#!/usr/bin/python

# Extracts anchor texts and URLs from a given HTML/XHTML document
# Copyright (C) 2007 Susam Pal
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

VERSION = '0.2'
COPYRIGHT = 'Copyright (C) 2007 Susam Pal'

import sys
from HTMLParser import HTMLParser
from urlparse import urljoin
from urllib2 import urlopen

class AnchorParser(HTMLParser):
  def parse(self, s):
    self.feed(s)
    self.close()

  def __init__(self, baseURL):
    HTMLParser.__init__(self)
    self.baseURL = baseURL
    self.links = {} 

  def handle_starttag(self, tag, attrs):
    if tag == 'a':
      for name, value in attrs:
        if name == 'href':
          self.url = value 
          break

  def handle_data(self, data):
    self.data = ' '.join(data.split())

  def handle_endtag(self, tag):
    if tag == 'a':
      if self.data not in self.links.keys():
        self.links[self.data] = urljoin(self.baseURL, self.url)

  def printlinks(self):
    for tag, url in self.links.iteritems():
      print '%s:%s' % (tag, url)

def printhelp(exitcode):
  map = {'cmd' : sys.argv[0]}
  print 'Usage: %(cmd)s BASEURL [FILE]' % map 
  print '   or: %(cmd)s OPTION' % map 
  print """
If only BASEURL is specified, the HTML content from the specified URL
is fetched and anchors are extracted from it. All relative URLs are
properly resolved to absolute URLs.
  
If FILE is specified, anchors are extracted from the HTML content
present in the file and BASEURL is only used for resolving relative
URLs.

Options:
  -v, --version   print version
  -h, --help      print help

e.g. %(cmd)s http://susam.in/
     %(cmd)s http://susam.in/ susam.html
     %(cmd)s . susam.html
     %(cmd)s / susam.html

Caveat: If the same hypertext occurs twice or more pointing to various
URLs, only the first hypertext appears in the result.""" % map
  sys.exit(exitcode)

def printversion():
  print 'LinkClip, version', VERSION
  print COPYRIGHT
  sys.exit(0)

# Execution begins here
if __name__ == '__main__':
  baseURL = None
  fileName = None

  # Print help on invalid command line 
  if len(sys.argv) < 2:
    printhelp(1)

  # Print help or version if mentioned in command line
  option = sys.argv[1]
  if option == '-h' or option == '--help':
    printhelp(0)
  elif option == '-v' or option == '--version':
    printversion()
  
  # Decide source for HTML
  baseURL = sys.argv[1]
  if len(sys.argv) > 2:
    fileName = sys.argv[2]

  # Input directly from the web or a local file?
  if fileName is None:
    stream = urlopen(baseURL)
  else:
    stream = open(fileName, 'r')

  # Extract the anchors and print
  parser = AnchorParser(baseURL)
  parser.parse(stream.read())
  parser.printlinks()
 

