#!/usr/bin/python # Extracts anchor texts and URLs from a given HTML/XHTML document # Copyright (C) 2007 Susam Pal # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . VERSION = '0.2' COPYRIGHT = 'Copyright (C) 2007 Susam Pal' import sys from HTMLParser import HTMLParser from urlparse import urljoin from urllib2 import urlopen class AnchorParser(HTMLParser): def parse(self, s): self.feed(s) self.close() def __init__(self, baseURL): HTMLParser.__init__(self) self.baseURL = baseURL self.links = {} def handle_starttag(self, tag, attrs): if tag == 'a': for name, value in attrs: if name == 'href': self.url = value break def handle_data(self, data): self.data = ' '.join(data.split()) def handle_endtag(self, tag): if tag == 'a': if self.data not in self.links.keys(): self.links[self.data] = urljoin(self.baseURL, self.url) def printlinks(self): for tag, url in self.links.iteritems(): print '%s:%s' % (tag, url) def printhelp(exitcode): map = {'cmd' : sys.argv[0]} print 'Usage: %(cmd)s BASEURL [FILE]' % map print ' or: %(cmd)s OPTION' % map print """ If only BASEURL is specified, the HTML content from the specified URL is fetched and anchors are extracted from it. All relative URLs are properly resolved to absolute URLs. If FILE is specified, anchors are extracted from the HTML content present in the file and BASEURL is only used for resolving relative URLs. Options: -v, --version print version -h, --help print help e.g. %(cmd)s http://susam.in/ %(cmd)s http://susam.in/ susam.html %(cmd)s . susam.html %(cmd)s / susam.html Caveat: If the same hypertext occurs twice or more pointing to various URLs, only the first hypertext appears in the result.""" % map sys.exit(exitcode) def printversion(): print 'LinkClip, version', VERSION print COPYRIGHT sys.exit(0) # Execution begins here if __name__ == '__main__': baseURL = None fileName = None # Print help on invalid command line if len(sys.argv) < 2: printhelp(1) # Print help or version if mentioned in command line option = sys.argv[1] if option == '-h' or option == '--help': printhelp(0) elif option == '-v' or option == '--version': printversion() # Decide source for HTML baseURL = sys.argv[1] if len(sys.argv) > 2: fileName = sys.argv[2] # Input directly from the web or a local file? if fileName is None: stream = urlopen(baseURL) else: stream = open(fileName, 'r') # Extract the anchors and print parser = AnchorParser(baseURL) parser.parse(stream.read()) parser.printlinks()