getWebFiles.py

I wrote this to download all the files of a given file type (determined by file extension) from a web page.

Here is the code:

#!/usr/bin/env python

"""
$Id: getWebFiles.py 1297 2003-07-02 12:59:34Z david $

Simple tool to download all the files with a given extension from
a web page.
"""


import sys, os
from sgmllib import SGMLParser
from urllib import urlopen
from urlparse import urljoin


USAGE = "%s <URL> <extension>\n" % os.path.basename(sys.argv[0])


class LinkExtractor (SGMLParser):
	def __init__(self):
		self.links = []
		SGMLParser.__init__(self)

	def do_a(self, attributes):
		for (name, value) in attributes:
			if name == "href":
				value = self.cleanLink(value)
				if value:
					self.links.append(value)

	def cleanLink(self, link):
		link = link.strip()
		index = link.find("#")
		if index > 0:
			link = link[:index]
		words = link.split()
		return "".join(words)

	def getLinks(self):
		return self.links


def main ():
	# Extract the URL to be processed
	try:
		url = sys.argv[1]
		ext = sys.argv[2].upper()
	except IndexError:
		sys.stderr.write(USAGE)
		sys.exit(1)

	musicList = []

	# Obtain the web data and parse it for links
	data = urlopen(url).read()
	extractor = LinkExtractor()
	extractor.feed(data)
	extractor.close()
	links = extractor.getLinks()

	# Build the absolute URLs for these links
	for link in links:
		if not link[-4:].upper() == ("." + ext):
			continue
		musicList.append(urljoin(url, link))
	
	# Download each file and write to disk
	for musicLink in musicList:
		filename = os.path.basename(musicLink)
		try:
			os.system("wget -c '%s'" % musicLink)
		except:
			# If I can't use wget I'll just have to do
			# it myself
			sys.stdout.write("Downloading %s... " % filename)
			sys.stdout.flush()
			musicData = urlopen(musicLink).read()
			open(filename, "w").write(musicData)
			sys.stdout.write("done\n")
			sys.stdout.flush()


if __name__ == "__main__":
	main()