|
knet's haven » Software » Snippets » getWebFiles.py |
|||||||||||||||||
|
getWebFiles.pyI wrote this to download all the files of a given file type (determined by file extension) from a web page. Here is the code:
#!/usr/bin/env python
"""
$Id: getWebFiles.py 1297 2003-07-02 12:59:34Z david $
Simple tool to download all the files with a given extension from
a web page.
"""
import sys, os
from sgmllib import SGMLParser
from urllib import urlopen
from urlparse import urljoin
USAGE = "%s <URL> <extension>\n" % os.path.basename(sys.argv[0])
class LinkExtractor (SGMLParser):
def __init__(self):
self.links = []
SGMLParser.__init__(self)
def do_a(self, attributes):
for (name, value) in attributes:
if name == "href":
value = self.cleanLink(value)
if value:
self.links.append(value)
def cleanLink(self, link):
link = link.strip()
index = link.find("#")
if index > 0:
link = link[:index]
words = link.split()
return "".join(words)
def getLinks(self):
return self.links
def main ():
# Extract the URL to be processed
try:
url = sys.argv[1]
ext = sys.argv[2].upper()
except IndexError:
sys.stderr.write(USAGE)
sys.exit(1)
musicList = []
# Obtain the web data and parse it for links
data = urlopen(url).read()
extractor = LinkExtractor()
extractor.feed(data)
extractor.close()
links = extractor.getLinks()
# Build the absolute URLs for these links
for link in links:
if not link[-4:].upper() == ("." + ext):
continue
musicList.append(urljoin(url, link))
# Download each file and write to disk
for musicLink in musicList:
filename = os.path.basename(musicLink)
try:
os.system("wget -c '%s'" % musicLink)
except:
# If I can't use wget I'll just have to do
# it myself
sys.stdout.write("Downloading %s... " % filename)
sys.stdout.flush()
musicData = urlopen(musicLink).read()
open(filename, "w").write(musicData)
sys.stdout.write("done\n")
sys.stdout.flush()
if __name__ == "__main__":
main()
|