html2mime.py

#!/usr/bin/env python

"""
$Id: html2mime.py 1059 2003-05-04 05:12:23Z david $

This tool will convert HTML (including images) into a MIME
multipart/alternative file suitable for mailing.

A plain text version of the HTML is extracted and provided in the file.
This means that the file can be understood by MUAs that do not support
HTML.  Quoted-printable encoding is used for the text.

For the moment, the HTML must exist as a file on the local filesystem.
Similarly, the images must also exist as files relative to the location
of the HTML.  If they do not, they will not be included, meaning the
encoded result may not match the look of the original.

It's a little unclean and kludgy.  I re-open the same file too many times
(more than once).  I jigger the HTML to get the cid/Content-ID stuff
working (cStringIO would probably help here).  I'm nasty and deserve to
be spanked.....
"""

import os, sys, quopri, base64, string, re
from MimeWriter import MimeWriter
from htmllib import HTMLParser
from formatter import NullFormatter, AbstractFormatter, DumbWriter
from cStringIO import StringIO
from tempfile import mktemp


USAGE = "Usage: %s <HTML file>\n" % os.path.basename(sys.argv[0])

MIMEBLURB = """\
This is a multi-part message in MIME format.  You need a MIME-aware
e-mail client to view this e-mail as intended.
"""

# Maps an image file extension to the image type.  This is used when
# determining the Content-type for an image.
IMAGETYPEMAP = {
	".jpg" : "jpeg",
	".tif" : "tiff",
	".gif" : "gif",
}


class NotAFileException(Exception): pass

class EncodedImageData:
	"""
	The EncodedImageData class is used as a container for data that
	describes an image.  Specifically, it contains the filename of
	the image and the contents of the image base64 encoded.
	"""
	def __init__(self, filename):
		self._setFilename(filename)
		self.raw = None
		self.base64 = None
		self.encodeImageData()
	
	def _setFilename(self, filename):
		"""
		First checks that the filename represents a valid file.
		If it does, self.filename is updated.  If it does not,
		raises NotAFile.
		"""
		if not os.path.isfile(filename):
			raise NotAFileException, filename
		self.filename = filename

	def encodeImageData(self, filename=None):
		"""
		Loads the image data, encodes it, then stores it in
		self.data.  If a new filename is provided, self.filename
		is updated appropriately and the image data is loaded
		from there.
		"""
		if filename:
			self._setFilename(filename)
		self.raw = open(self.filename, "r").read()
		self.base64 = base64.encodestring(self.raw)


class ImageAndTextExtractor(HTMLParser):
	"""
	Extracts images from HTML <img> tags.  For each image an
	EncodedImageData object is created.  Each EncodedImageData
	object contains the filename, raw data, and base64 encoded data.
	These objects are stored in an imagePool list.

	Also, extracts text and writes it out to a given file using the
	AbstractFormatter and the DumbWriter.
	"""
	def __init__(self, textOut):
		self.imagePool = {}
		formatter = AbstractFormatter(DumbWriter(textOut))
		HTMLParser.__init__(self, formatter)

	def unknown_starttag(self, tag, attrs):
		self.extractImage(tag, attrs)

	def start_body(self, attrs):
		self.extractImage("body", attrs)

	def start_head(self, attrs):
		self.extractImage("head", attrs)

	def start_html(self, attrs):
		self.extractImage("html", attrs)

	def handle_image(self, src, alt, ismap=0, align=0, width=0, height=0):
		"""
		Handles an <img> tag.
		"""
		self.storeImage(src)

	def extractImage(self, tag, attrs):
		for key, val in attrs:
			key = string.strip(key)
			key = string.lower(key)
			if key == "background":
				self.storeImage(val)

	def storeImage(self, image):
		"""
		Loads EncodedImageData objects into the imagePool list.
		Catches NotAFileException, which occurs when the image
		cannot be loaded.
		"""
		try:
			self.imagePool[image] = EncodedImageData(image)
		except NotAFileException, file:
			sys.stderr.write("Unable to load image: %s\n" % image)


def mapImageToType(imageName):
	"""
	Given the name of an image, extracts its file extension.  Maps
	the extension to an image type (eg, jpeg).  Returns the type.
	"""
	extension = os.path.splitext(imageName)[-1]
	extension = string.lower(extension)
	return IMAGETYPEMAP.get(extension, "unknown")


def addText(writer, text):
	"""
	Encodes the text (quoted-printable).  Adds the encoded text to the
	MIME content with "Content-type: text/plain; charset: us-ascii"
	and "Content-Transfer-Encoding: quoted-printable" headers.
	"""
	part = writer.nextpart()
	part.addheader("Content-Transfer-Encoding", "quoted-printable")
	textOut = part.startbody("text/plain", [("charset", "us-ascii")])
	encodedText = quopri.encodestring(text, 0)
	textOut.write(encodedText)


def addHtml(writer, html):
	"""
	Encodes the HTML (quoted-printable).  Adds the encoded HTML to the
	MIME content with "Content-type: text/html; charset: us-ascii"
	and "Content-Transfer-Encoding: quoted-printable" headers.
	"""
	part = writer.nextpart()
	part.addheader("Content-Transfer-Encoding", "quoted-printable")
	htmlOut = part.startbody("text/html", [("charset", "us-ascii")])
	encodedHtml = quopri.encodestring(html, 0)
	htmlOut.write(encodedHtml)
	

def addImages(writer, imagePool):
	"""
	For each image we need to create a new part of the MIME and
	add the encoded image data.

	NOTE: The '<' and '>' around the value of the Content-ID is
	absolutely necessary.  The 'cid:' used in the HTML must not
	be present in the Content-ID.
	"""
	for image in imagePool.values():
		part = writer.nextpart()
		part.addheader("Content-Transfer-Encoding", "base64")
		part.addheader("Content-ID", "<%s>" % image.filename)
		part.addheader("Content-Disposition",
			"inline; filename='%s'" % image.filename)
		contentType = "image/%s" % mapImageToType(image.filename)
		imageOut = part.startbody(contentType,
			[("name", image.filename)])
		imageOut.write(image.base64)


def main():
	htmlFile = None
	textFile = None

	try:
		htmlFile = sys.argv[1]
	except IndexError:
		sys.stderr.write(USAGE)
		sys.exit(1)

	if not os.path.isfile(htmlFile):
		sys.stderr.write("ERROR: %s: not a file\n" % htmlFile)
		sys.exit(1)
	
	# Initialise the output MIME file.
	mimeFile = open("mime.out", "w")
	topWriter = MimeWriter(mimeFile)
	topWriter.addheader("MIME-Version", "1.0")

	# If possible, extract the plain text version of the HTML as
	# well as the images.  The images will be in extractor.imagePool.
	textFile = mktemp(".txt")
	out = open(textFile, "w")
	extractor = ImageAndTextExtractor(out)
	extractor.feed(open(htmlFile).read())
	extractor.close()
	out.close()

	if os.path.getsize(textFile) == 0:
		# No text version .. so the HTML is included at the top level
		sys.stderr.write("WARNING: unable to produce text version\n")
		textFile = None
		htmlWriter = topWriter
	else:
		# A text version .. so the HTML will have to be in a sub-part
		topWriter.startmultipartbody("alternative")
		# Add the blurb about MIME e-mail clients
		mimeFile.write(MIMEBLURB)
		addText(topWriter, open(textFile).read())
		htmlWriter = topWriter.nextpart()
	os.remove(textFile)

	# Correct the image references
	html = open(htmlFile).read()
	for image in extractor.imagePool.values():
		oldImageName = r'%s' % image.filename
		newImageName = r'cid:%s' % image.filename
		html = re.sub(oldImageName, newImageName, html)
	htmlFile = StringIO(html)

	# Create the HTML part of the MIME.
	htmlWriter.startmultipartbody("related")
	addHtml(htmlWriter, htmlFile.read())
	addImages(htmlWriter, extractor.imagePool)

	# And I'm spent
	topWriter.lastpart()
	mimeFile.close()


if __name__ == "__main__":
	main()