| |
html2mime.py
#!/usr/bin/env python
"""
$Id: html2mime.py 1059 2003-05-04 05:12:23Z david $
This tool will convert HTML (including images) into a MIME
multipart/alternative file suitable for mailing.
A plain text version of the HTML is extracted and provided in the file.
This means that the file can be understood by MUAs that do not support
HTML. Quoted-printable encoding is used for the text.
For the moment, the HTML must exist as a file on the local filesystem.
Similarly, the images must also exist as files relative to the location
of the HTML. If they do not, they will not be included, meaning the
encoded result may not match the look of the original.
It's a little unclean and kludgy. I re-open the same file too many times
(more than once). I jigger the HTML to get the cid/Content-ID stuff
working (cStringIO would probably help here). I'm nasty and deserve to
be spanked.....
"""
import os, sys, quopri, base64, string, re
from MimeWriter import MimeWriter
from htmllib import HTMLParser
from formatter import NullFormatter, AbstractFormatter, DumbWriter
from cStringIO import StringIO
from tempfile import mktemp
USAGE = "Usage: %s <HTML file>\n" % os.path.basename(sys.argv[0])
MIMEBLURB = """\
This is a multi-part message in MIME format. You need a MIME-aware
e-mail client to view this e-mail as intended.
"""
# Maps an image file extension to the image type. This is used when
# determining the Content-type for an image.
IMAGETYPEMAP = {
".jpg" : "jpeg",
".tif" : "tiff",
".gif" : "gif",
}
class NotAFileException(Exception): pass
class EncodedImageData:
"""
The EncodedImageData class is used as a container for data that
describes an image. Specifically, it contains the filename of
the image and the contents of the image base64 encoded.
"""
def __init__(self, filename):
self._setFilename(filename)
self.raw = None
self.base64 = None
self.encodeImageData()
def _setFilename(self, filename):
"""
First checks that the filename represents a valid file.
If it does, self.filename is updated. If it does not,
raises NotAFile.
"""
if not os.path.isfile(filename):
raise NotAFileException, filename
self.filename = filename
def encodeImageData(self, filename=None):
"""
Loads the image data, encodes it, then stores it in
self.data. If a new filename is provided, self.filename
is updated appropriately and the image data is loaded
from there.
"""
if filename:
self._setFilename(filename)
self.raw = open(self.filename, "r").read()
self.base64 = base64.encodestring(self.raw)
class ImageAndTextExtractor(HTMLParser):
"""
Extracts images from HTML <img> tags. For each image an
EncodedImageData object is created. Each EncodedImageData
object contains the filename, raw data, and base64 encoded data.
These objects are stored in an imagePool list.
Also, extracts text and writes it out to a given file using the
AbstractFormatter and the DumbWriter.
"""
def __init__(self, textOut):
self.imagePool = {}
formatter = AbstractFormatter(DumbWriter(textOut))
HTMLParser.__init__(self, formatter)
def unknown_starttag(self, tag, attrs):
self.extractImage(tag, attrs)
def start_body(self, attrs):
self.extractImage("body", attrs)
def start_head(self, attrs):
self.extractImage("head", attrs)
def start_html(self, attrs):
self.extractImage("html", attrs)
def handle_image(self, src, alt, ismap=0, align=0, width=0, height=0):
"""
Handles an <img> tag.
"""
self.storeImage(src)
def extractImage(self, tag, attrs):
for key, val in attrs:
key = string.strip(key)
key = string.lower(key)
if key == "background":
self.storeImage(val)
def storeImage(self, image):
"""
Loads EncodedImageData objects into the imagePool list.
Catches NotAFileException, which occurs when the image
cannot be loaded.
"""
try:
self.imagePool[image] = EncodedImageData(image)
except NotAFileException, file:
sys.stderr.write("Unable to load image: %s\n" % image)
def mapImageToType(imageName):
"""
Given the name of an image, extracts its file extension. Maps
the extension to an image type (eg, jpeg). Returns the type.
"""
extension = os.path.splitext(imageName)[-1]
extension = string.lower(extension)
return IMAGETYPEMAP.get(extension, "unknown")
def addText(writer, text):
"""
Encodes the text (quoted-printable). Adds the encoded text to the
MIME content with "Content-type: text/plain; charset: us-ascii"
and "Content-Transfer-Encoding: quoted-printable" headers.
"""
part = writer.nextpart()
part.addheader("Content-Transfer-Encoding", "quoted-printable")
textOut = part.startbody("text/plain", [("charset", "us-ascii")])
encodedText = quopri.encodestring(text, 0)
textOut.write(encodedText)
def addHtml(writer, html):
"""
Encodes the HTML (quoted-printable). Adds the encoded HTML to the
MIME content with "Content-type: text/html; charset: us-ascii"
and "Content-Transfer-Encoding: quoted-printable" headers.
"""
part = writer.nextpart()
part.addheader("Content-Transfer-Encoding", "quoted-printable")
htmlOut = part.startbody("text/html", [("charset", "us-ascii")])
encodedHtml = quopri.encodestring(html, 0)
htmlOut.write(encodedHtml)
def addImages(writer, imagePool):
"""
For each image we need to create a new part of the MIME and
add the encoded image data.
NOTE: The '<' and '>' around the value of the Content-ID is
absolutely necessary. The 'cid:' used in the HTML must not
be present in the Content-ID.
"""
for image in imagePool.values():
part = writer.nextpart()
part.addheader("Content-Transfer-Encoding", "base64")
part.addheader("Content-ID", "<%s>" % image.filename)
part.addheader("Content-Disposition",
"inline; filename='%s'" % image.filename)
contentType = "image/%s" % mapImageToType(image.filename)
imageOut = part.startbody(contentType,
[("name", image.filename)])
imageOut.write(image.base64)
def main():
htmlFile = None
textFile = None
try:
htmlFile = sys.argv[1]
except IndexError:
sys.stderr.write(USAGE)
sys.exit(1)
if not os.path.isfile(htmlFile):
sys.stderr.write("ERROR: %s: not a file\n" % htmlFile)
sys.exit(1)
# Initialise the output MIME file.
mimeFile = open("mime.out", "w")
topWriter = MimeWriter(mimeFile)
topWriter.addheader("MIME-Version", "1.0")
# If possible, extract the plain text version of the HTML as
# well as the images. The images will be in extractor.imagePool.
textFile = mktemp(".txt")
out = open(textFile, "w")
extractor = ImageAndTextExtractor(out)
extractor.feed(open(htmlFile).read())
extractor.close()
out.close()
if os.path.getsize(textFile) == 0:
# No text version .. so the HTML is included at the top level
sys.stderr.write("WARNING: unable to produce text version\n")
textFile = None
htmlWriter = topWriter
else:
# A text version .. so the HTML will have to be in a sub-part
topWriter.startmultipartbody("alternative")
# Add the blurb about MIME e-mail clients
mimeFile.write(MIMEBLURB)
addText(topWriter, open(textFile).read())
htmlWriter = topWriter.nextpart()
os.remove(textFile)
# Correct the image references
html = open(htmlFile).read()
for image in extractor.imagePool.values():
oldImageName = r'%s' % image.filename
newImageName = r'cid:%s' % image.filename
html = re.sub(oldImageName, newImageName, html)
htmlFile = StringIO(html)
# Create the HTML part of the MIME.
htmlWriter.startmultipartbody("related")
addHtml(htmlWriter, htmlFile.read())
addImages(htmlWriter, extractor.imagePool)
# And I'm spent
topWriter.lastpart()
mimeFile.close()
if __name__ == "__main__":
main()
|