import domhtml import pxdom import tidy import urllib2 tidy_options = dict(output_xhtml=1, add_xml_decl=1, tidy_mark=0) def get(uri): """ Parse complete document from a URI into an HTMLDocument """ stream = urllib2.urlopen(uri) tidy_doc = tidy.parseString(stream.read(), **tidy_options) document = domhtml.parseString(str(tidy_doc), uri) return document