tags with HTML-ified text.'''
pres = self._soup.findAll('pre')
for pre in pres:
pre.replaceWith(self._FixPreContents(str(pre.contents[0])))
def _FixPreContents(self, text):
if self.unfill:
line_splitter = '\n\n'
line_joiner = ''
else:
line_splitter = '\n'
line_joiner = '
'
lines = []
for line in text.split(line_splitter):
lines.append(self.WHITESPACE_RE.subn(' ', line)[0])
return line_joiner.join(lines)
def _RemoveUnsupported(self):
'''Remove any tags which the kindle cannot handle.'''
# TODO(chatham): tags to script?
unsupported_tags = ('script', 'style')
for tag_type in unsupported_tags:
for element in self._soup.findAll(tag_type):
element.extract()
def RenameAnchors(self, prefix):
'''Rename every internal anchor to have the given prefix, then
return the contents of the body tag.'''
for anchor in self._soup.findAll('a', href=re.compile('^#')):
anchor['href'] = '#' + prefix + anchor['href'][1:]
for a in self._soup.findAll('a'):
if a.get('name'):
a['name'] = prefix + a['name']
# TODO(chatham): figure out how to fix this. sometimes body comes out
# as NoneType.
content = []
if self._soup.body is not None:
content = [unicode(c) for c in self._soup.body.contents]
return '\n'.join(content)
def CleanHtml(self):
# TODO(chatham): fix_html_br, fix_html
self._RemoveUnsupported()
self._StubInternalAnchors()
self._FixPreTags()
return self._ReplaceAnchorStubs()
if __name__ == '__main__':
FILE ='/tmp/documentation.html'
#FILE = '/tmp/multipre.html'
FILE = '/tmp/view.html'
import codecs
d = open(FILE).read()
h = HtmlProcessor(d)
s = h.CleanHtml()
#print s