Generalize handling of &#<number>; entities. Generalize method to create a valid filename.

Remove print from fictionalley.py that was causing the output-before-302-header issue.
Some changes to chapter URL formation due to apparent differences in handling '//' in URLs in appengine vs downalod.py in 2.5.2.
This commit is contained in:
retiefjimm 2010-10-10 12:54:41 -05:00
parent 275a1dd6ad
commit dafa33c64e
5 changed files with 57 additions and 38 deletions

View file

@ -115,7 +115,7 @@ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
acceptable_attributes = ['href']
entities = { '&ndash;' : ' - ', '&mdash;' : ' - ', '&rdquo;' : '"', '&ldquo;' : '"', '&rsquo;' : '\'', '&#039;' : '\'',
entities = { '&ndash;' : ' - ', '&mdash;' : ' - ', '&rdquo;' : '"', '&ldquo;' : '"', '&rsquo;' : '\'',
'&lsquo;' : '\'', '&quot;' : '"', '&hellip;' : '...', '&amp;' : '&', '&pound;' : '£', '&nbsp;' : ' ' }
FB2_PROLOGUE = '<FictionBook>'

View file

@ -72,7 +72,7 @@ class FictionAlley(FanfictionSiteAdapter):
title = a.string
result.append((url,title))
print('Story "%s" by %s' % (self.storyName, self.authorName))
#print('Story "%s" by %s' % (self.storyName, self.authorName))
return result
@ -122,4 +122,4 @@ if __name__ == '__main__':
fw = FictionAlley(url)
urls = fw.extractIndividualUrls(data, host, url)
pp.pprint(urls)
print(fw.getText(data))
print(fw.getText(data))

View file

@ -58,7 +58,7 @@ class FicWad(FanfictionSiteAdapter):
else:
allOptions = select.findAll('option')
for o in allOptions:
url = o['value']
url = 'http://' + self.host + o['value']
title = o.string
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
@ -74,9 +74,6 @@ class FicWad(FanfictionSiteAdapter):
return self.authorName
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = u2.urlopen(url).read()
soup = bs.BeautifulStoneSoup(data)
@ -109,4 +106,4 @@ if __name__ == '__main__':
fw = FicWad(url)
urls = fw.extractIndividualUrls()
pp.pprint(urls)
print(fw.getText(data))
print(fw.getText(data))

View file

@ -65,7 +65,7 @@ class HPFiction(FanfictionSiteAdapter):
else:
for o in select.findAll('option'):
if 'value' in o._getAttrMap():
url = 'http://' + self.host + '/' + self.path + o['value']
url = 'http://' + self.host + self.path + o['value']
title = o.string
if title != "Story Index":
urls.append((url,title))
@ -110,4 +110,4 @@ class FF_UnitTests(unittest.TestCase):
if __name__ == '__main__':
unittest.main()

View file

@ -59,10 +59,10 @@ class HTMLWriter(FanficWriter):
def __init__(self, base, name, author, inmemory=False, compress=False):
self.basePath = base
self.name = re.sub('&\#[0-9]+;', '_', name.replace(" ", "_").replace(":","_"))
self.storyTitle = name
self.fileName = self.basePath + '/' + re.sub('[^a-zA-Z0-9_\'-]+','',self.name) + '.html'
self.authorName = author
self.storyTitle = removeEntities(name)
self.name = makeAcceptableFilename(name)
self.fileName = self.basePath + '/' + self.name + '.html'
self.authorName = removeEntities(author)
self.inmemory = inmemory
@ -133,12 +133,12 @@ class EPubFanficWriter(FanficWriter):
def __init__(self, base, name, author, inmemory=False, compress=True):
self.basePath = base
self.name = re.sub('&\#[0-9]+;', '_', name.replace(" ", "_").replace(":","_"))
self.storyTitle = self._removeEntities(name)
self.directory = self.basePath + '/' + re.sub('[^a-zA-Z0-9_\'-]+','',self.name)
self.storyTitle = removeEntities(name)
self.name = makeAcceptableFilename(name)
self.directory = self.basePath + '/' + self.name
self.authorName = removeEntities(author)
self.inmemory = inmemory
self.authorName = self._removeEntities(author)
self.files = {}
self.chapters = []
@ -164,21 +164,8 @@ class EPubFanficWriter(FanficWriter):
self._writeFile('META-INF/container.xml', CONTAINER)
self._writeFile('OEBPS/stylesheet.css', CSS)
def _removeEntities(self, text):
for e in entities:
v = entities[e]
try:
text = text.replace(e, v)
except UnicodeDecodeError, ex:
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
# &lt; &lt; and &amp; are the only html entities allowed in xhtml.
text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')
return text
def writeChapter(self, index, title, text):
title = removeEntities(title)
logging.debug("Writing chapter: %s" % title)
fileName="chapter%04d.xhtml" % index
@ -188,7 +175,7 @@ class EPubFanficWriter(FanficWriter):
# f = open(filePath, 'w')
text = self._removeEntities(text)
text = removeEntities(text)
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
# hr & br needs to be if they're going to work.
@ -216,12 +203,10 @@ class EPubFanficWriter(FanficWriter):
text = self.soup.__str__('utf8')
tt = self._removeEntities(title)
self._writeFile(fn, XHTML_START % (tt, tt))
self._writeFile(fn, XHTML_START % (title, title))
self._writeFile(fn, text)
self._writeFile(fn, XHTML_END)
# print >> f, XHTML_START % (tt, tt)
# print >> f, XHTML_START % (title, title)
# f.write(text)
# print >> f, XHTML_END
@ -248,7 +233,7 @@ class EPubFanficWriter(FanficWriter):
for t,f in self.chapters:
chapterId = "chapter%04d" % i
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
@ -279,3 +264,40 @@ class EPubFanficWriter(FanficWriter):
self.output = zipdata
# zipdir.toZip(filename, self.directory)
def unirepl(match):
"Return the unicode string for a decimal number"
s = match.group()
value = int(s[2:-1])
return unichr(value)
def replaceNumberEntities(data):
p = re.compile(r'&#(\d+);')
return p.sub(unirepl, data)
def removeEntities(text):
# replace numeric versions of [&<>] with named versions.
text = re.sub(r'&#0*38;','&amp;',text)
text = re.sub(r'&#0*60;','&lt;',text)
text = re.sub(r'&#0*62;','&gt;',text)
# replace remaining &#000; entities with unicode value, such as &#039; -> '
text = replaceNumberEntities(text)
# replace several named entities with character, such as &mdash; -> -
# see constants.py for the list.
for e in entities:
v = entities[e]
try:
text = text.replace(e, v)
except UnicodeDecodeError, ex:
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')
return text
def makeAcceptableFilename(text):
return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))