diff --git a/constants.py b/constants.py index 5da65e78..2bcc1a67 100644 --- a/constants.py +++ b/constants.py @@ -115,7 +115,7 @@ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', acceptable_attributes = ['href'] -entities = { '–' : ' - ', '—' : ' - ', '”' : '"', '“' : '"', '’' : '\'', ''' : '\'', +entities = { '–' : ' - ', '—' : ' - ', '”' : '"', '“' : '"', '’' : '\'', '‘' : '\'', '"' : '"', '…' : '...', '&' : '&', '£' : '£', ' ' : ' ' } FB2_PROLOGUE = '' diff --git a/fictionalley.py b/fictionalley.py index 2ab11167..35ae71e2 100644 --- a/fictionalley.py +++ b/fictionalley.py @@ -72,7 +72,7 @@ class FictionAlley(FanfictionSiteAdapter): title = a.string result.append((url,title)) - print('Story "%s" by %s' % (self.storyName, self.authorName)) + #print('Story "%s" by %s' % (self.storyName, self.authorName)) return result @@ -122,4 +122,4 @@ if __name__ == '__main__': fw = FictionAlley(url) urls = fw.extractIndividualUrls(data, host, url) pp.pprint(urls) - print(fw.getText(data)) \ No newline at end of file + print(fw.getText(data)) diff --git a/ficwad.py b/ficwad.py index dab4fe0d..28b71584 100644 --- a/ficwad.py +++ b/ficwad.py @@ -58,7 +58,7 @@ class FicWad(FanfictionSiteAdapter): else: allOptions = select.findAll('option') for o in allOptions: - url = o['value'] + url = 'http://' + self.host + o['value'] title = o.string # ficwad includes 'Story Index' in the dropdown of chapters, # but it's not a real chapter. @@ -74,9 +74,6 @@ class FicWad(FanfictionSiteAdapter): return self.authorName def getText(self, url): - if url.find('http://') == -1: - url = 'http://' + self.host + '/' + url - data = u2.urlopen(url).read() soup = bs.BeautifulStoneSoup(data) @@ -109,4 +106,4 @@ if __name__ == '__main__': fw = FicWad(url) urls = fw.extractIndividualUrls() pp.pprint(urls) - print(fw.getText(data)) \ No newline at end of file + print(fw.getText(data)) diff --git a/hpfiction.py b/hpfiction.py index 1108cf22..75cb4597 100644 --- a/hpfiction.py +++ b/hpfiction.py @@ -65,7 +65,7 @@ class HPFiction(FanfictionSiteAdapter): else: for o in select.findAll('option'): if 'value' in o._getAttrMap(): - url = 'http://' + self.host + '/' + self.path + o['value'] + url = 'http://' + self.host + self.path + o['value'] title = o.string if title != "Story Index": urls.append((url,title)) @@ -110,4 +110,4 @@ class FF_UnitTests(unittest.TestCase): if __name__ == '__main__': unittest.main() - \ No newline at end of file + diff --git a/output.py b/output.py index 70a4cf01..4a5387c6 100644 --- a/output.py +++ b/output.py @@ -59,10 +59,10 @@ class HTMLWriter(FanficWriter): def __init__(self, base, name, author, inmemory=False, compress=False): self.basePath = base - self.name = re.sub('&\#[0-9]+;', '_', name.replace(" ", "_").replace(":","_")) - self.storyTitle = name - self.fileName = self.basePath + '/' + re.sub('[^a-zA-Z0-9_\'-]+','',self.name) + '.html' - self.authorName = author + self.storyTitle = removeEntities(name) + self.name = makeAcceptableFilename(name) + self.fileName = self.basePath + '/' + self.name + '.html' + self.authorName = removeEntities(author) self.inmemory = inmemory @@ -133,12 +133,12 @@ class EPubFanficWriter(FanficWriter): def __init__(self, base, name, author, inmemory=False, compress=True): self.basePath = base - self.name = re.sub('&\#[0-9]+;', '_', name.replace(" ", "_").replace(":","_")) - self.storyTitle = self._removeEntities(name) - self.directory = self.basePath + '/' + re.sub('[^a-zA-Z0-9_\'-]+','',self.name) + self.storyTitle = removeEntities(name) + self.name = makeAcceptableFilename(name) + self.directory = self.basePath + '/' + self.name + self.authorName = removeEntities(author) self.inmemory = inmemory - self.authorName = self._removeEntities(author) self.files = {} self.chapters = [] @@ -164,21 +164,8 @@ class EPubFanficWriter(FanficWriter): self._writeFile('META-INF/container.xml', CONTAINER) self._writeFile('OEBPS/stylesheet.css', CSS) - def _removeEntities(self, text): - for e in entities: - v = entities[e] - try: - text = text.replace(e, v) - except UnicodeDecodeError, ex: - # for the pound symbol in constants.py - text = text.replace(e, v.decode('utf-8')) - - # < < and & are the only html entities allowed in xhtml. - text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>') - - return text - def writeChapter(self, index, title, text): + title = removeEntities(title) logging.debug("Writing chapter: %s" % title) fileName="chapter%04d.xhtml" % index @@ -188,7 +175,7 @@ class EPubFanficWriter(FanficWriter): # f = open(filePath, 'w') - text = self._removeEntities(text) + text = removeEntities(text) # BeautifulStoneSoup doesn't have any selfClosingTags by default. # hr & br needs to be if they're going to work. @@ -216,12 +203,10 @@ class EPubFanficWriter(FanficWriter): text = self.soup.__str__('utf8') - tt = self._removeEntities(title) - - self._writeFile(fn, XHTML_START % (tt, tt)) + self._writeFile(fn, XHTML_START % (title, title)) self._writeFile(fn, text) self._writeFile(fn, XHTML_END) -# print >> f, XHTML_START % (tt, tt) +# print >> f, XHTML_START % (title, title) # f.write(text) # print >> f, XHTML_END @@ -248,7 +233,7 @@ class EPubFanficWriter(FanficWriter): for t,f in self.chapters: chapterId = "chapter%04d" % i - self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f)) + self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f)) self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f)) ids.append(chapterId) @@ -279,3 +264,40 @@ class EPubFanficWriter(FanficWriter): self.output = zipdata # zipdir.toZip(filename, self.directory) + +def unirepl(match): + "Return the unicode string for a decimal number" + s = match.group() + value = int(s[2:-1]) + return unichr(value) + +def replaceNumberEntities(data): + p = re.compile(r'&#(\d+);') + return p.sub(unirepl, data) + +def removeEntities(text): + # replace numeric versions of [&<>] with named versions. + text = re.sub(r'�*38;','&',text) + text = re.sub(r'�*60;','<',text) + text = re.sub(r'�*62;','>',text) + + # replace remaining � entities with unicode value, such as ' -> ' + text = replaceNumberEntities(text) + + # replace several named entities with character, such as — -> - + # see constants.py for the list. + for e in entities: + v = entities[e] + try: + text = text.replace(e, v) + except UnicodeDecodeError, ex: + # for the pound symbol in constants.py + text = text.replace(e, v.decode('utf-8')) + + # < < and & are the only html entities allowed in xhtml, put those back. + text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>') + + return text + +def makeAcceptableFilename(text): + return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))