mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-15 21:32:28 +01:00
Generalize handling of &#<number>; entities. Generalize method to create a valid filename.
Remove print from fictionalley.py that was causing the output-before-302-header issue. Some changes to chapter URL formation due to apparent differences in handling '//' in URLs in appengine vs downalod.py in 2.5.2.
This commit is contained in:
parent
275a1dd6ad
commit
dafa33c64e
5 changed files with 57 additions and 38 deletions
|
|
@ -115,7 +115,7 @@ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
|
|||
|
||||
acceptable_attributes = ['href']
|
||||
|
||||
entities = { '–' : ' - ', '—' : ' - ', '”' : '"', '“' : '"', '’' : '\'', ''' : '\'',
|
||||
entities = { '–' : ' - ', '—' : ' - ', '”' : '"', '“' : '"', '’' : '\'',
|
||||
'‘' : '\'', '"' : '"', '…' : '...', '&' : '&', '£' : '£', ' ' : ' ' }
|
||||
|
||||
FB2_PROLOGUE = '<FictionBook>'
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ class FictionAlley(FanfictionSiteAdapter):
|
|||
title = a.string
|
||||
result.append((url,title))
|
||||
|
||||
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
#print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
return result
|
||||
|
||||
|
|
@ -122,4 +122,4 @@ if __name__ == '__main__':
|
|||
fw = FictionAlley(url)
|
||||
urls = fw.extractIndividualUrls(data, host, url)
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
print(fw.getText(data))
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ class FicWad(FanfictionSiteAdapter):
|
|||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = o['value']
|
||||
url = 'http://' + self.host + o['value']
|
||||
title = o.string
|
||||
# ficwad includes 'Story Index' in the dropdown of chapters,
|
||||
# but it's not a real chapter.
|
||||
|
|
@ -74,9 +74,6 @@ class FicWad(FanfictionSiteAdapter):
|
|||
return self.authorName
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
data = u2.urlopen(url).read()
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
|
@ -109,4 +106,4 @@ if __name__ == '__main__':
|
|||
fw = FicWad(url)
|
||||
urls = fw.extractIndividualUrls()
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
print(fw.getText(data))
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ class HPFiction(FanfictionSiteAdapter):
|
|||
else:
|
||||
for o in select.findAll('option'):
|
||||
if 'value' in o._getAttrMap():
|
||||
url = 'http://' + self.host + '/' + self.path + o['value']
|
||||
url = 'http://' + self.host + self.path + o['value']
|
||||
title = o.string
|
||||
if title != "Story Index":
|
||||
urls.append((url,title))
|
||||
|
|
@ -110,4 +110,4 @@ class FF_UnitTests(unittest.TestCase):
|
|||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
78
output.py
78
output.py
|
|
@ -59,10 +59,10 @@ class HTMLWriter(FanficWriter):
|
|||
|
||||
def __init__(self, base, name, author, inmemory=False, compress=False):
|
||||
self.basePath = base
|
||||
self.name = re.sub('&\#[0-9]+;', '_', name.replace(" ", "_").replace(":","_"))
|
||||
self.storyTitle = name
|
||||
self.fileName = self.basePath + '/' + re.sub('[^a-zA-Z0-9_\'-]+','',self.name) + '.html'
|
||||
self.authorName = author
|
||||
self.storyTitle = removeEntities(name)
|
||||
self.name = makeAcceptableFilename(name)
|
||||
self.fileName = self.basePath + '/' + self.name + '.html'
|
||||
self.authorName = removeEntities(author)
|
||||
|
||||
self.inmemory = inmemory
|
||||
|
||||
|
|
@ -133,12 +133,12 @@ class EPubFanficWriter(FanficWriter):
|
|||
|
||||
def __init__(self, base, name, author, inmemory=False, compress=True):
|
||||
self.basePath = base
|
||||
self.name = re.sub('&\#[0-9]+;', '_', name.replace(" ", "_").replace(":","_"))
|
||||
self.storyTitle = self._removeEntities(name)
|
||||
self.directory = self.basePath + '/' + re.sub('[^a-zA-Z0-9_\'-]+','',self.name)
|
||||
self.storyTitle = removeEntities(name)
|
||||
self.name = makeAcceptableFilename(name)
|
||||
self.directory = self.basePath + '/' + self.name
|
||||
self.authorName = removeEntities(author)
|
||||
|
||||
self.inmemory = inmemory
|
||||
self.authorName = self._removeEntities(author)
|
||||
|
||||
self.files = {}
|
||||
self.chapters = []
|
||||
|
|
@ -164,21 +164,8 @@ class EPubFanficWriter(FanficWriter):
|
|||
self._writeFile('META-INF/container.xml', CONTAINER)
|
||||
self._writeFile('OEBPS/stylesheet.css', CSS)
|
||||
|
||||
def _removeEntities(self, text):
|
||||
for e in entities:
|
||||
v = entities[e]
|
||||
try:
|
||||
text = text.replace(e, v)
|
||||
except UnicodeDecodeError, ex:
|
||||
# for the pound symbol in constants.py
|
||||
text = text.replace(e, v.decode('utf-8'))
|
||||
|
||||
# < < and & are the only html entities allowed in xhtml.
|
||||
text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>')
|
||||
|
||||
return text
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
title = removeEntities(title)
|
||||
logging.debug("Writing chapter: %s" % title)
|
||||
fileName="chapter%04d.xhtml" % index
|
||||
|
||||
|
|
@ -188,7 +175,7 @@ class EPubFanficWriter(FanficWriter):
|
|||
|
||||
# f = open(filePath, 'w')
|
||||
|
||||
text = self._removeEntities(text)
|
||||
text = removeEntities(text)
|
||||
|
||||
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
|
||||
# hr & br needs to be if they're going to work.
|
||||
|
|
@ -216,12 +203,10 @@ class EPubFanficWriter(FanficWriter):
|
|||
|
||||
text = self.soup.__str__('utf8')
|
||||
|
||||
tt = self._removeEntities(title)
|
||||
|
||||
self._writeFile(fn, XHTML_START % (tt, tt))
|
||||
self._writeFile(fn, XHTML_START % (title, title))
|
||||
self._writeFile(fn, text)
|
||||
self._writeFile(fn, XHTML_END)
|
||||
# print >> f, XHTML_START % (tt, tt)
|
||||
# print >> f, XHTML_START % (title, title)
|
||||
# f.write(text)
|
||||
# print >> f, XHTML_END
|
||||
|
||||
|
|
@ -248,7 +233,7 @@ class EPubFanficWriter(FanficWriter):
|
|||
for t,f in self.chapters:
|
||||
chapterId = "chapter%04d" % i
|
||||
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
||||
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
||||
|
||||
ids.append(chapterId)
|
||||
|
|
@ -279,3 +264,40 @@ class EPubFanficWriter(FanficWriter):
|
|||
self.output = zipdata
|
||||
|
||||
# zipdir.toZip(filename, self.directory)
|
||||
|
||||
def unirepl(match):
|
||||
"Return the unicode string for a decimal number"
|
||||
s = match.group()
|
||||
value = int(s[2:-1])
|
||||
return unichr(value)
|
||||
|
||||
def replaceNumberEntities(data):
|
||||
p = re.compile(r'&#(\d+);')
|
||||
return p.sub(unirepl, data)
|
||||
|
||||
def removeEntities(text):
|
||||
# replace numeric versions of [&<>] with named versions.
|
||||
text = re.sub(r'�*38;','&',text)
|
||||
text = re.sub(r'�*60;','<',text)
|
||||
text = re.sub(r'�*62;','>',text)
|
||||
|
||||
# replace remaining � entities with unicode value, such as ' -> '
|
||||
text = replaceNumberEntities(text)
|
||||
|
||||
# replace several named entities with character, such as — -> -
|
||||
# see constants.py for the list.
|
||||
for e in entities:
|
||||
v = entities[e]
|
||||
try:
|
||||
text = text.replace(e, v)
|
||||
except UnicodeDecodeError, ex:
|
||||
# for the pound symbol in constants.py
|
||||
text = text.replace(e, v.decode('utf-8'))
|
||||
|
||||
# < < and & are the only html entities allowed in xhtml, put those back.
|
||||
text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>')
|
||||
|
||||
return text
|
||||
|
||||
def makeAcceptableFilename(text):
|
||||
return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))
|
||||
|
|
|
|||
Loading…
Reference in a new issue