mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-04 18:40:51 +02:00
Stop doing prettify on the HTML, it sometimes introduces extraneous white space. Change the chapter file names and ids inside the metadata to sequential chapter0001 rather than base64 of the chapter title. Replace invalid center & u tags with div styled centered and span styled underlined. Remove dead ffa.py file. Change downloda.py to not call ffa.py, but do call fictionalley.
This commit is contained in:
parent
773ff3c03c
commit
f75910ce7d
9 changed files with 32 additions and 302 deletions
|
|
@ -48,13 +48,15 @@ CONTENT_START = '''<?xml version="1.0"?>
|
|||
<item id="style" href="stylesheet.css" media-type="text/css" />
|
||||
'''
|
||||
|
||||
CONTENT_ITEM = '<item id="%s" href="%s" media-type="application/xhtml+xml" />'
|
||||
CONTENT_ITEM = '''<item id="%s" href="%s" media-type="application/xhtml+xml" />
|
||||
'''
|
||||
|
||||
CONTENT_END_MANIFEST = '''</manifest>
|
||||
<spine toc="ncx">
|
||||
'''
|
||||
|
||||
CONTENT_ITEMREF = '''<itemref idref="%s" />'''
|
||||
CONTENT_ITEMREF = '''<itemref idref="%s" />
|
||||
'''
|
||||
|
||||
CONTENT_END = '''</spine>
|
||||
</package>
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@ import BeautifulSoup as bs
|
|||
import htmlentitydefs as hdefs
|
||||
|
||||
|
||||
import ffa
|
||||
import ffnet
|
||||
import ficwad
|
||||
import output
|
||||
|
|
@ -51,13 +50,13 @@ class FanficLoader:
|
|||
urls = self.adapter.extractIndividualUrls()
|
||||
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress)
|
||||
|
||||
i = 0
|
||||
i = 1
|
||||
for u,n in urls:
|
||||
if not self.quiet:
|
||||
print('Downloading chapter %d/%d' % (i, len(urls)))
|
||||
i = i+1
|
||||
text = self.adapter.getText(u)
|
||||
self.writer.writeChapter(n, text)
|
||||
self.writer.writeChapter(i, n, text)
|
||||
i = i+1
|
||||
|
||||
self.writer.finalise()
|
||||
|
||||
|
|
@ -78,11 +77,10 @@ if __name__ == '__main__':
|
|||
writerClass = None
|
||||
|
||||
if url.find('fanficauthors') != -1:
|
||||
adapter = ffa.FFA(url)
|
||||
print >> sys.stderr, "fanficauthors.net already provides ebooks"
|
||||
sys.exit(0)
|
||||
elif url.find('fictionalley') != -1:
|
||||
adapter = fictionalley.FictionAlley(url)
|
||||
#print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors"
|
||||
#sys.exit(0)
|
||||
elif url.find('ficwad') != -1:
|
||||
adapter = ficwad.FicWad(url)
|
||||
elif url.find('fanfiction.net') != -1 or url.find('fictionpress.com') != -1:
|
||||
|
|
|
|||
235
ffa.py
235
ffa.py
|
|
@ -1,235 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class FFA(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
|
||||
logging.debug("Created FFA: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def reqLoginData(self, data):
|
||||
if data.find('<legend>Please login to continue</legend>') != -1 or data.find('<h4>Username or password not found. Please') != -1 or data.find("This story is rated Mature, you must be logged in to view it") != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
if url == None:
|
||||
u = self.url
|
||||
else:
|
||||
u = url
|
||||
|
||||
resp = self.opener.open(u)
|
||||
data = resp.read()
|
||||
return self.reqLoginData(data)
|
||||
|
||||
def performLogin(self, url = None):
|
||||
if url == None:
|
||||
url = self.url
|
||||
|
||||
data = {}
|
||||
|
||||
data['username'] = self.login
|
||||
data['password'] = self.password
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||
logging.debug("Will now login to URL %s" % loginUrl)
|
||||
|
||||
req = self.opener.open(loginUrl, urlvals)
|
||||
|
||||
d = req.read()
|
||||
|
||||
if self.reqLoginData(d) :
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self.opener.open(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
if self.reqLoginData(data):
|
||||
logging.debug('Data requires login, trying to login')
|
||||
if not self.performLogin(url):
|
||||
logging.error('Cannot login, raising exception ... ')
|
||||
raise LoginRequiredException(url)
|
||||
else:
|
||||
data = self.opener.open(url).read()
|
||||
|
||||
|
||||
self.author = str(soup.find('a', {'href' : '/contact/'}).string)
|
||||
self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
|
||||
|
||||
logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
|
||||
|
||||
selector = soup.find('select', {'class' : 'tinput'})
|
||||
options = selector.findAll('option')
|
||||
|
||||
urls = []
|
||||
|
||||
for o in options:
|
||||
title = o.string
|
||||
url = o['value']
|
||||
|
||||
urls.append((url,title))
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.info('Downloading: %s' % url)
|
||||
data = self.opener.open(url).read()
|
||||
|
||||
if self.reqLoginData(data):
|
||||
logging.debug('Data requires login, trying to login')
|
||||
if not self.performLogin(url):
|
||||
logging.error('Cannot login, raising exception ... ')
|
||||
raise LoginRequiredException(url)
|
||||
else:
|
||||
data = self.opener.open(url).read()
|
||||
|
||||
lines = data.split('\n')
|
||||
|
||||
emit = False
|
||||
|
||||
post = ''
|
||||
|
||||
for l in lines:
|
||||
if l.find('</div></form>') != -1:
|
||||
logging.debug('emit = True')
|
||||
emit = True
|
||||
continue
|
||||
elif l.find('<form action="#">') != -1:
|
||||
logging.debug('emit = False')
|
||||
if emit:
|
||||
break
|
||||
else:
|
||||
emit = False
|
||||
|
||||
if emit:
|
||||
post = post + l + '\n'
|
||||
|
||||
return post
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.author
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
class FFA_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testRequiresLoginNeg(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
self.assertFalse(f.requiresLogin())
|
||||
|
||||
def testRequiresLogin(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
|
||||
self.assertTrue(f.requiresLogin())
|
||||
|
||||
def testPerformLogin(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
|
||||
|
||||
if login_password != None:
|
||||
f.setLogin(login_password.login)
|
||||
f.setPassword(login_password.password)
|
||||
|
||||
self.assertTrue(f.performLogin(None))
|
||||
|
||||
def testExtractURLsAuthorStoryName(self):
|
||||
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
|
||||
f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Draco664', f.getAuthorName())
|
||||
self.assertEquals('Apprentice Potter', f.getStoryName())
|
||||
|
||||
def testExtractUrls(self):
|
||||
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
|
||||
urls = f.extractIndividualUrls()
|
||||
self.assertEquals(25, len(urls))
|
||||
|
||||
self.assertEquals('Grievances', urls[2][1])
|
||||
self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
|
||||
|
||||
def testGetText(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
|
||||
self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
|
||||
|
||||
def testGetTextLogin(self):
|
||||
url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
|
||||
f = FFA(url)
|
||||
|
||||
if login_password != None:
|
||||
f.setLogin(login_password.login)
|
||||
f.setPassword(login_password.password)
|
||||
|
||||
if f.requiresLogin():
|
||||
f.performLogin()
|
||||
|
||||
data = f.getText(url)
|
||||
seek = 'So Hokage-sama” I said, “this is how we came'
|
||||
self.assertTrue(data.find(seek) != -1)
|
||||
|
||||
def testSemiLoginRequired(self):
|
||||
f = FFA('http://viridian.fanficauthors.net/Harry_Potter_and_the_Nightmares_of_Futures_Past/The_End_of_Days/')
|
||||
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
try:
|
||||
data = f.getText('http://viridian.fanficauthors.net/Harry_Potter_and_the_Nightmares_of_Futures_Past/Doing_the_Mungo_Shuffle/')
|
||||
self.assertTrue(False)
|
||||
except LoginRequiredException, e:
|
||||
self.assertTrue(True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
23
ffnet.py
23
ffnet.py
|
|
@ -6,7 +6,6 @@ import sys
|
|||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
|
|
@ -136,26 +135,8 @@ class FFNet(FanfictionSiteAdapter):
|
|||
logging.error("Error downloading Chapter: %s" % url)
|
||||
exit(1)
|
||||
return '<html/>'
|
||||
|
||||
return div.prettify()
|
||||
|
||||
#
|
||||
# for l in lines:
|
||||
# if l.find("<div id=storytextp class=storytextp") != -1 or l.find('<!-- start story -->') != -1 or l.find('<div id="storytextp"') != -1:
|
||||
# logging.debug("starting at line: %s" % l)
|
||||
# #s2 = bs.BeautifulStoneSoup(l)
|
||||
# #return s2.div.prettify()
|
||||
# emit = True
|
||||
#
|
||||
# if emit:
|
||||
# textbuf = textbuf + "\n" + l
|
||||
#
|
||||
# if l.find("</div><div style='height:10px'></div> ") != -1 or l.find('<!-- end story -->') != -1:
|
||||
# emit = False
|
||||
#
|
||||
# s2 = bs.BeautifulStoneSoup(textbuf)
|
||||
# return s2.div.prettify()
|
||||
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ class FictionAlley(FanfictionSiteAdapter):
|
|||
exit(1)
|
||||
return '<html/>'
|
||||
|
||||
return div.prettify()
|
||||
return div.__str__('utf8')
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ class FicWad(FanfictionSiteAdapter):
|
|||
logging.error("Error downloading Chapter: %s" % url)
|
||||
exit(1)
|
||||
return '<html/>'
|
||||
return div.prettify()
|
||||
return div.__str__('utf8')
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@ import sys
|
|||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
|
|
@ -86,7 +85,7 @@ class HPFiction(FanfictionSiteAdapter):
|
|||
if None == divtext:
|
||||
logging.error("Error downloading Chapter: %s" % url)
|
||||
exit(1)
|
||||
return divtext.prettify()
|
||||
return divtext.__str__('utf8')
|
||||
|
||||
class FF_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
|
|
|
|||
49
output.py
49
output.py
|
|
@ -8,7 +8,6 @@ import uuid
|
|||
import codecs
|
||||
import shutil
|
||||
import string
|
||||
import base64
|
||||
import os.path
|
||||
import zipfile
|
||||
import StringIO
|
||||
|
|
@ -33,7 +32,7 @@ class FanficWriter:
|
|||
def __init__(self):
|
||||
pass
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
def writeChapter(self, index, title, text):
|
||||
pass
|
||||
|
||||
def finalise(self):
|
||||
|
|
@ -45,8 +44,8 @@ class TextWriter(FanficWriter):
|
|||
def __init__(self, base, name, author, inmemory=False, compress=False):
|
||||
self.htmlWriter = HTMLWriter(base, name, author, True, False)
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
self.htmlWriter.writeChapter(title, text)
|
||||
def writeChapter(self, index, title, text):
|
||||
self.htmlWriter.writeChapter(index, title, text)
|
||||
|
||||
def finalise(self):
|
||||
self.htmlWriter.finalise()
|
||||
|
|
@ -85,7 +84,7 @@ class HTMLWriter(FanficWriter):
|
|||
except:
|
||||
return text
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
def writeChapter(self, index, title, text):
|
||||
title = self._printableVersion(title) #title.decode('utf-8')
|
||||
text = self._printableVersion(text) #text.decode('utf-8')
|
||||
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
|
||||
|
|
@ -94,7 +93,7 @@ class HTMLWriter(FanficWriter):
|
|||
def finalise(self):
|
||||
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
|
||||
soup = bs.BeautifulSoup(html)
|
||||
result = soup.prettify()
|
||||
result = soup.__str__('utf8')
|
||||
|
||||
# f = open(self.fileName, 'w')
|
||||
# f.write(result)
|
||||
|
|
@ -178,16 +177,9 @@ class EPubFanficWriter(FanficWriter):
|
|||
|
||||
return text
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
def writeChapter(self, index, title, text):
|
||||
logging.debug("Writing chapter: %s" % title)
|
||||
try:
|
||||
fileName = base64.b64encode(title) + ".xhtml"
|
||||
except UnicodeEncodeError, e:
|
||||
fileName = base64.b64encode(title.encode('utf-8')) + ".xhtml"
|
||||
# Base64 can include +, / and =, which XML technically doesn't like
|
||||
# in it's id attributes. _ and - are okay and not otherwise used in Base64.
|
||||
# The = for padding is superfluous
|
||||
fileName = fileName.replace('/', '_').replace('+', '-').replace('=','')
|
||||
fileName="chapter%04d.xhtml" % index
|
||||
|
||||
filePath = self.directory + "/OEBPS/" + fileName
|
||||
|
||||
|
|
@ -207,21 +199,21 @@ class EPubFanficWriter(FanficWriter):
|
|||
for attr in t._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr]
|
||||
# these are not acceptable strict XHTML. But we do already have
|
||||
# CSS classes of the same names defined in constants.py
|
||||
if t.name in ('u'):
|
||||
t['class']=t.name
|
||||
t.name='span'
|
||||
if t.name in ('center'):
|
||||
t['class']=t.name
|
||||
t.name='div'
|
||||
|
||||
allPs = self.soup.findAll(recursive=True)
|
||||
for p in allPs:
|
||||
if p.string != None and len(p.string.strip()) == 0 :
|
||||
p.extract()
|
||||
|
||||
# xhtml doesn't like <p> nesting in <p>, so leave divs.
|
||||
# allBrs = self.soup.findAll(recursive=True, name = ['div'])
|
||||
# for br in allBrs:
|
||||
# if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
|
||||
# br.name = 'p'
|
||||
|
||||
# cleanup(self.soup )
|
||||
|
||||
text = self.soup.prettify()
|
||||
text = self.soup.__str__('utf8')
|
||||
|
||||
tt = self._removeEntities(title)
|
||||
|
||||
|
|
@ -253,14 +245,7 @@ class EPubFanficWriter(FanficWriter):
|
|||
|
||||
i = 1
|
||||
for t,f in self.chapters:
|
||||
try:
|
||||
chapterId = base64.b64encode(t)
|
||||
except UnicodeEncodeError, e:
|
||||
chapterId = base64.b64encode(t.encode('utf-8'))
|
||||
# Base64 can include +, / and =, which XML technically doesn't like
|
||||
# in it's id attributes. _ and - are okay and not otherwise used in Base64.
|
||||
# The = for padding is superfluous
|
||||
chapterId = chapterId.replace('/', '_').replace('+', '-').replace('=','')
|
||||
chapterId = "chapter%04d" % i
|
||||
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
|
||||
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ class Twilighted(FanfictionSiteAdapter):
|
|||
if None == div:
|
||||
return '<html/>'
|
||||
|
||||
return div.prettify()
|
||||
return div.__str__('utf8')
|
||||
|
||||
def _getLoginScript(self):
|
||||
return '/user.php?action=login'
|
||||
|
|
|
|||
Loading…
Reference in a new issue