Stop doing prettify on the HTML, it sometimes introduces extraneous white space. Change the chapter file names and ids inside the metadata to sequential chapter0001 rather than base64 of the chapter title. Replace invalid center & u tags with div styled centered and span styled underlined. Remove dead ffa.py file. Change downloda.py to not call ffa.py, but do call fictionalley.

This commit is contained in:
retiefjimm 2010-09-30 21:33:58 -05:00
parent 773ff3c03c
commit f75910ce7d
9 changed files with 32 additions and 302 deletions

View file

@ -48,13 +48,15 @@ CONTENT_START = '''<?xml version="1.0"?>
<item id="style" href="stylesheet.css" media-type="text/css" />
'''
CONTENT_ITEM = '<item id="%s" href="%s" media-type="application/xhtml+xml" />'
CONTENT_ITEM = '''<item id="%s" href="%s" media-type="application/xhtml+xml" />
'''
CONTENT_END_MANIFEST = '''</manifest>
<spine toc="ncx">
'''
CONTENT_ITEMREF = '''<itemref idref="%s" />'''
CONTENT_ITEMREF = '''<itemref idref="%s" />
'''
CONTENT_END = '''</spine>
</package>

View file

@ -13,7 +13,6 @@ import BeautifulSoup as bs
import htmlentitydefs as hdefs
import ffa
import ffnet
import ficwad
import output
@ -51,13 +50,13 @@ class FanficLoader:
urls = self.adapter.extractIndividualUrls()
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress)
i = 0
i = 1
for u,n in urls:
if not self.quiet:
print('Downloading chapter %d/%d' % (i, len(urls)))
i = i+1
text = self.adapter.getText(u)
self.writer.writeChapter(n, text)
self.writer.writeChapter(i, n, text)
i = i+1
self.writer.finalise()
@ -78,11 +77,10 @@ if __name__ == '__main__':
writerClass = None
if url.find('fanficauthors') != -1:
adapter = ffa.FFA(url)
print >> sys.stderr, "fanficauthors.net already provides ebooks"
sys.exit(0)
elif url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
#print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors"
#sys.exit(0)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
elif url.find('fanfiction.net') != -1 or url.find('fictionpress.com') != -1:

235
ffa.py
View file

@ -1,235 +0,0 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import base64
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class FFA(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
logging.debug("Created FFA: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def reqLoginData(self, data):
if data.find('<legend>Please login to continue</legend>') != -1 or data.find('<h4>Username or password not found. Please') != -1 or data.find("This story is rated Mature, you must be logged in to view it") != -1:
return True
else:
return False
def requiresLogin(self, url = None):
if url == None:
u = self.url
else:
u = url
resp = self.opener.open(u)
data = resp.read()
return self.reqLoginData(data)
def performLogin(self, url = None):
if url == None:
url = self.url
data = {}
data['username'] = self.login
data['password'] = self.password
data['submit'] = 'Submit'
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
d = req.read()
if self.reqLoginData(d) :
return False
else:
return True
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
soup = bs.BeautifulStoneSoup(data)
if self.reqLoginData(data):
logging.debug('Data requires login, trying to login')
if not self.performLogin(url):
logging.error('Cannot login, raising exception ... ')
raise LoginRequiredException(url)
else:
data = self.opener.open(url).read()
self.author = str(soup.find('a', {'href' : '/contact/'}).string)
self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
selector = soup.find('select', {'class' : 'tinput'})
options = selector.findAll('option')
urls = []
for o in options:
title = o.string
url = o['value']
urls.append((url,title))
return urls
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.info('Downloading: %s' % url)
data = self.opener.open(url).read()
if self.reqLoginData(data):
logging.debug('Data requires login, trying to login')
if not self.performLogin(url):
logging.error('Cannot login, raising exception ... ')
raise LoginRequiredException(url)
else:
data = self.opener.open(url).read()
lines = data.split('\n')
emit = False
post = ''
for l in lines:
if l.find('</div></form>') != -1:
logging.debug('emit = True')
emit = True
continue
elif l.find('<form action="#">') != -1:
logging.debug('emit = False')
if emit:
break
else:
emit = False
if emit:
post = post + l + '\n'
return post
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.author
def getPrintableUrl(self, url):
return url
class FFA_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testRequiresLoginNeg(self):
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
self.assertFalse(f.requiresLogin())
def testRequiresLogin(self):
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
self.assertTrue(f.requiresLogin())
def testPerformLogin(self):
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
self.assertTrue(f.performLogin(None))
def testExtractURLsAuthorStoryName(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
f.extractIndividualUrls()
self.assertEquals('Draco664', f.getAuthorName())
self.assertEquals('Apprentice Potter', f.getStoryName())
def testExtractUrls(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
urls = f.extractIndividualUrls()
self.assertEquals(25, len(urls))
self.assertEquals('Grievances', urls[2][1])
self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
def testGetText(self):
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
def testGetTextLogin(self):
url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
f = FFA(url)
if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
if f.requiresLogin():
f.performLogin()
data = f.getText(url)
seek = 'So Hokage-sama” I said, “this is how we came'
self.assertTrue(data.find(seek) != -1)
def testSemiLoginRequired(self):
f = FFA('http://viridian.fanficauthors.net/Harry_Potter_and_the_Nightmares_of_Futures_Past/The_End_of_Days/')
urls = f.extractIndividualUrls()
try:
data = f.getText('http://viridian.fanficauthors.net/Harry_Potter_and_the_Nightmares_of_Futures_Past/Doing_the_Mungo_Shuffle/')
self.assertTrue(False)
except LoginRequiredException, e:
self.assertTrue(True)
if __name__ == '__main__':
unittest.main()

View file

@ -6,7 +6,6 @@ import sys
import cgi
import uuid
import shutil
import base64
import os.path
import logging
import unittest
@ -136,26 +135,8 @@ class FFNet(FanfictionSiteAdapter):
logging.error("Error downloading Chapter: %s" % url)
exit(1)
return '<html/>'
return div.prettify()
#
# for l in lines:
# if l.find("<div id=storytextp class=storytextp") != -1 or l.find('<!-- start story -->') != -1 or l.find('<div id="storytextp"') != -1:
# logging.debug("starting at line: %s" % l)
# #s2 = bs.BeautifulStoneSoup(l)
# #return s2.div.prettify()
# emit = True
#
# if emit:
# textbuf = textbuf + "\n" + l
#
# if l.find("</div><div style='height:10px'></div> ") != -1 or l.find('<!-- end story -->') != -1:
# emit = False
#
# s2 = bs.BeautifulStoneSoup(textbuf)
# return s2.div.prettify()
return div.__str__('utf8')
def setLogin(self, login):
self.login = login

View file

@ -98,7 +98,7 @@ class FictionAlley(FanfictionSiteAdapter):
exit(1)
return '<html/>'
return div.prettify()
return div.__str__('utf8')
def getPrintableUrl(self, url):
return url

View file

@ -84,7 +84,7 @@ class FicWad(FanfictionSiteAdapter):
logging.error("Error downloading Chapter: %s" % url)
exit(1)
return '<html/>'
return div.prettify()
return div.__str__('utf8')
def getPrintableUrl(self, url):
return url

View file

@ -6,7 +6,6 @@ import sys
import cgi
import uuid
import shutil
import base64
import os.path
import logging
import unittest
@ -86,7 +85,7 @@ class HPFiction(FanfictionSiteAdapter):
if None == divtext:
logging.error("Error downloading Chapter: %s" % url)
exit(1)
return divtext.prettify()
return divtext.__str__('utf8')
class FF_UnitTests(unittest.TestCase):
def setUp(self):

View file

@ -8,7 +8,6 @@ import uuid
import codecs
import shutil
import string
import base64
import os.path
import zipfile
import StringIO
@ -33,7 +32,7 @@ class FanficWriter:
def __init__(self):
pass
def writeChapter(self, title, text):
def writeChapter(self, index, title, text):
pass
def finalise(self):
@ -45,8 +44,8 @@ class TextWriter(FanficWriter):
def __init__(self, base, name, author, inmemory=False, compress=False):
self.htmlWriter = HTMLWriter(base, name, author, True, False)
def writeChapter(self, title, text):
self.htmlWriter.writeChapter(title, text)
def writeChapter(self, index, title, text):
self.htmlWriter.writeChapter(index, title, text)
def finalise(self):
self.htmlWriter.finalise()
@ -85,7 +84,7 @@ class HTMLWriter(FanficWriter):
except:
return text
def writeChapter(self, title, text):
def writeChapter(self, index, title, text):
title = self._printableVersion(title) #title.decode('utf-8')
text = self._printableVersion(text) #text.decode('utf-8')
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
@ -94,7 +93,7 @@ class HTMLWriter(FanficWriter):
def finalise(self):
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
soup = bs.BeautifulSoup(html)
result = soup.prettify()
result = soup.__str__('utf8')
# f = open(self.fileName, 'w')
# f.write(result)
@ -178,16 +177,9 @@ class EPubFanficWriter(FanficWriter):
return text
def writeChapter(self, title, text):
def writeChapter(self, index, title, text):
logging.debug("Writing chapter: %s" % title)
try:
fileName = base64.b64encode(title) + ".xhtml"
except UnicodeEncodeError, e:
fileName = base64.b64encode(title.encode('utf-8')) + ".xhtml"
# Base64 can include +, / and =, which XML technically doesn't like
# in it's id attributes. _ and - are okay and not otherwise used in Base64.
# The = for padding is superfluous
fileName = fileName.replace('/', '_').replace('+', '-').replace('=','')
fileName="chapter%04d.xhtml" % index
filePath = self.directory + "/OEBPS/" + fileName
@ -207,21 +199,21 @@ class EPubFanficWriter(FanficWriter):
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr]
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
allPs = self.soup.findAll(recursive=True)
for p in allPs:
if p.string != None and len(p.string.strip()) == 0 :
p.extract()
# xhtml doesn't like <p> nesting in <p>, so leave divs.
# allBrs = self.soup.findAll(recursive=True, name = ['div'])
# for br in allBrs:
# if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
# br.name = 'p'
# cleanup(self.soup )
text = self.soup.prettify()
text = self.soup.__str__('utf8')
tt = self._removeEntities(title)
@ -253,14 +245,7 @@ class EPubFanficWriter(FanficWriter):
i = 1
for t,f in self.chapters:
try:
chapterId = base64.b64encode(t)
except UnicodeEncodeError, e:
chapterId = base64.b64encode(t.encode('utf-8'))
# Base64 can include +, / and =, which XML technically doesn't like
# in it's id attributes. _ and - are okay and not otherwise used in Base64.
# The = for padding is superfluous
chapterId = chapterId.replace('/', '_').replace('+', '-').replace('=','')
chapterId = "chapter%04d" % i
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))

View file

@ -109,7 +109,7 @@ class Twilighted(FanfictionSiteAdapter):
if None == div:
return '<html/>'
return div.prettify()
return div.__str__('utf8')
def _getLoginScript(self):
return '/user.php?action=login'