This commit is contained in:
sigizmund 2009-12-15 15:23:48 +00:00
commit c0459faa43
7 changed files with 2415 additions and 0 deletions

1711
BeautifulSoup.py Normal file

File diff suppressed because it is too large Load diff

135
constants.py Normal file
View file

@ -0,0 +1,135 @@
CSS = '''body { margin-left: 5%; margin-right: 5%; margin-top: 5%; margin-bottom: 5%; text-align: justify; }
pre { font-size: x-small; }
h1 { text-align: center; }
h2 { text-align: center; }
h3 { text-align: center; }
h4 { text-align: center; }
h5 { text-align: center; }
h6 { text-align: center; }
.CI {
text-align:center;
margin-top:0px;
margin-bottom:0px;
padding:0px;
}
.center {text-align: center;}
.smcap {font-variant: small-caps;}
.u {text-decoration: underline;}
.bold {font-weight: bold;}
'''
MIMETYPE = '''application/epub+zip'''
CONTAINER = '''<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
'''
CONTENT_START = '''<?xml version="1.0"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
unique-identifier="BookId-Epub-%s">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:opf="http://www.idpf.org/2007/opf">
<dc:title>%s</dc:title>
<dc:creator opf:role="aut">%s</dc:creator>
<dc:language>en-UK</dc:language>
<dc:rights></dc:rights>
<dc:publisher>sgzmd</dc:publisher>
<dc:identifier id="BookId">urn:uuid:sigizmund.com062820072147132</dc:identifier>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="text/xml" />
<item id="style" href="stylesheet.css" media-type="text/css" />
'''
CONTENT_ITEM = '<item id="%s" href="%s" media-type="application/xhtml+xml" />'
CONTENT_END_MANIFEST = '''</manifest>
<spine toc="ncx">
'''
CONTENT_ITEMREF = '''<itemref idref="%s" />'''
CONTENT_END = '''</spine>
</package>
'''
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="sigizmund.com062820072147132"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>%s</text>
</docTitle>
<navMap>
'''
TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
<navLabel>
<text>%s</text>
</navLabel>
<content src="%s"/>
</navPoint>
'''
TOC_END = '''</navMap>
</ncx>
'''
XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>%s</title>
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
</head>
<body>
<div>
<h3>%s</h3>
'''
XHTML_END = '''</div>
</body>
</html>
'''
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
'blockquote', 'br', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i',
'ins', 'kbd', 'label', 'li', 'ol',
'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
'strong', 'sub', 'sup', 'u', 'ul']
acceptable_attributes = ['href']
entities = { '&ndash;' : ' - ', '&mdash;' : ' - ', '&rdquo;' : '"', '&ldquo;' : '"', '&rsquo;' : '\'', '&lsquo;' : '\'', '&quot;' : '"' }
FB2_PROLOGUE = '<FictionBook>'
FB2_DESCRIPTION = '''<description>
<title-info>
<genre>fanfiction</genre>
<author>
<first-name></first-name>
<middle-name></middle-name>
<last-name>%s</last-name>
</author>
<book-title>%s</book-title>
<lang>eng</lang>
</title-info>
<document-info>
<author>
<nickname>sgzmd</nickname>
</author>
<date value="%s">%s</date>
<id>sgzmd_%s</id>
<version>2.0</version>
</document-info>
</description>'''

74
downaloder.py Normal file
View file

@ -0,0 +1,74 @@
import os
import re
import sys
import shutil
import os.path
import getpass
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import ffa
import ficwad
import output
import fictionalley
class FanficLoader:
'''A controller class which handles the interaction between various specific downloaders and writers'''
booksDirectory = "books"
def __init__(self, adapter, writerClass):
self.adapter = adapter
self.writerClass = writerClass
def download(self):
urls = self.adapter.extractIndividualUrls()
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
for u,n in urls:
text = self.adapter.getText(u)
self.writer.writeChapter(n, text)
self.writer.finalise()
if __name__ == '__main__':
(url, format) = sys.argv[1:]
if type(url) is unicode:
print('URL is unicode')
url = url.encode('latin1')
adapter = None
writerClass = None
if url.find('fanficauthors') != -1:
adapter = ffa.FFA(url)
elif url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
else:
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
sys.exit(1)
if format == 'epub':
writerClass = output.EPubFanficWriter
if adapter.requiresLogin(url):
print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
sys.stdout.write("Can I haz ur login? ")
login = sys.stdin.readline().strip()
password = getpass.getpass(prompt='Can I haz ur password? ')
print("Login: `%s`, Password: `%s`" % (login, password))
adapter.setLogin(login)
adapter.setPassword(password)
loader = FanficLoader(adapter, writerClass)
loader.download()

187
ffa.py Normal file
View file

@ -0,0 +1,187 @@
import os
import re
import sys
import cgi
import uuid
import shutil
import base64
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
from constants import *
from ficwad import *
class FFA:
storyName = None
def __init__(self):
self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
def getPasswordLine(self):
return '<input type="password" name="pass"'
def getLoginScript(self):
return '/scripts/login.php'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def getPrintableUrl(self, url):
return url + '?print=yes'
def _findIndex(self, lines, what, start):
for i in range(start, len(lines)):
if lines[i].find(what) != -1:
return i
return -1
def extractIndividualUrls(self, data, host, first, fetch = False):
lines = data.split('\n')
optionLines = filter(lambda x : x.find('<option value="') != -1, lines)
authorLines = filter(lambda x : x.find('pemail') != -1, lines)
for al in authorLines:
m = self.grabAuthor.match(al)
if m != None:
self.authorName = m.group(1)
break
optionsLines = optionLines[:len(optionLines)/2]
storyName = first.split("/")[1]
result = []
urls = []
for line in optionLines:
m = self.grabUrl.match(line)
u = m.group(2)
if u.find('" selected="selected') != -1:
u = u.replace('" selected="selected', '')
if u in urls:
continue
else:
urls.append(u)
result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3)))
self.soup = bs.BeautifulSoup(data)
titles = self.soup.findAll(name = 'title', recursive=True)
if len(titles) > 0:
title = titles[0]
print(title)
(website, rest) = title.string.split('::')
story_chapter = rest.split("-")
story = story_chapter[0].strip()
self.storyName = story
return result
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
def getText(self, data, fetch = False):
lines = data.split('\n')
begin = self._findIndex(lines, '</select>', 0)+1
if begin == 0:
begiun = self._findIndex(lines, '<div><p>', 24)
if begin == 0:
print('BAD start')
pp.pprint(lines)
sys.abort()
end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
return "\n".join(lines[begin:end])
class Downloader:
login = None
password = None
url = None
host = None
first = None
opener = None
writer = None
def __init__(self, url, login, password):
self.login = login
self.password = password
self.url = url
self.infoProvider = FicWad() #FFA()
parse = up.urlparse(url)
self.host = parse.scheme + '://' + parse.netloc
self.first = parse.path;
self.loginUrl = self.host + self.infoProvider.getLoginScript()
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
def _loginRequired(self):
print('is login required?')
resp = self.opener.open(self.url)
data = resp.read()
if data.find(self.infoProvider.getPasswordLine()) != -1:
print('yep')
return True
else:
print('nada')
return False
def _login(self):
(login, data) = self.infoProvider.getLoginPasswordOthers()
data[login['login']] = self.login
data[login['password']] = self.password
urlvals = u.urlencode(data)
req = self.opener.open(self.loginUrl, urlvals)
if req.read().find(self.infoProvider.getPasswordLine()) != -1:
return False
else:
return True
def _getContent(self, url):
print("<!-- Opening %s -->" % url)
return self.opener.open(url).read()
def download(self):
first = self._getContent(self.host + self.first)
urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName())
for u,n in urls:
text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
self.writer.writeChapter(n, text)
self.writer.finalise()
if __name__ == '__main__':
f = Downloader(sys.argv[1], 'sigizmund', '***************')
if f._loginRequired():
f._login()
f.download()

75
fictionalley.py Normal file
View file

@ -0,0 +1,75 @@
import os
import re
import sys
import shutil
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
class FictionAlley:
def __init__(self):
pass
def extractIndividualUrls(self, data, host, contents):
soup = bs.BeautifulStoneSoup(data)
title = soup.find('title').string
self.storyName = "-".join(title.split('-')[1:]).strip()
authors = soup.findAll('a')
print('Story "%s" by %s' % (self.storyName, self.authorName))
links = soup.findAll('a', { 'class' : 'chapterlink' } )
result = []
for a in links:
url = a['href']
title = a.string
result.append((url,title))
return result
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
def getText(self, data, fetch = False):
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
return '<html/>'
return div.prettify()
def getPrintableUrl(self, url):
return url
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
if __name__ == '__main__':
url = 'http://www.fictionalley.org/authors/drt/DA.html'
data = u2.urlopen(url).read()
host = up.urlparse(url).netloc
fw = FictionAlley()
fw.authorName = 'DrT'
urls = fw.extractIndividualUrls(data, host, url)
pp.pprint(urls)
print(fw.getText(data))

97
ficwad.py Normal file
View file

@ -0,0 +1,97 @@
import os
import re
import sys
import shutil
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
class FicWad:
def __init__(self, url):
self.url = url
self.host = up.urlparse(url).netloc
def requiresLogin(self, url):
return False
def performLogin(self, url):
pass
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def extractIndividualUrls(self):
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
title = soup.find('title').string
self.storyName = title.split('::')[0].strip()
author = soup.find('span', {'class' : 'author'})
self.authorName = author.a.string
print('Story "%s" by %s' % (self.storyName, self.authorName))
select = soup.find('select', { 'name' : 'goto' } )
allOptions = select.findAll('option')
result = []
for o in allOptions:
url = o['value']
# if type(url) is unicode:
# url = url.encode('utf-8')
title = o.string
result.append((url,title))
return result
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
def getText(self, url):
print(type(url))
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = u2.urlopen(url).read()
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
return '<html/>'
return div.prettify()
def getPrintableUrl(self, url):
return url
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
if __name__ == '__main__':
url = 'http://www.ficwad.com/story/14536'
data = u2.urlopen(url).read()
host = up.urlparse(url).netloc
fw = FicWad()
urls = fw.extractIndividualUrls(data, host, url)
pp.pprint(urls)
print(fw.getText(data))

136
output.py Normal file
View file

@ -0,0 +1,136 @@
import os
import re
import sys
import cgi
import uuid
import shutil
import base64
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
from constants import *
class FanficWriter:
def __init__(self):
pass
def writeChapter(self, title, text):
pass
def finalise(self):
pass
class HTMLWriter(FanficWriter):
def __init__(self, base, name, author):
pass
def writeChapter(self, title, text):
pass
def finalise(self):
pass
class EPubFanficWriter(FanficWriter):
chapters = []
def __init__(self, base, name, author):
self.basePath = base
self.name = name.replace(" ", "_")
self.storyTitle = name
self.directory = self.basePath + '/' + self.name
self.authorName = author
if os.path.exists(self.directory):
shutil.rmtree(self.directory)
os.mkdir(self.directory)
os.mkdir(self.directory + '/META-INF')
os.mkdir(self.directory + '/OEBPS')
print >> open(self.directory + '/mimetype', 'w'), MIMETYPE
print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER
print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS
def _removeEntities(self, text):
for e in entities:
v = entities[e]
text = text.replace(e, v)
return text
def writeChapter(self, title, text):
fileName = base64.b64encode(title) + ".xhtml"
filePath = self.directory + "/OEBPS/" + fileName
f = open(filePath, 'w')
text = self._removeEntities(text)
self.soup = bs.BeautifulStoneSoup(text)
allTags = self.soup.findAll(recursive=True)
for t in allTags:
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr]
allPs = self.soup.findAll(recursive=True)
for p in allPs:
if p.string != None and (len(p.string.strip()) == 0 or p.string.strip() == '&nbsp;' ) :
p.extract()
allBrs = self.soup.findAll(recursive=True, name = ["br", "hr"])
for br in allBrs:
if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
br.name = 'p'
# cleanup(self.soup )
text = self.soup.prettify()
print >> f, XHTML_START % (title, title)
print >> f, text
print >> f, XHTML_END
self.chapters.append((title, fileName))
def finalise(self):
### writing table of contents -- ncx file
tocFilePath = self.directory + "/OEBPS/toc.ncx"
toc = open(tocFilePath, 'w')
print >> toc, TOC_START % self.storyTitle
### writing content -- opf file
opfFilePath = self.directory + "/OEBPS/content.opf"
opf = open(opfFilePath, 'w')
print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
ids = []
i = 0
for t,f in self.chapters:
chapterId = base64.b64encode(t)
print >> toc, TOC_ITEM % (chapterId, i, cgi.escape(t), f)
print >> opf, CONTENT_ITEM % (chapterId, f)
ids.append(chapterId)
i = i + 1
print >> toc, TOC_END
print >> opf, CONTENT_END_MANIFEST
for chapterId in ids:
print >> opf, CONTENT_ITEMREF % chapterId
print >> opf, CONTENT_END