mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 00:43:00 +01:00
Whole lot of fixes related to appengine
This commit is contained in:
commit
1f897843e0
12 changed files with 2851 additions and 0 deletions
1711
BeautifulSoup.py
Normal file
1711
BeautifulSoup.py
Normal file
File diff suppressed because it is too large
Load diff
0
__init__.py
Normal file
0
__init__.py
Normal file
32
adapter.py
Normal file
32
adapter.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
class FanfictionSiteAdapter:
|
||||
login = ''
|
||||
password = ''
|
||||
def __init__(self, url):
|
||||
pass
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
pass
|
||||
|
||||
def performLogin(self, url = None):
|
||||
pass
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
pass
|
||||
|
||||
def getText(self, url):
|
||||
pass
|
||||
|
||||
def setLogin(self, login):
|
||||
pass
|
||||
|
||||
def setPassword(self, password):
|
||||
pass
|
||||
|
||||
def getStoryName(self):
|
||||
pass
|
||||
|
||||
def getAuthorName(self):
|
||||
pass
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
pass
|
||||
135
constants.py
Normal file
135
constants.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
CSS = '''body { margin-left: 5%; margin-right: 5%; margin-top: 5%; margin-bottom: 5%; text-align: justify; }
|
||||
pre { font-size: x-small; }
|
||||
h1 { text-align: center; }
|
||||
h2 { text-align: center; }
|
||||
h3 { text-align: center; }
|
||||
h4 { text-align: center; }
|
||||
h5 { text-align: center; }
|
||||
h6 { text-align: center; }
|
||||
.CI {
|
||||
text-align:center;
|
||||
margin-top:0px;
|
||||
margin-bottom:0px;
|
||||
padding:0px;
|
||||
}
|
||||
.center {text-align: center;}
|
||||
.smcap {font-variant: small-caps;}
|
||||
.u {text-decoration: underline;}
|
||||
.bold {font-weight: bold;}
|
||||
'''
|
||||
|
||||
MIMETYPE = '''application/epub+zip'''
|
||||
|
||||
CONTAINER = '''<?xml version="1.0"?>
|
||||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||
<rootfiles>
|
||||
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||
</rootfiles>
|
||||
</container>
|
||||
'''
|
||||
|
||||
CONTENT_START = '''<?xml version="1.0"?>
|
||||
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
|
||||
unique-identifier="BookId-Epub-%s">
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:opf="http://www.idpf.org/2007/opf">
|
||||
<dc:title>%s</dc:title>
|
||||
<dc:creator opf:role="aut">%s</dc:creator>
|
||||
<dc:language>en-UK</dc:language>
|
||||
<dc:rights></dc:rights>
|
||||
<dc:publisher>sgzmd</dc:publisher>
|
||||
<dc:identifier id="BookId">urn:uuid:sigizmund.com062820072147132</dc:identifier>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
||||
<item id="style" href="stylesheet.css" media-type="text/css" />
|
||||
'''
|
||||
|
||||
CONTENT_ITEM = '<item id="%s" href="%s" media-type="application/xhtml+xml" />'
|
||||
|
||||
CONTENT_END_MANIFEST = '''</manifest>
|
||||
<spine toc="ncx">
|
||||
'''
|
||||
|
||||
CONTENT_ITEMREF = '''<itemref idref="%s" />'''
|
||||
|
||||
CONTENT_END = '''</spine>
|
||||
</package>
|
||||
'''
|
||||
|
||||
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
||||
<head>
|
||||
<meta name="dtb:uid" content="sigizmund.com062820072147132"/>
|
||||
<meta name="dtb:depth" content="1"/>
|
||||
<meta name="dtb:totalPageCount" content="0"/>
|
||||
<meta name="dtb:maxPageNumber" content="0"/>
|
||||
</head>
|
||||
<docTitle>
|
||||
<text>%s</text>
|
||||
</docTitle>
|
||||
<navMap>
|
||||
'''
|
||||
|
||||
TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
|
||||
<navLabel>
|
||||
<text>%s</text>
|
||||
</navLabel>
|
||||
<content src="%s"/>
|
||||
</navPoint>
|
||||
'''
|
||||
|
||||
TOC_END = '''</navMap>
|
||||
</ncx>
|
||||
'''
|
||||
|
||||
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>%s</title>
|
||||
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<h3>%s</h3>
|
||||
'''
|
||||
|
||||
XHTML_END = '''</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
|
||||
'blockquote', 'br', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em',
|
||||
'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i',
|
||||
'ins', 'kbd', 'label', 'li', 'ol',
|
||||
'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
|
||||
'strong', 'sub', 'sup', 'u', 'ul']
|
||||
|
||||
acceptable_attributes = ['href']
|
||||
|
||||
entities = { '–' : ' - ', '—' : ' - ', '”' : '"', '“' : '"', '’' : '\'', '‘' : '\'', '"' : '"' }
|
||||
|
||||
FB2_PROLOGUE = '<FictionBook>'
|
||||
FB2_DESCRIPTION = '''<description>
|
||||
<title-info>
|
||||
<genre>fanfiction</genre>
|
||||
<author>
|
||||
<first-name></first-name>
|
||||
<middle-name></middle-name>
|
||||
<last-name>%s</last-name>
|
||||
</author>
|
||||
<book-title>%s</book-title>
|
||||
<lang>eng</lang>
|
||||
</title-info>
|
||||
<document-info>
|
||||
<author>
|
||||
<nickname>sgzmd</nickname>
|
||||
</author>
|
||||
<date value="%s">%s</date>
|
||||
<id>sgzmd_%s</id>
|
||||
<version>2.0</version>
|
||||
</document-info>
|
||||
</description>'''
|
||||
103
downaloder.py
Normal file
103
downaloder.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import getpass
|
||||
import logging
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
import ffa
|
||||
import ffnet
|
||||
import ficwad
|
||||
import output
|
||||
import fictionalley
|
||||
|
||||
class FanficLoader:
|
||||
'''A controller class which handles the interaction between various specific downloaders and writers'''
|
||||
booksDirectory = "books"
|
||||
|
||||
def __init__(self, adapter, writerClass, quiet = False, inmemory = False, compress=True):
|
||||
self.adapter = adapter
|
||||
self.writerClass = writerClass
|
||||
self.quiet = quiet
|
||||
self.inmemory = inmemory
|
||||
self.compress = compress
|
||||
self.badLogin = False
|
||||
|
||||
def download(self):
|
||||
logging.debug("Trying to download the story")
|
||||
if self.adapter.requiresLogin():
|
||||
logging.debug("Story requires login")
|
||||
if not self.adapter.performLogin():
|
||||
logging.debug("Login/password problem")
|
||||
self.badLogin = True
|
||||
return None
|
||||
|
||||
urls = self.adapter.extractIndividualUrls()
|
||||
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress)
|
||||
|
||||
i = 0
|
||||
for u,n in urls:
|
||||
if not self.quiet:
|
||||
print('Downloading chapter %d/%d' % (i, len(urls)))
|
||||
i = i+1
|
||||
text = self.adapter.getText(u)
|
||||
self.writer.writeChapter(n, text)
|
||||
|
||||
self.writer.finalise()
|
||||
|
||||
if self.inmemory:
|
||||
self.name = self.writer.name
|
||||
return self.writer.output.getvalue()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
(url, format) = sys.argv[1:]
|
||||
|
||||
if type(url) is unicode:
|
||||
print('URL is unicode')
|
||||
url = url.encode('latin1')
|
||||
|
||||
adapter = None
|
||||
writerClass = None
|
||||
|
||||
if url.find('fanficauthors') != -1:
|
||||
adapter = ffa.FFA(url)
|
||||
elif url.find('fictionalley') != -1:
|
||||
adapter = fictionalley.FictionAlley(url)
|
||||
print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors"
|
||||
sys.exit(0)
|
||||
elif url.find('ficwad') != -1:
|
||||
adapter = ficwad.FicWad(url)
|
||||
elif url.find('fanfiction.net') != -1:
|
||||
adapter = ffnet.FFNet(url)
|
||||
else:
|
||||
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
|
||||
sys.exit(1)
|
||||
|
||||
if format == 'epub':
|
||||
writerClass = output.EPubFanficWriter
|
||||
elif format == 'html':
|
||||
writerClass = output.HTMLWriter
|
||||
|
||||
if adapter.requiresLogin(url):
|
||||
print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
|
||||
sys.stdout.write("Can I haz ur login? ")
|
||||
login = sys.stdin.readline().strip()
|
||||
password = getpass.getpass(prompt='Can I haz ur password? ')
|
||||
print("Login: `%s`, Password: `%s`" % (login, password))
|
||||
|
||||
adapter.setLogin(login)
|
||||
adapter.setPassword(password)
|
||||
|
||||
|
||||
loader = FanficLoader(adapter, writerClass)
|
||||
loader.download()
|
||||
|
||||
197
ffa.py
Normal file
197
ffa.py
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class FFA(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
|
||||
logging.debug("Created FFA: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
resp = self.opener.open(self.url)
|
||||
data = resp.read()
|
||||
if data.find('<legend>Please login to continue</legend>') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url = None):
|
||||
if url == None:
|
||||
url = self.url
|
||||
|
||||
data = {}
|
||||
|
||||
data['username'] = self.login
|
||||
data['password'] = self.password
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||
logging.debug("Will now login to URL %s" % loginUrl)
|
||||
|
||||
req = self.opener.open(loginUrl, urlvals)
|
||||
|
||||
if self.requiresLogin():
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self.opener.open(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
self.author = soup.find('a', {'href' : '/contact/'}).string
|
||||
self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
|
||||
|
||||
logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
|
||||
|
||||
selector = soup.find('select', {'class' : 'tinput'})
|
||||
options = selector.findAll('option')
|
||||
|
||||
urls = []
|
||||
|
||||
for o in options:
|
||||
title = o.string
|
||||
url = o['value']
|
||||
|
||||
urls.append((url,title))
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.info('Downloading: %s' % url)
|
||||
data = self.opener.open(url).read()
|
||||
|
||||
lines = data.split('\n')
|
||||
|
||||
emit = False
|
||||
|
||||
post = ''
|
||||
|
||||
for l in lines:
|
||||
if l.find('</div></form>') != -1:
|
||||
logging.debug('emit = True')
|
||||
emit = True
|
||||
continue
|
||||
elif l.find('<form action="#">') != -1:
|
||||
logging.debug('emit = False')
|
||||
if emit:
|
||||
break
|
||||
else:
|
||||
emit = False
|
||||
|
||||
if emit:
|
||||
post = post + l + '\n'
|
||||
|
||||
return post
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.author
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
class FFA_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testRequiresLoginNeg(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
self.assertFalse(f.requiresLogin())
|
||||
|
||||
def testRequiresLogin(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
|
||||
self.assertTrue(f.requiresLogin())
|
||||
|
||||
def testPerformLogin(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
|
||||
|
||||
if login_password != None:
|
||||
f.setLogin(login_password.login)
|
||||
f.setPassword(login_password.password)
|
||||
|
||||
self.assertTrue(f.performLogin(None))
|
||||
|
||||
def testExtractURLsAuthorStoryName(self):
|
||||
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
|
||||
f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Draco664', f.getAuthorName())
|
||||
self.assertEquals('Apprentice Potter', f.getStoryName())
|
||||
|
||||
def testExtractUrls(self):
|
||||
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
|
||||
urls = f.extractIndividualUrls()
|
||||
self.assertEquals(25, len(urls))
|
||||
|
||||
self.assertEquals('Grievances', urls[2][1])
|
||||
self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
|
||||
|
||||
def testGetText(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
|
||||
self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
|
||||
|
||||
def testGetTextLogin(self):
|
||||
url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
|
||||
f = FFA(url)
|
||||
|
||||
if login_password != None:
|
||||
f.setLogin(login_password.login)
|
||||
f.setPassword(login_password.password)
|
||||
|
||||
if f.requiresLogin():
|
||||
f.performLogin()
|
||||
|
||||
data = f.getText(url)
|
||||
seek = 'So Hokage-sama” I said, “this is how we came'
|
||||
self.assertTrue(data.find(seek) != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
162
ffnet.py
Normal file
162
ffnet.py
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
try:
|
||||
from google.appengine.api.urlfetch import fetch as googlefetch
|
||||
appEngine = True
|
||||
except:
|
||||
appEngine = False
|
||||
|
||||
class FFNet(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
self.storyName = 'FF.Net story'
|
||||
self.storyName = 'FF.Net author'
|
||||
|
||||
spl = self.path.split('/')
|
||||
if len(spl) == 5:
|
||||
self.path = "/".join(spl[1:-1])
|
||||
|
||||
if self.path.startswith('/'):
|
||||
self.path = self.path[1:]
|
||||
|
||||
if self.path.endswith('/'):
|
||||
self.path = self.path[:-1]
|
||||
|
||||
(s, self.storyId, chapter) = self.path.split('/')
|
||||
|
||||
logging.debug('self.storyId=%s, chapter=%s' % (self.storyId, chapter))
|
||||
|
||||
if not appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
self.opener = None
|
||||
|
||||
logging.debug("Created FF.Net: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return False
|
||||
|
||||
def performLogin(self, url = None):
|
||||
return True
|
||||
|
||||
def _fetchUrl(self, url):
|
||||
if not appEngine:
|
||||
return self.opener.open(url).read().decode('utf-8')
|
||||
else:
|
||||
return googlefetch(url).content
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self._fetchUrl(self.url)
|
||||
|
||||
urls = []
|
||||
lines = data.split('\n')
|
||||
for l in lines:
|
||||
if l.find("<img src='http://c.fanfiction.net/static/ficons/script.png' width=16 height=16 border=0 align=absmiddle>") != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.storyName = s2.find('b').string
|
||||
elif l.find("<a href='/u/") != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.authorName = s2.a.string
|
||||
elif l.find("<SELECT title='chapter navigation'") != -1:
|
||||
if len(urls) > 0:
|
||||
continue
|
||||
u = l.decode('utf-8')
|
||||
u = re.sub('&\#[0-9]+;', ' ', u)
|
||||
s2 = bs.BeautifulSoup(u)
|
||||
options = s2.findAll('option')
|
||||
for o in options:
|
||||
url = 'http://fanfiction.net/s/' + self.storyId + '/' + o['value']
|
||||
title = o.string
|
||||
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
|
||||
urls.append((url,title))
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
data = self._fetchUrl(url)
|
||||
lines = data.split('\n')
|
||||
for l in lines:
|
||||
if l.find('<!-- start story -->') != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
return s2.div.prettify()
|
||||
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
class FFA_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testChaptersAuthStory(self):
|
||||
f = FFNet('http://www.fanfiction.net/s/5257563/1')
|
||||
f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Beka0502', f.getAuthorName())
|
||||
self.assertEquals("Draco's Redemption", f.getStoryName())
|
||||
|
||||
def testChaptersCountNames(self):
|
||||
f = FFNet('http://www.fanfiction.net/s/5257563/1')
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals(8, len(urls))
|
||||
|
||||
def testGetText(self):
|
||||
url = 'http://www.fanfiction.net/s/5257563/1'
|
||||
f = FFNet(url)
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('He was just about to look at some photos when he heard a crack') != -1)
|
||||
|
||||
def testBrokenWands(self):
|
||||
url = 'http://www.fanfiction.net/s/1527263/30/Harry_Potter_and_Broken_Wands'
|
||||
f = FFNet(url)
|
||||
text = f.getText(url)
|
||||
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
75
fictionalley.py
Normal file
75
fictionalley.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
class FictionAlley:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def extractIndividualUrls(self, data, host, contents):
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
title = soup.find('title').string
|
||||
self.storyName = "-".join(title.split('-')[1:]).strip()
|
||||
|
||||
authors = soup.findAll('a')
|
||||
|
||||
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
links = soup.findAll('a', { 'class' : 'chapterlink' } )
|
||||
|
||||
result = []
|
||||
for a in links:
|
||||
url = a['href']
|
||||
title = a.string
|
||||
result.append((url,title))
|
||||
|
||||
return result
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
|
||||
def getText(self, data, fetch = False):
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
return '<html/>'
|
||||
|
||||
return div.prettify()
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
def getPasswordLine(self):
|
||||
return 'opaopapassword'
|
||||
|
||||
def getLoginScript(self):
|
||||
return 'opaopaloginscript'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.fictionalley.org/authors/drt/DA.html'
|
||||
data = u2.urlopen(url).read()
|
||||
host = up.urlparse(url).netloc
|
||||
fw = FictionAlley()
|
||||
fw.authorName = 'DrT'
|
||||
urls = fw.extractIndividualUrls(data, host, url)
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
98
ficwad.py
Normal file
98
ficwad.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
from adapter import *
|
||||
|
||||
class FicWad(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.host = up.urlparse(url).netloc
|
||||
|
||||
def requiresLogin(self, url):
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
pass
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = u2.urlopen(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
title = soup.find('title').string
|
||||
self.storyName = title.split('::')[0].strip()
|
||||
|
||||
author = soup.find('span', {'class' : 'author'})
|
||||
self.authorName = author.a.string
|
||||
|
||||
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'goto' } )
|
||||
|
||||
allOptions = select.findAll('option')
|
||||
result = []
|
||||
for o in allOptions:
|
||||
url = o['value']
|
||||
# if type(url) is unicode:
|
||||
# url = url.encode('utf-8')
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
return result
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
data = u2.urlopen(url).read()
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
return '<html/>'
|
||||
|
||||
return div.prettify()
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
def getPasswordLine(self):
|
||||
return 'opaopapassword'
|
||||
|
||||
def getLoginScript(self):
|
||||
return 'opaopaloginscript'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.ficwad.com/story/14536'
|
||||
data = u2.urlopen(url).read()
|
||||
host = up.urlparse(url).netloc
|
||||
fw = FicWad(url)
|
||||
urls = fw.extractIndividualUrls()
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
17
html_constants.py
Normal file
17
html_constants.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>${title} by ${author}</title>
|
||||
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<h1>${title} by ${author}</h1>
|
||||
${body}
|
||||
</body></html>
|
||||
'''
|
||||
|
||||
XHTML_CHAPTER_START = '''<h2>${chapter}</h2>'''
|
||||
|
||||
XHTML_END = ''''''
|
||||
252
output.py
Normal file
252
output.py
Normal file
|
|
@ -0,0 +1,252 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import codecs
|
||||
import shutil
|
||||
import string
|
||||
import base64
|
||||
import os.path
|
||||
import zipfile
|
||||
import StringIO
|
||||
import logging
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
import zipdir
|
||||
import html_constants
|
||||
from constants import *
|
||||
|
||||
|
||||
|
||||
class FanficWriter:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
pass
|
||||
|
||||
def finalise(self):
|
||||
pass
|
||||
|
||||
class HTMLWriter(FanficWriter):
|
||||
body = ''
|
||||
|
||||
def __init__(self, base, name, author, inmemory=False, compress=False):
|
||||
self.basePath = base
|
||||
self.name = name.replace(" ", "_")
|
||||
self.storyTitle = name
|
||||
self.fileName = self.basePath + '/' + self.name + '.html'
|
||||
self.authorName = author
|
||||
|
||||
self.inmemory = inmemory
|
||||
|
||||
if not self.inmemory and os.path.exists(self.fileName):
|
||||
os.remove(self.fileName)
|
||||
|
||||
if self.inmemory:
|
||||
self.output = StringIO.StringIO()
|
||||
else:
|
||||
self.output = open(self.fileName, 'w')
|
||||
|
||||
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
|
||||
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
title = title.decode('utf-8')
|
||||
text = text.decode('utf-8')
|
||||
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
|
||||
self.body = self.body + '\n' + text
|
||||
|
||||
def finalise(self):
|
||||
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
|
||||
soup = bs.BeautifulSoup(html)
|
||||
result = soup.prettify()
|
||||
|
||||
# f = open(self.fileName, 'w')
|
||||
# f.write(result)
|
||||
# f.close()
|
||||
|
||||
self.output.write(result)
|
||||
if not self.inmemory:
|
||||
self.output.close()
|
||||
|
||||
class EPubFanficWriter(FanficWriter):
|
||||
chapters = []
|
||||
|
||||
files = {}
|
||||
|
||||
def _writeFile(self, fileName, data):
|
||||
if fileName in self.files:
|
||||
self.files[fileName].write(data.decode('utf-8'))
|
||||
else:
|
||||
if self.inmemory:
|
||||
self.files[fileName] = StringIO.StringIO()
|
||||
else:
|
||||
self.files[fileName] = open(self.directory + '/' + fileName, 'w')
|
||||
|
||||
self._writeFile(fileName, data)
|
||||
|
||||
|
||||
def _closeFiles(self):
|
||||
if not self.inmemory:
|
||||
for f in self.files:
|
||||
self.files[f].close()
|
||||
|
||||
def __init__(self, base, name, author, inmemory=False, compress=True):
|
||||
self.basePath = base
|
||||
self.name = name.replace(" ", "_")
|
||||
self.storyTitle = name
|
||||
self.directory = self.basePath + '/' + self.name
|
||||
self.inmemory = inmemory
|
||||
self.authorName = author
|
||||
|
||||
self.files = {}
|
||||
self.chapters = []
|
||||
|
||||
if not self.inmemory:
|
||||
self.inmemory = True
|
||||
self.writeToFile = True
|
||||
else:
|
||||
self.writeToFile = False
|
||||
|
||||
|
||||
if not self.inmemory:
|
||||
if os.path.exists(self.directory):
|
||||
shutil.rmtree(self.directory)
|
||||
|
||||
os.mkdir(self.directory)
|
||||
|
||||
os.mkdir(self.directory + '/META-INF')
|
||||
os.mkdir(self.directory + '/OEBPS')
|
||||
|
||||
# print >> codecs.open(self.directory + '/mimetype', 'w', 'utf-8'), MIMETYPE
|
||||
# print >> codecs.open(self.directory + '/META-INF/container.xml', 'w', 'utf-8'), CONTAINER
|
||||
# print >> codecs.open(self.directory + '/OEBPS/stylesheet.css', 'w', 'utf-8'), CSS
|
||||
|
||||
self._writeFile('mimetype', MIMETYPE)
|
||||
self._writeFile('META-INF/container.xml', CONTAINER)
|
||||
self._writeFile('OEBPS/stylesheet.css', CSS)
|
||||
|
||||
def _removeEntities(self, text):
|
||||
for e in entities:
|
||||
v = entities[e]
|
||||
text = text.replace(e, v)
|
||||
|
||||
text = text.replace('&', '&')
|
||||
|
||||
return text
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
fileName = base64.b64encode(title).replace('/', '_') + ".xhtml"
|
||||
filePath = self.directory + "/OEBPS/" + fileName
|
||||
|
||||
fn = 'OEBPS/' + fileName
|
||||
|
||||
# f = open(filePath, 'w')
|
||||
|
||||
text = self._removeEntities(text)
|
||||
|
||||
self.soup = bs.BeautifulStoneSoup(text)
|
||||
|
||||
allTags = self.soup.findAll(recursive=True)
|
||||
for t in allTags:
|
||||
for attr in t._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr]
|
||||
|
||||
allPs = self.soup.findAll(recursive=True)
|
||||
for p in allPs:
|
||||
if p.string != None and (len(p.string.strip()) == 0 or p.string.strip() == ' ' ) :
|
||||
p.extract()
|
||||
|
||||
allBrs = self.soup.findAll(recursive=True, name = ["br", "hr"])
|
||||
for br in allBrs:
|
||||
if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
|
||||
br.name = 'p'
|
||||
|
||||
# cleanup(self.soup )
|
||||
|
||||
text = self.soup.prettify()
|
||||
|
||||
tt = self._removeEntities(title)
|
||||
|
||||
self._writeFile(fn, XHTML_START % (tt, tt))
|
||||
self._writeFile(fn, text)
|
||||
self._writeFile(fn, XHTML_END)
|
||||
# print >> f, XHTML_START % (tt, tt)
|
||||
# f.write(text)
|
||||
# print >> f, XHTML_END
|
||||
|
||||
self.chapters.append((title, fileName))
|
||||
|
||||
def finalise(self):
|
||||
logging.debug("Finalising...")
|
||||
### writing table of contents -- ncx file
|
||||
|
||||
tocFilePath = "OEBPS/toc.ncx"
|
||||
# toc = open(tocFilePath, 'w')
|
||||
# print >> toc, TOC_START % self.storyTitle
|
||||
self._writeFile(tocFilePath, TOC_START % self.storyTitle)
|
||||
### writing content -- opf file
|
||||
opfFilePath = "OEBPS/content.opf"
|
||||
|
||||
# opf = open(opfFilePath, 'w')
|
||||
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName))
|
||||
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
|
||||
|
||||
ids = []
|
||||
|
||||
i = 0
|
||||
for t,f in self.chapters:
|
||||
chapterId = base64.b64encode(t)
|
||||
# print >> toc, TOC_ITEM % (chapterId, i, cgi.escape(t), f)
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
|
||||
# print >> opf, CONTENT_ITEM % (chapterId, f)
|
||||
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
||||
|
||||
ids.append(chapterId)
|
||||
|
||||
i = i + 1
|
||||
|
||||
# logging.d('Toc and refs printed, proceesing to ref-ids....')
|
||||
|
||||
# print >> toc, TOC_END
|
||||
# print >> opf, CONTENT_END_MANIFEST
|
||||
|
||||
self._writeFile(tocFilePath, TOC_END)
|
||||
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
|
||||
|
||||
for chapterId in ids:
|
||||
# print >> opf, CONTENT_ITEMREF % chapterId
|
||||
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
|
||||
|
||||
# print >> opf, CONTENT_END
|
||||
self._writeFile(opfFilePath, CONTENT_END)
|
||||
|
||||
# opf.close()
|
||||
# toc.close()
|
||||
|
||||
# print('Finished')
|
||||
|
||||
self._closeFiles()
|
||||
|
||||
filename = self.directory + '.epub'
|
||||
|
||||
zipdata = zipdir.inMemoryZip(self.files)
|
||||
|
||||
if self.writeToFile:
|
||||
f = open(filename, 'w')
|
||||
f.write(zipdata.getvalue())
|
||||
f.close()
|
||||
else:
|
||||
self.output = zipdata
|
||||
|
||||
# zipdir.toZip(filename, self.directory)
|
||||
69
zipdir.py
Normal file
69
zipdir.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
import os
|
||||
import zipfile
|
||||
import logging
|
||||
|
||||
import StringIO
|
||||
|
||||
def toZip(filename, directory):
|
||||
zippedHelp = zipfile.ZipFile(filename, "w", compression=zipfile.ZIP_DEFLATED)
|
||||
lst = os.listdir(directory)
|
||||
|
||||
for entity in lst:
|
||||
if entity.startswith('.'):
|
||||
continue
|
||||
|
||||
each = os.path.join(directory,entity)
|
||||
print(each)
|
||||
|
||||
if os.path.isfile(each):
|
||||
print(each)
|
||||
zippedHelp.write(each, arcname=entity)
|
||||
else:
|
||||
addFolderToZip(zippedHelp,entity, each)
|
||||
|
||||
zippedHelp.close()
|
||||
|
||||
def addFolderToZip(zippedHelp,folder,fpath):
|
||||
#print('addFolderToZip(%s)' % folder)
|
||||
|
||||
if folder == '.' or folder == '..':
|
||||
return
|
||||
|
||||
folderFiles = os.listdir(fpath)
|
||||
for f in folderFiles:
|
||||
if os.path.isfile(fpath + '/' + f):
|
||||
#print('basename=%s' % os.path.basename(fpath + '/' + f))
|
||||
zippedHelp.write(fpath + '/' + f, folder + '/' + f, zipfile.ZIP_DEFLATED)
|
||||
elif os.path.isdir(f):
|
||||
addFolderToZip(zippedHelp,f)
|
||||
|
||||
def inMemoryZip(files):
|
||||
# files have a structure of {'path/to/file' => content} dictionary
|
||||
io = StringIO.StringIO()
|
||||
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_DEFLATED)
|
||||
memzip.debug = 3
|
||||
|
||||
for path in files:
|
||||
if type(files[path]) != type('str'):
|
||||
data = files[path].getvalue()
|
||||
else:
|
||||
data = files[path]
|
||||
|
||||
# logging.debug(data)
|
||||
memzip.writestr(path, data.encode('utf-8'))
|
||||
|
||||
for zf in memzip.filelist:
|
||||
zf.create_system = 0
|
||||
|
||||
memzip.close()
|
||||
|
||||
return io
|
||||
|
||||
if __name__ == '__main__':
|
||||
# toZip('sample.epub', "books/A_Time_To_Reflect")
|
||||
# z = zipfile.ZipFile('sample.epub', 'r')
|
||||
files = {'test.txt' : 'test', 'data/abc.txt' : 'abc'}
|
||||
data = inMemoryZip(files)
|
||||
f = open('res.zip', 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
Loading…
Reference in a new issue