mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
FFA changed, architecture changed, not quite ready yet
This commit is contained in:
parent
a9748061f0
commit
6ef95c634d
5 changed files with 174 additions and 151 deletions
|
|
@ -83,7 +83,7 @@ TOC_END = '''</navMap>
|
||||||
</ncx>
|
</ncx>
|
||||||
'''
|
'''
|
||||||
|
|
||||||
XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?>
|
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
<head>
|
<head>
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,10 @@ class FanficLoader:
|
||||||
urls = self.adapter.extractIndividualUrls()
|
urls = self.adapter.extractIndividualUrls()
|
||||||
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
|
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
|
||||||
|
|
||||||
|
i = 0
|
||||||
for u,n in urls:
|
for u,n in urls:
|
||||||
|
print('Downloading chapter %d/%d' % (i, len(urls)))
|
||||||
|
i = i+1
|
||||||
text = self.adapter.getText(u)
|
text = self.adapter.getText(u)
|
||||||
self.writer.writeChapter(n, text)
|
self.writer.writeChapter(n, text)
|
||||||
|
|
||||||
|
|
|
||||||
289
ffa.py
289
ffa.py
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -6,182 +8,183 @@ import uuid
|
||||||
import shutil
|
import shutil
|
||||||
import base64
|
import base64
|
||||||
import os.path
|
import os.path
|
||||||
|
import logging
|
||||||
|
import unittest
|
||||||
import urllib as u
|
import urllib as u
|
||||||
import pprint as pp
|
import pprint as pp
|
||||||
import urllib2 as u2
|
import urllib2 as u2
|
||||||
|
import login_password
|
||||||
import urlparse as up
|
import urlparse as up
|
||||||
import BeautifulSoup as bs
|
import BeautifulSoup as bs
|
||||||
import htmlentitydefs as hdefs
|
import htmlentitydefs as hdefs
|
||||||
|
|
||||||
from constants import *
|
from constants import *
|
||||||
|
|
||||||
from ficwad import *
|
|
||||||
|
|
||||||
class FFA:
|
class FFA:
|
||||||
storyName = None
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
parsedUrl = up.urlparse(url)
|
||||||
|
self.host = parsedUrl.netloc
|
||||||
|
self.path = parsedUrl.path
|
||||||
|
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||||
|
|
||||||
def __init__(self):
|
logging.debug("Created FFA: url=%s" % (self.url))
|
||||||
self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
|
|
||||||
self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
|
|
||||||
|
|
||||||
def getPasswordLine(self):
|
def _getLoginScript(self):
|
||||||
return '<input type="password" name="pass"'
|
return self.path
|
||||||
|
|
||||||
|
def requiresLogin(self, url = None):
|
||||||
|
resp = self.opener.open(self.url)
|
||||||
|
data = resp.read()
|
||||||
|
if data.find('<legend>Please login to continue</legend>') != -1:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def performLogin(self, url = None):
|
||||||
|
if url == None:
|
||||||
|
url = self.url
|
||||||
|
|
||||||
def getLoginScript(self):
|
data = {}
|
||||||
return '/scripts/login.php'
|
|
||||||
|
|
||||||
def getLoginPasswordOthers(self):
|
data['username'] = self.login
|
||||||
login = dict(login = 'name', password = 'pass')
|
data['password'] = self.password
|
||||||
other = dict(submit = 'Log In', remember='yes')
|
data['submit'] = 'Submit'
|
||||||
return (login, other)
|
|
||||||
|
urlvals = u.urlencode(data)
|
||||||
|
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||||
|
logging.debug("Will now login to URL %s" % loginUrl)
|
||||||
|
|
||||||
|
req = self.opener.open(loginUrl, urlvals)
|
||||||
|
|
||||||
|
if self.requiresLogin():
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
def getPrintableUrl(self, url):
|
def extractIndividualUrls(self):
|
||||||
return url + '?print=yes'
|
data = self.opener.open(self.url).read()
|
||||||
|
soup = bs.BeautifulStoneSoup(data)
|
||||||
def _findIndex(self, lines, what, start):
|
|
||||||
for i in range(start, len(lines)):
|
self.author = soup.find('a', {'href' : '/contact/'}).string
|
||||||
if lines[i].find(what) != -1:
|
self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
|
||||||
return i
|
|
||||||
return -1
|
logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
|
||||||
|
|
||||||
def extractIndividualUrls(self, data, host, first, fetch = False):
|
selector = soup.find('select', {'class' : 'tinput'})
|
||||||
|
options = selector.findAll('option')
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
for o in options:
|
||||||
|
title = o.string
|
||||||
|
url = o['value']
|
||||||
|
|
||||||
|
urls.append((url,title))
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def getText(self, url):
|
||||||
|
if url.find('http://') == -1:
|
||||||
|
url = 'http://' + self.host + '/' + url
|
||||||
|
|
||||||
|
data = self.opener.open(url).read()
|
||||||
|
|
||||||
lines = data.split('\n')
|
lines = data.split('\n')
|
||||||
|
|
||||||
optionLines = filter(lambda x : x.find('<option value="') != -1, lines)
|
emit = False
|
||||||
|
|
||||||
authorLines = filter(lambda x : x.find('pemail') != -1, lines)
|
post = ''
|
||||||
for al in authorLines:
|
|
||||||
m = self.grabAuthor.match(al)
|
|
||||||
if m != None:
|
|
||||||
self.authorName = m.group(1)
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
optionsLines = optionLines[:len(optionLines)/2]
|
for l in lines:
|
||||||
|
if l.find('</div></form>') != -1:
|
||||||
storyName = first.split("/")[1]
|
logging.debug('emit = True')
|
||||||
|
emit = True
|
||||||
result = []
|
|
||||||
urls = []
|
|
||||||
for line in optionLines:
|
|
||||||
m = self.grabUrl.match(line)
|
|
||||||
u = m.group(2)
|
|
||||||
if u.find('" selected="selected') != -1:
|
|
||||||
u = u.replace('" selected="selected', '')
|
|
||||||
|
|
||||||
if u in urls:
|
|
||||||
continue
|
continue
|
||||||
else:
|
elif l.find('<form action="#">') != -1:
|
||||||
urls.append(u)
|
logging.debug('emit = False')
|
||||||
|
if emit:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
emit = False
|
||||||
|
|
||||||
result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3)))
|
if emit:
|
||||||
|
post = post + l + '\n'
|
||||||
|
|
||||||
self.soup = bs.BeautifulSoup(data)
|
return post
|
||||||
titles = self.soup.findAll(name = 'title', recursive=True)
|
|
||||||
if len(titles) > 0:
|
def setLogin(self, login):
|
||||||
title = titles[0]
|
self.login = login
|
||||||
print(title)
|
|
||||||
(website, rest) = title.string.split('::')
|
def setPassword(self, password):
|
||||||
story_chapter = rest.split("-")
|
self.password = password
|
||||||
|
|
||||||
story = story_chapter[0].strip()
|
|
||||||
self.storyName = story
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getStoryName(self):
|
def getStoryName(self):
|
||||||
return self.storyName
|
return self.storyName
|
||||||
|
|
||||||
def getAuthorName(self):
|
def getAuthorName(self):
|
||||||
return self.authorName
|
return self.author
|
||||||
|
|
||||||
def getText(self, data, fetch = False):
|
|
||||||
lines = data.split('\n')
|
|
||||||
begin = self._findIndex(lines, '</select>', 0)+1
|
|
||||||
if begin == 0:
|
|
||||||
begiun = self._findIndex(lines, '<div><p>', 24)
|
|
||||||
|
|
||||||
if begin == 0:
|
|
||||||
print('BAD start')
|
|
||||||
pp.pprint(lines)
|
|
||||||
sys.abort()
|
|
||||||
end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
|
|
||||||
print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
|
|
||||||
return "\n".join(lines[begin:end])
|
|
||||||
|
|
||||||
class Downloader:
|
def getPrintableUrl(self, url):
|
||||||
login = None
|
return url
|
||||||
password = None
|
|
||||||
url = None
|
|
||||||
host = None
|
|
||||||
first = None
|
|
||||||
opener = None
|
|
||||||
|
|
||||||
writer = None
|
|
||||||
|
|
||||||
def __init__(self, url, login, password):
|
|
||||||
self.login = login
|
|
||||||
self.password = password
|
|
||||||
self.url = url
|
|
||||||
|
|
||||||
self.infoProvider = FicWad() #FFA()
|
class FFA_UnitTests(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
parse = up.urlparse(url)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
self.host = parse.scheme + '://' + parse.netloc
|
pass
|
||||||
self.first = parse.path;
|
|
||||||
|
|
||||||
self.loginUrl = self.host + self.infoProvider.getLoginScript()
|
|
||||||
|
|
||||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
|
||||||
|
|
||||||
|
|
||||||
def _loginRequired(self):
|
def testRequiresLoginNeg(self):
|
||||||
print('is login required?')
|
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||||
resp = self.opener.open(self.url)
|
self.assertFalse(f.requiresLogin())
|
||||||
data = resp.read()
|
|
||||||
if data.find(self.infoProvider.getPasswordLine()) != -1:
|
|
||||||
print('yep')
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print('nada')
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _login(self):
|
|
||||||
(login, data) = self.infoProvider.getLoginPasswordOthers()
|
|
||||||
|
|
||||||
data[login['login']] = self.login
|
|
||||||
data[login['password']] = self.password
|
|
||||||
|
|
||||||
urlvals = u.urlencode(data)
|
|
||||||
req = self.opener.open(self.loginUrl, urlvals)
|
|
||||||
|
|
||||||
if req.read().find(self.infoProvider.getPasswordLine()) != -1:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _getContent(self, url):
|
def testRequiresLogin(self):
|
||||||
print("<!-- Opening %s -->" % url)
|
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
|
||||||
return self.opener.open(url).read()
|
self.assertTrue(f.requiresLogin())
|
||||||
|
|
||||||
def download(self):
|
def testPerformLogin(self):
|
||||||
first = self._getContent(self.host + self.first)
|
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
|
||||||
urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
|
|
||||||
|
|
||||||
self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName())
|
if login_password != None:
|
||||||
|
f.setLogin(login_password.login)
|
||||||
|
f.setPassword(login_password.password)
|
||||||
|
|
||||||
for u,n in urls:
|
self.assertTrue(f.performLogin(None))
|
||||||
text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
|
|
||||||
self.writer.writeChapter(n, text)
|
|
||||||
|
|
||||||
self.writer.finalise()
|
def testExtractURLsAuthorStoryName(self):
|
||||||
|
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
|
||||||
|
f.extractIndividualUrls()
|
||||||
|
|
||||||
|
self.assertEquals('Draco664', f.getAuthorName())
|
||||||
|
self.assertEquals('Apprentice Potter', f.getStoryName())
|
||||||
|
|
||||||
|
def testExtractUrls(self):
|
||||||
|
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
|
||||||
|
urls = f.extractIndividualUrls()
|
||||||
|
self.assertEquals(25, len(urls))
|
||||||
|
|
||||||
|
self.assertEquals('Grievances', urls[2][1])
|
||||||
|
self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
|
||||||
|
|
||||||
|
def testGetText(self):
|
||||||
|
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||||
|
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||||
|
|
||||||
|
self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
|
||||||
|
|
||||||
|
def testGetTextLogin(self):
|
||||||
|
url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
|
||||||
|
f = FFA(url)
|
||||||
|
|
||||||
|
if login_password != None:
|
||||||
|
f.setLogin(login_password.login)
|
||||||
|
f.setPassword(login_password.password)
|
||||||
|
|
||||||
|
if f.requiresLogin():
|
||||||
|
f.performLogin()
|
||||||
|
|
||||||
|
data = f.getText(url)
|
||||||
|
seek = 'So Hokage-sama” I said, “this is how we came'
|
||||||
|
self.assertTrue(data.find(seek) != -1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
f = Downloader(sys.argv[1], 'sigizmund', '***************')
|
unittest.main()
|
||||||
if f._loginRequired():
|
|
||||||
f._login()
|
|
||||||
f.download()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -59,7 +59,6 @@ class FicWad:
|
||||||
return self.authorName
|
return self.authorName
|
||||||
|
|
||||||
def getText(self, url):
|
def getText(self, url):
|
||||||
print(type(url))
|
|
||||||
if url.find('http://') == -1:
|
if url.find('http://') == -1:
|
||||||
url = 'http://' + self.host + '/' + url
|
url = 'http://' + self.host + '/' + url
|
||||||
|
|
||||||
|
|
|
||||||
30
output.py
30
output.py
|
|
@ -1,11 +1,15 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import cgi
|
import cgi
|
||||||
import uuid
|
import uuid
|
||||||
|
import codecs
|
||||||
import shutil
|
import shutil
|
||||||
import base64
|
import base64
|
||||||
import os.path
|
import os.path
|
||||||
|
import zipfile
|
||||||
import urllib as u
|
import urllib as u
|
||||||
import pprint as pp
|
import pprint as pp
|
||||||
import urllib2 as u2
|
import urllib2 as u2
|
||||||
|
|
@ -15,6 +19,9 @@ import htmlentitydefs as hdefs
|
||||||
|
|
||||||
from constants import *
|
from constants import *
|
||||||
|
|
||||||
|
import zipdir
|
||||||
|
|
||||||
|
|
||||||
class FanficWriter:
|
class FanficWriter:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
@ -54,9 +61,9 @@ class EPubFanficWriter(FanficWriter):
|
||||||
os.mkdir(self.directory + '/META-INF')
|
os.mkdir(self.directory + '/META-INF')
|
||||||
os.mkdir(self.directory + '/OEBPS')
|
os.mkdir(self.directory + '/OEBPS')
|
||||||
|
|
||||||
print >> open(self.directory + '/mimetype', 'w'), MIMETYPE
|
print >> codecs.open(self.directory + '/mimetype', 'w', 'utf-8'), MIMETYPE
|
||||||
print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER
|
print >> codecs.open(self.directory + '/META-INF/container.xml', 'w', 'utf-8'), CONTAINER
|
||||||
print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS
|
print >> codecs.open(self.directory + '/OEBPS/stylesheet.css', 'w', 'utf-8'), CSS
|
||||||
|
|
||||||
def _removeEntities(self, text):
|
def _removeEntities(self, text):
|
||||||
for e in entities:
|
for e in entities:
|
||||||
|
|
@ -68,6 +75,7 @@ class EPubFanficWriter(FanficWriter):
|
||||||
def writeChapter(self, title, text):
|
def writeChapter(self, title, text):
|
||||||
fileName = base64.b64encode(title) + ".xhtml"
|
fileName = base64.b64encode(title) + ".xhtml"
|
||||||
filePath = self.directory + "/OEBPS/" + fileName
|
filePath = self.directory + "/OEBPS/" + fileName
|
||||||
|
|
||||||
f = open(filePath, 'w')
|
f = open(filePath, 'w')
|
||||||
|
|
||||||
text = self._removeEntities(text)
|
text = self._removeEntities(text)
|
||||||
|
|
@ -93,21 +101,24 @@ class EPubFanficWriter(FanficWriter):
|
||||||
# cleanup(self.soup )
|
# cleanup(self.soup )
|
||||||
|
|
||||||
text = self.soup.prettify()
|
text = self.soup.prettify()
|
||||||
|
print(text)
|
||||||
|
|
||||||
print >> f, XHTML_START % (title, title)
|
print >> f, XHTML_START % (title, title)
|
||||||
print >> f, text
|
f.write(text)
|
||||||
print >> f, XHTML_END
|
print >> f, XHTML_END
|
||||||
|
|
||||||
self.chapters.append((title, fileName))
|
self.chapters.append((title, fileName))
|
||||||
|
|
||||||
def finalise(self):
|
def finalise(self):
|
||||||
|
print("Finalising...")
|
||||||
### writing table of contents -- ncx file
|
### writing table of contents -- ncx file
|
||||||
|
|
||||||
tocFilePath = self.directory + "/OEBPS/toc.ncx"
|
tocFilePath = self.directory + "/OEBPS/toc.ncx"
|
||||||
toc = open(tocFilePath, 'w')
|
toc = open(tocFilePath, 'w')
|
||||||
print >> toc, TOC_START % self.storyTitle
|
print >> toc, TOC_START % self.storyTitle
|
||||||
|
|
||||||
|
print("Printing toc and refs")
|
||||||
|
|
||||||
### writing content -- opf file
|
### writing content -- opf file
|
||||||
opfFilePath = self.directory + "/OEBPS/content.opf"
|
opfFilePath = self.directory + "/OEBPS/content.opf"
|
||||||
opf = open(opfFilePath, 'w')
|
opf = open(opfFilePath, 'w')
|
||||||
|
|
@ -126,6 +137,8 @@ class EPubFanficWriter(FanficWriter):
|
||||||
ids.append(chapterId)
|
ids.append(chapterId)
|
||||||
|
|
||||||
i = i + 1
|
i = i + 1
|
||||||
|
|
||||||
|
print('Toc and refs printed, proceesing to ref-ids....')
|
||||||
|
|
||||||
print >> toc, TOC_END
|
print >> toc, TOC_END
|
||||||
print >> opf, CONTENT_END_MANIFEST
|
print >> opf, CONTENT_END_MANIFEST
|
||||||
|
|
@ -133,4 +146,9 @@ class EPubFanficWriter(FanficWriter):
|
||||||
for chapterId in ids:
|
for chapterId in ids:
|
||||||
print >> opf, CONTENT_ITEMREF % chapterId
|
print >> opf, CONTENT_ITEMREF % chapterId
|
||||||
|
|
||||||
print >> opf, CONTENT_END
|
print >> opf, CONTENT_END
|
||||||
|
|
||||||
|
print('Finished')
|
||||||
|
|
||||||
|
filename = self.directory + '.epub'
|
||||||
|
zipdir.toZip(filename, self.directory)
|
||||||
Loading…
Reference in a new issue