mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
FFA changed, architecture changed, not quite ready yet
This commit is contained in:
parent
a9748061f0
commit
6ef95c634d
5 changed files with 174 additions and 151 deletions
|
|
@ -83,7 +83,7 @@ TOC_END = '''</navMap>
|
|||
</ncx>
|
||||
'''
|
||||
|
||||
XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?>
|
||||
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
|
|
|
|||
|
|
@ -28,7 +28,10 @@ class FanficLoader:
|
|||
urls = self.adapter.extractIndividualUrls()
|
||||
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
|
||||
|
||||
i = 0
|
||||
for u,n in urls:
|
||||
print('Downloading chapter %d/%d' % (i, len(urls)))
|
||||
i = i+1
|
||||
text = self.adapter.getText(u)
|
||||
self.writer.writeChapter(n, text)
|
||||
|
||||
|
|
|
|||
259
ffa.py
259
ffa.py
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
|
@ -6,182 +8,183 @@ import uuid
|
|||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import login_password
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
from constants import *
|
||||
|
||||
from ficwad import *
|
||||
|
||||
class FFA:
|
||||
storyName = None
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
|
||||
def __init__(self):
|
||||
self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
|
||||
self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
|
||||
logging.debug("Created FFA: url=%s" % (self.url))
|
||||
|
||||
def getPasswordLine(self):
|
||||
return '<input type="password" name="pass"'
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def getLoginScript(self):
|
||||
return '/scripts/login.php'
|
||||
def requiresLogin(self, url = None):
|
||||
resp = self.opener.open(self.url)
|
||||
data = resp.read()
|
||||
if data.find('<legend>Please login to continue</legend>') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
def performLogin(self, url = None):
|
||||
if url == None:
|
||||
url = self.url
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url + '?print=yes'
|
||||
data = {}
|
||||
|
||||
def _findIndex(self, lines, what, start):
|
||||
for i in range(start, len(lines)):
|
||||
if lines[i].find(what) != -1:
|
||||
return i
|
||||
return -1
|
||||
data['username'] = self.login
|
||||
data['password'] = self.password
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||
logging.debug("Will now login to URL %s" % loginUrl)
|
||||
|
||||
req = self.opener.open(loginUrl, urlvals)
|
||||
|
||||
if self.requiresLogin():
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self.opener.open(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
self.author = soup.find('a', {'href' : '/contact/'}).string
|
||||
self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
|
||||
|
||||
logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
|
||||
|
||||
selector = soup.find('select', {'class' : 'tinput'})
|
||||
options = selector.findAll('option')
|
||||
|
||||
urls = []
|
||||
|
||||
for o in options:
|
||||
title = o.string
|
||||
url = o['value']
|
||||
|
||||
urls.append((url,title))
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
data = self.opener.open(url).read()
|
||||
|
||||
def extractIndividualUrls(self, data, host, first, fetch = False):
|
||||
lines = data.split('\n')
|
||||
|
||||
optionLines = filter(lambda x : x.find('<option value="') != -1, lines)
|
||||
emit = False
|
||||
|
||||
authorLines = filter(lambda x : x.find('pemail') != -1, lines)
|
||||
for al in authorLines:
|
||||
m = self.grabAuthor.match(al)
|
||||
if m != None:
|
||||
self.authorName = m.group(1)
|
||||
break
|
||||
post = ''
|
||||
|
||||
|
||||
optionsLines = optionLines[:len(optionLines)/2]
|
||||
|
||||
storyName = first.split("/")[1]
|
||||
|
||||
result = []
|
||||
urls = []
|
||||
for line in optionLines:
|
||||
m = self.grabUrl.match(line)
|
||||
u = m.group(2)
|
||||
if u.find('" selected="selected') != -1:
|
||||
u = u.replace('" selected="selected', '')
|
||||
|
||||
if u in urls:
|
||||
for l in lines:
|
||||
if l.find('</div></form>') != -1:
|
||||
logging.debug('emit = True')
|
||||
emit = True
|
||||
continue
|
||||
elif l.find('<form action="#">') != -1:
|
||||
logging.debug('emit = False')
|
||||
if emit:
|
||||
break
|
||||
else:
|
||||
urls.append(u)
|
||||
emit = False
|
||||
|
||||
result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3)))
|
||||
if emit:
|
||||
post = post + l + '\n'
|
||||
|
||||
self.soup = bs.BeautifulSoup(data)
|
||||
titles = self.soup.findAll(name = 'title', recursive=True)
|
||||
if len(titles) > 0:
|
||||
title = titles[0]
|
||||
print(title)
|
||||
(website, rest) = title.string.split('::')
|
||||
story_chapter = rest.split("-")
|
||||
return post
|
||||
|
||||
story = story_chapter[0].strip()
|
||||
self.storyName = story
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
return result
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
return self.author
|
||||
|
||||
def getText(self, data, fetch = False):
|
||||
lines = data.split('\n')
|
||||
begin = self._findIndex(lines, '</select>', 0)+1
|
||||
if begin == 0:
|
||||
begiun = self._findIndex(lines, '<div><p>', 24)
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
if begin == 0:
|
||||
print('BAD start')
|
||||
pp.pprint(lines)
|
||||
sys.abort()
|
||||
end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
|
||||
print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
|
||||
return "\n".join(lines[begin:end])
|
||||
class FFA_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
class Downloader:
|
||||
login = None
|
||||
password = None
|
||||
url = None
|
||||
host = None
|
||||
first = None
|
||||
opener = None
|
||||
def testRequiresLoginNeg(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
self.assertFalse(f.requiresLogin())
|
||||
|
||||
writer = None
|
||||
def testRequiresLogin(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
|
||||
self.assertTrue(f.requiresLogin())
|
||||
|
||||
def __init__(self, url, login, password):
|
||||
self.login = login
|
||||
self.password = password
|
||||
self.url = url
|
||||
def testPerformLogin(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
|
||||
|
||||
self.infoProvider = FicWad() #FFA()
|
||||
if login_password != None:
|
||||
f.setLogin(login_password.login)
|
||||
f.setPassword(login_password.password)
|
||||
|
||||
parse = up.urlparse(url)
|
||||
self.host = parse.scheme + '://' + parse.netloc
|
||||
self.first = parse.path;
|
||||
self.assertTrue(f.performLogin(None))
|
||||
|
||||
self.loginUrl = self.host + self.infoProvider.getLoginScript()
|
||||
def testExtractURLsAuthorStoryName(self):
|
||||
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
|
||||
f.extractIndividualUrls()
|
||||
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.assertEquals('Draco664', f.getAuthorName())
|
||||
self.assertEquals('Apprentice Potter', f.getStoryName())
|
||||
|
||||
def testExtractUrls(self):
|
||||
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
|
||||
urls = f.extractIndividualUrls()
|
||||
self.assertEquals(25, len(urls))
|
||||
|
||||
def _loginRequired(self):
|
||||
print('is login required?')
|
||||
resp = self.opener.open(self.url)
|
||||
data = resp.read()
|
||||
if data.find(self.infoProvider.getPasswordLine()) != -1:
|
||||
print('yep')
|
||||
return True
|
||||
else:
|
||||
print('nada')
|
||||
return False
|
||||
self.assertEquals('Grievances', urls[2][1])
|
||||
self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
|
||||
|
||||
def _login(self):
|
||||
(login, data) = self.infoProvider.getLoginPasswordOthers()
|
||||
def testGetText(self):
|
||||
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
|
||||
|
||||
data[login['login']] = self.login
|
||||
data[login['password']] = self.password
|
||||
self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
req = self.opener.open(self.loginUrl, urlvals)
|
||||
def testGetTextLogin(self):
|
||||
url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
|
||||
f = FFA(url)
|
||||
|
||||
if req.read().find(self.infoProvider.getPasswordLine()) != -1:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
if login_password != None:
|
||||
f.setLogin(login_password.login)
|
||||
f.setPassword(login_password.password)
|
||||
|
||||
def _getContent(self, url):
|
||||
print("<!-- Opening %s -->" % url)
|
||||
return self.opener.open(url).read()
|
||||
|
||||
def download(self):
|
||||
first = self._getContent(self.host + self.first)
|
||||
urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
|
||||
|
||||
self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName())
|
||||
|
||||
for u,n in urls:
|
||||
text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
|
||||
self.writer.writeChapter(n, text)
|
||||
|
||||
self.writer.finalise()
|
||||
if f.requiresLogin():
|
||||
f.performLogin()
|
||||
|
||||
data = f.getText(url)
|
||||
seek = 'So Hokage-sama” I said, “this is how we came'
|
||||
self.assertTrue(data.find(seek) != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = Downloader(sys.argv[1], 'sigizmund', '***************')
|
||||
if f._loginRequired():
|
||||
f._login()
|
||||
f.download()
|
||||
|
||||
|
||||
|
||||
unittest.main()
|
||||
|
|
@ -59,7 +59,6 @@ class FicWad:
|
|||
return self.authorName
|
||||
|
||||
def getText(self, url):
|
||||
print(type(url))
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
|
|
|
|||
28
output.py
28
output.py
|
|
@ -1,11 +1,15 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import codecs
|
||||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import zipfile
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
|
|
@ -15,6 +19,9 @@ import htmlentitydefs as hdefs
|
|||
|
||||
from constants import *
|
||||
|
||||
import zipdir
|
||||
|
||||
|
||||
class FanficWriter:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
|
@ -54,9 +61,9 @@ class EPubFanficWriter(FanficWriter):
|
|||
os.mkdir(self.directory + '/META-INF')
|
||||
os.mkdir(self.directory + '/OEBPS')
|
||||
|
||||
print >> open(self.directory + '/mimetype', 'w'), MIMETYPE
|
||||
print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER
|
||||
print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS
|
||||
print >> codecs.open(self.directory + '/mimetype', 'w', 'utf-8'), MIMETYPE
|
||||
print >> codecs.open(self.directory + '/META-INF/container.xml', 'w', 'utf-8'), CONTAINER
|
||||
print >> codecs.open(self.directory + '/OEBPS/stylesheet.css', 'w', 'utf-8'), CSS
|
||||
|
||||
def _removeEntities(self, text):
|
||||
for e in entities:
|
||||
|
|
@ -68,6 +75,7 @@ class EPubFanficWriter(FanficWriter):
|
|||
def writeChapter(self, title, text):
|
||||
fileName = base64.b64encode(title) + ".xhtml"
|
||||
filePath = self.directory + "/OEBPS/" + fileName
|
||||
|
||||
f = open(filePath, 'w')
|
||||
|
||||
text = self._removeEntities(text)
|
||||
|
|
@ -93,21 +101,24 @@ class EPubFanficWriter(FanficWriter):
|
|||
# cleanup(self.soup )
|
||||
|
||||
text = self.soup.prettify()
|
||||
print(text)
|
||||
|
||||
print >> f, XHTML_START % (title, title)
|
||||
print >> f, text
|
||||
f.write(text)
|
||||
print >> f, XHTML_END
|
||||
|
||||
self.chapters.append((title, fileName))
|
||||
|
||||
def finalise(self):
|
||||
|
||||
print("Finalising...")
|
||||
### writing table of contents -- ncx file
|
||||
|
||||
tocFilePath = self.directory + "/OEBPS/toc.ncx"
|
||||
toc = open(tocFilePath, 'w')
|
||||
print >> toc, TOC_START % self.storyTitle
|
||||
|
||||
print("Printing toc and refs")
|
||||
|
||||
### writing content -- opf file
|
||||
opfFilePath = self.directory + "/OEBPS/content.opf"
|
||||
opf = open(opfFilePath, 'w')
|
||||
|
|
@ -127,6 +138,8 @@ class EPubFanficWriter(FanficWriter):
|
|||
|
||||
i = i + 1
|
||||
|
||||
print('Toc and refs printed, proceesing to ref-ids....')
|
||||
|
||||
print >> toc, TOC_END
|
||||
print >> opf, CONTENT_END_MANIFEST
|
||||
|
||||
|
|
@ -134,3 +147,8 @@ class EPubFanficWriter(FanficWriter):
|
|||
print >> opf, CONTENT_ITEMREF % chapterId
|
||||
|
||||
print >> opf, CONTENT_END
|
||||
|
||||
print('Finished')
|
||||
|
||||
filename = self.directory + '.epub'
|
||||
zipdir.toZip(filename, self.directory)
|
||||
Loading…
Reference in a new issue