added

2025-12-06 00:43:00 +01:00 · 2009-12-15 15:23:48 +00:00 · 2009-12-15 15:23:48 +00:00 · c0459faa43
commit c0459faa43
7 changed files with 2415 additions and 0 deletions
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
--- a/constants.py
+++ b/constants.py
@ -0,0 +1,135 @@
+CSS = '''body { margin-left: 5%; margin-right: 5%; margin-top: 5%; margin-bottom: 5%; text-align: justify; }
+pre { font-size: x-small; }
+h1 { text-align: center; }
+h2 { text-align: center; }
+h3 { text-align: center; }
+h4 { text-align: center; }
+h5 { text-align: center; }
+h6 { text-align: center; }
+.CI {
+    text-align:center;
+    margin-top:0px;
+    margin-bottom:0px;
+    padding:0px;
+    }
+.center   {text-align: center;}
+.smcap    {font-variant: small-caps;}
+.u        {text-decoration: underline;}
+.bold     {font-weight: bold;}
+'''
+
+MIMETYPE = '''application/epub+zip'''
+
+CONTAINER = '''<?xml version="1.0"?>
+<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+  <rootfiles>
+    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
+  </rootfiles>
+</container>
+'''
+
+CONTENT_START = '''<?xml version="1.0"?>
+<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
+         unique-identifier="BookId-Epub-%s">
+ <metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
+           xmlns:opf="http://www.idpf.org/2007/opf">
+   <dc:title>%s</dc:title> 
+   <dc:creator opf:role="aut">%s</dc:creator>
+   <dc:language>en-UK</dc:language> 
+   <dc:rights></dc:rights> 
+   <dc:publisher>sgzmd</dc:publisher> 
+   <dc:identifier id="BookId">urn:uuid:sigizmund.com062820072147132</dc:identifier>
+ </metadata>
+ <manifest>
+  <item id="ncx" href="toc.ncx" media-type="text/xml" />
+  <item id="style" href="stylesheet.css" media-type="text/css" />
+'''
+
+CONTENT_ITEM = '<item id="%s" href="%s" media-type="application/xhtml+xml" />'
+
+CONTENT_END_MANIFEST = '''</manifest>
+<spine toc="ncx">
+'''
+
+CONTENT_ITEMREF = '''<itemref idref="%s" />'''
+
+CONTENT_END = '''</spine>
+</package>
+'''
+
+TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
+  <head>
+    <meta name="dtb:uid" content="sigizmund.com062820072147132"/>
+    <meta name="dtb:depth" content="1"/>
+    <meta name="dtb:totalPageCount" content="0"/>
+    <meta name="dtb:maxPageNumber" content="0"/>
+  </head>
+  <docTitle>
+    <text>%s</text>
+  </docTitle>
+  <navMap>
+'''
+
+TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
+  <navLabel>
+    <text>%s</text>
+  </navLabel>
+  <content src="%s"/>
+</navPoint>
+'''
+
+TOC_END = '''</navMap>
+</ncx>
+'''
+
+XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>%s</title>
+<link href="stylesheet.css" type="text/css" rel="stylesheet" />
+</head>
+<body>
+<div>
+<h3>%s</h3>
+'''
+
+XHTML_END = '''</div>
+</body>
+</html>
+'''
+
+acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
+      'blockquote', 'br', 'center', 'cite', 'code', 'col',
+      'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em',
+      'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 
+      'ins', 'kbd', 'label', 'li', 'ol', 
+      'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
+      'strong', 'sub', 'sup', 'u', 'ul']
+
+acceptable_attributes = ['href']
+
+entities = { '&ndash;' : ' - ', '&mdash;' : ' - ', '&rdquo;' : '"', '&ldquo;' : '"', '&rsquo;' : '\'', '&lsquo;' : '\'', '&quot;' : '"' }
+
+FB2_PROLOGUE = '<FictionBook>'
+FB2_DESCRIPTION = '''<description>
+<title-info>
+  <genre>fanfiction</genre>
+  <author>
+  <first-name></first-name>
+  <middle-name></middle-name>
+  <last-name>%s</last-name>
+  </author>
+  <book-title>%s</book-title>
+  <lang>eng</lang>
+</title-info>
+<document-info>
+  <author>
+  <nickname>sgzmd</nickname>
+  </author>
+<date value="%s">%s</date>
+<id>sgzmd_%s</id>
+<version>2.0</version>
+</document-info>
+</description>'''
--- a/downaloder.py
+++ b/downaloder.py
@ -0,0 +1,74 @@
+import os
+import re
+import sys
+import shutil
+import os.path
+import getpass
+import urllib as u
+import pprint as pp
+import urllib2 as u2
+import urlparse as up
+import BeautifulSoup as bs
+import htmlentitydefs as hdefs
+
+import ffa
+import ficwad
+import output
+import fictionalley
+
+class FanficLoader:
+	'''A controller class which handles the interaction between various specific downloaders and writers'''
+	booksDirectory = "books"
+	
+	def __init__(self, adapter, writerClass):
+		self.adapter = adapter
+		self.writerClass = writerClass
+		
+	def download(self):
+		urls = self.adapter.extractIndividualUrls()
+		self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
+		
+		for u,n in urls:
+			text = self.adapter.getText(u)
+			self.writer.writeChapter(n, text)
+		
+		self.writer.finalise()
+	
+
+if __name__ == '__main__':
+	(url, format) = sys.argv[1:]
+	
+	if type(url) is unicode:
+		print('URL is unicode')
+		url = url.encode('latin1')
+	
+	adapter = None
+	writerClass = None
+	
+	if url.find('fanficauthors') != -1:
+		adapter = ffa.FFA(url)
+	elif url.find('fictionalley') != -1:
+		adapter = fictionalley.FictionAlley(url)
+	elif url.find('ficwad') != -1:
+		adapter = ficwad.FicWad(url)
+	else:
+		print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
+		sys.exit(1)
+	
+	if format == 'epub':
+		writerClass = output.EPubFanficWriter
+	
+	if adapter.requiresLogin(url):
+		print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
+		sys.stdout.write("Can I haz ur login? ")
+		login = sys.stdin.readline().strip()
+		password = getpass.getpass(prompt='Can I haz ur password? ')
+		print("Login: `%s`, Password: `%s`" % (login, password))
+		
+		adapter.setLogin(login)
+		adapter.setPassword(password)
+		
+	
+	loader = FanficLoader(adapter, writerClass)
+	loader.download()
+	
--- a/ffa.py
+++ b/ffa.py
@ -0,0 +1,187 @@
+import os
+import re
+import sys
+import cgi
+import uuid
+import shutil
+import base64
+import os.path
+import urllib as u
+import pprint as pp
+import urllib2 as u2
+import urlparse as up
+import BeautifulSoup as bs
+import htmlentitydefs as hdefs
+
+from constants import *
+
+from ficwad import *
+
+class FFA:
+	storyName = None
+	
+	def __init__(self):
+		self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
+		self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
+	
+	def getPasswordLine(self):
+		return '<input type="password" name="pass"'
+		
+	def getLoginScript(self):
+		return '/scripts/login.php'
+		
+	def getLoginPasswordOthers(self):
+		login = dict(login = 'name', password = 'pass')
+		other = dict(submit = 'Log In', remember='yes')
+		return (login, other)
+	
+	def getPrintableUrl(self, url):
+		return url + '?print=yes'
+	
+	def _findIndex(self, lines, what, start):
+		for i in range(start, len(lines)):
+			if lines[i].find(what) != -1:
+				return i
+		return -1
+	
+	def extractIndividualUrls(self, data, host, first, fetch = False):
+		lines = data.split('\n')
+		
+		optionLines = filter(lambda x : x.find('<option value="') != -1, lines)
+		
+		authorLines = filter(lambda x : x.find('pemail') != -1, lines)
+		for al in authorLines:
+			m = self.grabAuthor.match(al)
+			if m != None:
+				self.authorName = m.group(1)
+				break
+				
+		
+		optionsLines = optionLines[:len(optionLines)/2]
+		
+		storyName = first.split("/")[1]
+		
+		result = []
+		urls = []
+		for line in optionLines:
+			m = self.grabUrl.match(line)
+			u = m.group(2)
+			if u.find('" selected="selected') != -1:
+				u = u.replace('" selected="selected', '')
+			
+			if u in urls:
+				continue
+			else:
+				urls.append(u)
+			
+			result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3)))
+		
+		self.soup = bs.BeautifulSoup(data)
+		titles = self.soup.findAll(name = 'title', recursive=True)
+		if len(titles) > 0:
+			title = titles[0]
+			print(title)
+			(website, rest) = title.string.split('::')
+			story_chapter = rest.split("-")
+			
+			story = story_chapter[0].strip()
+			self.storyName = story
+		
+		return result
+	
+	def getStoryName(self):
+		return self.storyName
+	
+	def getAuthorName(self):
+		return self.authorName
+	
+	def getText(self, data, fetch = False):
+		lines = data.split('\n')
+		begin = self._findIndex(lines, '</select>', 0)+1
+		if begin == 0:
+			begiun = self._findIndex(lines, '<div><p>', 24)
+		
+		if begin == 0:
+			print('BAD start')
+			pp.pprint(lines)
+			sys.abort()
+		end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
+		print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
+		return "\n".join(lines[begin:end])
+
+class Downloader:
+	login = None
+	password = None
+	url = None
+	host = None
+	first = None
+	opener = None
+	
+	writer = None
+	
+	def __init__(self, url, login, password):
+		self.login = login
+		self.password = password
+		self.url = url
+
+		self.infoProvider = FicWad() #FFA()
+
+		parse = up.urlparse(url)
+		self.host = parse.scheme + '://' + parse.netloc
+		self.first = parse.path;
+		
+		self.loginUrl = self.host + self.infoProvider.getLoginScript()
+		
+		self.opener = u2.build_opener(u2.HTTPCookieProcessor())
+		
+	
+	def _loginRequired(self):
+		print('is login required?')
+		resp = self.opener.open(self.url)
+		data = resp.read()
+		if data.find(self.infoProvider.getPasswordLine()) != -1:
+			print('yep')
+			return True
+		else:
+			print('nada')
+			return False
+		
+	def _login(self):
+		(login, data) = self.infoProvider.getLoginPasswordOthers()
+
+		data[login['login']] = self.login
+		data[login['password']] = self.password
+		
+		urlvals = u.urlencode(data)
+		req = self.opener.open(self.loginUrl, urlvals)
+		
+		if req.read().find(self.infoProvider.getPasswordLine()) != -1:
+			return False
+		else:
+			return True
+	
+	def _getContent(self, url):
+		print("<!-- Opening %s -->" % url)
+		return self.opener.open(url).read()
+	
+	def download(self):
+		first = self._getContent(self.host + self.first)
+		urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
+		
+		self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName())
+		
+		for u,n in urls:
+			text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
+			self.writer.writeChapter(n, text)
+		
+		self.writer.finalise()
+		
+
+if __name__ == '__main__':
+	f = Downloader(sys.argv[1], 'sigizmund', '***************')
+	if f._loginRequired():
+		f._login()
+	f.download()
+	
+	
+	
--- a/fictionalley.py
+++ b/fictionalley.py
@ -0,0 +1,75 @@
+import os
+import re
+import sys
+import shutil
+import os.path
+import urllib as u
+import pprint as pp
+import urllib2 as u2
+import urlparse as up
+import BeautifulSoup as bs
+import htmlentitydefs as hdefs
+
+class FictionAlley:
+	def __init__(self):
+		pass
+	
+	def extractIndividualUrls(self, data, host, contents):
+		soup = bs.BeautifulStoneSoup(data)
+		
+		title = soup.find('title').string
+		self.storyName = "-".join(title.split('-')[1:]).strip()
+		
+		authors = soup.findAll('a')
+		
+		print('Story "%s" by %s' % (self.storyName, self.authorName))
+		
+		links = soup.findAll('a', { 'class' : 'chapterlink' } )
+
+		result = []
+		for a in links:
+			url = a['href']
+			title = a.string
+			result.append((url,title))
+			
+		return result
+	
+	def getStoryName(self):
+		return self.storyName
+
+	def getAuthorName(self):
+		return self.authorName
+	
+
+	def getText(self, data, fetch = False):
+		soup = bs.BeautifulStoneSoup(data)
+		div = soup.find('div', {'id' : 'storytext'})
+		if None == div:
+			return '<html/>'
+		
+		return div.prettify()
+	
+	def getPrintableUrl(self, url):
+		return url
+	
+	def getPasswordLine(self):
+		return 'opaopapassword'
+
+	def getLoginScript(self):
+		return 'opaopaloginscript'
+
+	def getLoginPasswordOthers(self):
+		login = dict(login = 'name', password = 'pass')
+		other = dict(submit = 'Log In', remember='yes')
+		return (login, other)
+
+		
+if __name__ == '__main__':
+	url = 'http://www.fictionalley.org/authors/drt/DA.html'
+	data = u2.urlopen(url).read()
+	host = up.urlparse(url).netloc
+	fw = FictionAlley()
+	fw.authorName = 'DrT'
+	urls = fw.extractIndividualUrls(data, host, url)
+	pp.pprint(urls)
+	print(fw.getText(data))
--- a/ficwad.py
+++ b/ficwad.py
@ -0,0 +1,97 @@
+import os
+import re
+import sys
+import shutil
+import os.path
+import urllib as u
+import pprint as pp
+import urllib2 as u2
+import urlparse as up
+import BeautifulSoup as bs
+import htmlentitydefs as hdefs
+
+class FicWad:
+	def __init__(self, url):
+		self.url = url
+		self.host = up.urlparse(url).netloc
+	
+	def requiresLogin(self, url):
+		return False
+	
+	def performLogin(self, url):
+		pass
+		
+	def setLogin(self, login):
+		self.login = login
+	
+	def setPassword(self, password):
+		self.password = password
+	
+	def extractIndividualUrls(self):
+		data = u2.urlopen(self.url).read()
+		soup = bs.BeautifulStoneSoup(data)
+		
+		title = soup.find('title').string
+		self.storyName = title.split('::')[0].strip()
+		
+		author = soup.find('span', {'class' : 'author'})
+		self.authorName = author.a.string
+		
+		print('Story "%s" by %s' % (self.storyName, self.authorName))
+		
+		select = soup.find('select', { 'name' : 'goto' } )
+		
+		allOptions = select.findAll('option')
+		result = []
+		for o in allOptions:
+			url = o['value']
+#			if type(url) is unicode:
+#				url = url.encode('utf-8')
+			title = o.string
+			result.append((url,title))
+			
+		return result
+	
+	def getStoryName(self):
+		return self.storyName
+
+	def getAuthorName(self):
+		return self.authorName
+	
+	def getText(self, url):
+		print(type(url))
+		if url.find('http://') == -1:
+			url = 'http://' + self.host + '/' + url
+		
+		data = u2.urlopen(url).read()
+		
+		soup = bs.BeautifulStoneSoup(data)
+		div = soup.find('div', {'id' : 'storytext'})
+		if None == div:
+			return '<html/>'
+		
+		return div.prettify()
+	
+	def getPrintableUrl(self, url):
+		return url
+	
+	def getPasswordLine(self):
+		return 'opaopapassword'
+
+	def getLoginScript(self):
+		return 'opaopaloginscript'
+
+	def getLoginPasswordOthers(self):
+		login = dict(login = 'name', password = 'pass')
+		other = dict(submit = 'Log In', remember='yes')
+		return (login, other)
+
+		
+if __name__ == '__main__':
+	url = 'http://www.ficwad.com/story/14536'
+	data = u2.urlopen(url).read()
+	host = up.urlparse(url).netloc
+	fw = FicWad()
+	urls = fw.extractIndividualUrls(data, host, url)
+	pp.pprint(urls)
+	print(fw.getText(data))
--- a/output.py
+++ b/output.py
@ -0,0 +1,136 @@
+import os
+import re
+import sys
+import cgi
+import uuid
+import shutil
+import base64
+import os.path
+import urllib as u
+import pprint as pp
+import urllib2 as u2
+import urlparse as up
+import BeautifulSoup as bs
+import htmlentitydefs as hdefs
+
+from constants import *
+
+class FanficWriter:
+	def __init__(self):
+		pass
+		
+	def writeChapter(self, title, text):
+		pass
+	
+	def finalise(self):
+		pass
+
+class HTMLWriter(FanficWriter):
+	def __init__(self, base, name, author):
+		pass
+
+	def writeChapter(self, title, text):
+		pass
+	
+	def finalise(self):
+		pass
+
+class EPubFanficWriter(FanficWriter):
+	chapters = []
+	
+	def __init__(self, base, name, author):
+		self.basePath = base
+		self.name = name.replace(" ", "_")
+		self.storyTitle = name
+		self.directory = self.basePath + '/' + self.name
+		
+		self.authorName = author
+
+		if os.path.exists(self.directory):
+			shutil.rmtree(self.directory)
+		
+		os.mkdir(self.directory)
+		
+		os.mkdir(self.directory + '/META-INF')
+		os.mkdir(self.directory + '/OEBPS')
+		
+		print >> open(self.directory + '/mimetype', 'w'), MIMETYPE
+		print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER
+		print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS
+
+	def _removeEntities(self, text):
+		for e in entities:
+			v = entities[e]
+			text = text.replace(e, v)
+		
+		return text
+	
+	def writeChapter(self, title, text):
+		fileName = base64.b64encode(title) + ".xhtml"
+		filePath = self.directory + "/OEBPS/" + fileName
+		f = open(filePath, 'w')
+		
+		text = self._removeEntities(text)
+		
+		self.soup = bs.BeautifulStoneSoup(text)
+
+		allTags = self.soup.findAll(recursive=True)
+		for t in allTags:
+			for attr in t._getAttrMap().keys():
+				if attr not in acceptable_attributes:
+					del t[attr]
+	    
+		allPs = self.soup.findAll(recursive=True)
+		for p in allPs:
+			if p.string != None and (len(p.string.strip()) == 0 or p.string.strip() == '&nbsp;' ) :
+				p.extract()
+				
+		allBrs = self.soup.findAll(recursive=True, name = ["br", "hr"])
+		for br in allBrs:
+			if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
+				br.name = 'p'
+
+#		cleanup(self.soup )
+		
+		text = self.soup.prettify()
+		
+		print >> f, XHTML_START % (title, title)
+		print >> f, text
+		print >> f, XHTML_END
+		
+		self.chapters.append((title, fileName))
+	
+	def finalise(self):
+		
+		### writing table of contents -- ncx file
+		
+		tocFilePath = self.directory + "/OEBPS/toc.ncx"
+		toc = open(tocFilePath, 'w')
+		print >> toc, TOC_START % self.storyTitle
+
+		### writing content -- opf file
+		opfFilePath = self.directory + "/OEBPS/content.opf"
+		opf = open(opfFilePath, 'w')
+		
+		print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
+
+		ids = []
+		
+		i = 0
+		for t,f in self.chapters:
+			chapterId = base64.b64encode(t)
+			print >> toc, TOC_ITEM % (chapterId, i, cgi.escape(t), f)
+			
+			print >> opf, CONTENT_ITEM % (chapterId, f)
+			
+			ids.append(chapterId)
+			
+			i = i + 1
+		
+		print >> toc, TOC_END
+		print >> opf, CONTENT_END_MANIFEST		
+		
+		for chapterId in ids:
+			print >> opf, CONTENT_ITEMREF % chapterId
+		
+		print >> opf, CONTENT_END