Merging changes from trunk.

This commit is contained in:
Jim Miller 2012-02-24 18:42:52 -06:00
commit 6a83131a99
30 changed files with 324 additions and 108 deletions

View file

@ -1,6 +1,6 @@
# ffd-retief-hrd fanfictiondownloader
application: fanfictiondownloader
version: 4-3-2
application: ffd-retief-hrd
version: 4-3-3
runtime: python27
api_version: 1
threadsafe: true

View file

@ -15,6 +15,10 @@ from datetime import datetime
from PyQt4.Qt import (QApplication, QMenu, QToolButton)
from PyQt4.Qt import QPixmap, Qt
from PyQt4.QtCore import QBuffer
from calibre.ptempfile import PersistentTemporaryFile, PersistentTemporaryDirectory, remove_dir
from calibre.ebooks.metadata import MetaInformation, authors_to_string
from calibre.ebooks.metadata.meta import get_metadata
@ -30,6 +34,7 @@ from calibre_plugins.fanfictiondownloader_plugin.common_utils import (set_plugin
create_menu_action_unique, get_library_uuid)
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML
from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
from calibre_plugins.fanfictiondownloader_plugin.dcsource import get_dcsource
@ -93,6 +98,8 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
# are not found in the zip file will result in null QIcons.
icon = get_icon('images/icon.png')
#self.qaction.setText('FFDL')
# The qaction is automatically created from the action_spec defined
# above
self.qaction.setIcon(icon)
@ -408,7 +415,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
ffdlconfig = SafeConfigParser()
ffdlconfig.readfp(StringIO(get_resources("plugin-defaults.ini")))
ffdlconfig.readfp(StringIO(prefs['personal.ini']))
adapter = adapters.getAdapter(ffdlconfig,url)
adapter = adapters.getAdapter(ffdlconfig,url,fileform)
options['personal.ini'] = prefs['personal.ini']
@ -440,7 +447,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
book['author_sort'] = book['author'] = story.getMetadata("author", removeallentities=True)
book['publisher'] = story.getMetadata("site")
book['tags'] = writer.getTags()
book['comments'] = story.getMetadata("description") #, removeallentities=True) comments handles entities better.
book['comments'] = stripHTML(story.getMetadata("description")) #, removeallentities=True) comments handles entities better.
book['series'] = story.getMetadata("series")
# adapter.opener is the element with a threadlock. But del

View file

@ -110,7 +110,7 @@ def do_download_for_worker(book,options):
ffdlconfig.readfp(StringIO(get_resources("plugin-defaults.ini")))
ffdlconfig.readfp(StringIO(options['personal.ini']))
adapter = adapters.getAdapter(ffdlconfig,book['url'])
adapter = adapters.getAdapter(ffdlconfig,book['url'],options['fileform'])
adapter.is_adult = book['is_adult']
adapter.username = book['username']
adapter.password = book['password']

View file

@ -64,7 +64,7 @@ for x in imports():
#print x
__class_list.append(sys.modules[x].getClass())
def getAdapter(config,url):
def getAdapter(config,url,fileform=None):
## fix up leading protocol.
fixedurl = re.sub(r"(?i)^[htp]+[:/]+","http://",url.strip())
if not fixedurl.startswith("http"):
@ -89,6 +89,7 @@ def getAdapter(config,url):
fixedurl = fixedurl.replace("http://","http://www.")
if cls:
adapter = cls(config,fixedurl) # raises InvalidStoryURL
adapter.setSectionOrder(adapter.getSiteDomain(),fileform)
return adapter
# No adapter found.
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
@ -133,7 +133,8 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
# sometimes poorly formated desc (<p> w/o </p>) leads
# to all labels being included.
svalue=svalue[:svalue.find('<span class="label">')]
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -220,7 +221,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return AdAstraFanficComSiteAdapter

View file

@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ArchiveOfOurOwnOrgAdapter
@ -126,7 +126,8 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
a = metasoup.find('blockquote',{'class':'userstuff'})
if a != None:
self.story.setMetadata('description',a.text)
self.setDescription(url,a.text)
#self.story.setMetadata('description',a.text)
a = metasoup.find('dd',{'class':"rating tags"})
if a != None:
@ -213,10 +214,11 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
# grab the text for an individual chapter.
def getChapterText(self, url):
logging.debug('Getting chapter text from: %s' % url)
print('Getting chapter text from: %s' % url)
chapter=bs.BeautifulSoup('<div class="story"></div>')
soup = bs.BeautifulSoup(self._fetchUrl(url),selfClosingTags=('br','hr'))
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr'))
headnotes = soup.find('div', {'class' : "preface group"}).find('div', {'class' : "notes module"})
if headnotes != None:
@ -257,5 +259,5 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
if None == soup:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(chapter)
return self.utf8FromSoup(url,chapter)

View file

@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
# By virtue of being recent and requiring both is_adult and user/pass,
# adapter_fanficcastletvnet.py is the best choice for learning to
@ -218,7 +218,8 @@ class CastleFansOrgAdapter(BaseSiteAdapter): # XXX
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -305,4 +306,4 @@ class CastleFansOrgAdapter(BaseSiteAdapter): # XXX
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)

View file

@ -24,7 +24,7 @@ import time
from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class FanFictionNetSiteAdapter(BaseSiteAdapter):
@ -153,7 +153,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
if 'title_t' in var:
self.story.setMetadata('title', value)
if 'summary' in var:
self.story.setMetadata('description', value)
self.setDescription(url,value)
#self.story.setMetadata('description', value)
if 'datep' in var:
self.story.setMetadata('datePublished',makeDate(value, '%m-%d-%y'))
if 'dateu' in var:
@ -270,7 +271,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
logging.debug('div id=storytext not found. data:%s'%data)
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)
def getClass():
return FanFictionNetSiteAdapter

View file

@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
@ -201,7 +201,8 @@ class FicBookNetAdapter(BaseSiteAdapter):
break
summary=soup.find('span', {'class' : 'urlize'})
self.story.setMetadata('description', summary.text)
self.setDescription(url,summary.text)
#self.story.setMetadata('description', summary.text)
# grab the text for an individual chapter.
def getChapterText(self, url):
@ -218,4 +219,4 @@ class FicBookNetAdapter(BaseSiteAdapter):
if None == chapter:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(chapter)
return self.utf8FromSoup(url,chapter)

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
@ -187,7 +187,8 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
for small in storydd.findAll('small'):
small.extract() ## removes the <small> tags, leaving only the summary.
self.story.setMetadata('description',stripHTML(storydd))
self.setDescription(url,storydd)
#self.story.setMetadata('description',stripHTML(storydd))
return
@ -223,7 +224,7 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
if not data or not text:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(text)
return self.utf8FromSoup(url,text)
def getClass():
return FictionAlleyOrgSiteAdapter

View file

@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class FicwadComSiteAdapter(BaseSiteAdapter):
@ -124,7 +124,8 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
# description
storydiv = soup.find("div",{"id":"story"})
self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
self.setDescription(url,storydiv.find("blockquote",{'class':'summary'}).p.string)
#self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
# most of the meta data is here:
metap = storydiv.find("p",{"class":"meta"})
@ -209,7 +210,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return FicwadComSiteAdapter

View file

@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return FimFictionNetSiteAdapter
@ -141,7 +141,15 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
description_soup.find('a', {"class":"more"}).extract()
except:
pass
self.story.setMetadata('description', description_soup.text)
story_img = soup.find('img',{'class':'story_image'})
if self.getConfig('keep_summary_html') and \
self.getConfig('include_images') and \
story_img:
self.setDescription(self.url,"%s<br/>%s"%(story_img,description_soup.text))
else:
self.setDescription(self.url,description_soup.text)
#self.story.setMetadata('description', description_soup.text)
# Unfortunately, nowhere on the page is the year mentioned.
# Best effort to deal with this:
@ -171,5 +179,5 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
soup = bs.BeautifulSoup(self._fetchUrl(url),selfClosingTags=('br','hr')).find('div', {'id' : 'chapter_container'})
if soup == None:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(soup)
return self.utf8FromSoup(url,soup)

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
@ -125,7 +125,8 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
## Finding the metadata is a bit of a pain. Desc is the only thing this color.
desctable= soup.find('table',{'bgcolor':'#f0e8e8'})
self.story.setMetadata('description',stripHTML(desctable))
self.setDescription(url,desctable)
#self.story.setMetadata('description',stripHTML(desctable))
## Finding the metadata is a bit of a pain. Most of the meta
## data is in a center.table without a bgcolor.
@ -193,7 +194,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)
def getClass():
return HarryPotterFanFictionComSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
@ -174,7 +174,8 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# Summary: ....
m = re.match(r".*?Summary: (.*)$",metastr)
if m:
self.story.setMetadata('description', m.group(1))
self.setDescription(url, m.group(1))
#self.story.setMetadata('description', m.group(1))
# completed
m = re.match(r".*?Status: Completed.*?",metastr)
@ -210,7 +211,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
del div['style']
del div['align']
anchor.name='div'
return utf8FromSoup(anchor)
return self.utf8FromSoup(url,anchor)
else:
logging.debug('Using kludgey text find for older mediaminer story.')
@ -226,7 +227,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
soup.findAll('table',{'class':'tbbrdr'}):
tag.extract() # remove tag from soup.
return utf8FromSoup(soup)
return self.utf8FromSoup(url,soup)
def getClass():

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
@ -131,7 +131,8 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'listbox':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -209,7 +210,7 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)
def getClass():
return PotionsAndSnitchesNetSiteAdapter

View file

@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
@ -227,7 +227,8 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
if part.startswith("Summary:"):
part = part[part.find(':')+1:]
self.story.setMetadata('description',part)
self.setDescription(url,part)
#self.story.setMetadata('description',part)
# want to get the next tr of the table.
#print("%s"%titlea.parent.parent.findNextSibling('tr'))
@ -295,4 +296,4 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
if None == story:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(story)
return self.utf8FromSoup(url,story)

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
@ -164,7 +164,8 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -238,7 +239,7 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return TenhawkPresentsComSiteAdapter

View file

@ -22,7 +22,7 @@ import logging
from .. import BeautifulSoup as bs
from .. import exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TestSiteAdapter(BaseSiteAdapter):
@ -191,7 +191,7 @@ horizontal rules
</div>
'''
soup = bs.BeautifulStoneSoup(text,selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
return utf8FromSoup(soup)
return self.utf8FromSoup(url,soup)
def getClass():
return TestSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
@ -166,7 +166,7 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -245,7 +245,7 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return TheWritersCoffeeShopComSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
@ -127,6 +127,8 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
else:
raise e
descurl = url
if "<h2>Story Not Found</h2>" in data:
raise exceptions.StoryDoesNotExist(url)
@ -154,12 +156,14 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
# going to pull part of the meta data from author list page.
logging.debug("**AUTHOR** URL: "+self.story.getMetadata('authorUrl'))
authordata = self._fetchUrl(self.story.getMetadata('authorUrl'))
descurl=self.story.getMetadata('authorUrl')
authorsoup = bs.BeautifulSoup(authordata)
# author can have several pages, scan until we find it.
while( not authorsoup.find('a', href=re.compile(r"^/Story-"+self.story.getMetadata('storyId'))) ):
nextpage = 'http://'+self.host+authorsoup.find('a', {'class':'arrowf'})['href']
logging.debug("**AUTHOR** nextpage URL: "+nextpage)
authordata = self._fetchUrl(nextpage)
descurl=nextpage
authorsoup = bs.BeautifulSoup(authordata)
except urllib2.HTTPError, e:
if e.code == 404:
@ -168,7 +172,8 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
raise e
storydiv = authorsoup.find('div', {'id':'st'+self.story.getMetadata('storyId'), 'class':re.compile(r"storylistitem")})
self.story.setMetadata('description',stripHTML(storydiv.find('div',{'class':'storydesc'})))
self.setDescription(descurl,storydiv.find('div',{'class':'storydesc'}))
#self.story.setMetadata('description',stripHTML(storydiv.find('div',{'class':'storydesc'})))
self.story.setMetadata('title',stripHTML(storydiv.find('a',{'class':'storylink'})))
verticaltable = soup.find('table', {'class':'verticaltable'})
@ -238,7 +243,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
div.find('h3').extract()
except:
pass
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)
def getClass():
return TwistingTheHellmouthSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TwilightedNetSiteAdapter(BaseSiteAdapter):
@ -162,7 +162,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -243,7 +243,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return TwilightedNetSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TwiwriteNetSiteAdapter(BaseSiteAdapter):
@ -169,7 +169,8 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -255,7 +256,7 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return TwiwriteNetSiteAdapter

View file

@ -23,7 +23,7 @@ import urllib2
from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class WhoficComSiteAdapter(BaseSiteAdapter):
@ -120,9 +120,10 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
# link instead to find the appropriate metadata.
a = soup.find('a', href=re.compile(r'reviews.php\?sid='+self.story.getMetadata('storyId')))
metadata = a.findParent('td')
metadatachunks = utf8FromSoup(metadata).split('<br />')
metadatachunks = self.utf8FromSoup(None,metadata).split('<br />')
# process metadata for this story.
self.story.setMetadata('description', metadatachunks[1])
self.setDescription(url,metadatachunks[1])
#self.story.setMetadata('description', metadatachunks[1])
# First line of the stuff with ' - ' separators
moremeta = metadatachunks[2]
@ -224,7 +225,7 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return WhoficComSiteAdapter

View file

@ -23,6 +23,9 @@ import urllib
import urllib2 as u2
import urlparse as up
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
try:
from google.appengine.api import apiproxy_stub_map
def urlfetch_timeout_hook(service, call, request, response):
@ -66,8 +69,9 @@ class BaseSiteAdapter(Configurable):
def __init__(self, config, url):
self.config = config
Configurable.__init__(self, config)
self.addConfigSection(self.getSiteDomain())
self.addConfigSection("overrides")
self.setSectionOrder(self.getSiteDomain())
# self.addConfigSection(self.getSiteDomain())
# self.addConfigSection("overrides")
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
@ -150,6 +154,12 @@ class BaseSiteAdapter(Configurable):
headers=headers)
return self._decode(self.opener.open(req).read())
def _fetchUrlRaw(self, url, parameters=None):
if parameters != None:
return self.opener.open(url,urllib.urlencode(parameters)).read()
else:
return self.opener.open(url).read()
# parameters is a dict()
def _fetchUrl(self, url, parameters=None):
if self.getConfig('slow_down_sleep_time'):
@ -159,10 +169,7 @@ class BaseSiteAdapter(Configurable):
for sleeptime in [0, 0.5, 4, 9]:
time.sleep(sleeptime)
try:
if parameters:
return self._decode(self.opener.open(url,urllib.urlencode(parameters)).read())
else:
return self._decode(self.opener.open(url).read())
return self._decode(self._fetchUrlRaw(url,parameters))
except Exception, e:
excpt=e
logging.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))
@ -235,6 +242,49 @@ class BaseSiteAdapter(Configurable):
if self.getConfig('collect_series'):
self.story.setMetadata('series','%s [%s]'%(name, num))
def setDescription(self,url,svalue):
#print("\n\nsvalue:\n%s\n"%svalue)
if self.getConfig('keep_summary_html'):
if isinstance(svalue,str) or isinstance(svalue,unicode):
svalue = bs.BeautifulSoup(svalue)
self.story.setMetadata('description',self.utf8FromSoup(url,svalue))
else:
self.story.setMetadata('description',stripHTML(svalue))
#print("\n\ndescription:\n"+self.story.getMetadata('description')+"\n\n")
# this gives us a unicode object, not just a string containing bytes.
# (I gave soup a unicode string, you'd think it could give it back...)
def utf8FromSoup(self,url,soup):
acceptable_attributes = ['href','name']
#print("include_images:"+self.getConfig('include_images'))
if self.getConfig('include_images'):
acceptable_attributes.extend(('src','alt'))
for img in soup.findAll('img'):
img['src']=self.story.addImgUrl(self,url,img['src'],self._fetchUrlRaw)
for attr in soup._getAttrMap().keys():
if attr not in acceptable_attributes:
del soup[attr] ## strip all tag attributes except href and name
for t in soup.findAll(recursive=True):
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr] ## strip all tag attributes except href and name
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
return soup.__str__('utf8').decode('utf-8')
fullmon = {"January":"01", "February":"02", "March":"03", "April":"04", "May":"05",
"June":"06","July":"07", "August":"08", "September":"09", "October":"10",
"November":"11", "December":"12" }
@ -245,7 +295,9 @@ def makeDate(string,format):
# fudge english month names for people who's locale is set to
# non-english. All our current sites date in english, even if
# there's non-english content.
# there's non-english content. -- ficbook.net now makes that a
# lie. It has to do something even more complicated to get
# Russian month names correct everywhere.
do_abbrev = "%b" in format
if "%B" in format or do_abbrev:
@ -259,24 +311,3 @@ def makeDate(string,format):
return datetime.datetime.strptime(string,format)
acceptable_attributes = ['href','name']
# this gives us a unicode object, not just a string containing bytes.
# (I gave soup a unicode string, you'd think it could give it back...)
def utf8FromSoup(soup):
for t in soup.findAll(recursive=True):
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr] ## strip all tag attributes except href and name
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
return soup.__str__('utf8').decode('utf-8')

View file

@ -21,16 +21,21 @@ import ConfigParser
# inherit from Configurable. The config file(s) uses ini format:
# [sections] with key:value settings.
#
# There's a [defaults] section which is overriden by the writer's
# section [epub], which is overriden by the adapter's section for each
# site.
# writer does [defaults], [www.whofic.com], [epub], [www.whofic.com:epub], [overrides]
#
# Until a write is created, the adapter only has [defaults], [www.whofic.com], [overrides]
#
# [defaults]
# titlepage_entries: category,genre, status
# [epub]
# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated
# [www.whofic.com]
# titlepage_entries: category,genre, status,dateUpdated,rating
# [epub]
# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated
# [www.whofic.com:epub]
# titlepage_entries: category,genre, status,datePublished
# [overrides]
# titlepage_entries: category
class Configurable(object):
@ -38,6 +43,14 @@ class Configurable(object):
self.config = config
self.sectionslist = ['defaults']
def setSectionOrder(self,site,fileform=None):
self.sectionslist = ['defaults']
self.addConfigSection(site)
if fileform:
self.addConfigSection(fileform)
self.addConfigSection(site+":"+fileform)
self.addConfigSection("overrides")
def addConfigSection(self,section):
self.sectionslist.insert(0,section)

View file

@ -16,9 +16,27 @@
#
import os, re
import urlparse
from base64 import b64encode
from htmlcleanup import conditionalRemoveEntities, removeAllEntities
# Create convert_image method depending on which graphics lib we can
# load. Preferred: calibre, PIL, none
try:
from calibre.utils.magick.draw import minify_image
def convert_image(data,sizes,grayscale):
img = minify_image(data, minify_to=sizes)
if grayscale:
img.type = "GrayscaleType"
return img.export('JPG')
except:
# Problem: writer_epub assumes image is jpg.
def convert_image(data,sizes,grayscale):
return data
# The list comes from ffnet, the only multi-language site we support
# at the time of writing. Values are taken largely from pycountry,
# but with some corrections and guesses.
@ -72,6 +90,8 @@ class Story:
self.metadata = {'version':'4.3'}
self.replacements = []
self.chapters = [] # chapters will be tuples of (title,html)
self.imgurls = []
self.imgurldata = []
self.listables = {} # some items (extratags, category, warnings & genres) are also kept as lists.
def setMetadata(self, key, value):
@ -153,6 +173,57 @@ class Story:
def getChapters(self):
"Chapters will be tuples of (title,html)"
return self.chapters
# pass fetch in from adapter in case we need the cookies collected
# as well as it's a base_story class method.
def addImgUrl(self,configurable,parenturl,url,fetch):
if url.startswith("http") :
imgurl = url
elif parenturl != None:
parsedUrl = urlparse.urlparse(parenturl)
if url.startswith("/") :
imgurl = urlparse.urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
url,
'','',''))
else:
imgurl = urlparse.urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
parsedUrl.path + url,
'','',''))
# using b64 encode of the url means that the same image ends
# up with the same name both now, in different chapters, and
# later with new update chapters. Numbering them didn't do
# that.
newsrc = "images/%s.jpg"%(b64encode(imgurl))
if imgurl not in self.imgurls:
self.imgurls.append(imgurl)
parsedUrl = urlparse.urlparse(imgurl)
# newsrc = "images/%s.jpg"%(
# self.imgurls.index(imgurl))
sizes = [ int(x) for x in configurable.getConfigList('image_max_size') ]
data = convert_image(fetch(imgurl),
sizes,
configurable.getConfig('grayscale_images'))
#print("\nimgurl\nimage size:%d\n"%len(data))
self.imgurldata.append((newsrc,data))
# else:
# newsrc = "images/%s.jpg"%(
# self.imgurls.index(imgurl))
#print("===============\n%s\nimg url:%s\n============"%(newsrc,self.imgurls[-1]))
return newsrc
def getImgUrls(self):
retlist = []
for i, url in enumerate(self.imgurls):
parsedUrl = urlparse.urlparse(url)
retlist.append(self.imgurldata[i])
return retlist
def __str__(self):
return "Metadata: " +str(self.metadata) + "\nListables: " +str(self.listables) #+ "\nChapters: "+str(self.chapters)

View file

@ -39,10 +39,11 @@ class BaseStoryWriter(Configurable):
def __init__(self, config, adapter):
Configurable.__init__(self, config)
self.addConfigSection(adapter.getSiteDomain())
self.addConfigSection(self.getFormatName())
self.addConfigSection(adapter.getSiteDomain()+":"+self.getFormatName())
self.addConfigSection("overrides")
self.setSectionOrder(adapter.getSiteDomain(),self.getFormatName())
# self.addConfigSection(adapter.getSiteDomain())
# self.addConfigSection(self.getFormatName())
# self.addConfigSection(adapter.getSiteDomain()+":"+self.getFormatName())
# self.addConfigSection("overrides")
self.adapter = adapter
self.story = adapter.getStoryMetadataOnly() # only cache the metadata initially.
@ -144,7 +145,7 @@ class BaseStoryWriter(Configurable):
def _write(self, out, text):
out.write(text.encode('utf8'))
def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None):
def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None, NO_TITLE_ENTRY=None):
"""
Write the title page, but only include entries that there's
metadata for. START, ENTRY and END are expected to already by
@ -171,6 +172,12 @@ class BaseStoryWriter(Configurable):
label=self.getConfig(entry+"_label")
else:
label=self.titleLabels[entry]
# If the label for the title entry is empty, use the
# 'no title' option if there is one.
if label == "" and NO_TITLE_ENTRY:
TEMPLATE= NO_TITLE_ENTRY
self._write(out,TEMPLATE.substitute({'label':label,
'value':self.story.getMetadata(entry)}))

View file

@ -20,6 +20,7 @@ import string
import StringIO
import zipfile
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
import urllib
## XML isn't as forgiving as HTML, so rather than generate as strings,
## use DOM to generate the XML files.
@ -57,6 +58,10 @@ class EpubWriter(BaseStoryWriter):
self.EPUB_TITLE_ENTRY = string.Template('''
<b>${label}:</b> ${value}<br />
''')
self.EPUB_NO_TITLE_ENTRY = string.Template('''
${value}<br />
''')
self.EPUB_TITLE_PAGE_END = string.Template('''
@ -84,6 +89,10 @@ class EpubWriter(BaseStoryWriter):
self.EPUB_TABLE_TITLE_WIDE_ENTRY = string.Template('''
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
''')
self.EPUB_TABLE_NO_TITLE_ENTRY = string.Template('''
<tr><td colspan="2">${label}${value}</td></tr>
''')
self.EPUB_TABLE_TITLE_PAGE_END = string.Template('''
@ -268,6 +277,24 @@ class EpubWriter(BaseStoryWriter):
title))
itemrefs.append("file%04d"%i)
if self.getConfig('include_images'):
#from calibre.utils.magick.draw import minify_image
imgcount=0
sizes = [ int(x) for x in self.getConfigList('image_max_size') ]
for (newsrc,data) in self.story.getImgUrls():
imgfile = "OEBPS/"+newsrc
# saveimg = minify_image(data, minify_to=sizes)
# if self.getConfig('grayscale_images'):
# saveimg.type = "GrayscaleType"
# outputepub.writestr(imgfile,saveimg.export('JPG'))
outputepub.writestr(imgfile,data)
items.append(("image%04d"%imgcount,
imgfile,
"image/jpeg",
None))
imgcount+=1
manifest = contentdom.createElement("manifest")
package.appendChild(manifest)
for item in items:
@ -346,11 +373,13 @@ class EpubWriter(BaseStoryWriter):
TITLE_PAGE_START = self.EPUB_TABLE_TITLE_PAGE_START
TITLE_ENTRY = self.EPUB_TABLE_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.EPUB_TABLE_TITLE_WIDE_ENTRY
NO_TITLE_ENTRY = self.EPUB_TABLE_NO_TITLE_ENTRY
TITLE_PAGE_END = self.EPUB_TABLE_TITLE_PAGE_END
else:
TITLE_PAGE_START = self.EPUB_TITLE_PAGE_START
TITLE_ENTRY = self.EPUB_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.EPUB_TITLE_ENTRY # same, only wide in tables.
NO_TITLE_ENTRY = self.EPUB_NO_TITLE_ENTRY
TITLE_PAGE_END = self.EPUB_TITLE_PAGE_END
titlepageIO = StringIO.StringIO()
@ -358,7 +387,8 @@ class EpubWriter(BaseStoryWriter):
START=TITLE_PAGE_START,
ENTRY=TITLE_ENTRY,
WIDE_ENTRY=WIDE_TITLE_ENTRY,
END=TITLE_PAGE_END)
END=TITLE_PAGE_END,
NO_TITLE_ENTRY=NO_TITLE_ENTRY)
if titlepageIO.getvalue(): # will be false if no title page.
outputepub.writestr("OEBPS/title_page.xhtml",titlepageIO.getvalue())
titlepageIO.close()
@ -384,7 +414,7 @@ class EpubWriter(BaseStoryWriter):
fullhtml = fullhtml.replace('</p>','</p>\n').replace('<br />','<br />\n')
outputepub.writestr("OEBPS/file%04d.xhtml"%(index+1),fullhtml.encode('utf-8'))
del fullhtml
# declares all the files created by Windows. otherwise, when
# it runs in appengine, windows unzips the files as 000 perms.
for zf in outputepub.filelist:

View file

@ -49,6 +49,10 @@ class MobiWriter(BaseStoryWriter):
self.MOBI_TITLE_ENTRY = string.Template('''
<b>${label}:</b> ${value}<br />
''')
self.MOBI_NO_TITLE_ENTRY = string.Template('''
${value}<br />
''')
self.MOBI_TITLE_PAGE_END = string.Template('''
@ -75,6 +79,10 @@ class MobiWriter(BaseStoryWriter):
self.MOBI_TABLE_TITLE_WIDE_ENTRY = string.Template('''
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
''')
self.MOBI_TABLE_NO_TITLE_WIDE_ENTRY = string.Template('''
<tr><td colspan="2">${value}</td></tr>
''')
self.MOBI_TABLE_TITLE_PAGE_END = string.Template('''
@ -129,11 +137,13 @@ class MobiWriter(BaseStoryWriter):
TITLE_PAGE_START = self.MOBI_TABLE_TITLE_PAGE_START
TITLE_ENTRY = self.MOBI_TABLE_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.MOBI_TABLE_TITLE_WIDE_ENTRY
NO_TITLE_ENTRY = self.MOBI_TABLE_NO_TITLE_ENTRY
TITLE_PAGE_END = self.MOBI_TABLE_TITLE_PAGE_END
else:
TITLE_PAGE_START = self.MOBI_TITLE_PAGE_START
TITLE_ENTRY = self.MOBI_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.MOBI_TITLE_ENTRY # same, only wide in tables.
NO_TITLE_ENTRY = self.MOBI_NO_TITLE_ENTRY
TITLE_PAGE_END = self.MOBI_TITLE_PAGE_END
titlepageIO = StringIO.StringIO()
@ -141,7 +151,8 @@ class MobiWriter(BaseStoryWriter):
START=TITLE_PAGE_START,
ENTRY=TITLE_ENTRY,
WIDE_ENTRY=WIDE_TITLE_ENTRY,
END=TITLE_PAGE_END)
END=TITLE_PAGE_END,
NO_TITLE_ENTRY=NO_TITLE_ENTRY)
if titlepageIO.getvalue(): # will be false if no title page.
files.append(titlepageIO.getvalue())
titlepageIO.close()

View file

@ -213,6 +213,23 @@ output_css:
.u {text-decoration: underline;}
.bold {font-weight: bold;}
## include images from img tags in the body and summary of
## stories
#include_images:false
## Resize images down to width, height, preserving aspect ratio.
## Nook size, with margin.
#image_max_size: 580, 725
## Change image to grayscale, if graphics library allows, to save
## space.
#grayscale_images: false
## If not set, the summary will have all html stripped for safety.
## Both this and include_images must be true to get images in the
## summary.
#keep_summary_html:false
[mobi]
## mobi TOC cannot be turned off right now.
#include_tocpage: true