Add replace_hr & never_make_cover options, allow empty fimfiction stories.

This commit is contained in:
Jim Miller 2012-02-29 12:44:12 -06:00
parent 6528b3c9a5
commit 189832a7a9
8 changed files with 50 additions and 18 deletions

View file

@ -27,7 +27,7 @@ class FanFictionDownLoaderBase(InterfaceActionBase):
description = 'UI plugin to download FanFiction stories from various sites.'
supported_platforms = ['windows', 'osx', 'linux']
author = 'Jim Miller'
version = (1, 5, 2)
version = (1, 5, 3)
minimum_calibre_version = (0, 8, 30)
#: This field defines the GUI plugin class that contains all the code

View file

@ -147,6 +147,11 @@ extratags: FanFiction
# (.*)Great(.*)=>\1Moderate\2
# .*-Centered=>
## Some readers don't show horizontal rule (<hr />) tags correctly.
## This replaces them all with a centered '* * *'. (Note centering
## doesn't work on some devices either.)
#replace_hr: false
## Each output format has a section that overrides [defaults]
[html]
@ -244,6 +249,10 @@ output_css:
## in chapters.
#make_firstimage_cover: false
## If set, the epub will never have a cover, even include_images is on
## and the site has specific cover images.
#never_make_cover: false
## If set, and there isn't already a cover image from the adapter or
## from make_firstimage_cover, this image will be made the cover.
## It can be either a 'file:' or 'http:' url.

View file

@ -102,7 +102,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
for chapter in soup.findAll("a", {"class":"chapter_link"}):
chapterDates.append(chapter.span.extract().text.strip("()"))
self.chapterUrls.append((chapter.text.strip(), "http://"+self.getSiteDomain() + chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
for character in [character_icon['title'] for character_icon in soup.findAll("a", {"class":"character_icon"})]:
@ -157,14 +157,6 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
now = datetime.datetime.now()
# Get the date of creation from the first chapter
datePublished_text = chapterDates[0]
day, month = datePublished_text.split()
day = re.sub(r"[^\d.]+", '', day)
datePublished = makeDate("%s%s%s"%(now.year,month,day), "%Y%b%d")
if datePublished > now :
datePublished = datePublished.replace(year=now.year-1)
self.story.setMetadata("datePublished", datePublished)
dateUpdated_soup = bs.BeautifulSoup(data).find("div", {"class":"calendar"})
dateUpdated_soup.find('span').extract()
dateUpdated = makeDate("%s%s"%(now.year,dateUpdated_soup.text), "%Y%b%d")
@ -172,6 +164,18 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
dateUpdated = datePublished.replace(year=now.year-1)
self.story.setMetadata("dateUpdated", dateUpdated)
# Get the date of creation from the first chapter
if len(chapterDates) > 0:
datePublished_text = chapterDates[0]
day, month = datePublished_text.split()
day = re.sub(r"[^\d.]+", '', day)
datePublished = makeDate("%s%s%s"%(now.year,month,day), "%Y%b%d")
if datePublished > now :
datePublished = datePublished.replace(year=now.year-1)
self.story.setMetadata("datePublished", datePublished)
else:
self.story.setMetadata("datePublished", dateUpdated)
def getChapterText(self, url):
logging.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),selfClosingTags=('br','hr')).find('div', {'id' : 'chapter_container'})

View file

@ -186,7 +186,7 @@ br breaks<br><br>
br breaks<br><br>
<hr>
horizontal rules
<hr>
<hr size=1 noshade>
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</div>

View file

@ -272,8 +272,9 @@ class BaseSiteAdapter(Configurable):
self.story.setMetadata('description',stripHTML(svalue))
#print("\n\ndescription:\n"+self.story.getMetadata('description')+"\n\n")
# this gives us a unicode object, not just a string containing bytes.
# This gives us a unicode object, not just a string containing bytes.
# (I gave soup a unicode string, you'd think it could give it back...)
# Now also does a bunch of other common processing for us.
def utf8FromSoup(self,url,soup,fetch=None):
if not fetch:
fetch=self._fetchUrlRaw
@ -294,9 +295,9 @@ class BaseSiteAdapter(Configurable):
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr] ## strip all tag attributes except href and name
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
# CSS classes of the same names defined
if t.name in ('u'):
t['class']=t.name
t.name='span'
@ -307,9 +308,16 @@ class BaseSiteAdapter(Configurable):
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
retval = soup.__str__('utf8').decode('utf-8')
if self.getConfig('replace_hr'):
# replacing a self-closing tag with a container tag in the
# soup is more difficult than it first appears. So cheat.
retval = retval.replace("<hr />","<div class='center'>* * *</div>")
# Don't want body tags in chapter html--writers add them.
# This is primarily for epub updates.
return re.sub(r"</?body>\r?\n?","",soup.__str__('utf8').decode('utf-8'))
return re.sub(r"</?body>\r?\n?","",retval)
fullmon = {"January":"01", "February":"02", "March":"03", "April":"04", "May":"05",
"June":"06","July":"07", "August":"08", "September":"09", "October":"10",

View file

@ -51,7 +51,7 @@ def get_update_data(inputio,
# Hellmouth, which uses chapter0.html.
if( item.getAttribute("media-type") == "application/xhtml+xml" ):
href=relpath+item.getAttribute("href")
print("---- item href:%s path part: %s"%(href,get_path_part(href)))
#print("---- item href:%s path part: %s"%(href,get_path_part(href)))
if re.match(r'.*/(file|chapter)\d+\.x?html',href):
if getsoups:
soup = bs.BeautifulSoup(epub.read(href).decode("utf-8"))

View file

@ -315,7 +315,7 @@ class Story:
return "failedtoload"
# explicit cover, make the first image.
if cover:
if cover and not configurable.getConfig('never_make_cover'):
if len(self.imgtuples) > 0 and 'cover' in self.imgtuples[0]['newsrc']:
# remove existing cover, if there is one.
del self.imgurls[0]
@ -327,7 +327,9 @@ class Story:
else:
self.imgurls.append(imgurl)
# First image, copy not link because calibre will replace with it's cover.
if (len(self.imgurls)==1 and configurable.getConfig('make_firstimage_cover')):
if len(self.imgurls)==1 and \
configurable.getConfig('make_firstimage_cover') and \
not configurable.getConfig('never_make_cover'):
newsrc = "images/cover.%s"%ext
self.cover=newsrc
self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data})

View file

@ -122,6 +122,11 @@ extratags: FanFiction
# (.*)Great(.*)=>\1Moderate\2
# .*-Centered=>
## Some readers don't show horizontal rule (<hr />) tags correctly.
## This replaces them all with a centered '* * *'. (Note centering
## doesn't work on some devices either.)
#replace_hr: false
## Each output format has a section that overrides [defaults]
[html]
@ -216,6 +221,10 @@ output_css:
## in chapters.
#make_firstimage_cover: false
## If set, the epub will never have a cover, even include_images is on
## and the site has specific cover images.
#never_make_cover: false
## If set, and there isn't already a cover image from the adapter or
## from make_firstimage_cover, this image will be made the cover.
## It can be either a 'file:' or 'http:' url.