Add include_images support for html format.

Pass class & id attributes on html tags.
In Previous (rev:696) -
Fix for epub cover when cover_exclusion_regexp skips 1st img.
Don't add chapter number when only one chapter.
This commit is contained in:
Jim Miller 2012-10-14 11:23:20 -05:00
parent 60b7eae72e
commit 4d96632b67
8 changed files with 122 additions and 35 deletions

View file

@ -123,7 +123,7 @@ def do_download_for_worker(book,options):
# images only for epub, even if the user mistakenly turned it
# on else where.
if options['fileform'] != "epub":
if options['fileform'] not in ("epub","html"):
configuration.set("overrides","include_images","false")
adapter = adapters.getAdapter(configuration,book['url'])

View file

@ -192,6 +192,7 @@ keep_summary_html:true
strip_chapter_numbers:false
## add_chapter_numbers can be true, false or toconly
## (Note number is not added when there's only one chapter.)
add_chapter_numbers:false
## (Two versions of chapter_title_strip_pattern are shown below. You
@ -222,6 +223,22 @@ chapter_title_add_pattern:${index}. ${title}
## Each output format has a section that overrides [defaults]
[html]
## include images from img tags in the body and summary of
## stories. Images will be converted to jpg for size if possible.
## include_images is *only* available in epub and html output formats.
## include_images is *not* available in the web service in any format.
#include_images:false
## Note that it's *highly* recommended to use zipfile output or story
## unique destination directories to avoid overwriting images.
#output_filename: books/${author}/${title}/${title}-${siteabbrev}_${authorId}_${storyId}${formatext}
#zip_output: false
## This switch prevents FFDL from doing any processing on the images.
## Usually they would be converted to jpg, resized and optionally made
## grayscale.
no_image_processing: true
## output background color--only used by html and epub (and ignored in
## epub by many readers). Included below in output_css--will be
## ignored if not in output_css.
@ -324,7 +341,7 @@ output_css:
## include images from img tags in the body and summary of
## stories. Images will be converted to jpg for size if possible.
## include_images is *only* available in epub output format.
## include_images is *only* available in epub and html output format.
## include_images is *not* available in the web service in any format.
#include_images:false

View file

@ -138,7 +138,7 @@ def main():
# images only for epub, even if the user mistakenly turned it
# on else where.
if options.format != "epub":
if options.format not in ("epub","html"):
configuration.set("overrides","include_images","false")
if options.options:

View file

@ -304,7 +304,7 @@ class BaseSiteAdapter(Configurable):
if not fetch:
fetch=self._fetchUrlRaw
acceptable_attributes = ['href','name']
acceptable_attributes = ['href','name','class','id']
#print("include_images:"+self.getConfig('include_images'))
if self.getConfig('include_images'):
acceptable_attributes.extend(('src','alt','longdesc'))

View file

@ -20,6 +20,8 @@ import urlparse
import string
from math import floor
from functools import partial
import logging
import urlparse as up
import exceptions
from htmlcleanup import conditionalRemoveEntities, removeAllEntities
@ -52,7 +54,7 @@ try:
if export:
return (img.export('JPG'),'jpg','image/jpeg')
else:
print("image used unchanged")
logging.debug("image used unchanged")
return (data,'jpg','image/jpeg')
except:
@ -88,23 +90,34 @@ except:
img.save(outsio,'JPEG')
return (outsio.getvalue(),'jpg','image/jpeg')
else:
print("image used unchanged")
logging.debug("image used unchanged")
return (data,'jpg','image/jpeg')
except:
# No calibre or PIL, simple pass through with mimetype.
imagetypes = {
'jpg':'image/jpeg',
'jpeg':'image/jpeg',
'png':'image/png',
'gif':'image/gif',
'svg':'image/svg+xml',
}
def convert_image(url,data,sizes,grayscale):
ext=url[url.rfind('.')+1:].lower()
return (data,ext,imagetypes[ext])
return no_convert_image(url,data)
imagetypes = {
'jpg':'image/jpeg',
'jpeg':'image/jpeg',
'png':'image/png',
'gif':'image/gif',
'svg':'image/svg+xml',
}
## also used for explicit no image processing.
def no_convert_image(url,data):
parsedUrl = up.urlparse(url)
ext=parsedUrl.path[parsedUrl.path.rfind('.')+1:].lower()
if ext not in imagetypes:
logging.debug("no_convert_image url:%s - no known extension"%url)
# doesn't have extension? use jpg.
ext='jpg'
return (data,ext,imagetypes[ext])
def normalize_format_name(fmt):
if fmt:
@ -483,17 +496,22 @@ class Story(Configurable):
prefix='ffdl'
if imgurl not in self.imgurls:
parsedUrl = urlparse.urlparse(imgurl)
try:
sizes = [ int(x) for x in self.getConfigList('image_max_size') ]
if self.getConfig('no_image_processing'):
(data,ext,mime) = no_convert_image(imgurl,
fetch(imgurl))
else:
try:
sizes = [ int(x) for x in self.getConfigList('image_max_size') ]
except Exception, e:
raise exceptions.FailedToDownload("Failed to parse image_max_size from personal.ini:%s\nException: %s"%(self.getConfigList('image_max_size'),e))
(data,ext,mime) = convert_image(imgurl,
fetch(imgurl),
sizes,
self.getConfig('grayscale_images'))
except Exception, e:
raise exceptions.FailedToDownload("Failed to parse image_max_size from personal.ini:%s\nException: %s"%(self.getConfigList('image_max_size'),e))
try:
(data,ext,mime) = convert_image(imgurl,
fetch(imgurl),
sizes,
self.getConfig('grayscale_images'))
except Exception, e:
print("Failed to load or convert image, skipping:\n%s\nException: %s"%(imgurl,e))
logging.info("Failed to load or convert image, skipping:\n%s\nException: %s"%(imgurl,e))
return "failedtoload"
# explicit cover, make the first image.
@ -528,7 +546,7 @@ class Story(Configurable):
ext)
self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data})
print("\nimgurl:%s\nnewsrc:%s\nimage size:%d\n"%(imgurl,newsrc,len(data)))
logging.debug("\nimgurl:%s\nnewsrc:%s\nimage size:%d\n"%(imgurl,newsrc,len(data)))
else:
newsrc = self.imgtuples[self.imgurls.index(imgurl)]['newsrc']

View file

@ -190,6 +190,8 @@ class BaseStoryWriter(Configurable):
if outfilename == None:
outfilename=self.getOutputFileName()
self.outfilename = outfilename
# minor cheat, tucking css into metadata.
if self.getConfig("output_css"):
self.story.setMetadata("output_css",
@ -203,8 +205,8 @@ class BaseStoryWriter(Configurable):
logging.info("Save directly to file: %s" % outfilename)
if self.getConfig('make_directories'):
path=""
dirs = os.path.dirname(outfilename).split('/')
for dir in dirs:
outputdirs = os.path.dirname(outfilename).split('/')
for dir in outputdirs:
path+=dir+"/"
if not os.path.exists(path):
os.mkdir(path) ## os.makedirs() doesn't work in 2.5.2?
@ -238,14 +240,14 @@ class BaseStoryWriter(Configurable):
# fetch once.
if self.getConfig('zip_output'):
out = StringIO.StringIO()
self.zipout = ZipFile(outstream, 'w', compression=ZIP_DEFLATED)
self.writeStoryImpl(out)
zipout = ZipFile(outstream, 'w', compression=ZIP_DEFLATED)
zipout.writestr(self.getBaseFileName(),out.getvalue())
self.zipout.writestr(self.getBaseFileName(),out.getvalue())
# declares all the files created by Windows. otherwise, when
# it runs in appengine, windows unzips the files as 000 perms.
for zf in zipout.filelist:
for zf in self.zipout.filelist:
zf.create_system = 0
zipout.close()
self.zipout.close()
out.close()
else:
self.writeStoryImpl(outstream)
@ -253,6 +255,27 @@ class BaseStoryWriter(Configurable):
if close:
outstream.close()
def writeFile(self, filename, data):
logging.debug("writeFile:%s"%filename)
if self.getConfig('zip_output'):
outputdirs = os.path.dirname(self.getBaseFileName())
if outputdirs:
filename=outputdirs+'/'+filename
self.zipout.writestr(filename,data)
else:
outputdirs = os.path.dirname(self.outfilename)
if outputdirs:
filename=outputdirs+'/'+filename
dir = os.path.dirname(filename)
if not os.path.exists(dir):
os.mkdir(dir) ## os.makedirs() doesn't work in 2.5.2?
outstream = open(filename,"wb")
outstream.write(data)
outstream.close()
def writeStoryImpl(self, out):
"Must be overriden by sub classes."
pass

View file

@ -46,6 +46,10 @@ ${output_css}
<h1><a href="${storyUrl}">${title}</a> by ${authorHTML}</h1>
''')
self.HTML_COVER = string.Template('''
<img src="${coverimg}" alt="cover" />
''')
self.HTML_TITLE_PAGE_START = string.Template('''
<table class="full">
''')
@ -84,11 +88,16 @@ ${output_css}
def writeStoryImpl(self, out):
if self.hasConfig("cover_content"):
COVER = string.Template(self.getConfig("cover_content"))
else:
COVER = self.HTML_COVER
if self.hasConfig('file_start'):
FILE_START = string.Template(self.getConfig("file_start"))
else:
FILE_START = self.HTML_FILE_START
if self.hasConfig('file_end'):
FILE_END = string.Template(self.getConfig("file_end"))
else:
@ -96,6 +105,9 @@ ${output_css}
self._write(out,FILE_START.substitute(self.story.getAllMetadata()))
if self.getConfig('include_images') and self.story.cover:
self._write(out,COVER.substitute(dict(self.story.getAllMetadata().items()+{'coverimg':self.story.cover}.items())))
self.writeTitlePage(out,
self.HTML_TITLE_PAGE_START,
self.HTML_TITLE_ENTRY,
@ -125,3 +137,8 @@ ${output_css}
self._write(out,CHAPTER_END.substitute(vals))
self._write(out,FILE_END.substitute(self.story.getAllMetadata()))
if self.getConfig('include_images'):
for imgmap in self.story.getImgUrls():
self.writeFile(imgmap['newsrc'],imgmap['data'])

View file

@ -176,6 +176,7 @@ keep_summary_html:true
strip_chapter_numbers:false
## add_chapter_numbers can be true, false or toconly
## (Note number is not added when there's only one chapter.)
add_chapter_numbers:false
## (Two versions of chapter_title_strip_pattern are shown below. You
@ -206,6 +207,17 @@ chapter_title_add_pattern:${index}. ${title}
## Each output format has a section that overrides [defaults]
[html]
## include images from img tags in the body and summary of
## stories. Images will be converted to jpg for size if possible.
## include_images is *only* available in epub and html output formats.
## include_images is *not* available in the web service in any format.
#include_images:false
## This switch prevents FFDL from doing any processing on the images.
## Usually they would be converted to jpg, resized and optionally made
## grayscale.
no_image_processing: true
## output background color--only used by html and epub (and ignored in
## epub by many readers). Included below in output_css--will be
## ignored if not in output_css.
@ -305,7 +317,7 @@ output_css:
## include images from img tags in the body and summary of
## stories. Images will be converted to jpg for size if possible.
## include_images is *only* available in epub output format.
## include_images is *only* available in epub and html output format.
#include_images:false
## If set, the first image found will be made the cover image. If