From bf651df38fabb8723bee3bdc10a39e2f7724161c Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Thu, 5 Dec 2013 13:15:46 -0600 Subject: [PATCH] Add prequels/sequels & fix groups site specific metadata for fimfiction.net. --- defaults.ini | 19 ++- .../adapters/adapter_fimfictionnet.py | 30 ++++- fanficdownloader/story.py | 112 ++++++++++-------- plugin-defaults.ini | 19 ++- 4 files changed, 127 insertions(+), 53 deletions(-) diff --git a/defaults.ini b/defaults.ini index da1fe27f..e31349c6 100644 --- a/defaults.ini +++ b/defaults.ini @@ -1260,13 +1260,30 @@ extracategories:My Little Pony: Friendship is Magic ## Extra metadata that this adapter knows about. See [dramione.org] ## for examples of how to use them. -extra_valid_entries:likes,dislikes,views,total_views,short_description,groups +extra_valid_entries:likes,dislikes,views,total_views,short_description,groups,groupsUrl,groupsHTML,prequel,prequelUrl,prequelHTML,sequels,sequelsUrl,sequelsHTML likes_label:Likes dislikes_label:Dislikes views_label:Highest Single Chapter Views total_views_label:Total Views short_description_label:Short Summary groups_label:Groups +groupsUrl_label:Groups URLs +groupsHTML_label:Groups +prequel_label:Prequel +prequelUrl_label:Prequel URL +prequelHTML_label:Prequel +sequels_label:Sequels +sequelsUrl_label:Sequel URLs +sequelsHTML_label:Sequels + +keep_in_order_sequels:true +keep_in_order_sequelsUrl:true +keep_in_order_groups:true +keep_in_order_groupsUrl:true + +## Assume entryUrl, apply to "%s" to +## make entryHTML. +make_linkhtml_entries:prequel,sequels,groups ## Some sites do not require a login, but do require the user to ## confirm they are adult for adult content. In commandline version, diff --git a/fanficdownloader/adapters/adapter_fimfictionnet.py b/fanficdownloader/adapters/adapter_fimfictionnet.py index c24e9b2a..025aeeec 100644 --- a/fanficdownloader/adapters/adapter_fimfictionnet.py +++ b/fanficdownloader/adapters/adapter_fimfictionnet.py @@ -249,11 +249,33 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter): value = unicode(value) self.story.setMetadata(metakey, value) - rawGroupList = soup.find('ul', {'id':'story_group_list'}) - if rawGroupList is not None: - for groupName in rawGroupList.findAll('a', {'href':re.compile('^/group/')}): + #Sequel links and group links are each bundled into story_group_list containers. + #Rather than mess around examining the header text, which is outside the containers, + #one can tell the two link types apart by examining them directly. + allGroupLists = soup.findAll('ul', {'id':'story_group_list'}) + for groupList in allGroupLists: + for groupName in groupList.findAll('a', {'href':re.compile('^/group/')}): + self.story.addToList("groupsUrl", 'http://'+self.host+groupName["href"]) self.story.addToList("groups",stripHTML(groupName).replace(',', ';')) - + for sequel in groupList.findAll('a', {'class':'story_link'}): + self.story.addToList("sequelsUrl", 'http://'+self.host+sequel["href"]) + self.story.addToList("sequels", stripHTML(sequel).replace(',', ';')) + + #The link to the prequel is embedded in the description text, so erring + #on the side of caution and wrapping this whole thing in a try block. + #If anything goes wrong this probably wasn't a valid prequel link. + try: + description = soup.find('div', {'class':'description'}) + firstHR = description.find("hr") + nextSib = firstHR.nextSibling + if "This story is a sequel to" in nextSib.string: + link = nextSib.nextSibling + if link.name == "a": + self.story.setMetadata("prequelUrl", 'http://'+self.host+link["href"]) + self.story.setMetadata("prequel", stripHTML(link)) + except: + pass + def hookForUpdates(self,chaptercount): if self.oldchapters and len(self.oldchapters) > self.newestChapterNum: print("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1)) diff --git a/fanficdownloader/story.py b/fanficdownloader/story.py index a63d1e4c..dfc0dc95 100644 --- a/fanficdownloader/story.py +++ b/fanficdownloader/story.py @@ -30,7 +30,7 @@ from configurable import Configurable # Create convert_image method depending on which graphics lib we can # load. Preferred: calibre, PIL, none - + imagetypes = { 'jpg':'image/jpeg', 'jpeg':'image/jpeg', @@ -48,14 +48,14 @@ try: export = False img = Image() img.load(data) - + owidth, oheight = img.size nwidth, nheight = sizes scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight) if scaled: img.size = (nwidth, nheight) export = True - + if normalize_format_name(img.format) != imgtype: export = True @@ -65,7 +65,7 @@ try: canvas.compose(img) img = canvas export = True - + if grayscale and img.type != "GrayscaleType": img.type = "GrayscaleType" export = True @@ -75,7 +75,7 @@ try: else: logger.debug("image used unchanged") return (data,imgtype,imagetypes[imgtype]) - + except: # No calibre routines, try for PIL for CLI. @@ -87,14 +87,14 @@ except: removetrans,imgtype="jpg",background='#ffffff'): export = False img = Image.open(StringIO(data)) - + owidth, oheight = img.size nwidth, nheight = sizes scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight) if scaled: img = img.resize((nwidth, nheight),Image.ANTIALIAS) export = True - + if normalize_format_name(img.format) != imgtype: if img.mode == "P": # convert pallete gifs to RGB so jpg save doesn't fail. @@ -119,7 +119,7 @@ except: else: logger.debug("image used unchanged") return (data,imgtype,imagetypes[imgtype]) - + except: # No calibre or PIL, simple pass through with mimetype. def convert_image(url,data,sizes,grayscale, @@ -129,16 +129,16 @@ except: ## also used for explicit no image processing. def no_convert_image(url,data): parsedUrl = up.urlparse(url) - + ext=parsedUrl.path[parsedUrl.path.rfind('.')+1:].lower() - + if ext not in imagetypes: logger.debug("no_convert_image url:%s - no known extension"%url) # doesn't have extension? use jpg. ext='jpg' - + return (data,ext,imagetypes[ext]) - + def normalize_format_name(fmt): if fmt: fmt = fmt.lower() @@ -222,7 +222,7 @@ langs = { } class Story(Configurable): - + def __init__(self, configuration): Configurable.__init__(self, configuration) try: @@ -234,7 +234,7 @@ class Story(Configurable): self.chapters = [] # chapters will be tuples of (title,html) self.imgurls = [] self.imgtuples = [] - + self.cover=None # *href* of new cover image--need to create html. self.oldcover=None # (oldcoverhtmlhref,oldcoverhtmltype,oldcoverhtmldata,oldcoverimghref,oldcoverimgtype,oldcoverimgdata) self.calibrebookmark=None # cheesy way to carry calibre bookmark file forward across update. @@ -251,7 +251,7 @@ class Story(Configurable): self.addToList(metadata,val) self.setReplace(self.getConfig('replace_metadata')) - + def setMetadata(self, key, value, condremoveentities=True): ## still keeps < < and & if condremoveentities: @@ -297,9 +297,9 @@ class Story(Configurable): # A way to explicitly include spaces in the # replacement string. The .ini parser eats any # trailing spaces. - replacement=replacement.replace('\s',' ') + replacement=replacement.replace('\s',' ') self.replacements.append([metakeys,regexp,replacement,condkey,condregexp]) - + def doReplacements(self,value,key): for (metakeys,regexp,replacement,condkey,condregexp) in self.replacements: if (metakeys == None or key in metakeys) \ @@ -309,11 +309,11 @@ class Story(Configurable): if condkey and condkey != key: # prevent infinite recursion. condval = self.getMetadata(condkey) doreplace = condval != None and condregexp.search(condval) - + if doreplace: value = regexp.sub(replacement,value) return value - + def getMetadataRaw(self,key): if self.isValidMetaEntry(key) and self.metadata.has_key(key): return self.metadata[key] @@ -326,7 +326,7 @@ class Story(Configurable): return value if self.isList(key): - join_string = self.getConfig("join_string_"+key,u", ").replace('\s',' ') + join_string = self.getConfig("join_string_"+key,u", ").replace('\s',' ') value = join_string.join(self.getList(key, removeallentities, doreplacements=True)) if doreplacements: value = self.doReplacements(value,key+"_LIST") @@ -351,7 +351,7 @@ class Story(Configurable): return value else: #if self.getConfig("default_value_"+key): return self.getConfig("default_value_"+key) - + def getAllMetadata(self, removeallentities=False, doreplacements=True, @@ -360,13 +360,13 @@ class Story(Configurable): All single value *and* list value metadata as strings (unless keeplists=True, then keep lists). ''' allmetadata = {} - + # special handling for authors/authorUrls linkhtml="%s" if self.isList('author'): # more than one author, assume multiple authorUrl too. htmllist=[] for i, v in enumerate(self.getList('author')): - aurl = self.getList('authorUrl')[i] + aurl = self.getList('authorUrl')[i] auth = v # make sure doreplacements & removeallentities are honored. if doreplacements: @@ -375,9 +375,9 @@ class Story(Configurable): if removeallentities: aurl=removeAllEntities(aurl) auth=removeAllEntities(auth) - + htmllist.append(linkhtml%('author',aurl,auth)) - join_string = self.getConfig("join_string_authorHTML",u", ").replace('\s',' ') + join_string = self.getConfig("join_string_authorHTML",u", ").replace('\s',' ') self.setMetadata('authorHTML',join_string.join(htmllist)) else: self.setMetadata('authorHTML',linkhtml%('author',self.getMetadata('authorUrl', removeallentities, doreplacements), @@ -388,20 +388,38 @@ class Story(Configurable): self.getMetadata('series', removeallentities, doreplacements))) elif self.getMetadataRaw('series') != None: self.setMetadata('seriesHTML',self.getMetadataRaw('series')) - + + for k in self.getConfigList('make_linkhtml_entries'): + # Assuming list, because it has to be site specific and + # they are all lists. + htmllist=[] + for i, v in enumerate(self.getList(k)): + url = self.getList(k+'Url')[i] + # make sure doreplacements & removeallentities are honored. + if doreplacements: + url=self.doReplacements(url,k+'Url') + v=self.doReplacements(v,k) + if removeallentities: + url=removeAllEntities(url) + v=removeAllEntities(v) + + htmllist.append(linkhtml%('author',url,v)) + join_string = self.getConfig("join_string_"+k+"HTML",u", ").replace('\s',' ') + self.setMetadata(k+'HTML',join_string.join(htmllist)) + for k in self.getValidMetaList(): if self.isList(k) and keeplists: allmetadata[k] = self.getList(k, removeallentities, doreplacements) else: allmetadata[k] = self.getMetadata(k, removeallentities, doreplacements) - + return allmetadata # just for less clutter in adapters. def extendList(self,listname,l): for v in l: self.addToList(listname,v.strip()) - + def addToList(self,listname,value): if value==None: return @@ -421,24 +439,24 @@ class Story(Configurable): return self.hasConfig("include_in_"+listname) or \ ( self.isValidMetaEntry(listname) and self.metadata.has_key(listname) \ and isinstance(self.metadata[listname],list) ) - + def getList(self,listname, removeallentities=False, doreplacements=True, includelist=[]): #print("getList(%s,%s)"%(listname,includelist)) retlist = [] - + if not self.isValidMetaEntry(listname): return retlist - + # includelist prevents infinite recursion of include_in_'s if self.hasConfig("include_in_"+listname) and listname not in includelist: for k in self.getConfigList("include_in_"+listname): retlist.extend(self.getList(k,removeallentities=False, doreplacements=doreplacements,includelist=includelist+[listname])) else: - + if not self.isList(listname): retlist = [self.getMetadata(listname,removeallentities=False, doreplacements=doreplacements)] @@ -458,7 +476,7 @@ class Story(Configurable): # ships=>[ ]*(/|&|&)[ ]*=>/ if listname == 'ships' and self.getConfig('sort_ships'): retlist = [ '/'.join(sorted(x.split('/'))) for x in retlist ] - + if retlist: if listname in ('author','authorUrl','authorId') or self.getConfig('keep_in_order_'+listname): # need to retain order for author & authorUrl so the @@ -473,9 +491,9 @@ class Story(Configurable): def getSubjectTags(self, removeallentities=False): # set to avoid duplicates subject tags. subjectset = set() - + tags_list = self.getConfigList("include_subject_tags") + self.getConfigList("extra_subject_tags") - + # metadata all go into dc:subject tags, but only if they are configured. for (name,value) in self.getAllMetadata(removeallentities=removeallentities,keeplists=True).iteritems(): if name in tags_list: @@ -491,7 +509,7 @@ class Story(Configurable): subjectset.remove('') return list(subjectset | set(self.getConfigList("extratags"))) - + def addChapter(self, url, title, html): if self.getConfig('strip_chapter_numbers') and \ self.getConfig('chapter_title_strip_pattern'): @@ -512,7 +530,7 @@ class Story(Configurable): html) ) else: retval = self.chapters - + return retval def formatFileName(self,template,allowunsafefilename=True): @@ -526,7 +544,7 @@ class Story(Configurable): pattern = re.compile(self.getConfig("output_filename_safepattern",r"[^a-zA-Z0-9_\. \[\]\(\)&'-]+")) for k in origvalues.keys(): values[k]=re.sub(pattern,'_', removeAllEntities(self.getMetadata(k))) - + return string.Template(template).substitute(values).encode('utf8') # pass fetch in from adapter in case we need the cookies collected @@ -537,7 +555,7 @@ class Story(Configurable): # isn't used anywhere. if cover and self.getConfig('never_make_cover'): return - + url = url.strip() # ran across an image with a space in the # src. Browser handled it, so we'd better, too. @@ -545,7 +563,7 @@ class Story(Configurable): # gets too big too fast and breaks things. if is_appengine: return - + if url.startswith("http") or url.startswith("file") or parenturl == None: imgurl = url else: @@ -598,7 +616,7 @@ class Story(Configurable): except Exception, e: logger.info("Failed to load or convert image, skipping:\n%s\nException: %s"%(imgurl,e)) return "failedtoload" - + # explicit cover, make the first image. if cover and not self.getConfig('never_make_cover'): if len(self.imgtuples) > 0 and 'cover' in self.imgtuples[0]['newsrc']: @@ -624,19 +642,19 @@ class Story(Configurable): self.cover=newsrc self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data}) self.imgurls.append(imgurl) - + newsrc = "images/%s-%s.%s"%( prefix, self.imgurls.index(imgurl), ext) self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data}) - + #logger.debug("\nimgurl:%s\nnewsrc:%s\nimage size:%d\n"%(imgurl,newsrc,len(data))) else: newsrc = self.imgtuples[self.imgurls.index(imgurl)]['newsrc'] - + #print("===============\n%s\nimg url:%s\n============"%(newsrc,self.imgurls[-1])) - + return newsrc def getImgUrls(self): @@ -645,9 +663,9 @@ class Story(Configurable): #parsedUrl = urlparse.urlparse(url) retlist.append(self.imgtuples[i]) return retlist - + def __str__(self): - return "Metadata: " +str(self.metadata) + return "Metadata: " +str(self.metadata) def commaGroups(s): groups = [] diff --git a/plugin-defaults.ini b/plugin-defaults.ini index 449d217a..40f02178 100644 --- a/plugin-defaults.ini +++ b/plugin-defaults.ini @@ -1242,13 +1242,30 @@ extracategories:My Little Pony: Friendship is Magic ## Extra metadata that this adapter knows about. See [dramione.org] ## for examples of how to use them. -extra_valid_entries:likes,dislikes,views,total_views,short_description,groups +extra_valid_entries:likes,dislikes,views,total_views,short_description,groups,groupsUrl,groupsHTML,prequel,prequelUrl,prequelHTML,sequels,sequelsUrl,sequelsHTML likes_label:Likes dislikes_label:Dislikes views_label:Highest Single Chapter Views total_views_label:Total Views short_description_label:Short Summary groups_label:Groups +groupsUrl_label:Groups URLs +groupsHTML_label:Groups +prequel_label:Prequel +prequelUrl_label:Prequel URL +prequelHTML_label:Prequel +sequels_label:Sequels +sequelsUrl_label:Sequel URLs +sequelsHTML_label:Sequels + +keep_in_order_sequels:true +keep_in_order_sequelsUrl:true +keep_in_order_groups:true +keep_in_order_groupsUrl:true + +## Assume entryUrl, apply to "%s" to +## make entryHTML. +make_linkhtml_entries:prequel,sequels,groups ## Some sites do not require a login, but do require the user to ## confirm they are adult for adult content. In commandline version,