From 0bea4afd01e5d0630090382a41313c8462b644d5 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Fri, 24 Feb 2012 20:50:43 -0600 Subject: [PATCH] Update ga for image support changes. Change img file names. Possibility of dups between updates, but not bad overwrites or overlength names, --- .../adapters/adapter_archiveofourownorg.py | 2 +- .../adapters/adapter_gayauthorsorg.py | 7 ++-- fanficdownloader/adapters/adapter_test1.py | 1 - fanficdownloader/adapters/base_adapter.py | 3 +- fanficdownloader/story.py | 38 +++++++++++++++---- 5 files changed, 37 insertions(+), 14 deletions(-) diff --git a/fanficdownloader/adapters/adapter_archiveofourownorg.py b/fanficdownloader/adapters/adapter_archiveofourownorg.py index 93e45539..ede2a023 100644 --- a/fanficdownloader/adapters/adapter_archiveofourownorg.py +++ b/fanficdownloader/adapters/adapter_archiveofourownorg.py @@ -214,7 +214,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter): # grab the text for an individual chapter. def getChapterText(self, url): - print('Getting chapter text from: %s' % url) + logging.debug('Getting chapter text from: %s' % url) chapter=bs.BeautifulSoup('
') data = self._fetchUrl(url) diff --git a/fanficdownloader/adapters/adapter_gayauthorsorg.py b/fanficdownloader/adapters/adapter_gayauthorsorg.py index ab4984e4..66023de9 100644 --- a/fanficdownloader/adapters/adapter_gayauthorsorg.py +++ b/fanficdownloader/adapters/adapter_gayauthorsorg.py @@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate +from base_adapter import BaseSiteAdapter, makeDate def getClass(): return GayAuthorsAdapter @@ -162,7 +162,8 @@ class GayAuthorsAdapter(BaseSiteAdapter): self.story.setMetadata('rating',rating.text) summary = msoup.find('span', {'itemprop' : 'description'}) - self.story.setMetadata('description',summary.text) + self.setDescription(self.url,summary.text) + #self.story.setMetadata('description',summary.text) stats = msoup.find('dl',{'class':'info'}) @@ -200,4 +201,4 @@ class GayAuthorsAdapter(BaseSiteAdapter): if None == div: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) - return utf8FromSoup(div) + return self.utf8FromSoup(url,div) diff --git a/fanficdownloader/adapters/adapter_test1.py b/fanficdownloader/adapters/adapter_test1.py index c55b9ecc..eeeb585b 100644 --- a/fanficdownloader/adapters/adapter_test1.py +++ b/fanficdownloader/adapters/adapter_test1.py @@ -78,7 +78,6 @@ class TestSiteAdapter(BaseSiteAdapter): Some more longer description. "I suck at summaries!" "Better than it sounds!" "My first fic" ''') self.story.setMetadata('datePublished',makeDate("1975-03-15","%Y-%m-%d")) - self.story.setMetadata('dateCreated',datetime.datetime.now()) if self.story.getMetadata('storyId') == '669': self.story.setMetadata('dateUpdated',datetime.datetime.now()) else: diff --git a/fanficdownloader/adapters/base_adapter.py b/fanficdownloader/adapters/base_adapter.py index aeec423b..6f0e9d99 100644 --- a/fanficdownloader/adapters/base_adapter.py +++ b/fanficdownloader/adapters/base_adapter.py @@ -259,8 +259,9 @@ class BaseSiteAdapter(Configurable): acceptable_attributes = ['href','name'] #print("include_images:"+self.getConfig('include_images')) if self.getConfig('include_images'): - acceptable_attributes.extend(('src','alt')) + acceptable_attributes.extend(('src','alt','origsrc')) for img in soup.findAll('img'): + img['origsrc']=img['src'] img['src']=self.story.addImgUrl(self,url,img['src'],self._fetchUrlRaw) for attr in soup._getAttrMap().keys(): diff --git a/fanficdownloader/story.py b/fanficdownloader/story.py index a4095754..c618e7cc 100644 --- a/fanficdownloader/story.py +++ b/fanficdownloader/story.py @@ -17,7 +17,6 @@ import os, re import urlparse -from base64 import b64encode from htmlcleanup import conditionalRemoveEntities, removeAllEntities @@ -198,21 +197,44 @@ class Story: # up with the same name both now, in different chapters, and # later with new update chapters. Numbering them didn't do # that. - newsrc = "images/%s.jpg"%(b64encode(imgurl)) + # newsrc = "images/%s.jpg"%(b64encode(imgurl)) + # step = 20 + # if newsrc > step: + # i = step + # while i < len(newsrc): + # newsrc = newsrc[:i]+"/"+newsrc[i:] + # i += step + + # But, b64 names can get too big for zip (on windows, at + # least) to handle too quickly. + + # This version, prefixing the images with the creation + # timestamp, still allows for dup images to be detected and + # not dup'ed in a single download. And it prevents 0.jpg from + # earlier update being overwritten by the first image in newer + # chapter. It does not, however, prevent dup copies of the + # same image being d/l'ed and saved in different updates. A + # bit of corner case inefficiency I can live with rather than + # scanning all the pre-existing files on update. oldsrc is + # being saved on img tags just in case, however. + prefix=self.getMetadataRaw('dateCreated').strftime("%Y%m%d%H%M%S") + if imgurl not in self.imgurls: self.imgurls.append(imgurl) parsedUrl = urlparse.urlparse(imgurl) - # newsrc = "images/%s.jpg"%( - # self.imgurls.index(imgurl)) + newsrc = "images/%s-%s.jpg"%( + prefix, + self.imgurls.index(imgurl)) sizes = [ int(x) for x in configurable.getConfigList('image_max_size') ] data = convert_image(fetch(imgurl), sizes, configurable.getConfig('grayscale_images')) - #print("\nimgurl\nimage size:%d\n"%len(data)) + print("\nimgurl:%s\nnewsrc:%s\nimage size:%d\n"%(imgurl,newsrc,len(data))) self.imgurldata.append((newsrc,data)) - # else: - # newsrc = "images/%s.jpg"%( - # self.imgurls.index(imgurl)) + else: + newsrc = "images/%s-%s.jpg"%( + prefix, + self.imgurls.index(imgurl)) #print("===============\n%s\nimg url:%s\n============"%(newsrc,self.imgurls[-1]))