Update ga for image support changes. Change img file names. Possibility of

dups between updates, but not bad overwrites or overlength names,
2026-05-08 12:36:11 +02:00 · 2012-02-24 20:50:43 -06:00 · 2012-02-24 20:50:43 -06:00 · 0bea4afd01
commit 0bea4afd01
parent 6a83131a99
5 changed files with 37 additions and 14 deletions
--- a/fanficdownloader/adapters/adapter_archiveofourownorg.py
+++ b/fanficdownloader/adapters/adapter_archiveofourownorg.py
@ -214,7 +214,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):

    # grab the text for an individual chapter.
    def getChapterText(self, url):
-        print('Getting chapter text from: %s' % url)
+        logging.debug('Getting chapter text from: %s' % url)
 		
        chapter=bs.BeautifulSoup('<div class="story"></div>')
        data = self._fetchUrl(url)
--- a/fanficdownloader/adapters/adapter_gayauthorsorg.py
+++ b/fanficdownloader/adapters/adapter_gayauthorsorg.py
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
 from ..htmlcleanup import stripHTML
 from .. import exceptions as exceptions

-from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
+from base_adapter import BaseSiteAdapter, makeDate

 def getClass():
    return GayAuthorsAdapter
@ -162,7 +162,8 @@ class GayAuthorsAdapter(BaseSiteAdapter):
        self.story.setMetadata('rating',rating.text)
 		
        summary = msoup.find('span', {'itemprop' : 'description'})
-        self.story.setMetadata('description',summary.text)
+        self.setDescription(self.url,summary.text)
+        #self.story.setMetadata('description',summary.text)
 	

        stats = msoup.find('dl',{'class':'info'})
@ -200,4 +201,4 @@ class GayAuthorsAdapter(BaseSiteAdapter):
        if None == div:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
    
-        return utf8FromSoup(div)
+        return self.utf8FromSoup(url,div)
--- a/fanficdownloader/adapters/adapter_test1.py
+++ b/fanficdownloader/adapters/adapter_test1.py
@ -78,7 +78,6 @@ class TestSiteAdapter(BaseSiteAdapter):
 Some more longer description.  "I suck at summaries!"  "Better than it sounds!"  "My first fic"
 ''')
        self.story.setMetadata('datePublished',makeDate("1975-03-15","%Y-%m-%d"))
-        self.story.setMetadata('dateCreated',datetime.datetime.now())
        if self.story.getMetadata('storyId') == '669':
            self.story.setMetadata('dateUpdated',datetime.datetime.now())
        else:
--- a/fanficdownloader/adapters/base_adapter.py
+++ b/fanficdownloader/adapters/base_adapter.py
@ -259,8 +259,9 @@ class BaseSiteAdapter(Configurable):
        acceptable_attributes = ['href','name']
        #print("include_images:"+self.getConfig('include_images'))
        if self.getConfig('include_images'):
-            acceptable_attributes.extend(('src','alt'))
+            acceptable_attributes.extend(('src','alt','origsrc'))
            for img in soup.findAll('img'):
+                img['origsrc']=img['src']
                img['src']=self.story.addImgUrl(self,url,img['src'],self._fetchUrlRaw)

        for attr in soup._getAttrMap().keys():
--- a/fanficdownloader/story.py
+++ b/fanficdownloader/story.py
@ -17,7 +17,6 @@

 import os, re
 import urlparse
-from base64 import b64encode

 from htmlcleanup import conditionalRemoveEntities, removeAllEntities

@ -198,21 +197,44 @@ class Story:
        # up with the same name both now, in different chapters, and
        # later with new update chapters.  Numbering them didn't do
        # that.
-        newsrc = "images/%s.jpg"%(b64encode(imgurl))
+        # newsrc = "images/%s.jpg"%(b64encode(imgurl))
+        # step = 20
+        # if newsrc > step:
+        #     i = step
+        #     while i < len(newsrc):
+        #         newsrc = newsrc[:i]+"/"+newsrc[i:]
+        #         i += step
+                
+        # But, b64 names can get too big for zip (on windows, at
+        # least) to handle too quickly.
+
+        # This version, prefixing the images with the creation
+        # timestamp, still allows for dup images to be detected and
+        # not dup'ed in a single download.  And it prevents 0.jpg from
+        # earlier update being overwritten by the first image in newer
+        # chapter.  It does not, however, prevent dup copies of the
+        # same image being d/l'ed and saved in different updates.  A
+        # bit of corner case inefficiency I can live with rather than
+        # scanning all the pre-existing files on update.  oldsrc is
+        # being saved on img tags just in case, however.
+        prefix=self.getMetadataRaw('dateCreated').strftime("%Y%m%d%H%M%S")
+        
        if imgurl not in self.imgurls:
            self.imgurls.append(imgurl)
            parsedUrl = urlparse.urlparse(imgurl)
-            # newsrc = "images/%s.jpg"%(
-            #     self.imgurls.index(imgurl))
+            newsrc = "images/%s-%s.jpg"%(
+                prefix,
+                self.imgurls.index(imgurl))
            sizes = [ int(x) for x in configurable.getConfigList('image_max_size') ]
            data = convert_image(fetch(imgurl),
                                 sizes,
                                 configurable.getConfig('grayscale_images'))
-            #print("\nimgurl\nimage size:%d\n"%len(data))
+            print("\nimgurl:%s\nnewsrc:%s\nimage size:%d\n"%(imgurl,newsrc,len(data)))
            self.imgurldata.append((newsrc,data))
-        # else:
-        #     newsrc = "images/%s.jpg"%(
-        #         self.imgurls.index(imgurl))
+        else:
+            newsrc = "images/%s-%s.jpg"%(
+                prefix,
+                self.imgurls.index(imgurl))
            
        #print("===============\n%s\nimg url:%s\n============"%(newsrc,self.imgurls[-1]))