Update ga for image support changes. Change img file names. Possibility of

dups between updates, but not bad overwrites or overlength names,
This commit is contained in:
Jim Miller 2012-02-24 20:50:43 -06:00
parent 6a83131a99
commit 0bea4afd01
5 changed files with 37 additions and 14 deletions

View file

@ -214,7 +214,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
# grab the text for an individual chapter.
def getChapterText(self, url):
print('Getting chapter text from: %s' % url)
logging.debug('Getting chapter text from: %s' % url)
chapter=bs.BeautifulSoup('<div class="story"></div>')
data = self._fetchUrl(url)

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return GayAuthorsAdapter
@ -162,7 +162,8 @@ class GayAuthorsAdapter(BaseSiteAdapter):
self.story.setMetadata('rating',rating.text)
summary = msoup.find('span', {'itemprop' : 'description'})
self.story.setMetadata('description',summary.text)
self.setDescription(self.url,summary.text)
#self.story.setMetadata('description',summary.text)
stats = msoup.find('dl',{'class':'info'})
@ -200,4 +201,4 @@ class GayAuthorsAdapter(BaseSiteAdapter):
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)

View file

@ -78,7 +78,6 @@ class TestSiteAdapter(BaseSiteAdapter):
Some more longer description. "I suck at summaries!" "Better than it sounds!" "My first fic"
''')
self.story.setMetadata('datePublished',makeDate("1975-03-15","%Y-%m-%d"))
self.story.setMetadata('dateCreated',datetime.datetime.now())
if self.story.getMetadata('storyId') == '669':
self.story.setMetadata('dateUpdated',datetime.datetime.now())
else:

View file

@ -259,8 +259,9 @@ class BaseSiteAdapter(Configurable):
acceptable_attributes = ['href','name']
#print("include_images:"+self.getConfig('include_images'))
if self.getConfig('include_images'):
acceptable_attributes.extend(('src','alt'))
acceptable_attributes.extend(('src','alt','origsrc'))
for img in soup.findAll('img'):
img['origsrc']=img['src']
img['src']=self.story.addImgUrl(self,url,img['src'],self._fetchUrlRaw)
for attr in soup._getAttrMap().keys():

View file

@ -17,7 +17,6 @@
import os, re
import urlparse
from base64 import b64encode
from htmlcleanup import conditionalRemoveEntities, removeAllEntities
@ -198,21 +197,44 @@ class Story:
# up with the same name both now, in different chapters, and
# later with new update chapters. Numbering them didn't do
# that.
newsrc = "images/%s.jpg"%(b64encode(imgurl))
# newsrc = "images/%s.jpg"%(b64encode(imgurl))
# step = 20
# if newsrc > step:
# i = step
# while i < len(newsrc):
# newsrc = newsrc[:i]+"/"+newsrc[i:]
# i += step
# But, b64 names can get too big for zip (on windows, at
# least) to handle too quickly.
# This version, prefixing the images with the creation
# timestamp, still allows for dup images to be detected and
# not dup'ed in a single download. And it prevents 0.jpg from
# earlier update being overwritten by the first image in newer
# chapter. It does not, however, prevent dup copies of the
# same image being d/l'ed and saved in different updates. A
# bit of corner case inefficiency I can live with rather than
# scanning all the pre-existing files on update. oldsrc is
# being saved on img tags just in case, however.
prefix=self.getMetadataRaw('dateCreated').strftime("%Y%m%d%H%M%S")
if imgurl not in self.imgurls:
self.imgurls.append(imgurl)
parsedUrl = urlparse.urlparse(imgurl)
# newsrc = "images/%s.jpg"%(
# self.imgurls.index(imgurl))
newsrc = "images/%s-%s.jpg"%(
prefix,
self.imgurls.index(imgurl))
sizes = [ int(x) for x in configurable.getConfigList('image_max_size') ]
data = convert_image(fetch(imgurl),
sizes,
configurable.getConfig('grayscale_images'))
#print("\nimgurl\nimage size:%d\n"%len(data))
print("\nimgurl:%s\nnewsrc:%s\nimage size:%d\n"%(imgurl,newsrc,len(data)))
self.imgurldata.append((newsrc,data))
# else:
# newsrc = "images/%s.jpg"%(
# self.imgurls.index(imgurl))
else:
newsrc = "images/%s-%s.jpg"%(
prefix,
self.imgurls.index(imgurl))
#print("===============\n%s\nimg url:%s\n============"%(newsrc,self.imgurls[-1]))