diff --git a/fanficdownloader/adapters/adapter_adastrafanficcom.py b/fanficdownloader/adapters/adapter_adastrafanficcom.py index 11f07739..161ff261 100644 --- a/fanficdownloader/adapters/adapter_adastrafanficcom.py +++ b/fanficdownloader/adapters/adapter_adastrafanficcom.py @@ -32,11 +32,11 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','aaff') - self.decode = "Windows-1252" # 1252 is a superset of - # iso-8859-1. Most sites that - # claim to be iso-8859-1 (and - # some that claim to be utf8) are - # really windows-1252. + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. self.story.addToList("category","Star Trek") self.is_adult=False diff --git a/fanficdownloader/adapters/adapter_fictionalleyorg.py b/fanficdownloader/adapters/adapter_fictionalleyorg.py index aa175312..14a03ec5 100644 --- a/fanficdownloader/adapters/adapter_fictionalleyorg.py +++ b/fanficdownloader/adapters/adapter_fictionalleyorg.py @@ -32,11 +32,11 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','fa') - self.decode = "Windows-1252" # 1252 is a superset of - # iso-8859-1. Most sites that - # claim to be iso-8859-1 (and - # some that claim to be utf8) are - # really windows-1252. + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. self.story.addToList("category","Harry Potter") self.is_adult=False diff --git a/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py b/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py index d1fafc18..126365b7 100644 --- a/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py +++ b/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py @@ -32,7 +32,11 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','hp') - self.decode = "Windows-1252" # Another site that lies to us. + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. self.story.addToList("category","Harry Potter") self.is_adult=False diff --git a/fanficdownloader/adapters/adapter_mediaminerorg.py b/fanficdownloader/adapters/adapter_mediaminerorg.py index d35591ad..62aa50ce 100644 --- a/fanficdownloader/adapters/adapter_mediaminerorg.py +++ b/fanficdownloader/adapters/adapter_mediaminerorg.py @@ -32,11 +32,11 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','mm') - self.decode = "Windows-1252" # 1252 is a superset of - # iso-8859-1. Most sites that - # claim to be iso-8859-1 (and - # some that claim to be utf8) are - # really windows-1252. + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) diff --git a/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py b/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py index 7e2bfc9d..6cb4b415 100644 --- a/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py +++ b/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py @@ -32,11 +32,11 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','pns') - self.decode = "Windows-1252" # 1252 is a superset of - # iso-8859-1. Most sites that - # claim to be iso-8859-1 (and - # some that claim to be utf8) are - # really windows-1252. + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. self.story.addToList("category","Harry Potter") # get storyId from url--url validation guarantees query is only sid=1234 diff --git a/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py b/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py index 1d67e8e1..1abe02c0 100644 --- a/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py +++ b/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py @@ -32,11 +32,11 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','twcs') - self.decode = "Windows-1252" # 1252 is a superset of - # iso-8859-1. Most sites that - # claim to be iso-8859-1 (and - # some that claim to be utf8) are - # really windows-1252. + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. self.username = "NoneGiven" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False diff --git a/fanficdownloader/adapters/adapter_twilightednet.py b/fanficdownloader/adapters/adapter_twilightednet.py index c4ddb8e9..2a58adfc 100644 --- a/fanficdownloader/adapters/adapter_twilightednet.py +++ b/fanficdownloader/adapters/adapter_twilightednet.py @@ -32,11 +32,11 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','tw') - self.decode = "Windows-1252" # 1252 is a superset of - # iso-8859-1. Most sites that - # claim to be iso-8859-1 (and - # some that claim to be utf8) are - # really windows-1252. + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. self.story.addToList("category","Twilight") self.username = "NoneGiven" # if left empty, site doesn't return any message at all. self.password = "" diff --git a/fanficdownloader/adapters/adapter_twiwritenet.py b/fanficdownloader/adapters/adapter_twiwritenet.py index d1073610..5fe9c3da 100644 --- a/fanficdownloader/adapters/adapter_twiwritenet.py +++ b/fanficdownloader/adapters/adapter_twiwritenet.py @@ -32,11 +32,11 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','twrt') - self.decode = "Windows-1252" # 1252 is a superset of - # iso-8859-1. Most sites that - # claim to be iso-8859-1 (and - # some that claim to be utf8) are - # really windows-1252. + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. self.story.addToList("category","Twilight") self.username = "NoneGiven" # if left empty, twiwrite.net doesn't return any message at all. self.password = "" diff --git a/fanficdownloader/adapters/adapter_whoficcom.py b/fanficdownloader/adapters/adapter_whoficcom.py index ba7d0a6a..517f40ea 100644 --- a/fanficdownloader/adapters/adapter_whoficcom.py +++ b/fanficdownloader/adapters/adapter_whoficcom.py @@ -30,12 +30,12 @@ class WhoficComSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','whof') - self.decode = "Windows-1252" # 1252 is a superset of - # iso-8859-1. Most sites that - # claim to be iso-8859-1 (and - # some that claim to be utf8) are - # really windows-1252. - + self.decode = ["Windows-1252", + "utf8"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. + @staticmethod def getSiteDomain(): return 'www.whofic.com' diff --git a/fanficdownloader/adapters/base_adapter.py b/fanficdownloader/adapters/base_adapter.py index 837e103e..0fdf1178 100644 --- a/fanficdownloader/adapters/base_adapter.py +++ b/fanficdownloader/adapters/base_adapter.py @@ -69,7 +69,13 @@ class BaseSiteAdapter(Configurable): self.story.setMetadata('site',self.getSiteDomain()) self.story.setMetadata('dateCreated',datetime.datetime.now()) self.chapterUrls = [] # tuples of (chapter title,chapter url) - self.decode = "utf8" + ## order of preference for decoding. + self.decode = ["utf8", + "Windows-1252"] # 1252 is a superset of + # iso-8859-1. Most sites that + # claim to be iso-8859-1 (and + # some that claim to be utf8) + # are really windows-1252. self._setURL(url) if not self.validateURL(): raise InvalidStoryURL(url, @@ -83,6 +89,16 @@ class BaseSiteAdapter(Configurable): self.path = self.parsedUrl.path self.story.setMetadata('storyUrl',self.url) + def _decode(self,data): + for code in self.decode: + try: + return data.decode(code) + except: + logging.debug("code failed:"+code) + pass + logging.info("Could not decode story, tried:%s Stripping non-ASCII."%self.decode) + return "".join([x for x in data if ord(x) < 128]) + # Assumes application/x-www-form-urlencoded. parameters, headers are dict()s def _postUrl(self, url, parameters={}, headers={}): if self.getConfig('slow_down_sleep_time'): @@ -97,7 +113,7 @@ class BaseSiteAdapter(Configurable): req = u2.Request(url, data=urllib.urlencode(parameters), headers=headers) - return self.opener.open(req).read().decode(self.decode) + return self._decode(self.opener.open(req).read()) # parameters is a dict() def _fetchUrl(self, url, parameters=None): @@ -109,10 +125,9 @@ class BaseSiteAdapter(Configurable): time.sleep(sleeptime) try: if parameters: - return self.opener.open(url,urllib.urlencode(parameters))\ - .read().decode(self.decode) + return self._decode(self.opener.open(url,urllib.urlencode(parameters)).read()) else: - return self.opener.open(url).read().decode(self.decode) + return self._decode(self.opener.open(url).read()) except Exception, e: excpt=e logging.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e))) diff --git a/utils/remover.py b/utils/remover.py index 70af1b0f..ae33165f 100644 --- a/utils/remover.py +++ b/utils/remover.py @@ -80,7 +80,8 @@ class RemoveOrphanDataChunks(webapp.RequestHandler): deleted = 0 num = 0 - results = chunks.fetch(100) + step = 100 + results = chunks.fetch(step) for d in results: ## This is the only way to test for orphans I could find. try: @@ -90,7 +91,7 @@ class RemoveOrphanDataChunks(webapp.RequestHandler): d.delete() deleted += 1 num += 1 - if num == 0: + if num < step: memcache.delete('orphan_search_cursor') logging.warn('Orphan search reached end, starting over next time.') else: