Try both utf8 and windows-1252 (order depends on adapter), then strip non-ascii

characters entirely if both fail.
2025-12-06 08:52:55 +01:00 · 2011-06-05 16:49:35 -05:00 · 2011-06-05 16:49:35 -05:00 · 520a295f88
commit 520a295f88
parent 951bc3b030
11 changed files with 69 additions and 49 deletions
--- a/fanficdownloader/adapters/adapter_adastrafanficcom.py
+++ b/fanficdownloader/adapters/adapter_adastrafanficcom.py
@ -32,11 +32,11 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','aaff')
-        self.decode = "Windows-1252" # 1252 is a superset of
-                                     # iso-8859-1.  Most sites that
-                                     # claim to be iso-8859-1 (and
-                                     # some that claim to be utf8) are
-                                     # really windows-1252.
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
        self.story.addToList("category","Star Trek")
        self.is_adult=False
        
--- a/fanficdownloader/adapters/adapter_fictionalleyorg.py
+++ b/fanficdownloader/adapters/adapter_fictionalleyorg.py
@ -32,11 +32,11 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','fa')
-        self.decode = "Windows-1252" # 1252 is a superset of
-                                     # iso-8859-1.  Most sites that
-                                     # claim to be iso-8859-1 (and
-                                     # some that claim to be utf8) are
-                                     # really windows-1252.
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
        self.story.addToList("category","Harry Potter")
        self.is_adult=False
        
--- a/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
+++ b/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
@ -32,7 +32,11 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','hp')
-        self.decode = "Windows-1252" # Another site that lies to us.  <rolls eyes>
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
        self.story.addToList("category","Harry Potter")
        self.is_adult=False
        
--- a/fanficdownloader/adapters/adapter_mediaminerorg.py
+++ b/fanficdownloader/adapters/adapter_mediaminerorg.py
@ -32,11 +32,11 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','mm')
-        self.decode = "Windows-1252" # 1252 is a superset of
-                                     # iso-8859-1.  Most sites that
-                                     # claim to be iso-8859-1 (and
-                                     # some that claim to be utf8) are
-                                     # really windows-1252.
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
        
        # get storyId from url--url validation guarantees query correct
        m = re.match(self.getSiteURLPattern(),url)
--- a/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py
+++ b/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py
@ -32,11 +32,11 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','pns')
-        self.decode = "Windows-1252" # 1252 is a superset of
-                                     # iso-8859-1.  Most sites that
-                                     # claim to be iso-8859-1 (and
-                                     # some that claim to be utf8) are
-                                     # really windows-1252.
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
        self.story.addToList("category","Harry Potter")
        
        # get storyId from url--url validation guarantees query is only sid=1234
--- a/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py
+++ b/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py
@ -32,11 +32,11 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','twcs')
-        self.decode = "Windows-1252" # 1252 is a superset of
-                                     # iso-8859-1.  Most sites that
-                                     # claim to be iso-8859-1 (and
-                                     # some that claim to be utf8) are
-                                     # really windows-1252.
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult=False
--- a/fanficdownloader/adapters/adapter_twilightednet.py
+++ b/fanficdownloader/adapters/adapter_twilightednet.py
@ -32,11 +32,11 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','tw')
-        self.decode = "Windows-1252" # 1252 is a superset of
-                                     # iso-8859-1.  Most sites that
-                                     # claim to be iso-8859-1 (and
-                                     # some that claim to be utf8) are
-                                     # really windows-1252.
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
        self.story.addToList("category","Twilight")
        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
        self.password = ""
--- a/fanficdownloader/adapters/adapter_twiwritenet.py
+++ b/fanficdownloader/adapters/adapter_twiwritenet.py
@ -32,11 +32,11 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','twrt')
-        self.decode = "Windows-1252" # 1252 is a superset of
-                                     # iso-8859-1.  Most sites that
-                                     # claim to be iso-8859-1 (and
-                                     # some that claim to be utf8) are
-                                     # really windows-1252.
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
        self.story.addToList("category","Twilight")
        self.username = "NoneGiven" # if left empty, twiwrite.net doesn't return any message at all.
        self.password = ""
--- a/fanficdownloader/adapters/adapter_whoficcom.py
+++ b/fanficdownloader/adapters/adapter_whoficcom.py
@ -30,12 +30,12 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','whof')
-        self.decode = "Windows-1252" # 1252 is a superset of
-                                     # iso-8859-1.  Most sites that
-                                     # claim to be iso-8859-1 (and
-                                     # some that claim to be utf8) are
-                                     # really windows-1252.
-
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
+        
    @staticmethod
    def getSiteDomain():
        return 'www.whofic.com'
--- a/fanficdownloader/adapters/base_adapter.py
+++ b/fanficdownloader/adapters/base_adapter.py
@ -69,7 +69,13 @@ class BaseSiteAdapter(Configurable):
        self.story.setMetadata('site',self.getSiteDomain())
        self.story.setMetadata('dateCreated',datetime.datetime.now())
        self.chapterUrls = [] # tuples of (chapter title,chapter url)
-        self.decode = "utf8"
+        ## order of preference for decoding.
+        self.decode = ["utf8",
+                       "Windows-1252"] # 1252 is a superset of
+                                       # iso-8859-1.  Most sites that
+                                       # claim to be iso-8859-1 (and
+                                       # some that claim to be utf8)
+                                       # are really windows-1252.
        self._setURL(url)
        if not self.validateURL():
            raise InvalidStoryURL(url,
@ -83,6 +89,16 @@ class BaseSiteAdapter(Configurable):
        self.path = self.parsedUrl.path        
        self.story.setMetadata('storyUrl',self.url)

+    def _decode(self,data):
+        for code in self.decode:
+            try:
+                return data.decode(code)
+            except:
+                logging.debug("code failed:"+code)
+                pass
+        logging.info("Could not decode story, tried:%s Stripping non-ASCII."%self.decode)
+        return "".join([x for x in data if ord(x) < 128])
+
    # Assumes application/x-www-form-urlencoded.  parameters, headers are dict()s
    def _postUrl(self, url, parameters={}, headers={}):
        if self.getConfig('slow_down_sleep_time'):
@ -97,7 +113,7 @@ class BaseSiteAdapter(Configurable):
        req = u2.Request(url,
                         data=urllib.urlencode(parameters),
                         headers=headers)
-        return self.opener.open(req).read().decode(self.decode)
+        return self._decode(self.opener.open(req).read())

    # parameters is a dict()
    def _fetchUrl(self, url, parameters=None):
@ -109,10 +125,9 @@ class BaseSiteAdapter(Configurable):
            time.sleep(sleeptime)	
            try:
                if parameters:
-                    return self.opener.open(url,urllib.urlencode(parameters))\
-                        .read().decode(self.decode)
+                    return self._decode(self.opener.open(url,urllib.urlencode(parameters)).read())
                else:
-                    return self.opener.open(url).read().decode(self.decode)
+                    return self._decode(self.opener.open(url).read())
            except Exception, e:
                excpt=e
                logging.warn("Caught an exception reading URL: %s  Exception %s."%(unicode(url),unicode(e)))
--- a/utils/remover.py
+++ b/utils/remover.py
@ -80,7 +80,8 @@ class RemoveOrphanDataChunks(webapp.RequestHandler):

        deleted = 0
        num = 0
-        results = chunks.fetch(100)
+        step = 100
+        results = chunks.fetch(step)
        for d in results:
            ## This is the only way to test for orphans I could find.
            try:
@ -90,7 +91,7 @@ class RemoveOrphanDataChunks(webapp.RequestHandler):
                d.delete()
                deleted += 1
            num += 1
-        if num == 0:
+        if num < step:
            memcache.delete('orphan_search_cursor')
            logging.warn('Orphan search reached end, starting over next time.')
        else: