Try both utf8 and windows-1252 (order depends on adapter), then strip non-ascii

characters entirely if both fail.
This commit is contained in:
Jim Miller 2011-06-05 16:49:35 -05:00
parent 951bc3b030
commit 520a295f88
11 changed files with 69 additions and 49 deletions

View file

@ -32,11 +32,11 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','aaff')
self.decode = "Windows-1252" # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8) are
# really windows-1252.
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.addToList("category","Star Trek")
self.is_adult=False

View file

@ -32,11 +32,11 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','fa')
self.decode = "Windows-1252" # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8) are
# really windows-1252.
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.addToList("category","Harry Potter")
self.is_adult=False

View file

@ -32,7 +32,11 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','hp')
self.decode = "Windows-1252" # Another site that lies to us. <rolls eyes>
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.addToList("category","Harry Potter")
self.is_adult=False

View file

@ -32,11 +32,11 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','mm')
self.decode = "Windows-1252" # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8) are
# really windows-1252.
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)

View file

@ -32,11 +32,11 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','pns')
self.decode = "Windows-1252" # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8) are
# really windows-1252.
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.addToList("category","Harry Potter")
# get storyId from url--url validation guarantees query is only sid=1234

View file

@ -32,11 +32,11 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','twcs')
self.decode = "Windows-1252" # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8) are
# really windows-1252.
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False

View file

@ -32,11 +32,11 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','tw')
self.decode = "Windows-1252" # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8) are
# really windows-1252.
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.addToList("category","Twilight")
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""

View file

@ -32,11 +32,11 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','twrt')
self.decode = "Windows-1252" # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8) are
# really windows-1252.
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.addToList("category","Twilight")
self.username = "NoneGiven" # if left empty, twiwrite.net doesn't return any message at all.
self.password = ""

View file

@ -30,12 +30,12 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','whof')
self.decode = "Windows-1252" # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8) are
# really windows-1252.
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
@staticmethod
def getSiteDomain():
return 'www.whofic.com'

View file

@ -69,7 +69,13 @@ class BaseSiteAdapter(Configurable):
self.story.setMetadata('site',self.getSiteDomain())
self.story.setMetadata('dateCreated',datetime.datetime.now())
self.chapterUrls = [] # tuples of (chapter title,chapter url)
self.decode = "utf8"
## order of preference for decoding.
self.decode = ["utf8",
"Windows-1252"] # 1252 is a superset of
# iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8)
# are really windows-1252.
self._setURL(url)
if not self.validateURL():
raise InvalidStoryURL(url,
@ -83,6 +89,16 @@ class BaseSiteAdapter(Configurable):
self.path = self.parsedUrl.path
self.story.setMetadata('storyUrl',self.url)
def _decode(self,data):
for code in self.decode:
try:
return data.decode(code)
except:
logging.debug("code failed:"+code)
pass
logging.info("Could not decode story, tried:%s Stripping non-ASCII."%self.decode)
return "".join([x for x in data if ord(x) < 128])
# Assumes application/x-www-form-urlencoded. parameters, headers are dict()s
def _postUrl(self, url, parameters={}, headers={}):
if self.getConfig('slow_down_sleep_time'):
@ -97,7 +113,7 @@ class BaseSiteAdapter(Configurable):
req = u2.Request(url,
data=urllib.urlencode(parameters),
headers=headers)
return self.opener.open(req).read().decode(self.decode)
return self._decode(self.opener.open(req).read())
# parameters is a dict()
def _fetchUrl(self, url, parameters=None):
@ -109,10 +125,9 @@ class BaseSiteAdapter(Configurable):
time.sleep(sleeptime)
try:
if parameters:
return self.opener.open(url,urllib.urlencode(parameters))\
.read().decode(self.decode)
return self._decode(self.opener.open(url,urllib.urlencode(parameters)).read())
else:
return self.opener.open(url).read().decode(self.decode)
return self._decode(self.opener.open(url).read())
except Exception, e:
excpt=e
logging.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))

View file

@ -80,7 +80,8 @@ class RemoveOrphanDataChunks(webapp.RequestHandler):
deleted = 0
num = 0
results = chunks.fetch(100)
step = 100
results = chunks.fetch(step)
for d in results:
## This is the only way to test for orphans I could find.
try:
@ -90,7 +91,7 @@ class RemoveOrphanDataChunks(webapp.RequestHandler):
d.delete()
deleted += 1
num += 1
if num == 0:
if num < step:
memcache.delete('orphan_search_cursor')
logging.warn('Orphan search reached end, starting over next time.')
else: