mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Try both utf8 and windows-1252 (order depends on adapter), then strip non-ascii
characters entirely if both fail.
This commit is contained in:
parent
951bc3b030
commit
520a295f88
11 changed files with 69 additions and 49 deletions
|
|
@ -32,11 +32,11 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','aaff')
|
||||
self.decode = "Windows-1252" # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8) are
|
||||
# really windows-1252.
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.story.addToList("category","Star Trek")
|
||||
self.is_adult=False
|
||||
|
||||
|
|
|
|||
|
|
@ -32,11 +32,11 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','fa')
|
||||
self.decode = "Windows-1252" # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8) are
|
||||
# really windows-1252.
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.story.addToList("category","Harry Potter")
|
||||
self.is_adult=False
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,11 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','hp')
|
||||
self.decode = "Windows-1252" # Another site that lies to us. <rolls eyes>
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.story.addToList("category","Harry Potter")
|
||||
self.is_adult=False
|
||||
|
||||
|
|
|
|||
|
|
@ -32,11 +32,11 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','mm')
|
||||
self.decode = "Windows-1252" # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8) are
|
||||
# really windows-1252.
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
|
|
|
|||
|
|
@ -32,11 +32,11 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','pns')
|
||||
self.decode = "Windows-1252" # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8) are
|
||||
# really windows-1252.
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.story.addToList("category","Harry Potter")
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
|
|
|
|||
|
|
@ -32,11 +32,11 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','twcs')
|
||||
self.decode = "Windows-1252" # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8) are
|
||||
# really windows-1252.
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
|
|
|||
|
|
@ -32,11 +32,11 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','tw')
|
||||
self.decode = "Windows-1252" # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8) are
|
||||
# really windows-1252.
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.story.addToList("category","Twilight")
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
|
|
|
|||
|
|
@ -32,11 +32,11 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','twrt')
|
||||
self.decode = "Windows-1252" # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8) are
|
||||
# really windows-1252.
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.story.addToList("category","Twilight")
|
||||
self.username = "NoneGiven" # if left empty, twiwrite.net doesn't return any message at all.
|
||||
self.password = ""
|
||||
|
|
|
|||
|
|
@ -30,12 +30,12 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','whof')
|
||||
self.decode = "Windows-1252" # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8) are
|
||||
# really windows-1252.
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.whofic.com'
|
||||
|
|
|
|||
|
|
@ -69,7 +69,13 @@ class BaseSiteAdapter(Configurable):
|
|||
self.story.setMetadata('site',self.getSiteDomain())
|
||||
self.story.setMetadata('dateCreated',datetime.datetime.now())
|
||||
self.chapterUrls = [] # tuples of (chapter title,chapter url)
|
||||
self.decode = "utf8"
|
||||
## order of preference for decoding.
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of
|
||||
# iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8)
|
||||
# are really windows-1252.
|
||||
self._setURL(url)
|
||||
if not self.validateURL():
|
||||
raise InvalidStoryURL(url,
|
||||
|
|
@ -83,6 +89,16 @@ class BaseSiteAdapter(Configurable):
|
|||
self.path = self.parsedUrl.path
|
||||
self.story.setMetadata('storyUrl',self.url)
|
||||
|
||||
def _decode(self,data):
|
||||
for code in self.decode:
|
||||
try:
|
||||
return data.decode(code)
|
||||
except:
|
||||
logging.debug("code failed:"+code)
|
||||
pass
|
||||
logging.info("Could not decode story, tried:%s Stripping non-ASCII."%self.decode)
|
||||
return "".join([x for x in data if ord(x) < 128])
|
||||
|
||||
# Assumes application/x-www-form-urlencoded. parameters, headers are dict()s
|
||||
def _postUrl(self, url, parameters={}, headers={}):
|
||||
if self.getConfig('slow_down_sleep_time'):
|
||||
|
|
@ -97,7 +113,7 @@ class BaseSiteAdapter(Configurable):
|
|||
req = u2.Request(url,
|
||||
data=urllib.urlencode(parameters),
|
||||
headers=headers)
|
||||
return self.opener.open(req).read().decode(self.decode)
|
||||
return self._decode(self.opener.open(req).read())
|
||||
|
||||
# parameters is a dict()
|
||||
def _fetchUrl(self, url, parameters=None):
|
||||
|
|
@ -109,10 +125,9 @@ class BaseSiteAdapter(Configurable):
|
|||
time.sleep(sleeptime)
|
||||
try:
|
||||
if parameters:
|
||||
return self.opener.open(url,urllib.urlencode(parameters))\
|
||||
.read().decode(self.decode)
|
||||
return self._decode(self.opener.open(url,urllib.urlencode(parameters)).read())
|
||||
else:
|
||||
return self.opener.open(url).read().decode(self.decode)
|
||||
return self._decode(self.opener.open(url).read())
|
||||
except Exception, e:
|
||||
excpt=e
|
||||
logging.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))
|
||||
|
|
|
|||
|
|
@ -80,7 +80,8 @@ class RemoveOrphanDataChunks(webapp.RequestHandler):
|
|||
|
||||
deleted = 0
|
||||
num = 0
|
||||
results = chunks.fetch(100)
|
||||
step = 100
|
||||
results = chunks.fetch(step)
|
||||
for d in results:
|
||||
## This is the only way to test for orphans I could find.
|
||||
try:
|
||||
|
|
@ -90,7 +91,7 @@ class RemoveOrphanDataChunks(webapp.RequestHandler):
|
|||
d.delete()
|
||||
deleted += 1
|
||||
num += 1
|
||||
if num == 0:
|
||||
if num < step:
|
||||
memcache.delete('orphan_search_cursor')
|
||||
logging.warn('Orphan search reached end, starting over next time.')
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in a new issue