Fixes for encoding/make unicode issues.

This commit is contained in:
Jim Miller 2018-08-01 12:55:45 -05:00
parent 61c3af67e1
commit d43b90642f
5 changed files with 5 additions and 10 deletions

View file

@ -30,7 +30,6 @@ from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six import text_type as unicode
from ..six import ensure_text
from ..six.moves.urllib.error import HTTPError
from .base_adapter import BaseSiteAdapter, makeDate
@ -338,7 +337,7 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter):
##There is also a double <br/>, so we have to fix that, then remove the leading and trailing '-:-'.
##They are always in the same order.
## EDIT 09/26/2016: Had some trouble with unicode errors... so I had to put in the decode/encode parts to fix it
liMetadata = ensure_text(lc2).replace('\n','').replace('\r','').replace('\t',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
liMetadata = unicode(lc2).replace('\n','').replace('\r','').replace('\t',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
liMetadata = stripHTML(liMetadata.replace(r'<br/>','-:-').replace('<!-- <br /-->','-:-'))
liMetadata = liMetadata.strip('-:-').strip('-:-').encode('utf-8')
for i, value in enumerate(liMetadata.decode('utf-8').split('-:-')):

View file

@ -24,7 +24,6 @@ import re
import sys
# py2 vs py3 transition
from ..six import text_type as unicode
from ..six import ensure_text
from ..six.moves.urllib.error import HTTPError
from .base_adapter import BaseSiteAdapter, makeDate
@ -134,7 +133,7 @@ class FireFlyFansNetSiteAdapter(BaseSiteAdapter):
# which is usualy FireFly on this site, but I'm going to get them
# anyway.a
category = soup.find('span', {'id': 'MainContent_txtItemDetails'})
category = stripHTML(ensure_text(category).replace(b"\xc2\xa0", ' '))
category = stripHTML(unicode(category).replace(u"\xc2\xa0", ' '))
metad = category.split(' ')
for meta in metad:
if ":" in meta:

View file

@ -28,7 +28,6 @@ from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six import text_type as unicode
from ..six import ensure_text
from ..six.moves.urllib.error import HTTPError
from .base_adapter import BaseSiteAdapter, makeDate
@ -287,7 +286,7 @@ class LOTRgficComAdapter(BaseSiteAdapter):
#<br/>
#</p>
## we'll have to remove the non-breaking spaces to get this to work.
metad = ensure_text(metad).replace(b"\xc2\xa0",'').replace('\n','')
metad = unicode(metad).replace(u"\xc2\xa0",'').replace('\n','')
for txt in metad.split('<br/>'):
if 'Challenges:' in txt:
txt = txt.replace('Challenges:','').strip()

View file

@ -33,7 +33,6 @@ from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six import text_type as unicode
from ..six import ensure_text
from ..six.moves.urllib.error import HTTPError
from .base_adapter import BaseSiteAdapter, makeDate
@ -192,7 +191,7 @@ class WWWArea52HKHNetAdapter(BaseSiteAdapter):
## I've seen a non-breaking space in some of the storyblocks
## so we are going to remove them.
series = stripHTML(ensure_text(series.renderContents()).replace(b"\xc2\xa0",'')).strip()
series = stripHTML(unicode(series.renderContents()).replace(u"\xc2\xa0",'')).strip()
if len(series) > 0:
self.story.setMetadata('series',series)

View file

@ -41,7 +41,6 @@ from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six import text_type as unicode
from ..six import ensure_text
from ..six.moves.urllib.error import HTTPError
from ..six.moves.urllib.parse import quote
@ -148,7 +147,7 @@ class WWWUtopiastoriesComAdapter(BaseSiteAdapter):
for detail in soup.findAll('li'):
det = ensure_text(detail).replace(b"\xc2\xa0",'')
det = unicode(detail).replace(u"\xc2\xa0",'')
heading = stripHTML(det).split(' - ')[0]
text = stripHTML(det).replace(heading+' - ','')
if 'Author' in heading: