diff --git a/.gitignore b/.gitignore index 16929968..93f22dcf 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ # usually perl -pi.back -e edits. *.back +*.bak cleanup.sh FanFictionDownLoader.zip diff --git a/calibre-plugin/__init__.py b/calibre-plugin/__init__.py index 5e0a6753..941e45ad 100644 --- a/calibre-plugin/__init__.py +++ b/calibre-plugin/__init__.py @@ -33,7 +33,7 @@ except NameError: from calibre.customize import InterfaceActionBase # pulled out from FanFicFareBase for saving in prefs.py -__version__ = (2, 28, 0) +__version__ = (2, 37, 3) ## Apparently the name for this class doesn't matter--it was still ## 'demo' for the first few versions. diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index c4e9180d..cf000204 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1,4 +1,4 @@ -# Copyright 2015 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2015 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -1916,31 +1916,6 @@ comments_label:Comments include_in_category:category,searchtags -[royalroadl.com] -extra_valid_entries:stars - -#add_to_extra_titlepage_entries:,stars - -## some sites include images that we don't ever want becoming the -## cover image. This lets you exclude them. -cover_exclusion_regexp:(imgur.com/dzOACJf.png|/forum/images/smilies/) - -## Clear FanFiction from defaults, site has fanfictions and original fiction. -extratags: - -## royalroadl.com stories sometimes have 'spoiler' blocks in -## posts. When viewed in a browser, the block is hidden until a button -## is clicked. eBook viewers can't handle that and the javascript is -## disabled. The remove_spoilers option, if uncommented, will remove -## spoiler blocks entirely. -#remove_spoilers:true - -## This option if uncommented, will put a box around the spoiler -## blocks with the original spoiler button text as a label using -## fieldset and legend HTML tags. For a simple box, see the -## add_to_output_css example for [base_xenforoforum:epub]. -#legend_spoilers:true - [samandjack.net] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In @@ -2841,6 +2816,31 @@ extracategories:Queer as Folk website_encodings:Windows-1252,utf8 +[www.royalroad.com] +extra_valid_entries:stars + +#add_to_extra_titlepage_entries:,stars + +## some sites include images that we don't ever want becoming the +## cover image. This lets you exclude them. +cover_exclusion_regexp:(imgur.com/dzOACJf.png|/forum/images/smilies/) + +## Clear FanFiction from defaults, site has fanfictions and original fiction. +extratags: + +## royalroad.com stories sometimes have 'spoiler' blocks in +## posts. When viewed in a browser, the block is hidden until a button +## is clicked. eBook viewers can't handle that and the javascript is +## disabled. The remove_spoilers option, if uncommented, will remove +## spoiler blocks entirely. +#remove_spoilers:true + +## This option if uncommented, will put a box around the spoiler +## blocks with the original spoiler button text as a label using +## fieldset and legend HTML tags. For a simple box, see the +## add_to_output_css example for [base_xenforoforum:epub]. +#legend_spoilers:true + [www.scarvesandcoffee.net] ## Some sites do not require a login, but do require the user to ## confirm they are adult for adult content. In commandline version, diff --git a/fanficfare/HtmlTagStack.py b/fanficfare/HtmlTagStack.py index 3a9e703a..64a7953b 100644 --- a/fanficfare/HtmlTagStack.py +++ b/fanficfare/HtmlTagStack.py @@ -1,4 +1,21 @@ -# coding: utf-8 +# -*- coding: utf-8 -*- + +# Copyright 2018 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import import re import codecs @@ -54,4 +71,4 @@ def flush(): del stack[:] def get_stack(): - return stack \ No newline at end of file + return stack diff --git a/fanficfare/__init__.py b/fanficfare/__init__.py index 9784e911..c6d2afca 100644 --- a/fanficfare/__init__.py +++ b/fanficfare/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015 Fanficdownloader team, 2016 FanFicFare team +# Copyright 2015 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from __future__ import absolute_import try: # just a way to switch between web service and CLI/PI diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py index 78e47c09..f4b36bbb 100644 --- a/fanficfare/adapters/__init__.py +++ b/fanficfare/adapters/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,168 +15,173 @@ # limitations under the License. # +from __future__ import absolute_import import os, re, sys, glob, types from os.path import dirname, basename, normpath import logging -import urlparse as up + +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.parse import urlparse logger = logging.getLogger(__name__) from .. import exceptions as exceptions -from ..configurable import Configuration +from .. import configurable as configurable ## must import each adapter here. -import adapter_test1 -import adapter_fanfictionnet -import adapter_fanficcastletvnet -import adapter_fictionalleyorg -import adapter_fictionpresscom -import adapter_ficwadcom -import adapter_fimfictionnet -import adapter_mediaminerorg -import adapter_potionsandsnitches -import adapter_tenhawkpresentscom -import adapter_adastrafanficcom -import adapter_tthfanficorg -import adapter_twilightednet -import adapter_whoficcom -import adapter_siyecouk -import adapter_archiveofourownorg -import adapter_ficbooknet -import adapter_nfacommunitycom -import adapter_midnightwhispers -import adapter_ksarchivecom -import adapter_archiveskyehawkecom -import adapter_squidgeorgpeja -import adapter_libraryofmoriacom -import adapter_wraithbaitcom -import adapter_dramioneorg -import adapter_ashwindersycophanthexcom -import adapter_chaossycophanthexcom -import adapter_erosnsapphosycophanthexcom -import adapter_lumossycophanthexcom -import adapter_occlumencysycophanthexcom -import adapter_phoenixsongnet -import adapter_walkingtheplankorg -import adapter_dokugacom -import adapter_iketernalnet -import adapter_storiesofardacom -import adapter_destinysgatewaycom -import adapter_ncisfictioncom -import adapter_fanfiktionde -import adapter_ponyfictionarchivenet -import adapter_ncisficcom -import adapter_nationallibrarynet -import adapter_themasquenet -import adapter_pretendercentrecom -import adapter_darksolaceorg -import adapter_finestoriescom -import adapter_hpfanficarchivecom -import adapter_twilightarchivescom -import adapter_nhamagicalworldsus -import adapter_hlfictionnet -import adapter_dracoandginnycom -import adapter_scarvesandcoffeenet -import adapter_thepetulantpoetesscom -import adapter_wolverineandroguecom -import adapter_merlinficdtwinscouk -import adapter_thehookupzonenet -import adapter_bloodtiesfancom -import adapter_qafficcom -import adapter_efpfanficnet -import adapter_potterficscom -import adapter_efictionestelielde -import adapter_imagineeficcom -import adapter_asr3slashzoneorg -import adapter_potterheadsanonymouscom -import adapter_fictionpadcom -import adapter_storiesonlinenet -import adapter_trekiverseorg -import adapter_literotica -import adapter_voracity2eficcom -import adapter_spikeluvercom -import adapter_bloodshedversecom -import adapter_nocturnallightnet -import adapter_fanfichu -import adapter_fictionmaniatv -import adapter_tolkienfanfiction -import adapter_themaplebookshelf -import adapter_fannation -import adapter_sheppardweircom -import adapter_samandjacknet -import adapter_csiforensicscom -import adapter_lotrfanfictioncom -import adapter_fhsarchivecom -import adapter_fanfictionjunkiesde -import adapter_tgstorytimecom -import adapter_itcouldhappennet -import adapter_forumsspacebattlescom -import adapter_forumssufficientvelocitycom -import adapter_forumquestionablequestingcom -import adapter_ninelivesarchivecom -import adapter_masseffect2in -import adapter_quotevcom -import adapter_mcstoriescom -import adapter_buffygilescom -import adapter_andromedawebcom -import adapter_artemisfowlcom -import adapter_naiceanilmenet -import adapter_deepinmysoulnet -import adapter_kiarepositorymujajinet -import adapter_adultfanfictionorg -import adapter_fictionhuntcom -import adapter_royalroadl -import adapter_chosentwofanficcom -import adapter_bdsmlibrarycom -import adapter_asexstoriescom -import adapter_gluttonyfictioncom -import adapter_valentchambercom -import adapter_looselugscom -import adapter_wwwgiantessworldnet -import adapter_lotrgficcom -import adapter_tomparisdormcom -import adapter_writingwhimsicalwanderingsnet -import adapter_sugarquillnet -import adapter_wwwarea52hkhnet -import adapter_starslibrarynet -import adapter_fanficauthorsnet -import adapter_fireflyfansnet -import adapter_fireflypopulliorg -import adapter_sebklainenet -import adapter_shriftweborgbfa -import adapter_trekfanfictionnet -import adapter_wuxiaworldcom -import adapter_wwwlushstoriescom -import adapter_wwwutopiastoriescom -import adapter_sinfuldreamscomunicornfic -import adapter_sinfuldreamscomwhisperedmuse -import adapter_sinfuldreamscomwickedtemptation -import adapter_asianfanficscom -import adapter_webnovelcom -import adapter_deandamagecom -import adapter_imrightbehindyoucom -import adapter_mttjustoncenet -import adapter_narutoficorg -import adapter_starskyhutcharchivenet -import adapter_swordborderlineangelcom -import adapter_tasteofpoisoninkubationnet -import adapter_thebrokenworldorg -import adapter_thedelphicexpansecom -import adapter_thundercatsfansorg -import adapter_unknowableroomorg -import adapter_www13hoursorg -import adapter_wwwaneroticstorycom -import adapter_gravitytalescom -import adapter_lcfanficcom -import adapter_noveltrovecom -import adapter_inkbunnynet -import adapter_alternatehistorycom -import adapter_wattpadcom -import adapter_lightnovelgatecom -import adapter_wwwnovelallcom -import adapter_wuxiaworldco -import adapter_harrypotterfanfictioncom +from . import base_efiction_adapter +from . import adapter_test1 +from . import adapter_fanfictionnet +from . import adapter_fanficcastletvnet +from . import adapter_fictionalleyorg +from . import adapter_fictionpresscom +from . import adapter_ficwadcom +from . import adapter_fimfictionnet +from . import adapter_mediaminerorg +from . import adapter_potionsandsnitches +from . import adapter_tenhawkpresentscom +from . import adapter_adastrafanficcom +from . import adapter_tthfanficorg +from . import adapter_twilightednet +from . import adapter_whoficcom +from . import adapter_siyecouk +from . import adapter_archiveofourownorg +from . import adapter_ficbooknet +from . import adapter_nfacommunitycom +from . import adapter_midnightwhispers +from . import adapter_ksarchivecom +from . import adapter_archiveskyehawkecom +from . import adapter_squidgeorgpeja +from . import adapter_libraryofmoriacom +from . import adapter_wraithbaitcom +from . import adapter_dramioneorg +from . import adapter_ashwindersycophanthexcom +from . import adapter_chaossycophanthexcom +from . import adapter_erosnsapphosycophanthexcom +from . import adapter_lumossycophanthexcom +from . import adapter_occlumencysycophanthexcom +from . import adapter_phoenixsongnet +from . import adapter_walkingtheplankorg +from . import adapter_dokugacom +from . import adapter_iketernalnet +from . import adapter_storiesofardacom +from . import adapter_destinysgatewaycom +from . import adapter_ncisfictioncom +from . import adapter_fanfiktionde +from . import adapter_ponyfictionarchivenet +from . import adapter_ncisficcom +from . import adapter_nationallibrarynet +from . import adapter_themasquenet +from . import adapter_pretendercentrecom +from . import adapter_darksolaceorg +from . import adapter_finestoriescom +from . import adapter_hpfanficarchivecom +from . import adapter_twilightarchivescom +from . import adapter_nhamagicalworldsus +from . import adapter_hlfictionnet +from . import adapter_dracoandginnycom +from . import adapter_scarvesandcoffeenet +from . import adapter_thepetulantpoetesscom +from . import adapter_wolverineandroguecom +from . import adapter_merlinficdtwinscouk +from . import adapter_thehookupzonenet +from . import adapter_bloodtiesfancom +from . import adapter_qafficcom +from . import adapter_efpfanficnet +from . import adapter_potterficscom +from . import adapter_efictionestelielde +from . import adapter_imagineeficcom +from . import adapter_asr3slashzoneorg +from . import adapter_potterheadsanonymouscom +from . import adapter_fictionpadcom +from . import adapter_storiesonlinenet +from . import adapter_trekiverseorg +from . import adapter_literotica +from . import adapter_voracity2eficcom +from . import adapter_spikeluvercom +from . import adapter_bloodshedversecom +from . import adapter_nocturnallightnet +from . import adapter_fanfichu +from . import adapter_fictionmaniatv +from . import adapter_tolkienfanfiction +from . import adapter_themaplebookshelf +from . import adapter_fannation +from . import adapter_sheppardweircom +from . import adapter_samandjacknet +from . import adapter_csiforensicscom +from . import adapter_lotrfanfictioncom +from . import adapter_fhsarchivecom +from . import adapter_fanfictionjunkiesde +from . import adapter_tgstorytimecom +from . import adapter_itcouldhappennet +from . import adapter_forumsspacebattlescom +from . import adapter_forumssufficientvelocitycom +from . import adapter_forumquestionablequestingcom +from . import adapter_ninelivesarchivecom +from . import adapter_masseffect2in +from . import adapter_quotevcom +from . import adapter_mcstoriescom +from . import adapter_buffygilescom +from . import adapter_andromedawebcom +from . import adapter_artemisfowlcom +from . import adapter_naiceanilmenet +from . import adapter_deepinmysoulnet +from . import adapter_kiarepositorymujajinet +from . import adapter_adultfanfictionorg +from . import adapter_fictionhuntcom +from . import adapter_royalroadl +from . import adapter_chosentwofanficcom +from . import adapter_bdsmlibrarycom +from . import adapter_asexstoriescom +from . import adapter_gluttonyfictioncom +from . import adapter_valentchambercom +from . import adapter_looselugscom +from . import adapter_wwwgiantessworldnet +from . import adapter_lotrgficcom +from . import adapter_tomparisdormcom +from . import adapter_writingwhimsicalwanderingsnet +from . import adapter_sugarquillnet +from . import adapter_wwwarea52hkhnet +from . import adapter_starslibrarynet +from . import adapter_fanficauthorsnet +from . import adapter_fireflyfansnet +from . import adapter_fireflypopulliorg +from . import adapter_sebklainenet +from . import adapter_shriftweborgbfa +from . import adapter_trekfanfictionnet +from . import adapter_wuxiaworldcom +from . import adapter_wwwlushstoriescom +from . import adapter_wwwutopiastoriescom +from . import adapter_sinfuldreamscomunicornfic +from . import adapter_sinfuldreamscomwhisperedmuse +from . import adapter_sinfuldreamscomwickedtemptation +from . import adapter_asianfanficscom +from . import adapter_webnovelcom +from . import adapter_deandamagecom +from . import adapter_imrightbehindyoucom +from . import adapter_mttjustoncenet +from . import adapter_narutoficorg +from . import adapter_starskyhutcharchivenet +from . import adapter_swordborderlineangelcom +from . import adapter_tasteofpoisoninkubationnet +from . import adapter_thebrokenworldorg +from . import adapter_thedelphicexpansecom +from . import adapter_thundercatsfansorg +from . import adapter_unknowableroomorg +from . import adapter_www13hoursorg +from . import adapter_wwwaneroticstorycom +from . import adapter_gravitytalescom +from . import adapter_lcfanficcom +from . import adapter_noveltrovecom +from . import adapter_inkbunnynet +from . import adapter_alternatehistorycom +from . import adapter_wattpadcom +from . import adapter_lightnovelgatecom +from . import adapter_wwwnovelallcom +from . import adapter_wuxiaworldco +from . import adapter_harrypotterfanfictioncom ## This bit of complexity allows adapters to be added by just adding ## importing. It eliminates the long if/else clauses we used to need @@ -187,9 +192,11 @@ __class_list = [] __domain_map = {} def imports(): + out = [] for name, val in globals().items(): if isinstance(val, types.ModuleType): - yield val.__name__ + out.append(val.__name__) + return out for x in imports(): if "fanficfare.adapters.adapter_" in x: @@ -223,7 +230,7 @@ def getNormalStoryURL(url): def getNormalStoryURLSite(url): # print("getNormalStoryURLSite:%s"%url) if not getNormalStoryURL.__dummyconfig: - getNormalStoryURL.__dummyconfig = Configuration(["test1.com"],"EPUB",lightweight=True) + getNormalStoryURL.__dummyconfig = configurable.Configuration(["test1.com"],"EPUB",lightweight=True) # pulling up an adapter is pretty low over-head. If # it fails, it's a bad url. try: @@ -297,7 +304,7 @@ def _get_class_for(url): if not "#post-" in fixedurl: fixedurl = re.sub(r"#.*$","",fixedurl) - parsedUrl = up.urlparse(fixedurl) + parsedUrl = urlparse(fixedurl) domain = parsedUrl.netloc.lower() if( domain != parsedUrl.netloc ): fixedurl = fixedurl.replace(parsedUrl.netloc,domain) diff --git a/fanficfare/adapters/adapter_adastrafanficcom.py b/fanficfare/adapters/adapter_adastrafanficcom.py index e679cda5..895ad39e 100644 --- a/fanficfare/adapters/adapter_adastrafanficcom.py +++ b/fanficfare/adapters/adapter_adastrafanficcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,20 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib -import urllib2 from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six import string_types as basestring +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class AdAstraFanficComSiteAdapter(BaseSiteAdapter): @@ -73,7 +76,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_adultfanfictionorg.py b/fanficfare/adapters/adapter_adultfanfictionorg.py index aabd1b20..4acb861b 100644 --- a/fanficfare/adapters/adapter_adultfanfictionorg.py +++ b/fanficfare/adapters/adapter_adultfanfictionorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -- coding: utf-8 -- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,19 +17,22 @@ ################################################################################ ### Written by GComyn ################################################################################ +from __future__ import absolute_import from __future__ import unicode_literals -import time import logging logger = logging.getLogger(__name__) import re import sys -import urllib2 from bs4 import UnicodeDammit from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate ################################################################################ @@ -199,7 +202,7 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist("Code: 404. {0}".format(url)) elif e.code == 410: @@ -232,7 +235,7 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter): # Find the chapters: chapters = soup.find('div',{'class':'dropdown-content'}) for i, chapter in enumerate(chapters.findAll('a')): - self.add_chapter(chapter,self.url+'&chapter='+str(i+1)) + self.add_chapter(chapter,self.url+'&chapter='+unicode(i+1)) # Find authorid and URL from... author url. @@ -265,7 +268,7 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter): logger.debug('Getting the author page: {0}'.format(author_Url)) try: adata = self._fetchUrl(author_Url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code in 404: raise exceptions.StoryDoesNotExist("Author Page: Code: 404. {0}".format(author_Url)) elif e.code == 410: @@ -299,11 +302,11 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter): while i == 0: ##We already have the first page, so if this is the first time through, skip getting the page if page != 1: - author_Url = '{0}&view=story&zone={1}&page={2}'.format(self.story.getMetadata('authorUrl'), self.zone, str(page)) + author_Url = '{0}&view=story&zone={1}&page={2}'.format(self.story.getMetadata('authorUrl'), self.zone, unicode(page)) logger.debug('Getting the author page: {0}'.format(author_Url)) try: adata = self._fetchUrl(author_Url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code in 404: raise exceptions.StoryDoesNotExist("Author Page: Code: 404. {0}".format(author_Url)) elif e.code == 410: @@ -334,7 +337,7 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter): ##There is also a double
, so we have to fix that, then remove the leading and trailing '-:-'. ##They are always in the same order. ## EDIT 09/26/2016: Had some trouble with unicode errors... so I had to put in the decode/encode parts to fix it - liMetadata = str(lc2).decode('utf-8').replace('\n','').replace('\r','').replace('\t',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ') + liMetadata = unicode(lc2).replace('\n','').replace('\r','').replace('\t',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ') liMetadata = stripHTML(liMetadata.replace(r'
','-:-').replace('','-:-')) liMetadata = liMetadata.strip('-:-').strip('-:-').encode('utf-8') for i, value in enumerate(liMetadata.decode('utf-8').split('-:-')): diff --git a/fanficfare/adapters/adapter_alternatehistorycom.py b/fanficfare/adapters/adapter_alternatehistorycom.py index 339e0afd..c3d13649 100644 --- a/fanficfare/adapters/adapter_alternatehistorycom.py +++ b/fanficfare/adapters/adapter_alternatehistorycom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,8 @@ # limitations under the License. # -from adapter_forumquestionablequestingcom import QuestionablequestingComAdapter +from __future__ import absolute_import +from .adapter_forumquestionablequestingcom import QuestionablequestingComAdapter def getClass(): return WWWAlternatehistoryComAdapter diff --git a/fanficfare/adapters/adapter_andromedawebcom.py b/fanficfare/adapters/adapter_andromedawebcom.py index 46d793d3..c0f6e223 100644 --- a/fanficfare/adapters/adapter_andromedawebcom.py +++ b/fanficfare/adapters/adapter_andromedawebcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,17 +18,18 @@ # ####### webpage. # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return AndromedaWebComAdapter # XXX @@ -125,7 +126,7 @@ class AndromedaWebComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -159,7 +160,7 @@ class AndromedaWebComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_archiveofourownorg.py b/fanficfare/adapters/adapter_archiveofourownorg.py index a5b46b18..4e8cbfa2 100644 --- a/fanficfare/adapters/adapter_archiveofourownorg.py +++ b/fanficfare/adapters/adapter_archiveofourownorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,20 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 import json from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ArchiveOfOurOwnOrgAdapter @@ -155,7 +158,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter): if "This work could have adult content. If you proceed you have agreed that you are willing to see such content." in meta: raise exceptions.AdultCheckRequired(self.url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_archiveskyehawkecom.py b/fanficfare/adapters/adapter_archiveskyehawkecom.py index ea66db2f..459a0416 100644 --- a/fanficfare/adapters/adapter_archiveskyehawkecom.py +++ b/fanficfare/adapters/adapter_archiveskyehawkecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): @@ -80,7 +81,7 @@ class ArchiveSkyeHawkeComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_artemisfowlcom.py b/fanficfare/adapters/adapter_artemisfowlcom.py index c3c2da67..a6dd1e5a 100644 --- a/fanficfare/adapters/adapter_artemisfowlcom.py +++ b/fanficfare/adapters/adapter_artemisfowlcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,17 +18,18 @@ # ####### webpage. # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ArtemisFowlComAdapter # XXX @@ -125,7 +126,7 @@ class ArtemisFowlComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -159,7 +160,7 @@ class ArtemisFowlComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_asexstoriescom.py b/fanficfare/adapters/adapter_asexstoriescom.py index b965cc32..1699913e 100644 --- a/fanficfare/adapters/adapter_asexstoriescom.py +++ b/fanficfare/adapters/adapter_asexstoriescom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,13 +15,10 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 -import urlparse -import time import os from bs4.element import Comment @@ -29,7 +26,12 @@ from ..htmlcleanup import stripHTML from .. import exceptions as exceptions import sys -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ASexStoriesComAdapter @@ -84,7 +86,7 @@ class ASexStoriesComAdapter(BaseSiteAdapter): soup1 = self.make_soup(data1) #strip comments from soup [comment.extract() for comment in soup1.find_all(text=lambda text:isinstance(text, Comment))] - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -133,7 +135,7 @@ class ASexStoriesComAdapter(BaseSiteAdapter): self.add_chapter(chapterTitle, chapterUrl) - rated = soup1.find('div',{'class':'story-info'}).findAll('div',{'story-info-bl5'})[0].find('img')['title'].replace('- Rate','').strip() + rated = soup1.find('div',{'class':'story-info'}).findAll('div',{'class':'story-info-bl5'})[0].find('img')['title'].replace('- Rate','').strip() self.story.setMetadata('rating',rated) self.story.setMetadata('dateUpdated', makeDate('01/01/2001', '%m/%d/%Y')) diff --git a/fanficfare/adapters/adapter_ashwindersycophanthexcom.py b/fanficfare/adapters/adapter_ashwindersycophanthexcom.py index 986977ad..699171bf 100644 --- a/fanficfare/adapters/adapter_ashwindersycophanthexcom.py +++ b/fanficfare/adapters/adapter_ashwindersycophanthexcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return AshwinderSycophantHexComAdapter @@ -115,7 +116,7 @@ class AshwinderSycophantHexComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_asianfanficscom.py b/fanficfare/adapters/adapter_asianfanficscom.py index 3fc39a87..2d65d2df 100644 --- a/fanficfare/adapters/adapter_asianfanficscom.py +++ b/fanficfare/adapters/adapter_asianfanficscom.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return AsianFanFicsComAdapter @@ -125,7 +127,7 @@ class AsianFanFicsComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_asr3slashzoneorg.py b/fanficfare/adapters/adapter_asr3slashzoneorg.py index 0faeeff6..91515376 100644 --- a/fanficfare/adapters/adapter_asr3slashzoneorg.py +++ b/fanficfare/adapters/adapter_asr3slashzoneorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return Asr3SlashzoneOrgAdapter @@ -85,7 +86,7 @@ class Asr3SlashzoneOrgAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -105,7 +106,7 @@ class Asr3SlashzoneOrgAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_bdsmlibrarycom.py b/fanficfare/adapters/adapter_bdsmlibrarycom.py index ede7e94f..824c821a 100644 --- a/fanficfare/adapters/adapter_bdsmlibrarycom.py +++ b/fanficfare/adapters/adapter_bdsmlibrarycom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ ### Fixed the removal of the extra tags from some of the stories and ### removed the attributes from the paragraph and span tags ########################################################################### +from __future__ import absolute_import ''' This works, but some of the stories have abysmal formatting, so it would probably need to be edited for reading. @@ -50,15 +51,17 @@ import logging logger = logging.getLogger(__name__) import re import urllib -import urllib2 import sys -import urlparse - from bs4 import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return BDSMLibraryComSiteAdapter @@ -110,7 +113,7 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(self.url) soup = self.make_soup(data) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -132,7 +135,7 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(self.url) soup = self.make_soup(data) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_bloodshedversecom.py b/fanficfare/adapters/adapter_bloodshedversecom.py index 7c561274..ace8eca2 100644 --- a/fanficfare/adapters/adapter_bloodshedversecom.py +++ b/fanficfare/adapters/adapter_bloodshedversecom.py @@ -1,7 +1,6 @@ +from __future__ import absolute_import from datetime import timedelta import re -import urllib2 -import urlparse import logging logger = logging.getLogger(__name__) @@ -9,7 +8,12 @@ logger = logging.getLogger(__name__) from bs4 import BeautifulSoup from ..htmlcleanup import stripHTML -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions @@ -47,7 +51,7 @@ class BloodshedverseComAdapter(BaseSiteAdapter): if exception: try: data = self._fetchUrl(url, parameters) - except urllib2.HTTPError: + except HTTPError: raise exception(self.url) # Just let self._fetchUrl throw the exception, don't catch and # customize it. diff --git a/fanficfare/adapters/adapter_bloodtiesfancom.py b/fanficfare/adapters/adapter_bloodtiesfancom.py index 31484159..a6bc2c3c 100644 --- a/fanficfare/adapters/adapter_bloodtiesfancom.py +++ b/fanficfare/adapters/adapter_bloodtiesfancom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,19 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - from bs4.element import Tag from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # By virtue of being recent and requiring both is_adult and user/pass, # adapter_fanficcastletvnet.py is the best choice for learning to @@ -150,7 +152,7 @@ class BloodTiesFansComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -184,7 +186,7 @@ class BloodTiesFansComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_buffygilescom.py b/fanficfare/adapters/adapter_buffygilescom.py index 60727150..af50231c 100644 --- a/fanficfare/adapters/adapter_buffygilescom.py +++ b/fanficfare/adapters/adapter_buffygilescom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return BuffyGilesComAdapter @@ -123,7 +124,7 @@ class BuffyGilesComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -157,7 +158,7 @@ class BuffyGilesComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_chaossycophanthexcom.py b/fanficfare/adapters/adapter_chaossycophanthexcom.py index c77e23b6..bee50b0f 100644 --- a/fanficfare/adapters/adapter_chaossycophanthexcom.py +++ b/fanficfare/adapters/adapter_chaossycophanthexcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ChaosSycophantHexComAdapter @@ -88,7 +89,7 @@ class ChaosSycophantHexComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_chosentwofanficcom.py b/fanficfare/adapters/adapter_chosentwofanficcom.py index d292b0f4..ac9bdd5c 100644 --- a/fanficfare/adapters/adapter_chosentwofanficcom.py +++ b/fanficfare/adapters/adapter_chosentwofanficcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,18 +16,21 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 import sys from bs4.element import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ChosenTwoFanFicArchiveAdapter @@ -85,7 +88,7 @@ class ChosenTwoFanFicArchiveAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_csiforensicscom.py b/fanficfare/adapters/adapter_csiforensicscom.py index cba47bca..bab51511 100644 --- a/fanficfare/adapters/adapter_csiforensicscom.py +++ b/fanficfare/adapters/adapter_csiforensicscom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): @@ -87,7 +88,7 @@ class CSIForensicsComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_darksolaceorg.py b/fanficfare/adapters/adapter_darksolaceorg.py index 925caa4a..5e49a78c 100644 --- a/fanficfare/adapters/adapter_darksolaceorg.py +++ b/fanficfare/adapters/adapter_darksolaceorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class DarkSolaceOrgAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_deandamagecom.py b/fanficfare/adapters/adapter_deandamagecom.py index 27a83e6e..a10cfeb3 100644 --- a/fanficfare/adapters/adapter_deandamagecom.py +++ b/fanficfare/adapters/adapter_deandamagecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class DeanDamageComSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_deepinmysoulnet.py b/fanficfare/adapters/adapter_deepinmysoulnet.py index a25a6e25..678b3ee1 100644 --- a/fanficfare/adapters/adapter_deepinmysoulnet.py +++ b/fanficfare/adapters/adapter_deepinmysoulnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return DeepInMySoulNetAdapter ## XXX @@ -123,7 +124,7 @@ class DeepInMySoulNetAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -157,7 +158,7 @@ class DeepInMySoulNetAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_destinysgatewaycom.py b/fanficfare/adapters/adapter_destinysgatewaycom.py index 28bde864..5b9c63df 100644 --- a/fanficfare/adapters/adapter_destinysgatewaycom.py +++ b/fanficfare/adapters/adapter_destinysgatewaycom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return DestinysGatewayComAdapter @@ -88,7 +89,7 @@ class DestinysGatewayComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -108,7 +109,7 @@ class DestinysGatewayComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_dokugacom.py b/fanficfare/adapters/adapter_dokugacom.py index 80913b88..f98eaa16 100644 --- a/fanficfare/adapters/adapter_dokugacom.py +++ b/fanficfare/adapters/adapter_dokugacom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return DokugaComAdapter @@ -126,7 +127,7 @@ class DokugaComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_dracoandginnycom.py b/fanficfare/adapters/adapter_dracoandginnycom.py index 436b09ab..b37cd90c 100644 --- a/fanficfare/adapters/adapter_dracoandginnycom.py +++ b/fanficfare/adapters/adapter_dracoandginnycom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return DracoAndGinnyComAdapter @@ -122,7 +123,7 @@ class DracoAndGinnyComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -147,7 +148,7 @@ class DracoAndGinnyComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_dramioneorg.py b/fanficfare/adapters/adapter_dramioneorg.py index 856f928b..ce7a0ae4 100644 --- a/fanficfare/adapters/adapter_dramioneorg.py +++ b/fanficfare/adapters/adapter_dramioneorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,19 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - from bs4.element import Tag from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return DramioneOrgAdapter @@ -122,7 +124,7 @@ class DramioneOrgAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_efictionestelielde.py b/fanficfare/adapters/adapter_efictionestelielde.py index 233eda25..81bacbde 100644 --- a/fanficfare/adapters/adapter_efictionestelielde.py +++ b/fanficfare/adapters/adapter_efictionestelielde.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return EfictionEstelielDeAdapter @@ -79,7 +80,7 @@ class EfictionEstelielDeAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_efpfanficnet.py b/fanficfare/adapters/adapter_efpfanficnet.py index 52cd53b3..a16d01e2 100644 --- a/fanficfare/adapters/adapter_efpfanficnet.py +++ b/fanficfare/adapters/adapter_efpfanficnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return EFPFanFicNet @@ -110,7 +111,7 @@ class EFPFanFicNet(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_erosnsapphosycophanthexcom.py b/fanficfare/adapters/adapter_erosnsapphosycophanthexcom.py index fd5789ae..ad341981 100644 --- a/fanficfare/adapters/adapter_erosnsapphosycophanthexcom.py +++ b/fanficfare/adapters/adapter_erosnsapphosycophanthexcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ErosnSapphoSycophantHexComAdapter @@ -88,7 +89,7 @@ class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -108,7 +109,7 @@ class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_fanficauthorsnet.py b/fanficfare/adapters/adapter_fanficauthorsnet.py index a18b925f..83f2341c 100644 --- a/fanficfare/adapters/adapter_fanficauthorsnet.py +++ b/fanficfare/adapters/adapter_fanficauthorsnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -- coding: utf-8 -- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,19 +18,22 @@ ### Adapted by GComyn - November 26, 2016 ### #################################################################################################### +from __future__ import absolute_import from __future__ import unicode_literals -import time import logging logger = logging.getLogger(__name__) import re import sys -import urllib2 from bs4 import UnicodeDammit, Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate #################################################################################################### def getClass(): @@ -157,7 +160,7 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url+'index/', params, usecache=False) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist("Code: 404. {0}".format(url)) elif e.code == 410: diff --git a/fanficfare/adapters/adapter_fanficcastletvnet.py b/fanficfare/adapters/adapter_fanficcastletvnet.py index ee186272..dce86a4a 100644 --- a/fanficfare/adapters/adapter_fanficcastletvnet.py +++ b/fanficfare/adapters/adapter_fanficcastletvnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # In general an 'adapter' needs to do these five things: @@ -138,7 +139,7 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -163,7 +164,7 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_fanfichu.py b/fanficfare/adapters/adapter_fanfichu.py index 7685a6d4..4d7ff372 100644 --- a/fanficfare/adapters/adapter_fanfichu.py +++ b/fanficfare/adapters/adapter_fanfichu.py @@ -1,6 +1,6 @@ -# coding=utf-8 +# -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,11 +15,15 @@ # limitations under the License. # +from __future__ import absolute_import import re -import urllib2 -import urlparse +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six import ensure_text +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions @@ -61,7 +65,7 @@ class FanficHuAdapter(BaseSiteAdapter): if exception: try: data = self._fetchUrl(url, parameters) - except urllib2.HTTPError: + except HTTPError: raise exception(self.url) # Just let self._fetchUrl throw the exception, don't catch and # customize it. @@ -84,7 +88,7 @@ class FanficHuAdapter(BaseSiteAdapter): def extractChapterUrlsAndMetadata(self): soup = self._customized_fetch_url(self.url + '&i=1') - if soup.title.string.encode(_SOURCE_CODE_ENCODING).strip(' :') == 'írta': + if ensure_text(soup.title.string).strip(u' :') == u'írta': raise exceptions.StoryDoesNotExist(self.url) chapter_options = soup.find('form', action='viewstory.php').select('option') @@ -140,46 +144,46 @@ class FanficHuAdapter(BaseSiteAdapter): while index < len(cells): cell = cells[index] - key = cell.b.string.encode(_SOURCE_CODE_ENCODING).strip(':') + key = ensure_text(cell.b.string).strip(u':') try: - value = cells[index+1].string.encode(_SOURCE_CODE_ENCODING) - except AttributeError: + value = ensure_text(cells[index+1].string) + except: value = None - if key == 'Kategória': + if key == u'Kategória': for anchor in cells[index+1]('a'): self.story.addToList('category', anchor.string) - elif key == 'Szereplõk': + elif key == u'Szereplõk': if cells[index+1].string: for name in cells[index+1].string.split(', '): self.story.addToList('character', name) - elif key == 'Korhatár': + elif key == u'Korhatár': if value != 'nem korhatáros': self.story.setMetadata('rating', value) - elif key == 'Figyelmeztetések': + elif key == u'Figyelmeztetések': for b_tag in cells[index+1]('b'): self.story.addToList('warnings', b_tag.string) - elif key == 'Jellemzõk': + elif key == u'Jellemzõk': for genre in cells[index+1].string.split(', '): self.story.addToList('genre', genre) - elif key == 'Fejezetek': + elif key == u'Fejezetek': self.story.setMetadata('numChapters', int(value)) - elif key == 'Megjelenés': + elif key == u'Megjelenés': self.story.setMetadata('datePublished', makeDate(value, self.DATE_FORMAT)) - elif key == 'Frissítés': + elif key == u'Frissítés': self.story.setMetadata('dateUpdated', makeDate(value, self.DATE_FORMAT)) - elif key == 'Szavak': + elif key == u'Szavak': self.story.setMetadata('numWords', value) - elif key == 'Befejezett': + elif key == u'Befejezett': self.story.setMetadata('status', 'Completed' if value == 'Nem' else 'In-Progress') index += 2 diff --git a/fanficfare/adapters/adapter_fanfictionjunkiesde.py b/fanficfare/adapters/adapter_fanfictionjunkiesde.py index c2418b47..d8c95dfc 100644 --- a/fanficfare/adapters/adapter_fanfictionjunkiesde.py +++ b/fanficfare/adapters/adapter_fanfictionjunkiesde.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # By virtue of being recent and requiring both is_adult and user/pass, # adapter_fanficcastletvnet.py is the best choice for learning to @@ -147,7 +148,7 @@ class FanfictionJunkiesDeAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_fanfictionnet.py b/fanficfare/adapters/adapter_fanfictionnet.py index 9140ff6f..8fa6dc77 100644 --- a/fanficfare/adapters/adapter_fanfictionnet.py +++ b/fanficfare/adapters/adapter_fanfictionnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,21 @@ # limitations under the License. # +from __future__ import absolute_import from datetime import datetime import logging logger = logging.getLogger(__name__) import re -import urllib2 -from urllib import unquote_plus + +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + from .. import exceptions as exceptions from ..htmlcleanup import stripHTML -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate ffnetgenres=["Adventure", "Angst", "Crime", "Drama", "Family", "Fantasy", "Friendship", "General", "Horror", "Humor", "Hurt-Comfort", "Mystery", "Parody", "Poetry", "Romance", "Sci-Fi", @@ -100,7 +104,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): data = self._fetchUrl(url) #logger.debug("\n===================\n%s\n===================\n"%data) soup = self.make_soup(data) - except urllib2.HTTPError as e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) else: @@ -135,7 +139,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): and "This request takes too long to process, it is timed out by the server." not in newdata: logger.debug('=======Found newer chapter: %s' % tryurl) soup = self.make_soup(newdata) - except urllib2.HTTPError as e: + except HTTPError as e: if e.code == 503: raise e except Exception as e: diff --git a/fanficfare/adapters/adapter_fanfiktionde.py b/fanficfare/adapters/adapter_fanfiktionde.py index 70516081..ef50a9aa 100644 --- a/fanficfare/adapters/adapter_fanfiktionde.py +++ b/fanficfare/adapters/adapter_fanfiktionde.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,19 +15,21 @@ # limitations under the License. # +from __future__ import absolute_import import time import logging logger = logging.getLogger(__name__) import re import urllib -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return FanFiktionDeAdapter @@ -118,7 +120,7 @@ class FanFiktionDeAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -164,7 +166,7 @@ class FanFiktionDeAdapter(BaseSiteAdapter): self.story.extendList('genre',genres[:genres.index(' / ')].split(', ')) self.story.setMetadata('rating', genres[genres.index(' / ')+3:]) - self.story.addToList('category',stripHTML(soup.find('span',id='ffcbox-story-topic-1')).split(' / ')[2]) + self.story.addToList('category',stripHTML(soup.find('span',id='ffcbox-story-topic-1')).split('/')[2].strip()) try: self.story.setMetadata('native_status', head.find_all('span',{'class':'titled-icon'})[3]['title']) diff --git a/fanficfare/adapters/adapter_fannation.py b/fanficfare/adapters/adapter_fannation.py index 78a053ca..d34beaf8 100644 --- a/fanficfare/adapters/adapter_fannation.py +++ b/fanficfare/adapters/adapter_fannation.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,8 +16,9 @@ # # Software: eFiction +from __future__ import absolute_import import re -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class FanNationAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_fhsarchivecom.py b/fanficfare/adapters/adapter_fhsarchivecom.py index d2f9ff3d..1eed35e8 100644 --- a/fanficfare/adapters/adapter_fhsarchivecom.py +++ b/fanficfare/adapters/adapter_fhsarchivecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,8 +16,9 @@ # # Software: eFiction +from __future__ import absolute_import import re -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class FHSArchiveComAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_ficbooknet.py b/fanficfare/adapters/adapter_ficbooknet.py index 14304ab7..afac38e7 100644 --- a/fanficfare/adapters/adapter_ficbooknet.py +++ b/fanficfare/adapters/adapter_ficbooknet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,19 +15,22 @@ # limitations under the License. # -import time +from __future__ import absolute_import import datetime import logging logger = logging.getLogger(__name__) import re -import urllib2 from .. import translit from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): @@ -77,7 +80,7 @@ class FicBookNetAdapter(BaseSiteAdapter): logger.debug("URL: "+url) try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_fictionalleyorg.py b/fanficfare/adapters/adapter_fictionalleyorg.py index bdbf323d..d7bee1df 100644 --- a/fanficfare/adapters/adapter_fictionalleyorg.py +++ b/fanficfare/adapters/adapter_fictionalleyorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,18 +15,19 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re import urllib -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class FictionAlleyOrgSiteAdapter(BaseSiteAdapter): @@ -80,7 +81,7 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter): try: data = self._postFetchWithIAmOld(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_fictionhuntcom.py b/fanficfare/adapters/adapter_fictionhuntcom.py index e3ae1bef..de1d203c 100644 --- a/fanficfare/adapters/adapter_fictionhuntcom.py +++ b/fanficfare/adapters/adapter_fictionhuntcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,15 +15,18 @@ # limitations under the License. # +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - from .. import exceptions as exceptions from ..htmlcleanup import stripHTML -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class FictionHuntComSiteAdapter(BaseSiteAdapter): @@ -68,7 +71,7 @@ class FictionHuntComSiteAdapter(BaseSiteAdapter): url = self.url try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.meta) else: diff --git a/fanficfare/adapters/adapter_fictionmaniatv.py b/fanficfare/adapters/adapter_fictionmaniatv.py index d6de271c..b3d64df9 100644 --- a/fanficfare/adapters/adapter_fictionmaniatv.py +++ b/fanficfare/adapters/adapter_fictionmaniatv.py @@ -1,8 +1,11 @@ +from __future__ import absolute_import import re -import urllib2 -import urlparse +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): @@ -44,7 +47,7 @@ class FictionManiaTVAdapter(BaseSiteAdapter): if exception: try: data = self._fetchUrl(url, parameters) - except urllib2.HTTPError: + except HTTPError: raise exception(self.url) # Just let self._fetchUrl throw the exception, don't catch and # customize it. diff --git a/fanficfare/adapters/adapter_fictionpadcom.py b/fanficfare/adapters/adapter_fictionpadcom.py index 2881bfe6..9d8dc4e4 100644 --- a/fanficfare/adapters/adapter_fictionpadcom.py +++ b/fanficfare/adapters/adapter_fictionpadcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,19 +15,21 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 -import time import json #from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class FictionPadSiteAdapter(BaseSiteAdapter): @@ -123,7 +125,7 @@ class FictionPadSiteAdapter(BaseSiteAdapter): data = data[:data.rindex(";")] data = data.replace('tables:','"tables":') tables = json.loads(data)['tables'] - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) else: diff --git a/fanficfare/adapters/adapter_fictionpresscom.py b/fanficfare/adapters/adapter_fictionpresscom.py index 19882975..132e291f 100644 --- a/fanficfare/adapters/adapter_fictionpresscom.py +++ b/fanficfare/adapters/adapter_fictionpresscom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,15 +15,16 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 -import time + +# py2 vs py3 transition +from ..six import text_type as unicode ## They're from the same people and pretty much identical. -from adapter_fanfictionnet import FanFictionNetSiteAdapter +from .adapter_fanfictionnet import FanFictionNetSiteAdapter class FictionPressComSiteAdapter(FanFictionNetSiteAdapter): diff --git a/fanficfare/adapters/adapter_ficwadcom.py b/fanficfare/adapters/adapter_ficwadcom.py index 2f2fc834..aad2d0ea 100644 --- a/fanficfare/adapters/adapter_ficwadcom.py +++ b/fanficfare/adapters/adapter_ficwadcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,18 +15,19 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 -import time -import httplib, urllib from .. import exceptions as exceptions from ..htmlcleanup import stripHTML -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class FicwadComSiteAdapter(BaseSiteAdapter): @@ -96,7 +97,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter): if "

Featured Story

" in data: raise exceptions.StoryDoesNotExist(self.url) soup = self.make_soup(data) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -118,7 +119,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter): self._setURL(url) try: soup = self.make_soup(self._fetchUrl(url)) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_fimfictionnet.py b/fanficfare/adapters/adapter_fimfictionnet.py index 42f7233b..444e6b68 100644 --- a/fanficfare/adapters/adapter_fimfictionnet.py +++ b/fanficfare/adapters/adapter_fimfictionnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,19 +15,23 @@ # limitations under the License. # +from __future__ import absolute_import import time from datetime import date, datetime import logging logger = logging.getLogger(__name__) import re -import urllib2 -import cookielib as cl import json from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError +from ..six.moves import http_cookiejar as cl + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return FimFictionNetSiteAdapter @@ -97,7 +101,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter): data = self.do_fix_blockquotes(self._fetchUrl(self.url, usecache=(not self.is_adult))) soup = self.make_soup(data) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_finestoriescom.py b/fanficfare/adapters/adapter_finestoriescom.py index 59d47e6c..81be7f5a 100644 --- a/fanficfare/adapters/adapter_finestoriescom.py +++ b/fanficfare/adapters/adapter_finestoriescom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,14 @@ # limitations under the License. # +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) -from adapter_storiesonlinenet import StoriesOnlineNetAdapter + +# py2 vs py3 transition +from ..six import text_type as unicode + +from .adapter_storiesonlinenet import StoriesOnlineNetAdapter def getClass(): return FineStoriesComAdapter diff --git a/fanficfare/adapters/adapter_fireflyfansnet.py b/fanficfare/adapters/adapter_fireflyfansnet.py index 005884f2..160815d7 100644 --- a/fanficfare/adapters/adapter_fireflyfansnet.py +++ b/fanficfare/adapters/adapter_fireflyfansnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,14 +17,16 @@ #################################################################################################### # Adapted by GComyn - December 10, 2016 #################################################################################################### +from __future__ import absolute_import ''' This adapter will download the stories from the www.fireflyfans.net forum pages ''' import logging import re import sys -import time -import urllib2 +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -79,7 +81,7 @@ class FireFlyFansNetSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -131,7 +133,7 @@ class FireFlyFansNetSiteAdapter(BaseSiteAdapter): # which is usualy FireFly on this site, but I'm going to get them # anyway.a category = soup.find('span', {'id': 'MainContent_txtItemDetails'}) - category = stripHTML(str(category).replace(b"\xc2\xa0", ' ')) + category = stripHTML(unicode(category).replace(u"\xa0", u' ')) metad = category.split(' ') for meta in metad: if ":" in meta: diff --git a/fanficfare/adapters/adapter_fireflypopulliorg.py b/fanficfare/adapters/adapter_fireflypopulliorg.py index 028862c9..41b45a9c 100644 --- a/fanficfare/adapters/adapter_fireflypopulliorg.py +++ b/fanficfare/adapters/adapter_fireflypopulliorg.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,13 +21,17 @@ ###================================================================================================= ### I have started to use lines of # on the line just before a function so they are easier to find. #################################################################################################### +from __future__ import absolute_import ''' This adapter scrapes the metadata and chapter text from stories on firefly.populli.org ''' import logging import re -import urllib2 import sys -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -97,7 +101,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): ''' try: page_data = self._fetchUrl(page) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(page)) else: @@ -125,7 +129,8 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): if not title: raise exceptions.StoryDoesNotExist('Cannot find title on the page {}'.format(url)) - self.story.setMetadata('title', stripHTML(soup.find('h2'))) + rawtitle = stripHTML(soup.find('h2')) + self.story.setMetadata('title', rawtitle) # This site has the entire story on one page, so we will be using the normalized URL as # the chapterUrl and the Title as the chapter Title @@ -149,7 +154,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): if ',' in mdata: self.story.setMetadata('coauthor', ', '.join(mdata.split(',')[1:]).strip()) mdata = mdata.split(',')[0] - + # print mdata # self.story.getMetadata('coauthor') # sys.exit() @@ -180,13 +185,16 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): if stories: for story in stories: # There alot of nbsp's (non broken spaces) in here, so I'm going to remove them - # I'm also getting rid of the bold tags and the nextline characters to make it + # I'm also getting rid of the bold tags and the nextline characters to make it # easier to get the information below - story = repr(story).replace(b'\\xa0', '').replace(' ',' ').replace( + story = repr(story).replace(u'\\xa0', '').replace(' ',' ').replace( '','').replace('','').replace(r'\n','') story = self.make_soup(story).find('p') story_a = story.find('a') - title = self.story.getMetadata('title').split('-')[0].strip() + # not sure why this split is here, but it caused + # problems when title_chapter_range_pattern + # introduces a '-', so save rawtitle --JM + title = rawtitle.split('-')[0].strip() if story_a.get_text() == title: story_found = True break @@ -265,7 +273,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): else: ## This should catch anything else, and shouldn't ever really be gotten # to, but I'm going to have it print out in the debugger, just in case - logger.debug('Metadata not caught: %s' % str(meta)) + logger.debug('Metadata not caught: %s' % unicode(meta)) zzzzzzzz = 0 elif label == 'characters': self.story.setMetadata('characters', value) @@ -315,7 +323,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): # the end of the section, which has alot of extraneous things, then adding my own div # wrapper, recreating the soup, then getting that div from the soup again, before sending to # the writers. - story = repr(story).replace(b'\\xa0', '').replace(' ',' ').replace(r'\n','').strip() + story = repr(story).replace(u'\\xa0', '').replace(' ',' ').replace(r'\n','').strip() story = story[12:] story = story[:story.find('

Please <')] story = '

' + story + '
' diff --git a/fanficfare/adapters/adapter_forumquestionablequestingcom.py b/fanficfare/adapters/adapter_forumquestionablequestingcom.py index 6c07d4cb..59c6d534 100644 --- a/fanficfare/adapters/adapter_forumquestionablequestingcom.py +++ b/fanficfare/adapters/adapter_forumquestionablequestingcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import import re from ..htmlcleanup import stripHTML -from base_xenforoforum_adapter import BaseXenForoForumAdapter +from .base_xenforoforum_adapter import BaseXenForoForumAdapter def getClass(): return QuestionablequestingComAdapter diff --git a/fanficfare/adapters/adapter_forumsspacebattlescom.py b/fanficfare/adapters/adapter_forumsspacebattlescom.py index eaa90bcf..e9e2bc2b 100644 --- a/fanficfare/adapters/adapter_forumsspacebattlescom.py +++ b/fanficfare/adapters/adapter_forumsspacebattlescom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,10 @@ # limitations under the License. # +from __future__ import absolute_import import re -from base_xenforoforum_adapter import BaseXenForoForumAdapter +from .base_xenforoforum_adapter import BaseXenForoForumAdapter def getClass(): return ForumsSpacebattlesComAdapter diff --git a/fanficfare/adapters/adapter_forumssufficientvelocitycom.py b/fanficfare/adapters/adapter_forumssufficientvelocitycom.py index 679f9df1..c9b09b56 100644 --- a/fanficfare/adapters/adapter_forumssufficientvelocitycom.py +++ b/fanficfare/adapters/adapter_forumssufficientvelocitycom.py @@ -15,9 +15,10 @@ # limitations under the License. # +from __future__ import absolute_import import re -from base_xenforoforum_adapter import BaseXenForoForumAdapter +from .base_xenforoforum_adapter import BaseXenForoForumAdapter def getClass(): return ForumsSufficientVelocityComAdapter diff --git a/fanficfare/adapters/adapter_gluttonyfictioncom.py b/fanficfare/adapters/adapter_gluttonyfictioncom.py index eef9410a..adfc6439 100644 --- a/fanficfare/adapters/adapter_gluttonyfictioncom.py +++ b/fanficfare/adapters/adapter_gluttonyfictioncom.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -# Copyright 2015 FanFicFare team -# Copyright 2016 FanFicFare team +# Copyright 2018 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,8 @@ ### Rewritten by: GComyn on November, 06, 2016 ### Original was adapter_fannation.py ################################################################################## -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class GluttonyFictionComAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_gravitytalescom.py b/fanficfare/adapters/adapter_gravitytalescom.py index 97183427..86f02a61 100644 --- a/fanficfare/adapters/adapter_gravitytalescom.py +++ b/fanficfare/adapters/adapter_gravitytalescom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,10 +18,10 @@ ## Adapted by GComyn on April 21, 2017 #################################################################################################### +from __future__ import absolute_import import logging import re import time -import urllib2 from datetime import datetime logger = logging.getLogger(__name__) @@ -37,7 +37,11 @@ except ImportError: # logger.warn('No version of feedparser module available, falling back to naive published and updated date') feedparser = None -from base_adapter import BaseSiteAdapter +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -93,7 +97,7 @@ class GravityTalesComSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('Error 404: {0}'.format(self.url)) else: diff --git a/fanficfare/adapters/adapter_harrypotterfanfictioncom.py b/fanficfare/adapters/adapter_harrypotterfanfictioncom.py index 8fe607be..8a709e38 100644 --- a/fanficfare/adapters/adapter_harrypotterfanfictioncom.py +++ b/fanficfare/adapters/adapter_harrypotterfanfictioncom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,16 +15,18 @@ # limitations under the License. # +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter): @@ -69,7 +71,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_hlfictionnet.py b/fanficfare/adapters/adapter_hlfictionnet.py index 92686186..0292fb8a 100644 --- a/fanficfare/adapters/adapter_hlfictionnet.py +++ b/fanficfare/adapters/adapter_hlfictionnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return HLFictionNetAdapter @@ -78,7 +79,7 @@ class HLFictionNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_hpfanficarchivecom.py b/fanficfare/adapters/adapter_hpfanficarchivecom.py index 1cebac01..e0e597e2 100644 --- a/fanficfare/adapters/adapter_hpfanficarchivecom.py +++ b/fanficfare/adapters/adapter_hpfanficarchivecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,19 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - from bs4.element import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return HPFanficArchiveComAdapter @@ -78,7 +80,7 @@ class HPFanficArchiveComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_iketernalnet.py b/fanficfare/adapters/adapter_iketernalnet.py index a0e5bd13..68000aeb 100644 --- a/fanficfare/adapters/adapter_iketernalnet.py +++ b/fanficfare/adapters/adapter_iketernalnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,16 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return IkEternalNetAdapter @@ -121,7 +123,7 @@ class IkEternalNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -159,7 +161,7 @@ class IkEternalNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_imagineeficcom.py b/fanficfare/adapters/adapter_imagineeficcom.py index 32ecf81e..c80b895d 100644 --- a/fanficfare/adapters/adapter_imagineeficcom.py +++ b/fanficfare/adapters/adapter_imagineeficcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ImagineEFicComAdapter @@ -122,7 +123,7 @@ class ImagineEFicComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -147,7 +148,7 @@ class ImagineEFicComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_imrightbehindyoucom.py b/fanficfare/adapters/adapter_imrightbehindyoucom.py index f07f3a83..ba1c47db 100644 --- a/fanficfare/adapters/adapter_imrightbehindyoucom.py +++ b/fanficfare/adapters/adapter_imrightbehindyoucom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class ImRightBehindYouComSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_inkbunnynet.py b/fanficfare/adapters/adapter_inkbunnynet.py index d8a15eff..03f9f2e9 100644 --- a/fanficfare/adapters/adapter_inkbunnynet.py +++ b/fanficfare/adapters/adapter_inkbunnynet.py @@ -18,20 +18,23 @@ # Adapted by GComyn on April 24, 2017 # Updated by GComyn on June 11, 2018 +from __future__ import absolute_import import logging import re import sys -import urllib2 from datetime import datetime, timedelta -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML +from ..dateutils import UNIX_EPOCHE -UNIX_EPOCHE = datetime.fromtimestamp(0) logger = logging.getLogger(__name__) - def getClass(): return InkBunnyNetSiteAdapter @@ -122,7 +125,7 @@ class InkBunnyNetSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('Error 404: {0}'.format(self.url)) else: diff --git a/fanficfare/adapters/adapter_itcouldhappennet.py b/fanficfare/adapters/adapter_itcouldhappennet.py index 7e6329c5..1bd03f15 100644 --- a/fanficfare/adapters/adapter_itcouldhappennet.py +++ b/fanficfare/adapters/adapter_itcouldhappennet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,8 +16,9 @@ # # Software: eFiction +from __future__ import absolute_import import re -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class ItCouldHappenNetSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_kiarepositorymujajinet.py b/fanficfare/adapters/adapter_kiarepositorymujajinet.py index df6f06b9..8a8ba785 100644 --- a/fanficfare/adapters/adapter_kiarepositorymujajinet.py +++ b/fanficfare/adapters/adapter_kiarepositorymujajinet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return KiaRepositoryMujajiNetAdapter ## XXX @@ -123,7 +124,7 @@ class KiaRepositoryMujajiNetAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -157,7 +158,7 @@ class KiaRepositoryMujajiNetAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_ksarchivecom.py b/fanficfare/adapters/adapter_ksarchivecom.py index ad7dbe80..299fa9f7 100644 --- a/fanficfare/adapters/adapter_ksarchivecom.py +++ b/fanficfare/adapters/adapter_ksarchivecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # Search for XXX comments--that's where things are most likely to need changing. @@ -101,7 +102,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -134,7 +135,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_lcfanficcom.py b/fanficfare/adapters/adapter_lcfanficcom.py index 14155e5a..6faadd8b 100644 --- a/fanficfare/adapters/adapter_lcfanficcom.py +++ b/fanficfare/adapters/adapter_lcfanficcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,15 +18,18 @@ ## Adapted by GComyn on April 22, 2017 #################################################################################################### +from __future__ import absolute_import import logging import json import re import sys # ## used for debug purposes -import time -import urllib2 import datetime -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -96,7 +99,7 @@ class LCFanFicComSiteAdapter(BaseSiteAdapter): url = self.url try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('Error 404: {0}'.format(self.url)) else: diff --git a/fanficfare/adapters/adapter_libraryofmoriacom.py b/fanficfare/adapters/adapter_libraryofmoriacom.py index 08196cfb..adefe3b5 100644 --- a/fanficfare/adapters/adapter_libraryofmoriacom.py +++ b/fanficfare/adapters/adapter_libraryofmoriacom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class LibraryOfMoriaComAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_lightnovelgatecom.py b/fanficfare/adapters/adapter_lightnovelgatecom.py index fe79e103..9c2fe519 100644 --- a/fanficfare/adapters/adapter_lightnovelgatecom.py +++ b/fanficfare/adapters/adapter_lightnovelgatecom.py @@ -19,12 +19,15 @@ ### Tested with Calibre #################################################################################################### +from __future__ import absolute_import import logging import re -import urllib2 -import urlparse +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate from bs4 import Comment from ..htmlcleanup import removeEntities, stripHTML, fix_excess_space @@ -95,7 +98,7 @@ class LightNovelGateSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(url)) else: diff --git a/fanficfare/adapters/adapter_literotica.py b/fanficfare/adapters/adapter_literotica.py index 8446ab84..e0ea5e3a 100644 --- a/fanficfare/adapters/adapter_literotica.py +++ b/fanficfare/adapters/adapter_literotica.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,18 +15,21 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 -import urlparse from bs4.element import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class LiteroticaSiteAdapter(BaseSiteAdapter): @@ -132,7 +135,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): soup1 = self.make_soup(data1) #strip comments from soup [comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, Comment))] - except urllib2.HTTPError, e: + except HTTPError as e: if e.code in [404, 410]: raise exceptions.StoryDoesNotExist(self.url) else: @@ -157,7 +160,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): #strip comments from soup [comment.extract() for comment in soupAuth.findAll(text=lambda text:isinstance(text, Comment))] # logger.debug(soupAuth) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code in [404, 410]: raise exceptions.StoryDoesNotExist(authorurl) else: @@ -343,7 +346,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): chapter_description = '

Description: %s


' % chapter_description fullhtml += self.getPageText(raw_page, url) if pages: - for page_no in xrange(2, len(page_nums) + 1): + for page_no in range(2, len(page_nums) + 1): page_url = url + "?page=%s" % page_no logger.debug("page_url= %s" % page_url) raw_page = self._fetchUrl(page_url) diff --git a/fanficfare/adapters/adapter_looselugscom.py b/fanficfare/adapters/adapter_looselugscom.py index 20b93af2..9d72db0c 100644 --- a/fanficfare/adapters/adapter_looselugscom.py +++ b/fanficfare/adapters/adapter_looselugscom.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -# Copyright 2015 FanFicFare team -# Copyright 2016 FanFicFare team +# Copyright 2018 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,8 @@ ### Rewritten by: GComyn on November, 06, 2016 ### Original was adapter_fannation.py ################################################################################## -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class LooseLugsComAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_lotrfanfictioncom.py b/fanficfare/adapters/adapter_lotrfanfictioncom.py index ad4f9c33..a57e7d83 100644 --- a/fanficfare/adapters/adapter_lotrfanfictioncom.py +++ b/fanficfare/adapters/adapter_lotrfanfictioncom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class TheLOTRFanFictionSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_lotrgficcom.py b/fanficfare/adapters/adapter_lotrgficcom.py index 6fb6392d..5d75e9fc 100644 --- a/fanficfare/adapters/adapter_lotrgficcom.py +++ b/fanficfare/adapters/adapter_lotrgficcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,17 +18,19 @@ ### Adapted by GComyn ### Completed on November, 22, 2016 ############################################################################## -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re import urllib -import urllib2 - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class LOTRgficComAdapter(BaseSiteAdapter): @@ -79,7 +81,7 @@ class LOTRgficComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -253,7 +255,7 @@ class LOTRgficComAdapter(BaseSiteAdapter): ## dedicated tag, so we have to split some hairs.. ## This may not work every time... but I tested it with 6 stories... mdata = metad[0] - while '
' not in str(mdata.nextSibling): + while '
' not in unicode(mdata.nextSibling): mdata = mdata.nextSibling self.setDescription(url,mdata.previousSibling.previousSibling.get_text()) @@ -284,7 +286,7 @@ class LOTRgficComAdapter(BaseSiteAdapter): #
#

## we'll have to remove the non-breaking spaces to get this to work. - metad = str(metad).replace(b"\xc2\xa0",'').replace('\n','') + metad = unicode(metad).replace(u"\xa0",'').replace('\n','') for txt in metad.split('
'): if 'Challenges:' in txt: txt = txt.replace('Challenges:','').strip() diff --git a/fanficfare/adapters/adapter_lumossycophanthexcom.py b/fanficfare/adapters/adapter_lumossycophanthexcom.py index f87410be..6b8f1c84 100644 --- a/fanficfare/adapters/adapter_lumossycophanthexcom.py +++ b/fanficfare/adapters/adapter_lumossycophanthexcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return LumosSycophantHexComAdapter @@ -88,7 +89,7 @@ class LumosSycophantHexComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 2fb26965..99ae3f87 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,15 +15,21 @@ # limitations under the License. # +from __future__ import absolute_import import bs4 import datetime import logging import re -import urllib2 +from itertools import takewhile from ..htmlcleanup import removeEntities, stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError +from ..six.moves import zip as izip + +from .base_adapter import BaseSiteAdapter, makeDate _logger = logging.getLogger(__name__) @@ -114,7 +120,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): try: startingChapter = self._makeChapter(self.url) - except urllib2.HTTPError, error: + except HTTPError as error: if error.code == 404: raise exceptions.StoryDoesNotExist(self.url) raise @@ -198,7 +204,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): chapterTitle = re.sub(garbagePattern, u'', chapter.getHeading()[chapterTitleStart:]) self.add_chapter(chapterTitle, url) - except ParsingError, error: + except ParsingError as error: raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error)) # Some metadata are handled separately due to format conversions. @@ -700,7 +706,6 @@ def _getLargestCommonPrefix(*args): """Returns largest common prefix of all unicode arguments, ignoring case. :rtype : unicode """ - from itertools import takewhile, izip toLower = lambda xs: map(lambda x: x.lower(), xs) allSame = lambda xs: len(set(toLower(xs))) == 1 return u''.join([i[0] for i in takewhile(allSame, izip(*args))]) diff --git a/fanficfare/adapters/adapter_mcstoriescom.py b/fanficfare/adapters/adapter_mcstoriescom.py index 93572bf2..954d15bb 100644 --- a/fanficfare/adapters/adapter_mcstoriescom.py +++ b/fanficfare/adapters/adapter_mcstoriescom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,20 +15,22 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 -import urlparse -import time import os from bs4.element import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class MCStoriesComSiteAdapter(BaseSiteAdapter): @@ -86,7 +88,7 @@ class MCStoriesComSiteAdapter(BaseSiteAdapter): soup1 = self.make_soup(data1) #strip comments from soup [comment.extract() for comment in soup1.find_all(text=lambda text:isinstance(text, Comment))] - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_mediaminerorg.py b/fanficfare/adapters/adapter_mediaminerorg.py index 2bfb4bdc..291542f5 100644 --- a/fanficfare/adapters/adapter_mediaminerorg.py +++ b/fanficfare/adapters/adapter_mediaminerorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,19 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re import urllib -import urllib2 - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class MediaMinerOrgSiteAdapter(BaseSiteAdapter): @@ -108,7 +110,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) # w/o trailing / gets 'chapter list' page even for one-shots. - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: logger.error("404 on %s"%url) raise exceptions.StoryDoesNotExist(self.url) @@ -120,7 +122,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): ## title: ##

A, A' Fan Fiction ❯ Mmmmm

- titletext = stripHTML(soup.find("h1",{"id":"post-title"})) + titletext = unicode(stripHTML(soup.find("h1",{"id":"post-title"}))) titletext = titletext[titletext.index(u'❯')+2:] # print("title:(%s)"%titletext) self.story.setMetadata('title',titletext) diff --git a/fanficfare/adapters/adapter_merlinficdtwinscouk.py b/fanficfare/adapters/adapter_merlinficdtwinscouk.py index b34cdc40..2385ae0a 100644 --- a/fanficfare/adapters/adapter_merlinficdtwinscouk.py +++ b/fanficfare/adapters/adapter_merlinficdtwinscouk.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return MerlinFicDtwinsCoUk @@ -122,7 +123,7 @@ class MerlinFicDtwinsCoUk(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -147,7 +148,7 @@ class MerlinFicDtwinsCoUk(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_midnightwhispers.py b/fanficfare/adapters/adapter_midnightwhispers.py index fa889f8a..7c885752 100644 --- a/fanficfare/adapters/adapter_midnightwhispers.py +++ b/fanficfare/adapters/adapter_midnightwhispers.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # Search for XXX comments--that's where things are most likely to need changing. @@ -106,7 +107,7 @@ class MidnightwhispersAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -139,7 +140,7 @@ class MidnightwhispersAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_mttjustoncenet.py b/fanficfare/adapters/adapter_mttjustoncenet.py index ad8c09ad..5349b40b 100644 --- a/fanficfare/adapters/adapter_mttjustoncenet.py +++ b/fanficfare/adapters/adapter_mttjustoncenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class MTTJustOnceNetSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_naiceanilmenet.py b/fanficfare/adapters/adapter_naiceanilmenet.py index 3838f06f..758ef07f 100644 --- a/fanficfare/adapters/adapter_naiceanilmenet.py +++ b/fanficfare/adapters/adapter_naiceanilmenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class NaiceaNilmeNetAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_narutoficorg.py b/fanficfare/adapters/adapter_narutoficorg.py index 35e3bb86..2d0bbd92 100644 --- a/fanficfare/adapters/adapter_narutoficorg.py +++ b/fanficfare/adapters/adapter_narutoficorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class NarutoFicOrgSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_nationallibrarynet.py b/fanficfare/adapters/adapter_nationallibrarynet.py index bdea9c85..b569b0c2 100644 --- a/fanficfare/adapters/adapter_nationallibrarynet.py +++ b/fanficfare/adapters/adapter_nationallibrarynet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return NationalLibraryNetAdapter @@ -83,7 +84,7 @@ class NationalLibraryNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_ncisficcom.py b/fanficfare/adapters/adapter_ncisficcom.py index 79ceccce..bcab3f51 100644 --- a/fanficfare/adapters/adapter_ncisficcom.py +++ b/fanficfare/adapters/adapter_ncisficcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return NCISFicComAdapter @@ -82,7 +83,7 @@ class NCISFicComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_ncisfictioncom.py b/fanficfare/adapters/adapter_ncisfictioncom.py index 974adbb7..aa90d1c3 100644 --- a/fanficfare/adapters/adapter_ncisfictioncom.py +++ b/fanficfare/adapters/adapter_ncisfictioncom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class NCISFictionComAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_nfacommunitycom.py b/fanficfare/adapters/adapter_nfacommunitycom.py index 754948a6..e7da293b 100644 --- a/fanficfare/adapters/adapter_nfacommunitycom.py +++ b/fanficfare/adapters/adapter_nfacommunitycom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # Search for XXX comments--that's where things are most likely to need changing. @@ -101,7 +102,7 @@ class NfaCommunityComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -134,7 +135,7 @@ class NfaCommunityComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_nhamagicalworldsus.py b/fanficfare/adapters/adapter_nhamagicalworldsus.py index bb78b5ac..6328e04c 100644 --- a/fanficfare/adapters/adapter_nhamagicalworldsus.py +++ b/fanficfare/adapters/adapter_nhamagicalworldsus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter # Class name has to be unique. Our convention is camel case the # sitename with Adapter at the end. www is skipped. diff --git a/fanficfare/adapters/adapter_ninelivesarchivecom.py b/fanficfare/adapters/adapter_ninelivesarchivecom.py index 08c5c08d..72907fe8 100644 --- a/fanficfare/adapters/adapter_ninelivesarchivecom.py +++ b/fanficfare/adapters/adapter_ninelivesarchivecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2015 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,8 +16,9 @@ # # Software: eFiction +from __future__ import absolute_import import re -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class NineLivesAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_nocturnallightnet.py b/fanficfare/adapters/adapter_nocturnallightnet.py index db86d4cd..a0a63161 100644 --- a/fanficfare/adapters/adapter_nocturnallightnet.py +++ b/fanficfare/adapters/adapter_nocturnallightnet.py @@ -1,12 +1,15 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import re -import urllib2 -import urlparse - from bs4.element import Tag -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions @@ -47,7 +50,7 @@ class NocturnalLightNetAdapter(BaseSiteAdapter): if exception: try: data = self._fetchUrl(url, parameters) - except urllib2.HTTPError: + except HTTPError: raise exception(self.url) # Just let self._fetchUrl throw the exception, don't catch and # customize it. diff --git a/fanficfare/adapters/adapter_noveltrovecom.py b/fanficfare/adapters/adapter_noveltrovecom.py index 24ddf5c8..ff9e70d6 100644 --- a/fanficfare/adapters/adapter_noveltrovecom.py +++ b/fanficfare/adapters/adapter_noveltrovecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,15 +18,18 @@ ## Adapted by GComyn on April 22, 2017 #################################################################################################### +from __future__ import absolute_import import logging import json import re import sys # ## used for debug purposes -import time -import urllib2 import datetime -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -88,7 +91,7 @@ class NovelTroveComSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('Error 404: {0}'.format(self.url)) else: @@ -98,7 +101,7 @@ class NovelTroveComSiteAdapter(BaseSiteAdapter): soup = self.make_soup(data) # Now go hunting for all the meta data we can get - metablock = soup.find('div', {'class', 'title-infos'}) + metablock = soup.find('div', {'class': 'title-infos'}) ## Getting Title title = stripHTML(metablock.find('h1')) diff --git a/fanficfare/adapters/adapter_occlumencysycophanthexcom.py b/fanficfare/adapters/adapter_occlumencysycophanthexcom.py index ce49c8d5..14ff543e 100644 --- a/fanficfare/adapters/adapter_occlumencysycophanthexcom.py +++ b/fanficfare/adapters/adapter_occlumencysycophanthexcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return OcclumencySycophantHexComAdapter @@ -114,7 +115,7 @@ class OcclumencySycophantHexComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_phoenixsongnet.py b/fanficfare/adapters/adapter_phoenixsongnet.py index 6994cc38..22c37153 100644 --- a/fanficfare/adapters/adapter_phoenixsongnet.py +++ b/fanficfare/adapters/adapter_phoenixsongnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,19 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2, urllib, cookielib - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return PhoenixSongNetAdapter @@ -111,7 +113,7 @@ class PhoenixSongNetAdapter(BaseSiteAdapter): if self.getConfig('force_login'): self.performLogin(url) data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_ponyfictionarchivenet.py b/fanficfare/adapters/adapter_ponyfictionarchivenet.py index 54775eff..f3be33f6 100644 --- a/fanficfare/adapters/adapter_ponyfictionarchivenet.py +++ b/fanficfare/adapters/adapter_ponyfictionarchivenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return PonyFictionArchiveNetAdapter @@ -92,7 +93,7 @@ class PonyFictionArchiveNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -113,7 +114,7 @@ class PonyFictionArchiveNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_potionsandsnitches.py b/fanficfare/adapters/adapter_potionsandsnitches.py index d25212a6..49fa91f3 100644 --- a/fanficfare/adapters/adapter_potionsandsnitches.py +++ b/fanficfare/adapters/adapter_potionsandsnitches.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,18 +16,19 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re import urllib -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class PotionsAndSnitchesOrgSiteAdapter(BaseSiteAdapter): @@ -65,7 +66,7 @@ class PotionsAndSnitchesOrgSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_potterficscom.py b/fanficfare/adapters/adapter_potterficscom.py index e3f789e4..582e3c83 100644 --- a/fanficfare/adapters/adapter_potterficscom.py +++ b/fanficfare/adapters/adapter_potterficscom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,19 @@ # limitations under the License. # +from __future__ import absolute_import import datetime import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter # This function is called by the downloader in all adapter_*.py files # in this dir to register the adapter class. So it needs to be @@ -144,7 +146,7 @@ class PotterFicsComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_potterheadsanonymouscom.py b/fanficfare/adapters/adapter_potterheadsanonymouscom.py index 4184e19d..97f53150 100644 --- a/fanficfare/adapters/adapter_potterheadsanonymouscom.py +++ b/fanficfare/adapters/adapter_potterheadsanonymouscom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return PotterHeadsAnonymousComAdapter @@ -122,7 +123,7 @@ class PotterHeadsAnonymousComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -156,7 +157,7 @@ class PotterHeadsAnonymousComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_pretendercentrecom.py b/fanficfare/adapters/adapter_pretendercentrecom.py index 0d644c34..789a0184 100644 --- a/fanficfare/adapters/adapter_pretendercentrecom.py +++ b/fanficfare/adapters/adapter_pretendercentrecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return PretenderCenterComAdapter @@ -91,7 +92,7 @@ class PretenderCenterComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -111,7 +112,7 @@ class PretenderCenterComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_qafficcom.py b/fanficfare/adapters/adapter_qafficcom.py index 9d5d252f..7fa6df5f 100644 --- a/fanficfare/adapters/adapter_qafficcom.py +++ b/fanficfare/adapters/adapter_qafficcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return QafFicComAdapter @@ -88,7 +89,7 @@ class QafFicComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -108,7 +109,7 @@ class QafFicComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_quotevcom.py b/fanficfare/adapters/adapter_quotevcom.py index 282ba283..533dd08f 100644 --- a/fanficfare/adapters/adapter_quotevcom.py +++ b/fanficfare/adapters/adapter_quotevcom.py @@ -1,12 +1,16 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import re -import urlparse -import urllib2 import datetime from .. import exceptions -from base_adapter import BaseSiteAdapter +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter from ..htmlcleanup import stripHTML SITE_DOMAIN = 'quotev.com' @@ -51,7 +55,7 @@ class QuotevComAdapter(BaseSiteAdapter): def extractChapterUrlsAndMetadata(self): try: data = self._fetchUrl(self.url) - except urllib2.HTTPError as e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist("Code: %s: %s"%(e.code,self.url)) else: @@ -68,7 +72,6 @@ class QuotevComAdapter(BaseSiteAdapter): authdiv = soup.find('div', {'class':"quizAuthorList"}) if authdiv: - print("div:%s"%authdiv) for a in authdiv.find_all('a'): self.story.addToList('author', a.get_text()) self.story.addToList('authorId', a['href'].split('/')[-1]) diff --git a/fanficfare/adapters/adapter_royalroadl.py b/fanficfare/adapters/adapter_royalroadl.py index 0b5defd2..e5a031d2 100644 --- a/fanficfare/adapters/adapter_royalroadl.py +++ b/fanficfare/adapters/adapter_royalroadl.py @@ -15,17 +15,21 @@ # limitations under the License. # +from __future__ import absolute_import import contextlib from datetime import datetime -import httplib import logging import re -import urllib2 - from .. import exceptions as exceptions from ..dateutils import parse_relative_date_string from ..htmlcleanup import stripHTML -from base_adapter import BaseSiteAdapter + +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves import http_client as httplib +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter logger = logging.getLogger(__name__) @@ -89,18 +93,24 @@ class RoyalRoadAdapter(BaseSiteAdapter): @staticmethod # must be @staticmethod, don't remove it. def getSiteDomain(): # The site domain. Does have www here, if it uses it. - return 'royalroadl.com' + # changed from royalroadl.com + return 'www.royalroad.com' @classmethod def getAcceptDomains(cls): - return ['royalroadl.com','www.royalroadl.com'] + return ['royalroad.com','royalroadl.com','www.royalroadl.com'] + + @classmethod + def getConfigSections(cls): + "Only needs to be overriden if has additional ini sections." + return ['royalroadl.com',cls.getSiteDomain()] @classmethod def getSiteExampleURLs(cls): - return "https://royalroadl.com/fiction/3056" + return "https://www.royalroad.com/fiction/3056" def getSiteURLPattern(self): - return "https?"+re.escape("://")+r"(www\.|)royalroadl\.com/fiction/\d+(/.*)?$" + return "https?"+re.escape("://")+r"(www\.|)royalroadl?\.com/fiction/\d+(/.*)?$" def use_pagecache(self): ''' @@ -142,7 +152,7 @@ class RoyalRoadAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_samandjacknet.py b/fanficfare/adapters/adapter_samandjacknet.py index 8304bafb..7a6395f6 100644 --- a/fanficfare/adapters/adapter_samandjacknet.py +++ b/fanficfare/adapters/adapter_samandjacknet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # By virtue of being recent and requiring both is_adult and user/pass, # adapter_fanficcastletvnet.py is the best choice for learning to @@ -151,7 +152,7 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -184,7 +185,7 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_scarvesandcoffeenet.py b/fanficfare/adapters/adapter_scarvesandcoffeenet.py index b2b1d9c1..79677349 100644 --- a/fanficfare/adapters/adapter_scarvesandcoffeenet.py +++ b/fanficfare/adapters/adapter_scarvesandcoffeenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ScarvesAndCoffeeNetAdapter @@ -96,7 +97,7 @@ class ScarvesAndCoffeeNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -116,7 +117,7 @@ class ScarvesAndCoffeeNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_sebklainenet.py b/fanficfare/adapters/adapter_sebklainenet.py index 5862e231..116d5f70 100644 --- a/fanficfare/adapters/adapter_sebklainenet.py +++ b/fanficfare/adapters/adapter_sebklainenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,7 +19,8 @@ ################################################################################################### # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class SebklaineNeteOrgSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_sheppardweircom.py b/fanficfare/adapters/adapter_sheppardweircom.py index 6b1fa76a..1a959410 100644 --- a/fanficfare/adapters/adapter_sheppardweircom.py +++ b/fanficfare/adapters/adapter_sheppardweircom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # By virtue of being recent and requiring both is_adult and user/pass, # adapter_fanficcastletvnet.py is the best choice for learning to @@ -146,7 +147,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_shriftweborgbfa.py b/fanficfare/adapters/adapter_shriftweborgbfa.py index b1fbaa4f..f662c4d7 100644 --- a/fanficfare/adapters/adapter_shriftweborgbfa.py +++ b/fanficfare/adapters/adapter_shriftweborgbfa.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,13 +21,17 @@ ###================================================================================================= ### I have started to use lines of # on the line just before a function so they are easier to find. #################################################################################################### +from __future__ import absolute_import ''' This adapter scrapes the metadata and chapter text from stories on archive.shriftweb.org ''' import logging import re -import urllib2 import sys -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -97,7 +101,7 @@ class BFAArchiveShriftwebOrgSiteAdapter(BaseSiteAdapter): ''' try: page_data = self._fetchUrl(page) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(page)) else: @@ -125,7 +129,8 @@ class BFAArchiveShriftwebOrgSiteAdapter(BaseSiteAdapter): if not title: raise exceptions.StoryDoesNotExist('Cannot find title on the page {}'.format(url)) - self.story.setMetadata('title', stripHTML(title)) + rawtitle = stripHTML(title) + self.story.setMetadata('title', rawtitle) # This site has the entire story on one page, so we will be using the normalized URL as # the chapterUrl and the Title as the chapter Title @@ -178,7 +183,7 @@ class BFAArchiveShriftwebOrgSiteAdapter(BaseSiteAdapter): story = self.make_soup(story).find('div') story_a = story.find('a') ## some stories have special characters... need to fix them. - title = repr(self.story.getMetadata('title'))[2:-1].replace('&', '&') + title = repr(rawtitle)[2:-1].replace('&', '&') if title in story_a.get_text(): story_found = True break diff --git a/fanficfare/adapters/adapter_sinfuldreamscomunicornfic.py b/fanficfare/adapters/adapter_sinfuldreamscomunicornfic.py index 3fd9d1df..e2c56509 100644 --- a/fanficfare/adapters/adapter_sinfuldreamscomunicornfic.py +++ b/fanficfare/adapters/adapter_sinfuldreamscomunicornfic.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class SinfulDreamsComUnicornFic(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_sinfuldreamscomwhisperedmuse.py b/fanficfare/adapters/adapter_sinfuldreamscomwhisperedmuse.py index 5082fe61..595c4c66 100644 --- a/fanficfare/adapters/adapter_sinfuldreamscomwhisperedmuse.py +++ b/fanficfare/adapters/adapter_sinfuldreamscomwhisperedmuse.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class SinfulDreamsComWhisperedMuse(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_sinfuldreamscomwickedtemptation.py b/fanficfare/adapters/adapter_sinfuldreamscomwickedtemptation.py index cebc3020..44cc733f 100644 --- a/fanficfare/adapters/adapter_sinfuldreamscomwickedtemptation.py +++ b/fanficfare/adapters/adapter_sinfuldreamscomwickedtemptation.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class SinfulDreamsComWickedTemptation(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_siyecouk.py b/fanficfare/adapters/adapter_siyecouk.py index 15628ff9..e08524a6 100644 --- a/fanficfare/adapters/adapter_siyecouk.py +++ b/fanficfare/adapters/adapter_siyecouk.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate # This function is called by the downloader in all adapter_*.py files # in this dir to register the adapter class. So it needs to be @@ -83,7 +84,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_spikeluvercom.py b/fanficfare/adapters/adapter_spikeluvercom.py index adf8d2eb..3d1493fb 100644 --- a/fanficfare/adapters/adapter_spikeluvercom.py +++ b/fanficfare/adapters/adapter_spikeluvercom.py @@ -1,12 +1,16 @@ # Software: eFiction +from __future__ import absolute_import import re -import urllib2 -import urlparse from bs4.element import Tag from ..htmlcleanup import stripHTML -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions @@ -51,7 +55,7 @@ class SpikeluverComAdapter(BaseSiteAdapter): if exception: try: data = self._fetchUrl(url, parameters) - except urllib2.HTTPError: + except HTTPError: raise exception(self.url) # Just let self._fetchUrl throw the exception, don't catch and # customize it. diff --git a/fanficfare/adapters/adapter_squidgeorgpeja.py b/fanficfare/adapters/adapter_squidgeorgpeja.py index c49acaf2..105866ed 100644 --- a/fanficfare/adapters/adapter_squidgeorgpeja.py +++ b/fanficfare/adapters/adapter_squidgeorgpeja.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): @@ -93,7 +94,7 @@ class SquidgeOrgPejaAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_starskyhutcharchivenet.py b/fanficfare/adapters/adapter_starskyhutcharchivenet.py index fe816265..9c4df433 100644 --- a/fanficfare/adapters/adapter_starskyhutcharchivenet.py +++ b/fanficfare/adapters/adapter_starskyhutcharchivenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class StarskyHutchArchiveNetSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_starslibrarynet.py b/fanficfare/adapters/adapter_starslibrarynet.py index 678a2ccd..a5557eba 100644 --- a/fanficfare/adapters/adapter_starslibrarynet.py +++ b/fanficfare/adapters/adapter_starslibrarynet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class StarsLibraryNetAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_storiesofardacom.py b/fanficfare/adapters/adapter_storiesofardacom.py index e31def73..0a0de89b 100644 --- a/fanficfare/adapters/adapter_storiesofardacom.py +++ b/fanficfare/adapters/adapter_storiesofardacom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return StoriesOfArdaComAdapter @@ -77,7 +78,7 @@ class StoriesOfArdaComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_storiesonlinenet.py b/fanficfare/adapters/adapter_storiesonlinenet.py index 2f224769..f7b9e744 100644 --- a/fanficfare/adapters/adapter_storiesonlinenet.py +++ b/fanficfare/adapters/adapter_storiesonlinenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,16 +15,19 @@ # limitations under the License. # +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - # from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return StoriesOnlineNetAdapter @@ -133,7 +136,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter): self.needToLogin = False try: data = self._fetchUrl(url+":i") - except urllib2.HTTPError, e: + except HTTPError as e: if e.code in (404, 410): raise exceptions.StoryDoesNotExist("Code: %s: %s"%(e.code,self.url)) elif e.code == 401: @@ -147,7 +150,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter): self.performLogin(url) try: data = self._fetchUrl(url+":i",usecache=False) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code in (404, 410): raise exceptions.StoryDoesNotExist("Code: %s: %s"%(e.code,self.url)) elif e.code == 401: @@ -263,7 +266,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter): page = page + 1 try: data = self._fetchUrl(self.story.getList('authorUrl')[0] + "/" + unicode(page)) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.FailedToDownload("Story not found in Author's list--change Listings Theme back to "+self.getTheme()) asoup = self.make_soup(data) diff --git a/fanficfare/adapters/adapter_sugarquillnet.py b/fanficfare/adapters/adapter_sugarquillnet.py index 65c43093..e64b629e 100644 --- a/fanficfare/adapters/adapter_sugarquillnet.py +++ b/fanficfare/adapters/adapter_sugarquillnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,18 +26,21 @@ ### take a long gime to process. I've removed as much of the extra ### formatting as I thought I could. ############################################################################# -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 import sys from bs4.element import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return SugarQuillNetAdapter @@ -89,7 +92,7 @@ class SugarQuillNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) else: @@ -127,7 +130,7 @@ class SugarQuillNetAdapter(BaseSiteAdapter): logger.debug('Getting the author page: {0}'.format(author_Url)) try: adata = self._fetchUrl(author_Url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code in 404: raise exceptions.StoryDoesNotExist("Author Page: Code: 404. {0}".format(author_Url)) elif e.code == 410: diff --git a/fanficfare/adapters/adapter_swordborderlineangelcom.py b/fanficfare/adapters/adapter_swordborderlineangelcom.py index 5f94da25..3b2cfd09 100644 --- a/fanficfare/adapters/adapter_swordborderlineangelcom.py +++ b/fanficfare/adapters/adapter_swordborderlineangelcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,9 +16,10 @@ # # Software: eFiction +from __future__ import absolute_import from ..htmlcleanup import stripHTML -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class SwordBorderlineAngelComSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_tasteofpoisoninkubationnet.py b/fanficfare/adapters/adapter_tasteofpoisoninkubationnet.py index 94b2f19c..3ad19ac4 100644 --- a/fanficfare/adapters/adapter_tasteofpoisoninkubationnet.py +++ b/fanficfare/adapters/adapter_tasteofpoisoninkubationnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class TasteOfPoisonInkubationNetAdapter(BaseEfictionAdapter): ''' This adapter will download stories from the diff --git a/fanficfare/adapters/adapter_tenhawkpresentscom.py b/fanficfare/adapters/adapter_tenhawkpresentscom.py index 89bc3d81..c65f23a1 100644 --- a/fanficfare/adapters/adapter_tenhawkpresentscom.py +++ b/fanficfare/adapters/adapter_tenhawkpresentscom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,18 +16,19 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re import urllib -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class TenhawkPresentsComSiteAdapter(BaseSiteAdapter): @@ -111,7 +112,7 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_test1.py b/fanficfare/adapters/adapter_test1.py index 745f2204..2a253ae0 100644 --- a/fanficfare/adapters/adapter_test1.py +++ b/fanficfare/adapters/adapter_test1.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ # limitations under the License. # +from __future__ import absolute_import import datetime import time import logging @@ -22,7 +23,10 @@ logger = logging.getLogger(__name__) from .. import exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import ensure_text + +from .base_adapter import BaseSiteAdapter, makeDate class TestSiteAdapter(BaseSiteAdapter): @@ -67,13 +71,13 @@ class TestSiteAdapter(BaseSiteAdapter): #print("addList:%s"%(nkey)) for val in self.get_config_list(sections,key): #print("addList:%s->%s"%(nkey,val)) - self.story.addToList(nkey,val.decode('utf-8').replace('{{storyId}}',idstr)) + self.story.addToList(nkey,ensure_text(val).replace('{{storyId}}',idstr)) else: # Special cases: if key in ['datePublished','dateUpdated']: self.story.setMetadata(key,makeDate(self.get_config(sections,key),"%Y-%m-%d")) else: - self.story.setMetadata(key,self.get_config(sections,key).decode('utf-8').replace('{{storyId}}',idstr)) + self.story.setMetadata(key,ensure_text(self.get_config(sections,key)).replace('{{storyId}}',idstr)) #print("set:%s->%s"%(key,self.story.getMetadata(key))) if self.has_config(sections,'chapter_urls'): @@ -112,7 +116,7 @@ class TestSiteAdapter(BaseSiteAdapter): raise exceptions.FailedToLogin(self.url,self.username) if idstr == '664': - self.story.setMetadata(u'title',"Test Story Title "+idstr+self.crazystring) + self.story.setMetadata(u'title',"Test Story Title "+idstr+self.crazystring+"  ") self.story.setMetadata('author','Test Author aa bare amp(&) quote(') amp(&)') else: self.story.setMetadata(u'title',"Test Story Title "+idstr) diff --git a/fanficfare/adapters/adapter_tgstorytimecom.py b/fanficfare/adapters/adapter_tgstorytimecom.py index ca3002e1..69bcd962 100644 --- a/fanficfare/adapters/adapter_tgstorytimecom.py +++ b/fanficfare/adapters/adapter_tgstorytimecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class TGStorytimeComAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_thebrokenworldorg.py b/fanficfare/adapters/adapter_thebrokenworldorg.py index ea18fc08..d4133b51 100644 --- a/fanficfare/adapters/adapter_thebrokenworldorg.py +++ b/fanficfare/adapters/adapter_thebrokenworldorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class TheBrokenWorldOrgSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_thedelphicexpansecom.py b/fanficfare/adapters/adapter_thedelphicexpansecom.py index 4a063019..c634cbb5 100644 --- a/fanficfare/adapters/adapter_thedelphicexpansecom.py +++ b/fanficfare/adapters/adapter_thedelphicexpansecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class TheDelphicExpanseComAdapter(BaseEfictionAdapter): ''' This adapter will download stories from the diff --git a/fanficfare/adapters/adapter_thehookupzonenet.py b/fanficfare/adapters/adapter_thehookupzonenet.py index 78563fc3..285ff594 100644 --- a/fanficfare/adapters/adapter_thehookupzonenet.py +++ b/fanficfare/adapters/adapter_thehookupzonenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class TheHookupZoneNetAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_themaplebookshelf.py b/fanficfare/adapters/adapter_themaplebookshelf.py index 186708e7..e9affbf9 100644 --- a/fanficfare/adapters/adapter_themaplebookshelf.py +++ b/fanficfare/adapters/adapter_themaplebookshelf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ # # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class TheMapleBookshelfComSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_themasquenet.py b/fanficfare/adapters/adapter_themasquenet.py index 60f7482b..ee69babf 100644 --- a/fanficfare/adapters/adapter_themasquenet.py +++ b/fanficfare/adapters/adapter_themasquenet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return TheMasqueNetAdapter @@ -129,7 +130,7 @@ class TheMasqueNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -154,7 +155,7 @@ class TheMasqueNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_thepetulantpoetesscom.py b/fanficfare/adapters/adapter_thepetulantpoetesscom.py index a1a70f5e..e22b3ecb 100644 --- a/fanficfare/adapters/adapter_thepetulantpoetesscom.py +++ b/fanficfare/adapters/adapter_thepetulantpoetesscom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return ThePetulantPoetessComAdapter @@ -112,7 +113,7 @@ class ThePetulantPoetessComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_thundercatsfansorg.py b/fanficfare/adapters/adapter_thundercatsfansorg.py index 3b31a201..534fdddd 100644 --- a/fanficfare/adapters/adapter_thundercatsfansorg.py +++ b/fanficfare/adapters/adapter_thundercatsfansorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class ThundercatsFansOrgSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_tolkienfanfiction.py b/fanficfare/adapters/adapter_tolkienfanfiction.py index c53d65fd..f2230189 100644 --- a/fanficfare/adapters/adapter_tolkienfanfiction.py +++ b/fanficfare/adapters/adapter_tolkienfanfiction.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ # limitations under the License. # +from __future__ import absolute_import """ FFDL Adapter for TolkienFanFiction.com. @@ -58,16 +59,19 @@ import time import logging logger = logging.getLogger(__name__) import re -import urllib -import urllib2 -import urlparse import string from bs4.element import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.parse import urlencode +from ..six.moves.urllib.error import HTTPError +from ..six.moves import urllib + +from .base_adapter import BaseSiteAdapter, makeDate def _is_story_url(url): return "Story_Read_Head.php" in url @@ -130,7 +134,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter): chapterSoup = self.make_soup(chapterHtml) indexLink = chapterSoup.find("a", text="[Index]") self._normalizeURL('http://' + self.getSiteDomain() + '/' + indexLink.get('href')) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -140,7 +144,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter): try: indexHtml = _fix_broken_markup(self._fetchUrl(self.url)) soup = self.make_soup(indexHtml) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -192,7 +196,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter): logger.debug('Title as `str`: ' + unicode(title)) # For publication date we need to search try: - queryString = urllib.urlencode(( + queryString = urlencode(( ('type', 3), ('field', 1), # need translate here for the weird accented letters @@ -206,7 +210,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter): date = searchSoup.find(text="Updated:").nextSibling.string logger.debug("Last Updated: '%s'" % date) self.story.setMetadata('dateUpdated', makeDate(date, self.dateformat)) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_tomparisdormcom.py b/fanficfare/adapters/adapter_tomparisdormcom.py index 76c67881..8da180f6 100644 --- a/fanficfare/adapters/adapter_tomparisdormcom.py +++ b/fanficfare/adapters/adapter_tomparisdormcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,18 +16,21 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 import sys from bs4.element import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return TomParisDormComAdapter @@ -79,7 +82,7 @@ class TomParisDormComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) else: @@ -120,7 +123,7 @@ class TomParisDormComAdapter(BaseSiteAdapter): # Get the rest of the Metadata mdsoup = soup.find('div',{'id' : 'output'}) - mdstr = str(mdsoup).replace('\n','').replace('\r','').replace('\t',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ') + mdstr = unicode(mdsoup).replace('\n','').replace('\r','').replace('\t',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ') mdstr = stripHTML(mdstr.replace(r'
',r'-:-').replace('|','-:-')) mdstr = mdstr.replace(r'[Rev',r'-:-[Rev').replace(' -:- ','-:-').strip('-:-').strip('-:-') diff --git a/fanficfare/adapters/adapter_trekfanfictionnet.py b/fanficfare/adapters/adapter_trekfanfictionnet.py index 7f288d37..1304e4f2 100644 --- a/fanficfare/adapters/adapter_trekfanfictionnet.py +++ b/fanficfare/adapters/adapter_trekfanfictionnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,14 +22,17 @@ ###================================================================================================= ### I have started to use lines of # on the line just before a function so they are easier to find. #################################################################################################### +from __future__ import absolute_import ''' This will scrape the chapter text and metadata from stories on the site trekfanfiction.net ''' import logging import re -import urllib2 +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -91,7 +94,7 @@ class TrekFanFictionNetSiteAdapter(BaseSiteAdapter): ''' try: page_data = self._fetchUrl(page) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(page)) else: diff --git a/fanficfare/adapters/adapter_trekiverseorg.py b/fanficfare/adapters/adapter_trekiverseorg.py index e32a8499..7dd98a60 100644 --- a/fanficfare/adapters/adapter_trekiverseorg.py +++ b/fanficfare/adapters/adapter_trekiverseorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return TrekiverseOrgAdapter @@ -126,7 +127,7 @@ class TrekiverseOrgAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -151,7 +152,7 @@ class TrekiverseOrgAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_tthfanficorg.py b/fanficfare/adapters/adapter_tthfanficorg.py index 206a349e..bbed206f 100644 --- a/fanficfare/adapters/adapter_tthfanficorg.py +++ b/fanficfare/adapters/adapter_tthfanficorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 -import time - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): @@ -151,7 +152,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): data = self._fetchUrl(url) #print("data:%s"%data) soup = self.make_soup(data) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code in (404,410): raise exceptions.StoryDoesNotExist(url) else: @@ -201,7 +202,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): #logger.info("authsoup:%s"%authorsoup) descurl=nextpage authorsoup = self.make_soup(authordata) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) else: @@ -238,7 +239,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): stripHTML(a), stripHTML(autha)),'https://'+self.host+a['href']) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) else: diff --git a/fanficfare/adapters/adapter_twilightarchivescom.py b/fanficfare/adapters/adapter_twilightarchivescom.py index 7cabe5c7..fde1cb8c 100644 --- a/fanficfare/adapters/adapter_twilightarchivescom.py +++ b/fanficfare/adapters/adapter_twilightarchivescom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return TwilightArchivesComAdapter @@ -78,7 +79,7 @@ class TwilightArchivesComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_twilightednet.py b/fanficfare/adapters/adapter_twilightednet.py index 44f689a9..959a239e 100644 --- a/fanficfare/adapters/adapter_twilightednet.py +++ b/fanficfare/adapters/adapter_twilightednet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,18 +16,19 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re import urllib -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class TwilightedNetSiteAdapter(BaseSiteAdapter): @@ -101,7 +102,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_unknowableroomorg.py b/fanficfare/adapters/adapter_unknowableroomorg.py index eb033a46..494fc908 100644 --- a/fanficfare/adapters/adapter_unknowableroomorg.py +++ b/fanficfare/adapters/adapter_unknowableroomorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,14 +17,17 @@ #################################################################################################### ### Adapted by GComyn on December 19, 2016 #################################################################################################### +from __future__ import absolute_import ''' This adapter will download stories from the site unknowableroom.org ''' import logging import re -import time -import urllib2 import sys -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -45,7 +48,7 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): # 1252 is a superset of iso-8859-1. Most sites that claim to be iso-8859-1 (and some that # claim to be utf8) are really windows-1252. - self.decode = ["Windows-1252", "utf8", "iso-8859-1"] + self.decode = ["Windows-1252", "utf8", "iso-8859-1"] # Setting the adult status to false initially self.is_adult=False @@ -83,7 +86,7 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): ''' try: page_data = self._fetchUrl(page) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(page)) else: @@ -119,10 +122,11 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): self.story.setMetadata('authorId', author) self.story.setMetadata('authorUrl', 'http://'+self.getSiteDomain()) self.story.setMetadata('author', author) - + ## Title - self.story.setMetadata('title',stripHTML(soup.find('h1')).replace( - 'by '+self.story.getMetadata('author'), '').strip()) + rawtitle = stripHTML(soup.find('h1')).replace( + 'by '+self.story.getMetadata('author'), '').strip() + self.story.setMetadata('title',rawtitle) # Find the chapters: for chapter in soup.find('select').find_all('option', value=re.compile( @@ -145,12 +149,12 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): story_found = False for story in asoup.find('ul', {'id':'fic_list'}).find_all('li'): - if self.story.getMetadata('title') == stripHTML(story.a): + if rawtitle == stripHTML(story.a): story_found = True break else: story_found = False - + if not story_found: raise exceptions.StoryDoesNotExist("Cannot find story '{}' on author's page '{}'".format( url, self.story.getMetadata('authorUrl'))) @@ -197,7 +201,7 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): 'rd,', ',').replace('th,', ',').replace('.', '').strip() self.story.setMetadata('dateUpdated', makeDate(value, self.dateformat)) - # I'm going to add the disclaimer + # I'm going to add the disclaimer disclaimer = soup.find('strong', {'id':'disclaimer'}) if disclaimer: self.story.setMetadata('disclaimer', stripHTML(disclaimer).replace( diff --git a/fanficfare/adapters/adapter_valentchambercom.py b/fanficfare/adapters/adapter_valentchambercom.py index 4b3b6d2a..ba35317d 100644 --- a/fanficfare/adapters/adapter_valentchambercom.py +++ b/fanficfare/adapters/adapter_valentchambercom.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -# Copyright 2015 FanFicFare team -# Copyright 2016 FanFicFare team +# Copyright 2018 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,8 @@ ### Rewritten by: GComyn on November, 06, 2016 ### Original was adapter_fannation.py ################################################################################## -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class ValentChamberComAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_voracity2eficcom.py b/fanficfare/adapters/adapter_voracity2eficcom.py index 3ee014ed..8a47b2d5 100644 --- a/fanficfare/adapters/adapter_voracity2eficcom.py +++ b/fanficfare/adapters/adapter_voracity2eficcom.py @@ -1,11 +1,15 @@ # Software: eFiction +from __future__ import absolute_import import re -import urllib2 -import urlparse from bs4.element import Tag -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions @@ -80,7 +84,7 @@ class Voracity2EficComAdapter(BaseSiteAdapter): if exception: try: data = self._fetchUrl(url, parameters) - except urllib2.HTTPError: + except HTTPError: raise exception(self.url) # Just let self._fetchUrl throw the exception, don't catch and # customize it. diff --git a/fanficfare/adapters/adapter_walkingtheplankorg.py b/fanficfare/adapters/adapter_walkingtheplankorg.py index 037c87ac..73e36f82 100644 --- a/fanficfare/adapters/adapter_walkingtheplankorg.py +++ b/fanficfare/adapters/adapter_walkingtheplankorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return WalkingThePlankOrgAdapter @@ -86,7 +87,7 @@ class WalkingThePlankOrgAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_wattpadcom.py b/fanficfare/adapters/adapter_wattpadcom.py index 5e23a9ac..dfe16a24 100644 --- a/fanficfare/adapters/adapter_wattpadcom.py +++ b/fanficfare/adapters/adapter_wattpadcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. @@ -20,6 +20,9 @@ import json import logging import re +# py2 vs py3 transition +from ..six import text_type as unicode + from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions logger = logging.getLogger(__name__) @@ -46,7 +49,7 @@ class WattpadComAdapter(BaseSiteAdapter): try: WattpadComAdapter.CATEGORY_DEFs = json.loads(self._fetchUrl(WattpadComAdapter.API_GETCATEGORIES)) except: - logger.debug('API_GETCATEGORIES failed.') + logger.warn('API_GETCATEGORIES failed.') WattpadComAdapter.CATEGORY_DEFs = [] @staticmethod @@ -88,7 +91,7 @@ class WattpadComAdapter(BaseSiteAdapter): def doExtractChapterUrlsAndMetadata(self, get_cover=True): try: storyInfo = json.loads(self._fetchUrl(WattpadComAdapter.API_STORYINFO % self.storyId)) - logger.debug('storyInfo: %s' % json.dumps(storyInfo)) + # logger.debug('storyInfo: %s' % json.dumps(storyInfo)) except Exception: raise exceptions.InvalidStoryURL(self.url, self.getSiteDomain(), self.getSiteExampleURLs()) @@ -122,8 +125,8 @@ class WattpadComAdapter(BaseSiteAdapter): # CATEGORIES try: - storyCategories = [WattpadComAdapter.CATEGORY_DEFs.get(str(c)) for c in storyInfo['categories'] if - WattpadComAdapter.CATEGORY_DEFs.has_key(str(c))] + storyCategories = [WattpadComAdapter.CATEGORY_DEFs.get(unicode(c)) for c in storyInfo['categories'] if + unicode(c) in WattpadComAdapter.CATEGORY_DEFs] self.story.setMetadata('category', storyCategories[0]) self.story.setMetadata('tags', storyInfo['tags']) diff --git a/fanficfare/adapters/adapter_webnovelcom.py b/fanficfare/adapters/adapter_webnovelcom.py index 89e10fa3..39b9744d 100644 --- a/fanficfare/adapters/adapter_webnovelcom.py +++ b/fanficfare/adapters/adapter_webnovelcom.py @@ -16,15 +16,18 @@ # # Adapted by GComyn on April 16, 2017 +from __future__ import absolute_import import cgi import difflib import json import logging import re import time -import urllib2 +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter +from .base_adapter import BaseSiteAdapter from .. import exceptions as exceptions from ..htmlcleanup import stripHTML from ..dateutils import parse_relative_date_string @@ -100,7 +103,7 @@ class WWWWebNovelComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('Error 404: {0}'.format(self.url)) else: diff --git a/fanficfare/adapters/adapter_whoficcom.py b/fanficfare/adapters/adapter_whoficcom.py index af8a608e..983151e5 100644 --- a/fanficfare/adapters/adapter_whoficcom.py +++ b/fanficfare/adapters/adapter_whoficcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,17 +16,18 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate class WhoficComSiteAdapter(BaseSiteAdapter): @@ -60,7 +61,7 @@ class WhoficComSiteAdapter(BaseSiteAdapter): # use BeautifulSoup HTML parser to make everything easier to find. try: soup = self.make_soup(self._fetchUrl(url)) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_wolverineandroguecom.py b/fanficfare/adapters/adapter_wolverineandroguecom.py index 4db5801c..ca10f248 100644 --- a/fanficfare/adapters/adapter_wolverineandroguecom.py +++ b/fanficfare/adapters/adapter_wolverineandroguecom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return WolverineAndRogueComAdapter @@ -78,7 +79,7 @@ class WolverineAndRogueComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_wraithbaitcom.py b/fanficfare/adapters/adapter_wraithbaitcom.py index 47e659a5..095602b8 100644 --- a/fanficfare/adapters/adapter_wraithbaitcom.py +++ b/fanficfare/adapters/adapter_wraithbaitcom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): @@ -87,7 +88,7 @@ class WraithBaitComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_writingwhimsicalwanderingsnet.py b/fanficfare/adapters/adapter_writingwhimsicalwanderingsnet.py index 203bb0ad..5aa5205b 100644 --- a/fanficfare/adapters/adapter_writingwhimsicalwanderingsnet.py +++ b/fanficfare/adapters/adapter_writingwhimsicalwanderingsnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,18 +16,21 @@ # # Software: eFiction -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 import sys from bs4.element import Comment from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return WritingWhimsicalwanderingsNetAdapter @@ -84,7 +87,7 @@ class WritingWhimsicalwanderingsNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url+addurl) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) else: @@ -149,7 +152,7 @@ class WritingWhimsicalwanderingsNetAdapter(BaseSiteAdapter): ## I know I'm replacing alot of
's here, but I want to make sure that they are all ## the same, so we can split the string correctly. metad = soup.find('div',{'class':'listbox'}) - metad = str(metad.renderContents()).replace('\n',' ').replace('
','|||||||').replace('
','|||||||').replace('
','|||||||').strip() + metad = unicode(metad.renderContents()).replace('\n',' ').replace('
','|||||||').replace('
','|||||||').replace('
','|||||||').strip() while '||||||||' in metad: metad = metad.replace('||||||||','|||||||') metad = stripHTML(metad) diff --git a/fanficfare/adapters/adapter_wuxiaworldco.py b/fanficfare/adapters/adapter_wuxiaworldco.py index fbead952..a84f4ea4 100644 --- a/fanficfare/adapters/adapter_wuxiaworldco.py +++ b/fanficfare/adapters/adapter_wuxiaworldco.py @@ -16,12 +16,15 @@ # +from __future__ import absolute_import import logging import re -import urllib2 -import urlparse +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate from fanficfare.htmlcleanup import stripHTML from .. import exceptions as exceptions @@ -67,7 +70,7 @@ class WuxiaWorldCoSiteAdapter(BaseSiteAdapter): logger.debug('URL: %s', self.url) try: data = self._fetchUrl(self.url) - except urllib2.HTTPError, exception: + except HTTPError as exception: if exception.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(self.url)) raise exception diff --git a/fanficfare/adapters/adapter_wuxiaworldcom.py b/fanficfare/adapters/adapter_wuxiaworldcom.py index 8829b05e..9bb3263e 100644 --- a/fanficfare/adapters/adapter_wuxiaworldcom.py +++ b/fanficfare/adapters/adapter_wuxiaworldcom.py @@ -16,13 +16,16 @@ # Adapted by GComyn on December 14. 2016 +from __future__ import absolute_import import json import logging import re -import urllib2 -import urlparse +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate from ..htmlcleanup import stripHTML from .. import exceptions as exceptions @@ -78,7 +81,7 @@ class WuxiaWorldComSiteAdapter(BaseSiteAdapter): logger.debug('URL: %s', self.url) try: data = self._fetchUrl(self.url) - except urllib2.HTTPError, exception: + except HTTPError as exception: if exception.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(self.url)) raise exception diff --git a/fanficfare/adapters/adapter_www13hoursorg.py b/fanficfare/adapters/adapter_www13hoursorg.py index 857e2911..4c92cc4f 100644 --- a/fanficfare/adapters/adapter_www13hoursorg.py +++ b/fanficfare/adapters/adapter_www13hoursorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ # limitations under the License. # +from __future__ import absolute_import from ..htmlcleanup import stripHTML # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from .base_efiction_adapter import BaseEfictionAdapter class WWW13HoursOrgSiteAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_wwwaneroticstorycom.py b/fanficfare/adapters/adapter_wwwaneroticstorycom.py index 19f5453a..4e6414e7 100644 --- a/fanficfare/adapters/adapter_wwwaneroticstorycom.py +++ b/fanficfare/adapters/adapter_wwwaneroticstorycom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,19 @@ # limitations under the License. # +from __future__ import absolute_import import logging import os import re import sys -import time -import urllib2 -import urlparse - from bs4.element import Comment -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -94,7 +96,7 @@ class WWWAnEroticStoryComAdapter(BaseSiteAdapter): #strip comments and scripts from soup [comment.extract() for comment in soup1.find_all(text=lambda text:isinstance(text, Comment))] [script.extract() for script in soup1.find_all('script')] - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/adapter_wwwarea52hkhnet.py b/fanficfare/adapters/adapter_wwwarea52hkhnet.py index efbc22e9..b83d06d4 100644 --- a/fanficfare/adapters/adapter_wwwarea52hkhnet.py +++ b/fanficfare/adapters/adapter_wwwarea52hkhnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,16 +24,18 @@ ### Fixed the Metadata processing to take into account that some of the ### stories have the authorinfo div, and to make it more systematic ############################################################################# -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return WWWArea52HKHNetAdapter @@ -96,7 +98,7 @@ class WWWArea52HKHNetAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: @@ -189,7 +191,7 @@ class WWWArea52HKHNetAdapter(BaseSiteAdapter): ## I've seen a non-breaking space in some of the storyblocks ## so we are going to remove them. - series = stripHTML(str(series.renderContents()).replace(b"\xc2\xa0",'')).strip() + series = stripHTML(unicode(series.renderContents()).replace(u"\xa0",'')).strip() if len(series) > 0: self.story.setMetadata('series',series) @@ -228,7 +230,7 @@ class WWWArea52HKHNetAdapter(BaseSiteAdapter): if not self.getConfig("keep_summary_html"): value = stripHTML(value).replace('Summary:','').strip() else: - value = str(value).replace('Summary:','').strip() + value = unicode(value).replace('Summary:','').strip() self.setDescription(url, value) # grab the text for an individual chapter. diff --git a/fanficfare/adapters/adapter_wwwgiantessworldnet.py b/fanficfare/adapters/adapter_wwwgiantessworldnet.py index a08ecb08..b44938c3 100644 --- a/fanficfare/adapters/adapter_wwwgiantessworldnet.py +++ b/fanficfare/adapters/adapter_wwwgiantessworldnet.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2016 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,8 @@ ### Adapted by GComyn - November 18, 2016 ########################################################################### # Software: eFiction -from base_efiction_adapter import BaseEfictionAdapter +from __future__ import absolute_import +from .base_efiction_adapter import BaseEfictionAdapter class WWWGiantessworldNetAdapter(BaseEfictionAdapter): diff --git a/fanficfare/adapters/adapter_wwwlushstoriescom.py b/fanficfare/adapters/adapter_wwwlushstoriescom.py index e14da56f..36ac32fc 100644 --- a/fanficfare/adapters/adapter_wwwlushstoriescom.py +++ b/fanficfare/adapters/adapter_wwwlushstoriescom.py @@ -20,17 +20,21 @@ # Updated on January 07, 2017 - fixed metadata capturing after Jimm fixed the UnidecodeError problem #################################################################################################### -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 from ..htmlcleanup import stripHTML from .. import exceptions as exceptions from bs4 import Comment, BeautifulSoup -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError +from ..six.moves.urllib.parse import quote + +from .base_adapter import BaseSiteAdapter, makeDate #################################################################################################### def getClass(): @@ -57,12 +61,12 @@ class WWWLushStoriesComAdapter(BaseSiteAdapter): # XXX if '%' not in storyId: ## assume already escaped if contains %. Assume needs escaping if it doesn't. try: - storyId = urllib2.quote(storyId) + storyId = quote(storyId) except KeyError: ## string from calibre is utf8, but lushstories.com ## expects extended chars to be in latin1 / iso-8859-1 ## rather than utf8. - storyId = urllib2.quote(storyId.encode("iso-8859-1")) + storyId = quote(storyId.encode("iso-8859-1")) self.story.setMetadata('storyId',storyId) @@ -119,7 +123,7 @@ class WWWLushStoriesComAdapter(BaseSiteAdapter): # XXX ''' try: page_data = self._fetchUrl(page) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(page)) else: @@ -173,7 +177,7 @@ class WWWLushStoriesComAdapter(BaseSiteAdapter): # XXX authorurl = self.story.getMetadata('authorUrl') try: adata = self._fetchUrl(authorurl) - except (urllib2.HTTPError) as e: + except (HTTPError) as e: ## Can't get the author's page, so we use what is on the story page tags = soup.find('div',{'id':'storytags'}).find('a') if tags: @@ -207,7 +211,7 @@ class WWWLushStoriesComAdapter(BaseSiteAdapter): # XXX for story in asoup.findAll('div',{'class':'entrycontent'}): for link in story.find_all('a'): if '/stories/' in link['href']: - linkh = urllib2.quote(link['href'].encode('utf-8', 'ignore')) + linkh = quote(link['href'].encode('utf-8', 'ignore')) linkh = linkh.replace('%3A', ':') # print self.url # print linkh diff --git a/fanficfare/adapters/adapter_wwwnovelallcom.py b/fanficfare/adapters/adapter_wwwnovelallcom.py index 542adebb..d61e3593 100644 --- a/fanficfare/adapters/adapter_wwwnovelallcom.py +++ b/fanficfare/adapters/adapter_wwwnovelallcom.py @@ -19,13 +19,16 @@ ### Tested with Calibre #################################################################################################### +from __future__ import absolute_import import logging import re import json -import urllib2 -import urlparse +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib import parse as urlparse +from ..six.moves.urllib.error import HTTPError -from base_adapter import BaseSiteAdapter, makeDate +from .base_adapter import BaseSiteAdapter, makeDate from bs4 import Comment from ..htmlcleanup import removeEntities, stripHTML, fix_excess_space @@ -108,7 +111,7 @@ class WWWNovelAllComAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(url)) else: diff --git a/fanficfare/adapters/adapter_wwwutopiastoriescom.py b/fanficfare/adapters/adapter_wwwutopiastoriescom.py index 19db47f5..cad6b565 100644 --- a/fanficfare/adapters/adapter_wwwutopiastoriescom.py +++ b/fanficfare/adapters/adapter_wwwutopiastoriescom.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,23 +23,28 @@ ### Updated on December 18, 2016 ### Updated format as per linter, and added documentation #################################################################################################### +from __future__ import absolute_import ''' This site is much link fictionmania, in that there is only one chapter per story, so we only have the one url to get information from. We get the category from the author's page ''' -import time import logging logger = logging.getLogger(__name__) import re -import urllib2 import sys from bs4.element import Comment + from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError +from ..six.moves.urllib.parse import quote + +from .base_adapter import BaseSiteAdapter, makeDate def getClass(): return WWWUtopiastoriesComAdapter @@ -95,7 +100,7 @@ class WWWUtopiastoriesComAdapter(BaseSiteAdapter): ''' try: page_data = self._fetchUrl(page) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(page)) else: @@ -142,12 +147,12 @@ class WWWUtopiastoriesComAdapter(BaseSiteAdapter): for detail in soup.findAll('li'): - det = str(detail).replace(b"\xc2\xa0",'') + det = unicode(detail).replace(u"\xa0",'') heading = stripHTML(det).split(' - ')[0] text = stripHTML(det).replace(heading+' - ','') if 'Author' in heading: a = detail.find('a') - if 'mailto' in str(a): + if 'mailto' in unicode(a): self.story.setMetadata('authorId','0000000000') self.story.setMetadata('authorUrl',self.url) self.story.setMetadata('author','Unknown') @@ -155,7 +160,7 @@ class WWWUtopiastoriesComAdapter(BaseSiteAdapter): else: self.story.setMetadata('authorId',a['href'].split('/')[2]) self.story.setMetadata('author',a.string) - self.story.setMetadata('authorUrl','http://'+self.host+urllib2.quote( + self.story.setMetadata('authorUrl','http://'+self.host+quote( a['href'].encode('UTF-8'))) elif 'Story Codes' in heading: self.story.setMetadata('eroticatags',text.replace('Story Codes - ','')) diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index 329eda94..6a0a9de6 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,22 @@ # limitations under the License. # +from __future__ import absolute_import import re from datetime import datetime, timedelta from collections import defaultdict +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six import string_types as basestring +from ..six.moves.urllib.parse import urlparse + import logging -import urlparse as up from functools import partial import traceback import copy -import bs4 +from bs4 import BeautifulSoup from ..htmlcleanup import stripHTML from ..htmlheuristics import replace_br_with_p @@ -34,7 +39,7 @@ logger = logging.getLogger(__name__) from ..story import Story from ..configurable import Configurable -from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML +from ..htmlcleanup import stripHTML from ..exceptions import InvalidStoryURL # quick convenience class @@ -46,7 +51,7 @@ class TimeKeeper(defaultdict): self[name] = self[name] + td def __unicode__(self): - keys = self.keys() + keys = list(self.keys()) keys.sort() return u"\n".join([ u"%s: %s"%(k,self[k]) for k in keys ]) import inspect @@ -125,7 +130,7 @@ class BaseSiteAdapter(Configurable): def _setURL(self,url): self.url = url - self.parsedUrl = up.urlparse(url) + self.parsedUrl = urlparse(url) self.host = self.parsedUrl.netloc self.path = self.parsedUrl.path self.story.setMetadata('storyUrl',self.url,condremoveentities=False) @@ -151,7 +156,7 @@ class BaseSiteAdapter(Configurable): self.ignore_chapter_url_list = [ self.normalize_chapterurl(u) for u in self.getConfig('ignore_chapter_url_list').splitlines() ] if self.normalize_chapterurl(url) not in self.ignore_chapter_url_list: meta = defaultdict(unicode,othermeta) # copy othermeta - meta.update({'title':stripHTML(title),'url':url}) # after other to make sure they are set + meta.update({'title':stripHTML(title,remove_all_entities=False),'url':url}) # after other to make sure they are set self.chapterUrls.append(meta) self.story.setMetadata('numChapters', self.num_chapters()) return True @@ -397,7 +402,7 @@ class BaseSiteAdapter(Configurable): if isinstance(svalue,basestring): # bs4/html5lib add html, header and body tags, which # we don't want. utf8FromSoup will strip the body tags for us. - svalue = bs4.BeautifulSoup(svalue,"html5lib").body + svalue = BeautifulSoup(svalue,"html5lib").body self.story.setMetadata('description',self.utf8FromSoup(url,svalue)) else: self.story.setMetadata('description',stripHTML(svalue)) @@ -410,17 +415,12 @@ class BaseSiteAdapter(Configurable): else: return (None,None) - # bs3 & bs4 are different here. - # will move to a bs3 vs bs4 block if there's lots of changes. + # bs3 & bs4 were different here. def get_attr_keys(self,soup): - if hasattr(soup, '_getAttrMap') and getattr(soup, '_getAttrMap') is not None: - # bs3 - #print "bs3 attrs:%s"%soup._getAttrMap().keys() - return soup._getAttrMap().keys() - elif hasattr(soup, 'attrs') and isinstance(soup.attrs,dict): + if hasattr(soup, 'attrs') and isinstance(soup.attrs,dict): #print "bs4 attrs:%s"%soup.attrs.keys() # bs4 - return soup.attrs.keys() + return list(soup.attrs.keys()) return [] # This gives us a unicode object, not just a string containing bytes. @@ -453,19 +453,24 @@ class BaseSiteAdapter(Configurable): #print("include_images:"+self.getConfig('include_images')) if self.getConfig('include_images'): + ## actually effects all tags' attrs, not just , but I'm okay with that. acceptable_attributes.extend(('src','alt','longdesc')) - try: - for img in soup.find_all('img'): + for img in soup.find_all('img'): + try: # some pre-existing epubs have img tags that had src stripped off. if img.has_attr('src'): (img['src'],img['longdesc'])=self.story.addImgUrl(url,img['src'],fetch, coverexclusion=self.getConfig('cover_exclusion_regexp')) - except AttributeError as ae: - logger.info("Parsing for img tags failed--probably poor input HTML. Skipping images.") + except AttributeError as ae: + logger.info("Parsing for img tags failed--probably poor input HTML. Skipping img(%s)"%img) + else: + ## remove all img tags entirely + for img in soup.find_all('img'): + img.extract() for attr in self.get_attr_keys(soup): if attr not in acceptable_attributes: - del soup[attr] ## strip all tag attributes except href and name + del soup[attr] ## strip all tag attributes except configured ## apply adapter's normalize_chapterurls to all links in ## chapter texts, if they match chapter URLs. While this will @@ -503,7 +508,7 @@ class BaseSiteAdapter(Configurable): if t.name=='script': t.extract() - except AttributeError, ae: + except AttributeError as ae: if "%s"%ae != "'NoneType' object has no attribute 'next_element'": logger.error("Error parsing HTML, probably poor input HTML. %s"%ae) @@ -546,8 +551,8 @@ class BaseSiteAdapter(Configurable): ## soup and re-soup because BS4/html5lib is more forgiving of ## incorrectly nested tags that way. - soup = bs4.BeautifulSoup(data,'html5lib') - soup = bs4.BeautifulSoup(unicode(soup),'html5lib') + soup = BeautifulSoup(data,'html5lib') + soup = BeautifulSoup(unicode(soup),'html5lib') for ns in soup.find_all('fff_hide_noscript'): ns.name = 'noscript' @@ -599,7 +604,8 @@ def makeDate(string,dateform): add_hours = True string = string.replace(u"AM",u"").replace(u"PM",u"").replace(u"am",u"").replace(u"pm",u"") - date = datetime.strptime(string.encode('utf-8'),dateform.encode('utf-8')) + # date = datetime.strptime(string.encode('utf-8'),dateform.encode('utf-8')) + date = datetime.strptime(string, dateform) if add_hours: date += timedelta(hours=12) diff --git a/fanficfare/adapters/base_efiction_adapter.py b/fanficfare/adapters/base_efiction_adapter.py index 69df5ce8..fd128617 100644 --- a/fanficfare/adapters/base_efiction_adapter.py +++ b/fanficfare/adapters/base_efiction_adapter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,18 +16,21 @@ # # Software: eFiction -# import time -# import urllib +from __future__ import absolute_import + import logging logger = logging.getLogger(__name__) import re -import urllib2 import bs4 as bs from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate """ This is a generic adapter for eFiction based archives (see @@ -216,7 +219,7 @@ class BaseEfictionAdapter(BaseSiteAdapter): """ try: html = self._fetchUrl(url) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: diff --git a/fanficfare/adapters/base_xenforoforum_adapter.py b/fanficfare/adapters/base_xenforoforum_adapter.py index 7e7b05b8..0773b7c0 100644 --- a/fanficfare/adapters/base_xenforoforum_adapter.py +++ b/fanficfare/adapters/base_xenforoforum_adapter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017 FanFicFare team +# Copyright 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,20 @@ # limitations under the License. # -import time +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re -import urllib2 from xml.dom.minidom import parseString from ..htmlcleanup import stripHTML from .. import exceptions as exceptions -from base_adapter import BaseSiteAdapter, makeDate +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate logger = logging.getLogger(__name__) @@ -293,7 +296,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): (data,opened) = self._fetchUrlOpened(useurl) useurl = opened.geturl() logger.info("use useurl: "+useurl) - except urllib2.HTTPError, e: + except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) elif e.code == 403: @@ -504,7 +507,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): # assumed normalized to /posts/1234/ anchorid = "post-"+url.split('/')[-2] - logger.debug("anchorid: %s"%anchorid) + # logger.debug("anchorid: %s"%anchorid) souptag = topsoup.find('li',id=anchorid) else: logger.debug("post found in cache") @@ -524,7 +527,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): topsoup = souptag = self.make_soup(data) - if '#' in url: + if '#' in unicode(url): anchorid = url.split('#')[1] souptag = topsoup.find('li',id=anchorid) diff --git a/fanficfare/cli.py b/fanficfare/cli.py index fc4b4367..ea3f7153 100644 --- a/fanficfare/cli.py +++ b/fanficfare/cli.py @@ -15,28 +15,33 @@ # limitations under the License. # +from __future__ import absolute_import +from __future__ import print_function from optparse import OptionParser, SUPPRESS_HELP from os.path import expanduser, join, dirname from os import access, R_OK from subprocess import call -from StringIO import StringIO -import ConfigParser import getpass import logging import pprint import string import os, sys - import pickle -import cookielib as cl -version="2.28.0" +if sys.version_info < (2, 7): + sys.exit('This program requires Python 2.7 or newer.') +elif sys.version_info < (3, 0): + reload(sys) # Reload restores 'hidden' setdefaultencoding method + sys.setdefaultencoding("utf-8") + def pickle_load(f): + return pickle.load(f) +else: # > 3.0 + def pickle_load(f): + return pickle.load(f,encoding="bytes") + +version="2.37.3" os.environ['CURRENT_VERSION_ID']=version -if sys.version_info < (2, 5) or sys.version_info > (3,0): - print('This program requires Python 2.5 or newer. Python 3 is not supported.') - sys.exit(1) - if sys.version_info >= (2, 7): # suppresses default logger. Logging is setup in fanficfare/__init__.py so it works in calibre, too. rootlogger = logging.getLogger() @@ -53,12 +58,18 @@ try: from calibre_plugins.fanficfare_plugin.fanficfare.epubutils import ( get_dcsource_chaptercount, get_update_data, reset_orig_chapters_epub) from calibre_plugins.fanficfare_plugin.fanficfare.geturls import get_urls_from_page, get_urls_from_imap + from calibre_plugins.fanficfare_plugin.fanficfare.six import StringIO + from calibre_plugins.fanficfare_plugin.fanficfare.six.moves import configparser + from calibre_plugins.fanficfare_plugin.fanficfare.six.moves import http_cookiejar as cl except ImportError: from fanficfare import adapters, writers, exceptions from fanficfare.configurable import Configuration from fanficfare.epubutils import ( get_dcsource_chaptercount, get_update_data, reset_orig_chapters_epub) from fanficfare.geturls import get_urls_from_page, get_urls_from_imap + from fanficfare.six import StringIO + from fanficfare.six.moves import configparser + from fanficfare.six.moves import http_cookiejar as cl def write_story(config, adapter, writeformat, metaonly=False, outstream=None): @@ -160,13 +171,18 @@ def main(argv=None, options, args = parser.parse_args(argv) + if not options.debug: + logger.setLevel(logging.WARNING) + else: + import platform + logger.debug(" OS Version:%s"%platform.platform()) + logger.debug("Python Version:%s"%sys.version) + logger.debug(" FFF Version:%s"%version) + if options.version: print("Version: %s" % version) return - if not options.debug: - logger.setLevel(logging.WARNING) - list_only = any((options.imaplist, options.siteslist, options.list, @@ -179,9 +195,9 @@ def main(argv=None, if options.siteslist: for site, examples in adapters.getSiteExamples(): - print '\n#### %s\nExample URLs:' % site + print('\n#### %s\nExample URLs:' % site) for u in examples: - print ' * %s' % u + print(' * %s' % u) return if options.update and options.format != 'epub': @@ -203,14 +219,14 @@ def main(argv=None, passed_defaultsini, passed_personalini,options) retlist = get_urls_from_page(options.list, configuration) - print '\n'.join(retlist) + print('\n'.join(retlist)) if options.normalize: configuration = get_configuration(options.normalize, passed_defaultsini, passed_personalini,options) retlist = get_urls_from_page(options.normalize, configuration,normalize=True) - print '\n'.join(retlist) + print('\n'.join(retlist)) if options.downloadlist: configuration = get_configuration(options.downloadlist, @@ -233,32 +249,32 @@ def main(argv=None, if options.downloadimap: urls.extend(retlist) else: - print '\n'.join(retlist) + print('\n'.join(retlist)) # for passing in a file list if options.infile: with open(options.infile,"r") as infile: - #print "File exists and is readable" + #print("file exists and is readable") for url in infile: if '#' in url: url = url[:url.find('#')].strip() url = url.strip() if len(url) > 0: - #print "URL: (%s)"%url + #print("url: (%s)"%url) urls.append(url) if options.save_cache: try: with open('global_cache','rb') as jin: - options.pagecache = pickle.load(jin) # ,encoding="utf-8" + options.pagecache = pickle_load(jin) options.cookiejar = cl.LWPCookieJar() options.cookiejar.load('global_cookies') - except: - print("Didn't load global_cache") + except Exception as e: + print("Didn't load global_cache %s"%e) if not list_only: if len(urls) < 1: - print "No valid story URLs found" + print("No valid story URLs found") else: for url in urls: try: @@ -267,14 +283,14 @@ def main(argv=None, passed_defaultsini, passed_personalini) #print("pagecache:%s"%options.pagecache.keys()) - except Exception, e: + except Exception as e: if len(urls) == 1: raise - print "URL(%s) Failed: Exception (%s). Run URL individually for more detail."%(url,e) + print("URL(%s) Failed: Exception (%s). Run URL individually for more detail."%(url,e)) if options.save_cache: with open('global_cache','wb') as jout: - pickle.dump(options.pagecache,jout) + pickle.dump(options.pagecache,jout,protocol=2) options.cookiejar.save('global_cookies') # make rest a function and loop on it. @@ -296,9 +312,9 @@ def do_download(arg, try: url, chaptercount = get_dcsource_chaptercount(arg) if not url: - print 'No story URL found in epub to update.' + print('No story URL found in epub to update.') return - print 'Updating %s, URL: %s' % (arg, url) + print('Updating %s, URL: %s' % (arg, url)) output_filename = arg except Exception: # if there's an error reading the update file, maybe it's a URL? @@ -342,10 +358,10 @@ def do_download(arg, writer = writers.getWriter('epub', configuration, adapter) output_filename = writer.getOutputFileName() noturl, chaptercount = get_dcsource_chaptercount(output_filename) - print 'Updating %s, URL: %s' % (output_filename, url) - except Exception: + print('Updating %s, URL: %s' % (output_filename, url)) + except Exception as e: + print("Failed to read epub for update: (%s) Continuing with update=false"%e) options.update = False - pass # Check for include_images without no_image_processing. In absence of PIL, give warning. if adapter.getConfig('include_images') and not adapter.getConfig('no_image_processing'): @@ -359,38 +375,38 @@ def do_download(arg, try: import Image except ImportError: - print "You have include_images enabled, but Python Image Library(PIL) isn't found.\nImages will be included full size in original format.\nContinue? (y/n)?" + print("You have include_images enabled, but Python Image Library(PIL) isn't found.\nImages will be included full size in original format.\nContinue? (y/n)?") if options.interactive: if not sys.stdin.readline().strip().lower().startswith('y'): return else: # for non-interactive, default the response to yes and continue processing - print 'y' + print('y') # three tries, that's enough if both user/pass & is_adult needed, # or a couple tries of one or the other for x in range(0, 2): try: adapter.getStoryMetadataOnly() - except exceptions.FailedToLogin, f: + except exceptions.FailedToLogin as f: if not options.interactive: - print 'Login Failed on non-interactive process. Set username and password in personal.ini.' + print('Login Failed on non-interactive process. Set username and password in personal.ini.') return if f.passwdonly: - print 'Story requires a password.' + print('Story requires a password.') else: - print 'Login Failed, Need Username/Password.' + print('Login Failed, Need Username/Password.') sys.stdout.write('Username: ') adapter.username = sys.stdin.readline().strip() adapter.password = getpass.getpass(prompt='Password: ') # print('Login: `%s`, Password: `%s`' % (adapter.username, adapter.password)) except exceptions.AdultCheckRequired: if options.interactive: - print 'Please confirm you are an adult in your locale: (y/n)?' + print('Please confirm you are an adult in your locale: (y/n)?') if sys.stdin.readline().strip().lower().startswith('y'): adapter.is_adult = True else: - print 'Adult check required on non-interactive process. Set is_adult:true in personal.ini or pass -o "is_adult=true" to the command.' + print('Adult check required on non-interactive process. Set is_adult:true in personal.ini or pass -o "is_adult=true" to the command.') return if options.update and not options.force: @@ -399,11 +415,11 @@ def do_download(arg, urlchaptercount = adapter.getStoryMetadataOnly().getChapterCount() if chaptercount == urlchaptercount and not options.metaonly: - print '%s already contains %d chapters.' % (output_filename, chaptercount) + print('%s already contains %d chapters.' % (output_filename, chaptercount)) elif chaptercount > urlchaptercount: - print '%s contains %d chapters, more than source: %d.' % (output_filename, chaptercount, urlchaptercount) + print('%s contains %d chapters, more than source: %d.' % (output_filename, chaptercount, urlchaptercount)) elif chaptercount == 0: - print "%s doesn't contain any recognizable chapters, probably from a different source. Not updating." % output_filename + print("%s doesn't contain any recognizable chapters, probably from a different source. Not updating." % output_filename) else: # update now handled by pre-populating the old # images and chapters in the adapter rather than @@ -418,7 +434,7 @@ def do_download(arg, adapter.oldchaptersmap, adapter.oldchaptersdata) = (get_update_data(output_filename))[0:9] - print 'Do update - epub(%d) vs url(%d)' % (chaptercount, urlchaptercount) + print('Do update - epub(%d) vs url(%d)' % (chaptercount, urlchaptercount)) if not options.update and chaptercount == urlchaptercount and adapter.getConfig('do_update_hook'): adapter.hookForUpdates(chaptercount) @@ -452,8 +468,8 @@ def do_download(arg, metadata['output_filename'] = output_filename if options.jsonmeta: import json - print json.dumps(metadata, sort_keys=True, - indent=2, separators=(',', ':')) + print(json.dumps(metadata, sort_keys=True, + indent=2, separators=(',', ':'))) else: pprint.pprint(metadata) @@ -468,13 +484,13 @@ def do_download(arg, del adapter except exceptions.InvalidStoryURL as isu: - print isu + print(isu) except exceptions.StoryDoesNotExist as dne: - print dne + print(dne) except exceptions.UnknownSite as us: - print us + print(us) except exceptions.AccessDenied as ad: - print ad + print(ad) def get_configuration(url, passed_defaultsini, @@ -484,7 +500,7 @@ def get_configuration(url, output_filename=None): try: configuration = Configuration(adapters.getConfigSectionsFor(url), options.format) - except exceptions.UnknownSite, e: + except exceptions.UnknownSite as e: if options.list or options.normalize or options.downloadlist: # list for page doesn't have to be a supported site. configuration = Configuration(['unknown'], options.format) @@ -519,11 +535,12 @@ def get_configuration(url, if options.configfile: conflist.extend(options.configfile) - configuration.read(conflist) + logger.debug("confs list:%s"%conflist) + logger.debug("read confs:%s"%configuration.read(conflist)) try: configuration.add_section('overrides') - except ConfigParser.DuplicateSectionError: + except configparser.DuplicateSectionError: pass if options.force: diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 9f601250..fb9425ac 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -15,19 +15,29 @@ # limitations under the License. # -import ConfigParser, re -import exceptions +from __future__ import absolute_import +import re import codecs -from ConfigParser import DEFAULTSECT, MissingSectionHeaderError, ParsingError + +# py2 vs py3 transition +from . import six +from .six.moves import configparser +from .six.moves.configparser import DEFAULTSECT, MissingSectionHeaderError, ParsingError +from .six.moves import urllib +from .six.moves.urllib.parse import urlencode +from .six.moves.urllib.request import (build_opener, HTTPCookieProcessor, Request) +from .six.moves.urllib.error import HTTPError +from .six.moves import http_cookiejar as cl +from .six import text_type as unicode +from .six import string_types as basestring +from .six import ensure_binary import time import logging import sys -import urllib -import urllib2 as u2 -import urlparse as up -import cookielib as cl -import pickle +# import pickle + +from . import exceptions try: from google.appengine.api import apiproxy_stub_map @@ -50,7 +60,7 @@ try: except ImportError: chardet = None -from gziphttp import GZipProcessor +from .gziphttp import GZipProcessor # All of the writers(epub,html,txt) and adapters(ffnet,twlt,etc) # inherit from Configurable. The config file(s) uses ini format: @@ -69,12 +79,21 @@ from gziphttp import GZipProcessor logger = logging.getLogger(__name__) -import adapters +# Work around for fact that py3 apparently doesn't allow/ignore +# recursive imports like py2 does. +try: + from . import adapters +except ImportError: + import sys + if "fanficfare.adapters" in sys.modules: + adapters = sys.modules["fanficfare.adapters"] + elif "calibre_plugins.fanficfare_plugin.fanficfare.adapters" in sys.modules: + adapters = sys.modules["calibre_plugins.fanficfare_plugin.fanficfare.adapters"] def re_compile(regex,line): try: return re.compile(regex,re.DOTALL) - except Exception, e: + except Exception as e: raise exceptions.RegularExpresssionFailed(e,regex,line) # fall back labels. @@ -474,20 +493,20 @@ def make_generate_cover_settings(param): for line in param.splitlines(): if "=>" in line: try: - (template,regexp,setting) = map( lambda x: x.strip(), line.split("=>") ) + (template,regexp,setting) = [ x.strip() for x in line.split("=>") ] re_compile(regexp,line) vlist.append((template,regexp,setting)) - except Exception, e: + except Exception as e: raise exceptions.PersonalIniFailed(e,line,param) return vlist -class Configuration(ConfigParser.SafeConfigParser): +class Configuration(configparser.SafeConfigParser): def __init__(self, sections, fileform, lightweight=False): site = sections[-1] # first section is site DN. - ConfigParser.SafeConfigParser.__init__(self) + configparser.SafeConfigParser.__init__(self) self.lightweight = lightweight self.use_pagecache = False # default to false for old adapters. @@ -530,7 +549,7 @@ class Configuration(ConfigParser.SafeConfigParser): self.override_sleep = None self.cookiejar = self.get_empty_cookiejar() - self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor()) + self.opener = build_opener(HTTPCookieProcessor(self.cookiejar),GZipProcessor()) self.pagecache = self.get_empty_pagecache() @@ -543,9 +562,9 @@ class Configuration(ConfigParser.SafeConfigParser): ## reconstructed completely because removing and re-adding ## a section would mess up the order. ## assumes _dict and _sections from ConfigParser parent. - self._sections = self._dict((section_url_f(k) if (domain in k and 'http' in k) else k, v) for k, v in self._sections.viewitems()) + self._sections = self._dict((section_url_f(k) if (domain in k and 'http' in k) else k, v) for k, v in six.viewitems(self._sections)) # logger.debug(self._sections.keys()) - except e: + except Exception as e: logger.warn("Failed to perform section_url_names: %s"%e) def addUrlConfigSection(self,url): @@ -636,17 +655,17 @@ class Configuration(ConfigParser.SafeConfigParser): val = self.get(section,key) if val and val.lower() == "false": val = False - #print "getConfig(%s)=[%s]%s" % (key,section,val) + #print("getConfig(%s)=[%s]%s" % (key,section,val)) break - except (ConfigParser.NoOptionError, ConfigParser.NoSectionError), e: + except (configparser.NoOptionError, configparser.NoSectionError) as e: pass for section in sections[::-1]: # 'martian smiley' [::-1] reverses list by slicing whole list with -1 step. try: val = val + self.get(section,"add_to_"+key) - #print "getConfig(add_to_%s)=[%s]%s" % (key,section,val) - except (ConfigParser.NoOptionError, ConfigParser.NoSectionError), e: + #print("getConfig(add_to_%s)=[%s]%s" % (key,section,val)) + except (configparser.NoOptionError, configparser.NoSectionError) as e: pass return val @@ -654,8 +673,9 @@ class Configuration(ConfigParser.SafeConfigParser): # split and strip each. def get_config_list(self, sections, key, default=[]): vlist = re.split(r'(? float(self.getConfig("chardet_confidence_limit",0.9)): logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence'])) code=detected['encoding'] @@ -964,8 +988,9 @@ class Configuration(ConfigParser.SafeConfigParser): return data.decode(code,errors='ignore') else: return data.decode(code) - except: + except Exception as e: logger.debug("code failed:"+code) + logger.debug(e) pass logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode) return "".join([x for x in data if ord(x) < 128]) @@ -999,15 +1024,15 @@ class Configuration(ConfigParser.SafeConfigParser): logger.debug("#####################################\npagecache(POST) MISS: %s"%safe_url(cachekey)) self.do_sleep(extrasleep) - ## u2.Request assumes POST when data!=None. Also assumes data + ## Request assumes POST when data!=None. Also assumes data ## is application/x-www-form-urlencoded. if 'Content-type' not in headers: headers['Content-type']='application/x-www-form-urlencoded' if 'Accept' not in headers: headers['Accept']="text/html,*/*" - req = u2.Request(url, - data=urllib.urlencode(parameters), - headers=headers) + req = Request(url, + data=ensure_binary(urlencode(parameters)), + headers=headers) ## Specific UA because too many sites are blocking the default python UA. self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')), @@ -1015,6 +1040,8 @@ class Configuration(ConfigParser.SafeConfigParser): data = self._decode(self.opener.open(req,None,float(self.getConfig('connect_timeout',30.0))).read()) self._progressbar() + ## postURL saves data to the pagecache *after* _decode() while + ## fetchRaw saves it *before* _decode()--because raw. self._set_to_pagecache(cachekey,data,url) return data @@ -1072,11 +1099,17 @@ class Configuration(ConfigParser.SafeConfigParser): self.opener.addheaders = headers if parameters != None: - opened = self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters),float(self.getConfig('connect_timeout',30.0))) + opened = self.opener.open(url.replace(' ','%20'), + ensure_binary(urlencode(parameters)), + float(self.getConfig('connect_timeout',30.0))) else: - opened = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0))) + opened = self.opener.open(url.replace(' ','%20'), + None, + float(self.getConfig('connect_timeout',30.0))) self._progressbar() data = opened.read() + ## postURL saves data to the pagecache *after* _decode() while + ## fetchRaw saves it *before* _decode()--because raw. self._set_to_pagecache(cachekey,data,opened.url) return (data,opened) @@ -1115,13 +1148,14 @@ class Configuration(ConfigParser.SafeConfigParser): extrasleep=extrasleep, referer=referer) return (self._decode(data),opened) - except u2.HTTPError, he: + except HTTPError as he: excpt=he if he.code in (403,404,410): logger.debug("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he))) break # break out on 404 - except Exception, e: + except Exception as e: excpt=e + raise logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e))) logger.debug("Giving up on %s" %safe_url(url)) diff --git a/fanficfare/dateutils.py b/fanficfare/dateutils.py index be6e2ba1..b8217449 100644 --- a/fanficfare/dateutils.py +++ b/fanficfare/dateutils.py @@ -1,9 +1,33 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import + from datetime import datetime, timedelta +# py2 vs py3 transition +from .six import text_type as unicode + import logging logger = logging.getLogger(__name__) -UNIX_EPOCHE = datetime.fromtimestamp(0) +## There's a windows / py3 bug that prevents using 0. +## So Jan 2, 1970 instead. +UNIX_EPOCHE = datetime.fromtimestamp(86400) ## Currently used by adapter_webnovelcom & adapter_wwwnovelallcom diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 2b079e9d..48f8edc6 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1,4 +1,4 @@ -# Copyright 2015 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2015 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -1950,31 +1950,6 @@ comments_label:Comments include_in_category:category,searchtags -[royalroadl.com] -extra_valid_entries:stars - -#add_to_extra_titlepage_entries:,stars - -## some sites include images that we don't ever want becoming the -## cover image. This lets you exclude them. -cover_exclusion_regexp:(imgur.com/dzOACJf.png|/forum/images/smilies/) - -## Clear FanFiction from defaults, site has fanfictions and original fiction. -extratags: - -## royalroadl.com stories sometimes have 'spoiler' blocks in -## posts. When viewed in a browser, the block is hidden until a button -## is clicked. eBook viewers can't handle that and the javascript is -## disabled. The remove_spoilers option, if uncommented, will remove -## spoiler blocks entirely. -#remove_spoilers:true - -## This option if uncommented, will put a box around the spoiler -## blocks with the original spoiler button text as a label using -## fieldset and legend HTML tags. For a simple box, see the -## add_to_output_css example for [base_xenforoforum:epub]. -#legend_spoilers:true - [samandjack.net] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In @@ -2869,6 +2844,31 @@ extracategories:Queer as Folk website_encodings:Windows-1252,utf8 +[www.royalroad.com] +extra_valid_entries:stars + +#add_to_extra_titlepage_entries:,stars + +## some sites include images that we don't ever want becoming the +## cover image. This lets you exclude them. +cover_exclusion_regexp:(imgur.com/dzOACJf.png|/forum/images/smilies/) + +## Clear FanFiction from defaults, site has fanfictions and original fiction. +extratags: + +## royalroad.com stories sometimes have 'spoiler' blocks in +## posts. When viewed in a browser, the block is hidden until a button +## is clicked. eBook viewers can't handle that and the javascript is +## disabled. The remove_spoilers option, if uncommented, will remove +## spoiler blocks entirely. +#remove_spoilers:true + +## This option if uncommented, will put a box around the spoiler +## blocks with the original spoiler button text as a label using +## fieldset and legend HTML tags. For a simple box, see the +## add_to_output_css example for [base_xenforoforum:epub]. +#legend_spoilers:true + [www.scarvesandcoffee.net] ## Some sites do not require a login, but do require the user to ## confirm they are adult for adult content. In commandline version, diff --git a/fanficfare/epubutils.py b/fanficfare/epubutils.py index e51680c4..105a29c4 100644 --- a/fanficfare/epubutils.py +++ b/fanficfare/epubutils.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import __license__ = 'GPL v3' -__copyright__ = '2017, Jim Miller' +__copyright__ = '2018, Jim Miller' __docformat__ = 'restructuredtext en' import logging @@ -11,7 +12,11 @@ import re, os, traceback from collections import defaultdict from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED from xml.dom.minidom import parseString -from StringIO import StringIO + +# py2 vs py3 transition +from .six import text_type as unicode +from .six import string_types as basestring +from .six import BytesIO # StringIO under py2 import bs4 @@ -158,7 +163,7 @@ def get_update_data(inputio, chapurl = soup.find('meta',{'name':'chapterurl'}) if chapurl: if chapurl['content'] not in urlsoups: # keep first found if more than one. - #print("Found chapurl['content']:%s"%chapurl['content']) + # print("Found chapurl['content']:%s"%chapurl['content']) currenturl = chapurl['content'] urlsoups[chapurl['content']] = bodysoup else: @@ -188,7 +193,7 @@ def get_update_data(inputio, #for k in images.keys(): #print("\tlongdesc:%s\n\tData len:%s\n"%(k,len(images[k]))) - # print("datamaps:%s"%datamaps) + #print("datamaps:%s"%datamaps) return (source,filecount,soups,images,oldcover,calibrebookmark,logfile,urlsoups,datamaps) def get_path_part(n): @@ -274,7 +279,7 @@ def reset_orig_chapters_epub(inputio,outfile): inputepub = ZipFile(inputio, 'r') # works equally well with a path or a blob ## build zip in memory in case updating in place(CLI). - zipio = StringIO() + zipio = BytesIO() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify diff --git a/fanficfare/exceptions.py b/fanficfare/exceptions.py index ec4dc8f0..5cc3e140 100644 --- a/fanficfare/exceptions.py +++ b/fanficfare/exceptions.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ # limitations under the License. # +from __future__ import absolute_import + ## A few exceptions for different things for adapters class FailedToDownload(Exception): diff --git a/fanficfare/geturls.py b/fanficfare/geturls.py index 711f927f..de0fff41 100644 --- a/fanficfare/geturls.py +++ b/fanficfare/geturls.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2015 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,22 +15,27 @@ # limitations under the License. # +from __future__ import absolute_import import collections import email import imaplib import re -import urllib2 as u2 -import urlparse + +# unicode in py2, str in py3 +from .six.moves.urllib.request import (build_opener, HTTPCookieProcessor) +from .six.moves.urllib.parse import (urlparse, urlunparse) +from .six import text_type as unicode +from .six import ensure_str import logging logger = logging.getLogger(__name__) from bs4 import BeautifulSoup -from gziphttp import GZipProcessor +from .gziphttp import GZipProcessor -import adapters -from configurable import Configuration -from exceptions import UnknownSite +from . import adapters +from .configurable import Configuration +from .exceptions import UnknownSite def get_urls_from_page(url,configuration=None,normalize=False): @@ -75,7 +80,7 @@ def get_urls_from_page(url,configuration=None,normalize=False): data = adapter._fetchUrl(url,usecache=False) except UnknownSite: # no adapter with anyurl=True, must be a random site. - opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor()) + opener = build_opener(HTTPCookieProcessor(),GZipProcessor()) data = opener.open(url).read() # kludge because I don't see it on enough sites to be worth generalizing yet. @@ -112,18 +117,18 @@ def get_urls_from_html(data,url=None,configuration=None,normalize=False,restrict urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) - except Exception, e: + except Exception as e: #logger.debug e pass # Simply return the longest URL with the assumption that it contains the # most user readable metadata, if not normalized - return urls.keys() if normalize else [max(value, key=len) for key, value in urls.items()] + return list(urls.keys()) if normalize else [max(value, key=len) for key, value in urls.items()] def get_urls_from_text(data,configuration=None,normalize=False,email=False): urls = collections.OrderedDict() try: - data = unicode(data) + data = ensure_str(data) except UnicodeDecodeError: data=data.decode('utf8') ## for when called outside calibre. @@ -143,7 +148,7 @@ def get_urls_from_text(data,configuration=None,normalize=False,email=False): # Simply return the longest URL with the assumption that it contains the # most user readable metadata, if not normalized - return urls.keys() if normalize else [max(value, key=len) for key, value in urls.items()] + return list(urls.keys()) if normalize else [max(value, key=len) for key, value in urls.items()] def form_url(parenturl,url): @@ -153,9 +158,9 @@ def form_url(parenturl,url): if "//" in url or parenturl == None: returl = url else: - parsedUrl = urlparse.urlparse(parenturl) + parsedUrl = urlparse(parenturl) if url.startswith("/") : - returl = urlparse.urlunparse( + returl = urlunparse( (parsedUrl.scheme, parsedUrl.netloc, url, @@ -166,7 +171,7 @@ def form_url(parenturl,url): toppath = parsedUrl.path else: toppath = parsedUrl.path[:parsedUrl.path.rindex('/')] - returl = urlparse.urlunparse( + returl = urlunparse( (parsedUrl.scheme, parsedUrl.netloc, toppath + '/' + url, @@ -192,7 +197,7 @@ def cleanup_url(href,email=False): def get_urls_from_imap(srv,user,passwd,folder,markread=True): - logger.debug("get_urls_from_imap srv:(%s)"%srv) + # logger.debug("get_urls_from_imap srv:(%s)"%srv) mail = imaplib.IMAP4_SSL(srv) mail.login(user, passwd) mail.list() @@ -215,33 +220,31 @@ def get_urls_from_imap(srv,user,passwd,folder,markread=True): result, data = mail.uid('fetch', email_uid, '(BODY.PEEK[])') #RFC822 - #logger.debug("result:%s"%result) - #logger.debug("data:%s"%data) + # logger.debug("result:%s"%result) + # logger.debug("data:%s"%data) raw_email = data[0][1] #raw_email = data[0][1] # here's the body, which is raw text of the whole email # including headers and alternate payloads - email_message = email.message_from_string(raw_email) + email_message = email.message_from_string(ensure_str(raw_email)) - #logger.debug "To:%s"%email_message['To'] - #logger.debug "From:%s"%email_message['From'] - #logger.debug "Subject:%s"%email_message['Subject'] - - # logger.debug("payload:%s"%email_message.get_payload()) + # logger.debug("To:%s"%email_message['To']) + # logger.debug("From:%s"%email_message['From']) + # logger.debug("Subject:%s"%email_message['Subject']) + # logger.debug("payload:%r"%email_message.get_payload(decode=True)) urllist=[] for part in email_message.walk(): try: - #logger.debug("part mime:%s"%part.get_content_type()) + # logger.debug("part mime:%s"%part.get_content_type()) if part.get_content_type() == 'text/plain': urllist.extend(get_urls_from_text(part.get_payload(decode=True),email=True)) if part.get_content_type() == 'text/html': urllist.extend(get_urls_from_html(part.get_payload(decode=True),email=True)) except Exception as e: logger.error("Failed to read email content: %s"%e,exc_info=True) - #logger.debug "urls:%s"%get_urls_from_text(get_first_text_block(email_message)) if urllist and markread: #obj.store(data[0].replace(' ',','),'+FLAGS','\Seen') diff --git a/fanficfare/gziphttp.py b/fanficfare/gziphttp.py index 92ebb641..07aeb471 100644 --- a/fanficfare/gziphttp.py +++ b/fanficfare/gziphttp.py @@ -1,10 +1,14 @@ +# -*- coding: utf-8 -*- + ## Borrowed from http://techknack.net/python-urllib2-handlers/ +from __future__ import absolute_import -import urllib2 from gzip import GzipFile -from StringIO import StringIO +from .six.moves.urllib.request import BaseHandler +from .six.moves.urllib.response import addinfourl +from .six import BytesIO -class GZipProcessor(urllib2.BaseHandler): +class GZipProcessor(BaseHandler): """A handler to add gzip capabilities to urllib2 requests """ def http_request(self, req): @@ -16,7 +20,7 @@ class GZipProcessor(urllib2.BaseHandler): #print("Content-Encoding:%s"%resp.headers.get("Content-Encoding")) if resp.headers.get("Content-Encoding") == "gzip": gz = GzipFile( - fileobj=StringIO(resp.read()), + fileobj=BytesIO(resp.read()), mode="r" ) # resp.read = gz.read @@ -24,7 +28,7 @@ class GZipProcessor(urllib2.BaseHandler): # resp.readline = gz.readline # resp.next = gz.next old_resp = resp - resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) + resp = addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp https_response = http_response diff --git a/fanficfare/htmlcleanup.py b/fanficfare/htmlcleanup.py index 997cedcd..fb0258a0 100644 --- a/fanficfare/htmlcleanup.py +++ b/fanficfare/htmlcleanup.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,11 +15,18 @@ # limitations under the License. # +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re +# py2 vs py3 transition +from .six import text_type as unicode +from .six import string_types as basestring +from .six import ensure_text +from .six import unichr + def _unirepl(match): "Return the unicode string for a decimal number" if match.group(1).startswith('x'): @@ -55,49 +62,53 @@ def _replaceNotEntities(data): p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') return p.sub(r'&\1', data) -def stripHTML(soup): - if isinstance(soup,basestring) or hasattr(soup, 'bs3'): - return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip() +def stripHTML(soup, remove_all_entities=True): + if isinstance(soup,basestring): + retval = removeEntities(re.sub(r'<[^>]+>','',"%s" % soup), + remove_all_entities=remove_all_entities).strip() else: # bs4 already converts all the entities to UTF8 chars. - return soup.get_text(strip=True) + retval = soup.get_text(strip=True) + # some change in the python3 branch started making   '\xc2\xa0' + # instead of ' ' + return ensure_text(retval).replace(u'\xc2\xa0',' ').strip() def conditionalRemoveEntities(value): if isinstance(value,basestring): return removeEntities(value).strip() else: return value - -def removeAllEntities(text): - # Remove < < and & - return removeEntities(text).replace('<', '<').replace('>', '>').replace('&', '&') -def removeEntities(text, space_only=False): +def removeAllEntities(text): + # Remove < < and & also + return removeEntities(text, remove_all_entities=True) + +def removeEntities(text, space_only=False, remove_all_entities=False): + # keeps &, < and > when remove_all_entities=False if text is None: return u"" - + if not isinstance(text,basestring): - return unicode(text) - + text = unicode(text) + try: - t = text.decode('utf-8') - except (UnicodeEncodeError,UnicodeDecodeError), e: + t = text + except (UnicodeEncodeError,UnicodeDecodeError) as e: try: - t = text.encode ('ascii', 'xmlcharrefreplace') - except (UnicodeEncodeError,UnicodeDecodeError), e: + t = text.encode ('ascii', 'xmlcharrefreplace') + except (UnicodeEncodeError,UnicodeDecodeError) as e: t = text - text = t + text = t # replace numeric versions of [&<>] with named versions, # then replace named versions with actual characters, text = re.sub(r'�*38;','&',text) text = re.sub(r'�*60;','<',text) text = re.sub(r'�*62;','>',text) - + # replace remaining � entities with unicode value, such as ' -> ' text = _replaceNumberEntities(text) # replace several named entities with character, such as — -> - - # see constants.py for the list. # reverse sort will put entities with ; before the same one without, when valid. for e in reversed(sorted(entities.keys())): v = entities[e] @@ -106,8 +117,8 @@ def removeEntities(text, space_only=False): continue try: text = text.replace(e, v) - except UnicodeDecodeError, ex: - # for the pound symbol in constants.py + except UnicodeDecodeError as ex: + # for the pound symbol text = text.replace(e, v.decode('utf-8')) # SGMLParser, and in turn, BeautifulStoneSoup doesn't parse @@ -118,9 +129,14 @@ def removeEntities(text, space_only=False): # this point, there should be *no* real entities left, so find # these not-entities and removing them here should be safe. text = _replaceNotEntities(text) - - # < < and & are the only html entities allowed in xhtml, put those back. - return text.replace('&', '&').replace('&lt', '<').replace('&gt', '>') + + if remove_all_entities: + text = text.replace('<', '<').replace('>', '>').replace('&', '&') + else: + # < > and & are the only html entities allowed in xhtml, put those back. + # They come out as < because _replaceNotEntities removes the ';'. + text = text.replace('&', '&').replace('&lt', '<').replace('&gt', '>') + return text ## Currently used(optionally) by adapter_lightnovelgatecom and ## adapter_wwwnovelallcom only. I hesitate to put the option in diff --git a/fanficfare/htmlheuristics.py b/fanficfare/htmlheuristics.py index 5f4a8ee4..179790fb 100644 --- a/fanficfare/htmlheuristics.py +++ b/fanficfare/htmlheuristics.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2013 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,13 +15,18 @@ # limitations under the License. # +from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re import codecs import bs4 as bs -import HtmlTagStack as stack +# py2 vs py3 transition +from .six import text_type as unicode +from .six.moves import range + +from . import HtmlTagStack as stack from . import exceptions as exceptions def logdebug(s): diff --git a/fanficfare/mobi.py b/fanficfare/mobi.py index 7a527154..2f59c72c 100644 --- a/fanficfare/mobi.py +++ b/fanficfare/mobi.py @@ -1,16 +1,24 @@ #!/usr/bin/python -# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan - -import StringIO +# -*- coding: utf-8 -*- +# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan +# Changes Copyright 2018 FanFicFare team +from __future__ import absolute_import + import struct import time import random import logging +# py2 vs py3 transition +from .six import text_type as unicode +from .six import string_types as basestring +from .six import ensure_binary +from .six import BytesIO # StringIO under py2 + logger = logging.getLogger(__name__) -from html import HtmlProcessor +from .mobihtml import HtmlProcessor # http://wiki.mobileread.com/wiki/MOBI # http://membres.lycos.fr/microfirst/palm/pdb.html @@ -41,7 +49,7 @@ class _SubEntry: def TocLink(self): return '%.80s' % (self._name, self.title) - + def Anchor(self): return '' % self._name @@ -57,12 +65,12 @@ class Converter: self._refresh_url = refresh_url def ConvertString(self, s): - out = StringIO.StringIO() + out = BytesIO() self._ConvertStringToFile(s, out) return out.getvalue() def ConvertStrings(self, html_strs): - out = StringIO.StringIO() + out = BytesIO() self._ConvertStringsToFile(html_strs, out) return out.getvalue() @@ -83,13 +91,15 @@ class Converter: toc_html = [] body_html = [] - PAGE_BREAK = '' + ## This gets broken by html5lib/bs4fixed being helpful, but we'll + ## fix it inside mobihtml.py + PAGE_BREAK = '' # pull out the title page, assumed first html_strs. htmltitle = html_strs[0] entrytitle = _SubEntry(1, htmltitle) title_html.append(entrytitle.Body()) - + title_html.append(PAGE_BREAK) toc_html.append('

Table of Contents


') @@ -99,11 +109,11 @@ class Converter: # give some space between bodies of work. body_html.append(PAGE_BREAK) - + body_html.append(entry.Anchor()) - + body_html.append(entry.Body()) - + # TODO: this title can get way too long with RSS feeds. Not sure how to fix # cheat slightly and use the
code to set filepos in references. header = ''' @@ -117,6 +127,11 @@ class Converter: ''' % time.ctime(time.time()) footer = '' + # logger.debug("header:%s"%header) + # logger.debug("title_html:%s"%title_html) + # logger.debug("toc_html:%s"%toc_html) + # logger.debug("body_html:%s"%body_html) + # logger.debug("footer:%s"%footer) all_html = header + '\n'.join(title_html + toc_html + body_html) + footer #print "%s" % all_html.encode('utf8') return all_html @@ -125,18 +140,19 @@ class Converter: try: tmp = self.MakeOneHTML(html_strs) self._ConvertStringToFile(tmp, out_file) - except Exception, e: + except Exception as e: + raise logger.error('Error %s', e) - #logger.debug('Details: %s' % html_strs) + # logger.debug('Details: %s' % html_strs) def _ConvertStringToFile(self, html_data, out): html = HtmlProcessor(html_data) - data = html.CleanHtml() + data = ensure_binary(html.CleanHtml()) # collect offsets of '' tags, use to make index list. # indexlist = [] # list of (offset,length) tuples. # not in current use. - + # j=0 # lastj=0 # while True: @@ -152,27 +168,28 @@ class Converter: # if title: # self._header.SetTitle(title) record_id = 1 + # logger.debug("len(data):%s"%len(data)) for start_pos in range(0, len(data), Record.MAX_SIZE): end = min(len(data), start_pos + Record.MAX_SIZE) record_data = data[start_pos:end] records.append(self._header.AddRecord(record_data, record_id)) - #print "HTML Record %03d: (size:%d) [[%s ... %s]]" % ( record_id, len(record_data), record_data[:20], record_data[-20:] ) + # logger.debug("HTML Record %03d: (size:%d) [[%s ... %s]]" % ( record_id, len(record_data), record_data[:20], record_data[-20:] )) record_id += 1 self._header.SetImageRecordIndex(record_id) records[0:0] = [self._header.MobiHeader()] header, rec_offset = self._header.PDBHeader(len(records)) - out.write(header) + out.write(ensure_binary(header)) for record in records: record.WriteHeader(out, rec_offset) - #print "rec_offset: %d len(record.data): %d" % (rec_offset,len(record.data)) + # logger.debug("rec_offset: %d len(record.data): %d" % (rec_offset,len(record.data))) rec_offset += (len(record.data)+1) # plus one for trailing null # Write to nuls for some reason - out.write('\0\0') + out.write(b'\0\0') for record in records: record.WriteData(out) - out.write('\0') + out.write(b'\0') # needs a trailing null, I believe it indicates zero length 'overlap'. # otherwise, the readers eat the last char of each html record. # Calibre writes another 6-7 bytes of stuff after that, but we seem @@ -203,7 +220,7 @@ class Record: self._id = Record._unique_id_seed def WriteData(self, out): - out.write(self.data) + out.write(ensure_binary(self.data)) def WriteHeader(self, out, rec_offset): attributes = 64 # dirty? @@ -212,7 +229,7 @@ class Record: attributes, 0, self._id) assert len(header) == Record.INDEX_LEN - out.write(header) + out.write(ensure_binary(header)) EXTH_HEADER_FIELDS = { 'author' : 100, @@ -245,6 +262,7 @@ class Header: def AddRecord(self, data, record_id): self.max_record_size = max(Record.MAX_SIZE, len(data)) self._record_count += 1 + # logger.debug("len(data):%s"%len(data)) self._length += len(data) return Record(data, record_id) @@ -268,12 +286,15 @@ class Header: return palmdoc_header def PDBHeader(self, num_records): + # logger.debug("num_records:%s"%num_records) HEADER_LEN = 32+2+2+9*4 RECORD_INDEX_HEADER_LEN = 6 RESOURCE_INDEX_LEN = 10 index_len = RECORD_INDEX_HEADER_LEN + num_records * Record.INDEX_LEN rec_offset = HEADER_LEN + index_len + 2 + # logger.debug("index_len:%s"%index_len) + # logger.debug("rec_offset:%s"%rec_offset) short_title = self._title[0:31] attributes = 0 @@ -284,11 +305,11 @@ class Header: modnum = 0 appinfo_offset = 0 sort_offset = 0 - type = 'BOOK' - creator = 'MOBI' + type = b'BOOK' + creator = b'MOBI' id_seed = 36 header = struct.pack('>32sHHII', - short_title, attributes, version, + ensure_binary(short_title), attributes, version, ctime, mtime) header += struct.pack('>IIII', backup_time, modnum, appinfo_offset, sort_offset) @@ -309,13 +330,15 @@ class Header: typeid = EXTH_HEADER_FIELDS[key] length_encoding_len = 8 r.append(struct.pack('>LL', typeid, len(value) + length_encoding_len,) + value) - content = ''.join(r) + content = b''.join(r) + # logger.debug("len(content):%s"%len(content)) # Pad to word boundary while len(content) % 4: - content += '\0' + content += b'\0' + # logger.debug("len(content):%s"%len(content)) TODO_mysterious = 12 - exth = 'EXTH' + struct.pack('>LL', len(content) + TODO_mysterious, len(data)) + content + exth = b'EXTH' + struct.pack('>LL', len(content) + TODO_mysterious, len(data)) + content return exth def SetImageRecordIndex(self, idx): @@ -333,13 +356,16 @@ class Header: text_encoding = encoding['UTF-8'] unique_id = random.randint(1, 1<<32) creator_version = 4 - reserved = '%c' % 0xff * 40 + reserved = b'%c' % 0xff * 40 nonbook_index = fs + # logger.debug("header_len:%s"%header_len) + # logger.debug("len(palmdoc_header):%s"%len(palmdoc_header)) + # logger.debug("len(exth_header):%s"%len(exth_header)) full_name_offset = header_len + len(palmdoc_header) + len(exth_header) # put full name after header language = languages['en-us'] unused = 0 mobi_header = struct.pack('>4sIIIII40sIIIIII', - 'MOBI', + b'MOBI', header_len, mobi_type, text_encoding, @@ -368,13 +394,13 @@ class Header: fs, unused, exth_flags) - mobi_header += '\0' * 112 # TODO: Why this much padding? + mobi_header += b'\0' * 112 # TODO: Why this much padding? # Set some magic offsets to be 0xFFFFFFF. for pos in (0x94, 0x98, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xdc): mobi_header = self._ReplaceWord(mobi_header, pos, fs) # 16 bytes? - padding = '\0' * 48 * 4 # why? + padding = b'\0' * 48 * 4 # why? total_header = palmdoc_header + mobi_header + exth_header + self._title + padding return self.AddRecord(total_header, 0) diff --git a/fanficfare/html.py b/fanficfare/mobihtml.py similarity index 69% rename from fanficfare/html.py rename to fanficfare/mobihtml.py index c2bf4891..8a3f6713 100644 --- a/fanficfare/html.py +++ b/fanficfare/mobihtml.py @@ -1,15 +1,29 @@ #!/usr/bin/python +# -*- coding: utf-8 -*- + # Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan +# Changes Copyright 2018 FanFicFare team ## This module is used by mobi.py exclusively. +## Renamed Jul 2018 to avoid conflict with other 'html' packages +from __future__ import absolute_import import re import sys -import StringIO -import urllib +import logging +# py2 vs py3 transition +from .six.moves.urllib.parse import unquote +from .six import text_type as unicode +from .six import binary_type as bytes +from .six import ensure_binary + +# import bs4 +# BeautifulSoup = bs4.BeautifulSoup from bs4 import BeautifulSoup +logger = logging.getLogger(__name__) + class HtmlProcessor: WHITESPACE_RE = re.compile(r'\s') # Look for @@ -19,6 +33,14 @@ class HtmlProcessor: self.unfill = unfill # html = self._ProcessRawHtml(html) self._soup = BeautifulSoup(html,'html5lib') + # logger.debug(html) + ## mobi format wants to find this tag inside . + ## html5lib, on the other hand, moved it to . So we'll move + ## it back. + guide = self._soup.find('guide') + if guide: + self._soup.head.append(guide) + # logger.debug(self._soup) if self._soup.title.contents: self.title = self._soup.title.contents[0] else: @@ -33,7 +55,7 @@ class HtmlProcessor: def _StubInternalAnchors(self): '''Replace each internal anchor with a fixed-size filepos anchor. -\ + Looks for every anchor with and replaces that with . Stores anchors in self._anchor_references''' self._anchor_references = [] @@ -44,29 +66,35 @@ class HtmlProcessor: anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#'))) for anchor in anchorlist: self._anchor_references.append((anchor_num, anchor['href'])) - del anchor['href'] anchor['filepos'] = '%.10d' % anchor_num + # logger.debug("Add anchor: %s %s"%((anchor_num, anchor))) + del anchor['href'] anchor_num += 1 def _ReplaceAnchorStubs(self): # TODO: Browsers allow extra whitespace in the href names. - # str() instead of unicode() rather than figure out how to fix - # ancient mobi.py code. - assembled_text = str(self._soup) + assembled_text = ensure_binary(unicode(self._soup)) + # html5lib/bs4 creates close tags for + assembled_text = assembled_text.replace(b'',b'') + assembled_text = assembled_text.replace(b'',b'') del self._soup # shouldn't touch this anymore for anchor_num, original_ref in self._anchor_references: - ref = urllib.unquote(original_ref[1:]) # remove leading '#' + ref = unquote(original_ref[1:]) # remove leading '#' # Find the position of ref in the utf-8 document. # TODO(chatham): Using regexes and looking for name= would be better. - newpos = assembled_text.rfind(ref.encode('utf-8')) + newpos = assembled_text.find(b'name="'+ensure_binary(ref)) # .encode('utf-8') if newpos == -1: - print >>sys.stderr, 'Could not find anchor "%s"' % original_ref + logger.warn('Could not find anchor "%s"' % original_ref) continue - newpos += len(ref) + 2 # don't point into the middle of the tag - old_filepos = 'filepos="%.10d"' % anchor_num - new_filepos = 'filepos="%.10d"' % newpos + # instead of somewhere slightly *after* the tag pointed to, + # let's go right in front of it instead by looking for the page + # break before it. + newpos = assembled_text.rfind(b'<',0,newpos) + # logger.debug("Anchor Pos: %s %s '%s|%s'"%((anchor_num, newpos,assembled_text[newpos-15:newpos],assembled_text[newpos:newpos+15]))) + old_filepos = b'filepos="%.10d"' % anchor_num + new_filepos = b'filepos="%.10d"' % newpos assert assembled_text.find(old_filepos) != -1 assembled_text = assembled_text.replace(old_filepos, new_filepos, 1) return assembled_text diff --git a/fanficfare/six.py b/fanficfare/six.py new file mode 100644 index 00000000..9beb025a --- /dev/null +++ b/fanficfare/six.py @@ -0,0 +1,949 @@ +# Copyright (c) 2010-2018 Benjamin Peterson +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +"""Utilities for writing code that runs on Python 2 and 3""" + +from __future__ import absolute_import + +import functools +import itertools +import operator +import sys +import types + +__author__ = "Benjamin Peterson " +__version__ = "1.11.0fff" # for version included in fanficfare + +# Useful for very coarse version differentiation. +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 +PY34 = sys.version_info[0:2] >= (3, 4) + +if PY3: + string_types = str, + integer_types = int, + class_types = type, + text_type = str + binary_type = bytes + + MAXSIZE = sys.maxsize +else: + string_types = basestring, + integer_types = (int, long) + class_types = (type, types.ClassType) + text_type = unicode + binary_type = str + + if sys.platform.startswith("java"): + # Jython always uses 32 bits. + MAXSIZE = int((1 << 31) - 1) + else: + # It's possible to have sizeof(long) != sizeof(Py_ssize_t). + class X(object): + + def __len__(self): + return 1 << 31 + try: + len(X()) + except OverflowError: + # 32-bit + MAXSIZE = int((1 << 31) - 1) + else: + # 64-bit + MAXSIZE = int((1 << 63) - 1) + del X + + +def _add_doc(func, doc): + """Add documentation to a function.""" + func.__doc__ = doc + + +def _import_module(name): + """Import module, returning the module after the last dot.""" + __import__(name) + return sys.modules[name] + + +class _LazyDescr(object): + + def __init__(self, name): + self.name = name + + def __get__(self, obj, tp): + result = self._resolve() + setattr(obj, self.name, result) # Invokes __set__. + try: + # This is a bit ugly, but it avoids running this again by + # removing this descriptor. + delattr(obj.__class__, self.name) + except AttributeError: + pass + return result + + +class MovedModule(_LazyDescr): + + def __init__(self, name, old, new=None): + super(MovedModule, self).__init__(name) + if PY3: + if new is None: + new = name + self.mod = new + else: + self.mod = old + + def _resolve(self): + return _import_module(self.mod) + + def __getattr__(self, attr): + _module = self._resolve() + value = getattr(_module, attr) + setattr(self, attr, value) + return value + + +class _LazyModule(types.ModuleType): + + def __init__(self, name): + super(_LazyModule, self).__init__(name) + self.__doc__ = self.__class__.__doc__ + + def __dir__(self): + attrs = ["__doc__", "__name__"] + attrs += [attr.name for attr in self._moved_attributes] + return attrs + + # Subclasses should override this + _moved_attributes = [] + + +class MovedAttribute(_LazyDescr): + + def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None): + super(MovedAttribute, self).__init__(name) + if PY3: + if new_mod is None: + new_mod = name + self.mod = new_mod + if new_attr is None: + if old_attr is None: + new_attr = name + else: + new_attr = old_attr + self.attr = new_attr + else: + self.mod = old_mod + if old_attr is None: + old_attr = name + self.attr = old_attr + + def _resolve(self): + module = _import_module(self.mod) + return getattr(module, self.attr) + + +class _SixMetaPathImporter(object): + + """ + A meta path importer to import six.moves and its submodules. + + This class implements a PEP302 finder and loader. It should be compatible + with Python 2.5 and all existing versions of Python3 + """ + + def __init__(self, six_module_name): + self.name = six_module_name + self.known_modules = {} + + def _add_module(self, mod, *fullnames): + for fullname in fullnames: + self.known_modules[self.name + "." + fullname] = mod + + def _get_module(self, fullname): + return self.known_modules[self.name + "." + fullname] + + def find_module(self, fullname, path=None): + if fullname in self.known_modules: + return self + return None + + def __get_module(self, fullname): + try: + return self.known_modules[fullname] + except KeyError: + raise ImportError("This loader does not know module " + fullname) + + def load_module(self, fullname): + try: + # in case of a reload + return sys.modules[fullname] + except KeyError: + pass + mod = self.__get_module(fullname) + if isinstance(mod, MovedModule): + mod = mod._resolve() + else: + mod.__loader__ = self + sys.modules[fullname] = mod + return mod + + def is_package(self, fullname): + """ + Return true, if the named module is a package. + + We need this method to get correct spec objects with + Python 3.4 (see PEP451) + """ + return hasattr(self.__get_module(fullname), "__path__") + + def get_code(self, fullname): + """Return None + + Required, if is_package is implemented""" + self.__get_module(fullname) # eventually raises ImportError + return None + get_source = get_code # same as get_code + +_importer = _SixMetaPathImporter(__name__) + + +class _MovedItems(_LazyModule): + + """Lazy loading of moved objects""" + __path__ = [] # mark as package + + +_moved_attributes = [ + MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"), + MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"), + MovedAttribute("filterfalse", "itertools", "itertools", "ifilterfalse", "filterfalse"), + MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"), + MovedAttribute("intern", "__builtin__", "sys"), + MovedAttribute("map", "itertools", "builtins", "imap", "map"), + MovedAttribute("getcwd", "os", "os", "getcwdu", "getcwd"), + MovedAttribute("getcwdb", "os", "os", "getcwd", "getcwdb"), + MovedAttribute("getoutput", "commands", "subprocess"), + MovedAttribute("range", "__builtin__", "builtins", "xrange", "range"), + MovedAttribute("reload_module", "__builtin__", "importlib" if PY34 else "imp", "reload"), + MovedAttribute("reduce", "__builtin__", "functools"), + MovedAttribute("shlex_quote", "pipes", "shlex", "quote"), + MovedAttribute("StringIO", "StringIO", "io"), + MovedAttribute("UserDict", "UserDict", "collections"), + MovedAttribute("UserList", "UserList", "collections"), + MovedAttribute("UserString", "UserString", "collections"), + MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"), + MovedAttribute("zip", "itertools", "builtins", "izip", "zip"), + MovedAttribute("zip_longest", "itertools", "itertools", "izip_longest", "zip_longest"), + MovedModule("builtins", "__builtin__"), + MovedModule("configparser", "ConfigParser"), + MovedModule("copyreg", "copy_reg"), + MovedModule("dbm_gnu", "gdbm", "dbm.gnu"), + MovedModule("_dummy_thread", "dummy_thread", "_dummy_thread"), + MovedModule("http_cookiejar", "cookielib", "http.cookiejar"), + MovedModule("http_cookies", "Cookie", "http.cookies"), + MovedModule("html_entities", "htmlentitydefs", "html.entities"), + MovedModule("html_parser", "HTMLParser", "html.parser"), + MovedModule("http_client", "httplib", "http.client"), + MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"), + MovedModule("email_mime_image", "email.MIMEImage", "email.mime.image"), + MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"), + MovedModule("email_mime_nonmultipart", "email.MIMENonMultipart", "email.mime.nonmultipart"), + MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"), + MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"), + MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"), + MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"), + MovedModule("cPickle", "cPickle", "pickle"), + MovedModule("queue", "Queue"), + MovedModule("reprlib", "repr"), + MovedModule("socketserver", "SocketServer"), + MovedModule("_thread", "thread", "_thread"), + MovedModule("tkinter", "Tkinter"), + MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"), + MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"), + MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"), + MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"), + MovedModule("tkinter_tix", "Tix", "tkinter.tix"), + MovedModule("tkinter_ttk", "ttk", "tkinter.ttk"), + MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"), + MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"), + MovedModule("tkinter_colorchooser", "tkColorChooser", + "tkinter.colorchooser"), + MovedModule("tkinter_commondialog", "tkCommonDialog", + "tkinter.commondialog"), + MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"), + MovedModule("tkinter_font", "tkFont", "tkinter.font"), + MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"), + MovedModule("tkinter_tksimpledialog", "tkSimpleDialog", + "tkinter.simpledialog"), + MovedModule("urllib_parse", __name__ + ".moves.urllib_parse", "urllib.parse"), + MovedModule("urllib_error", __name__ + ".moves.urllib_error", "urllib.error"), + MovedModule("urllib", __name__ + ".moves.urllib", __name__ + ".moves.urllib"), + MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"), + MovedModule("xmlrpc_client", "xmlrpclib", "xmlrpc.client"), + MovedModule("xmlrpc_server", "SimpleXMLRPCServer", "xmlrpc.server"), +] +# Add windows specific modules. +if sys.platform == "win32": + _moved_attributes += [ + MovedModule("winreg", "_winreg"), + ] + +for attr in _moved_attributes: + setattr(_MovedItems, attr.name, attr) + if isinstance(attr, MovedModule): + _importer._add_module(attr, "moves." + attr.name) +del attr + +_MovedItems._moved_attributes = _moved_attributes + +moves = _MovedItems(__name__ + ".moves") +_importer._add_module(moves, "moves") + + +class Module_six_moves_urllib_parse(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_parse""" + + +_urllib_parse_moved_attributes = [ + MovedAttribute("ParseResult", "urlparse", "urllib.parse"), + MovedAttribute("SplitResult", "urlparse", "urllib.parse"), + MovedAttribute("parse_qs", "urlparse", "urllib.parse"), + MovedAttribute("parse_qsl", "urlparse", "urllib.parse"), + MovedAttribute("urldefrag", "urlparse", "urllib.parse"), + MovedAttribute("urljoin", "urlparse", "urllib.parse"), + MovedAttribute("urlparse", "urlparse", "urllib.parse"), + MovedAttribute("urlsplit", "urlparse", "urllib.parse"), + MovedAttribute("urlunparse", "urlparse", "urllib.parse"), + MovedAttribute("urlunsplit", "urlparse", "urllib.parse"), + MovedAttribute("quote", "urllib", "urllib.parse"), + MovedAttribute("quote_plus", "urllib", "urllib.parse"), + MovedAttribute("unquote", "urllib", "urllib.parse"), + MovedAttribute("unquote_plus", "urllib", "urllib.parse"), + MovedAttribute("unquote_to_bytes", "urllib", "urllib.parse", "unquote", "unquote_to_bytes"), + MovedAttribute("urlencode", "urllib", "urllib.parse"), + MovedAttribute("splitquery", "urllib", "urllib.parse"), + MovedAttribute("splittag", "urllib", "urllib.parse"), + MovedAttribute("splituser", "urllib", "urllib.parse"), + MovedAttribute("splitvalue", "urllib", "urllib.parse"), + MovedAttribute("uses_fragment", "urlparse", "urllib.parse"), + MovedAttribute("uses_netloc", "urlparse", "urllib.parse"), + MovedAttribute("uses_params", "urlparse", "urllib.parse"), + MovedAttribute("uses_query", "urlparse", "urllib.parse"), + MovedAttribute("uses_relative", "urlparse", "urllib.parse"), +] +for attr in _urllib_parse_moved_attributes: + setattr(Module_six_moves_urllib_parse, attr.name, attr) +del attr + +Module_six_moves_urllib_parse._moved_attributes = _urllib_parse_moved_attributes + +_importer._add_module(Module_six_moves_urllib_parse(__name__ + ".moves.urllib_parse"), + "moves.urllib_parse", "moves.urllib.parse") + + +class Module_six_moves_urllib_error(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_error""" + + +_urllib_error_moved_attributes = [ + MovedAttribute("URLError", "urllib2", "urllib.error"), + MovedAttribute("HTTPError", "urllib2", "urllib.error"), + MovedAttribute("ContentTooShortError", "urllib", "urllib.error"), +] +for attr in _urllib_error_moved_attributes: + setattr(Module_six_moves_urllib_error, attr.name, attr) +del attr + +Module_six_moves_urllib_error._moved_attributes = _urllib_error_moved_attributes + +_importer._add_module(Module_six_moves_urllib_error(__name__ + ".moves.urllib.error"), + "moves.urllib_error", "moves.urllib.error") + + +class Module_six_moves_urllib_request(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_request""" + + +_urllib_request_moved_attributes = [ + MovedAttribute("urlopen", "urllib2", "urllib.request"), + MovedAttribute("install_opener", "urllib2", "urllib.request"), + MovedAttribute("build_opener", "urllib2", "urllib.request"), + MovedAttribute("pathname2url", "urllib", "urllib.request"), + MovedAttribute("url2pathname", "urllib", "urllib.request"), + MovedAttribute("getproxies", "urllib", "urllib.request"), + MovedAttribute("Request", "urllib2", "urllib.request"), + MovedAttribute("OpenerDirector", "urllib2", "urllib.request"), + MovedAttribute("HTTPDefaultErrorHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPRedirectHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPCookieProcessor", "urllib2", "urllib.request"), + MovedAttribute("ProxyHandler", "urllib2", "urllib.request"), + MovedAttribute("BaseHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPPasswordMgr", "urllib2", "urllib.request"), + MovedAttribute("HTTPPasswordMgrWithDefaultRealm", "urllib2", "urllib.request"), + MovedAttribute("AbstractBasicAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPBasicAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("ProxyBasicAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("AbstractDigestAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPDigestAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("ProxyDigestAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPSHandler", "urllib2", "urllib.request"), + MovedAttribute("FileHandler", "urllib2", "urllib.request"), + MovedAttribute("FTPHandler", "urllib2", "urllib.request"), + MovedAttribute("CacheFTPHandler", "urllib2", "urllib.request"), + MovedAttribute("UnknownHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPErrorProcessor", "urllib2", "urllib.request"), + MovedAttribute("urlretrieve", "urllib", "urllib.request"), + MovedAttribute("urlcleanup", "urllib", "urllib.request"), + MovedAttribute("URLopener", "urllib", "urllib.request"), + MovedAttribute("FancyURLopener", "urllib", "urllib.request"), + MovedAttribute("proxy_bypass", "urllib", "urllib.request"), + MovedAttribute("parse_http_list", "urllib2", "urllib.request"), + MovedAttribute("parse_keqv_list", "urllib2", "urllib.request"), +] +for attr in _urllib_request_moved_attributes: + setattr(Module_six_moves_urllib_request, attr.name, attr) +del attr + +Module_six_moves_urllib_request._moved_attributes = _urllib_request_moved_attributes + +_importer._add_module(Module_six_moves_urllib_request(__name__ + ".moves.urllib.request"), + "moves.urllib_request", "moves.urllib.request") + + +class Module_six_moves_urllib_response(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_response""" + + +_urllib_response_moved_attributes = [ + MovedAttribute("addbase", "urllib", "urllib.response"), + MovedAttribute("addclosehook", "urllib", "urllib.response"), + MovedAttribute("addinfo", "urllib", "urllib.response"), + MovedAttribute("addinfourl", "urllib", "urllib.response"), +] +for attr in _urllib_response_moved_attributes: + setattr(Module_six_moves_urllib_response, attr.name, attr) +del attr + +Module_six_moves_urllib_response._moved_attributes = _urllib_response_moved_attributes + +_importer._add_module(Module_six_moves_urllib_response(__name__ + ".moves.urllib.response"), + "moves.urllib_response", "moves.urllib.response") + + +class Module_six_moves_urllib_robotparser(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_robotparser""" + + +_urllib_robotparser_moved_attributes = [ + MovedAttribute("RobotFileParser", "robotparser", "urllib.robotparser"), +] +for attr in _urllib_robotparser_moved_attributes: + setattr(Module_six_moves_urllib_robotparser, attr.name, attr) +del attr + +Module_six_moves_urllib_robotparser._moved_attributes = _urllib_robotparser_moved_attributes + +_importer._add_module(Module_six_moves_urllib_robotparser(__name__ + ".moves.urllib.robotparser"), + "moves.urllib_robotparser", "moves.urllib.robotparser") + + +class Module_six_moves_urllib(types.ModuleType): + + """Create a six.moves.urllib namespace that resembles the Python 3 namespace""" + __path__ = [] # mark as package + parse = _importer._get_module("moves.urllib_parse") + error = _importer._get_module("moves.urllib_error") + request = _importer._get_module("moves.urllib_request") + response = _importer._get_module("moves.urllib_response") + robotparser = _importer._get_module("moves.urllib_robotparser") + + def __dir__(self): + return ['parse', 'error', 'request', 'response', 'robotparser'] + +_importer._add_module(Module_six_moves_urllib(__name__ + ".moves.urllib"), + "moves.urllib") + + +def add_move(move): + """Add an item to six.moves.""" + setattr(_MovedItems, move.name, move) + + +def remove_move(name): + """Remove item from six.moves.""" + try: + delattr(_MovedItems, name) + except AttributeError: + try: + del moves.__dict__[name] + except KeyError: + raise AttributeError("no such move, %r" % (name,)) + + +if PY3: + _meth_func = "__func__" + _meth_self = "__self__" + + _func_closure = "__closure__" + _func_code = "__code__" + _func_defaults = "__defaults__" + _func_globals = "__globals__" +else: + _meth_func = "im_func" + _meth_self = "im_self" + + _func_closure = "func_closure" + _func_code = "func_code" + _func_defaults = "func_defaults" + _func_globals = "func_globals" + + +try: + advance_iterator = next +except NameError: + def advance_iterator(it): + return it.next() +next = advance_iterator + + +try: + callable = callable +except NameError: + def callable(obj): + return any("__call__" in klass.__dict__ for klass in type(obj).__mro__) + + +if PY3: + def get_unbound_function(unbound): + return unbound + + create_bound_method = types.MethodType + + def create_unbound_method(func, cls): + return func + + Iterator = object +else: + def get_unbound_function(unbound): + return unbound.im_func + + def create_bound_method(func, obj): + return types.MethodType(func, obj, obj.__class__) + + def create_unbound_method(func, cls): + return types.MethodType(func, None, cls) + + class Iterator(object): + + def next(self): + return type(self).__next__(self) + + callable = callable +_add_doc(get_unbound_function, + """Get the function out of a possibly unbound function""") + + +get_method_function = operator.attrgetter(_meth_func) +get_method_self = operator.attrgetter(_meth_self) +get_function_closure = operator.attrgetter(_func_closure) +get_function_code = operator.attrgetter(_func_code) +get_function_defaults = operator.attrgetter(_func_defaults) +get_function_globals = operator.attrgetter(_func_globals) + + +if PY3: + def iterkeys(d, **kw): + return iter(d.keys(**kw)) + + def itervalues(d, **kw): + return iter(d.values(**kw)) + + def iteritems(d, **kw): + return iter(d.items(**kw)) + + def iterlists(d, **kw): + return iter(d.lists(**kw)) + + viewkeys = operator.methodcaller("keys") + + viewvalues = operator.methodcaller("values") + + viewitems = operator.methodcaller("items") +else: + def iterkeys(d, **kw): + return d.iterkeys(**kw) + + def itervalues(d, **kw): + return d.itervalues(**kw) + + def iteritems(d, **kw): + return d.iteritems(**kw) + + def iterlists(d, **kw): + return d.iterlists(**kw) + + viewkeys = operator.methodcaller("viewkeys") + + viewvalues = operator.methodcaller("viewvalues") + + viewitems = operator.methodcaller("viewitems") + +_add_doc(iterkeys, "Return an iterator over the keys of a dictionary.") +_add_doc(itervalues, "Return an iterator over the values of a dictionary.") +_add_doc(iteritems, + "Return an iterator over the (key, value) pairs of a dictionary.") +_add_doc(iterlists, + "Return an iterator over the (key, [values]) pairs of a dictionary.") + + +if PY3: + def b(s): + return s.encode("latin-1") + + def u(s): + return s + unichr = chr + import struct + int2byte = struct.Struct(">B").pack + del struct + byte2int = operator.itemgetter(0) + indexbytes = operator.getitem + iterbytes = iter + import io + StringIO = io.StringIO + BytesIO = io.BytesIO + _assertCountEqual = "assertCountEqual" + if sys.version_info[1] <= 1: + _assertRaisesRegex = "assertRaisesRegexp" + _assertRegex = "assertRegexpMatches" + else: + _assertRaisesRegex = "assertRaisesRegex" + _assertRegex = "assertRegex" +else: + def b(s): + return s + # Workaround for standalone backslash + + def u(s): + return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape") + unichr = unichr + int2byte = chr + + def byte2int(bs): + return ord(bs[0]) + + def indexbytes(buf, i): + return ord(buf[i]) + iterbytes = functools.partial(itertools.imap, ord) + import StringIO + StringIO = BytesIO = StringIO.StringIO + _assertCountEqual = "assertItemsEqual" + _assertRaisesRegex = "assertRaisesRegexp" + _assertRegex = "assertRegexpMatches" +_add_doc(b, """Byte literal""") +_add_doc(u, """Text literal""") + + +def assertCountEqual(self, *args, **kwargs): + return getattr(self, _assertCountEqual)(*args, **kwargs) + + +def assertRaisesRegex(self, *args, **kwargs): + return getattr(self, _assertRaisesRegex)(*args, **kwargs) + + +def assertRegex(self, *args, **kwargs): + return getattr(self, _assertRegex)(*args, **kwargs) + + +if PY3: + exec_ = getattr(moves.builtins, "exec") + + def reraise(tp, value, tb=None): + try: + if value is None: + value = tp() + if value.__traceback__ is not tb: + raise value.with_traceback(tb) + raise value + finally: + value = None + tb = None + +else: + def exec_(_code_, _globs_=None, _locs_=None): + """Execute code in a namespace.""" + if _globs_ is None: + frame = sys._getframe(1) + _globs_ = frame.f_globals + if _locs_ is None: + _locs_ = frame.f_locals + del frame + elif _locs_ is None: + _locs_ = _globs_ + exec("""exec _code_ in _globs_, _locs_""") + + exec_("""def reraise(tp, value, tb=None): + try: + raise tp, value, tb + finally: + tb = None +""") + + +if sys.version_info[:2] == (3, 2): + exec_("""def raise_from(value, from_value): + try: + if from_value is None: + raise value + raise value from from_value + finally: + value = None +""") +elif sys.version_info[:2] > (3, 2): + exec_("""def raise_from(value, from_value): + try: + raise value from from_value + finally: + value = None +""") +else: + def raise_from(value, from_value): + raise value + + +print_ = getattr(moves.builtins, "print", None) +if print_ is None: + def print_(*args, **kwargs): + """The new-style print function for Python 2.4 and 2.5.""" + fp = kwargs.pop("file", sys.stdout) + if fp is None: + return + + def write(data): + if not isinstance(data, basestring): + data = str(data) + # If the file has an encoding, encode unicode with it. + if (isinstance(fp, file) and + isinstance(data, unicode) and + fp.encoding is not None): + errors = getattr(fp, "errors", None) + if errors is None: + errors = "strict" + data = data.encode(fp.encoding, errors) + fp.write(data) + want_unicode = False + sep = kwargs.pop("sep", None) + if sep is not None: + if isinstance(sep, unicode): + want_unicode = True + elif not isinstance(sep, str): + raise TypeError("sep must be None or a string") + end = kwargs.pop("end", None) + if end is not None: + if isinstance(end, unicode): + want_unicode = True + elif not isinstance(end, str): + raise TypeError("end must be None or a string") + if kwargs: + raise TypeError("invalid keyword arguments to print()") + if not want_unicode: + for arg in args: + if isinstance(arg, unicode): + want_unicode = True + break + if want_unicode: + newline = unicode("\n") + space = unicode(" ") + else: + newline = "\n" + space = " " + if sep is None: + sep = space + if end is None: + end = newline + for i, arg in enumerate(args): + if i: + write(sep) + write(arg) + write(end) +if sys.version_info[:2] < (3, 3): + _print = print_ + + def print_(*args, **kwargs): + fp = kwargs.get("file", sys.stdout) + flush = kwargs.pop("flush", False) + _print(*args, **kwargs) + if flush and fp is not None: + fp.flush() + +_add_doc(reraise, """Reraise an exception.""") + +if sys.version_info[0:2] < (3, 4): + def wraps(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS, + updated=functools.WRAPPER_UPDATES): + def wrapper(f): + f = functools.wraps(wrapped, assigned, updated)(f) + f.__wrapped__ = wrapped + return f + return wrapper +else: + wraps = functools.wraps + + +def with_metaclass(meta, *bases): + """Create a base class with a metaclass.""" + # This requires a bit of explanation: the basic idea is to make a dummy + # metaclass for one level of class instantiation that replaces itself with + # the actual metaclass. + class metaclass(type): + + def __new__(cls, name, this_bases, d): + return meta(name, bases, d) + + @classmethod + def __prepare__(cls, name, this_bases): + return meta.__prepare__(name, bases) + return type.__new__(metaclass, 'temporary_class', (), {}) + + +def add_metaclass(metaclass): + """Class decorator for creating a class with a metaclass.""" + def wrapper(cls): + orig_vars = cls.__dict__.copy() + slots = orig_vars.get('__slots__') + if slots is not None: + if isinstance(slots, str): + slots = [slots] + for slots_var in slots: + orig_vars.pop(slots_var) + orig_vars.pop('__dict__', None) + orig_vars.pop('__weakref__', None) + return metaclass(cls.__name__, cls.__bases__, orig_vars) + return wrapper + + +def ensure_binary(s, encoding='utf-8', errors='strict'): + """Coerce **s** to six.binary_type. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> encoded to `bytes` + - `bytes` -> `bytes` + """ + if isinstance(s, text_type): + return s.encode(encoding, errors) + elif isinstance(s, binary_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + +def ensure_str(s, encoding='utf-8', errors='strict'): + """Coerce *s* to `str`. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if not isinstance(s, (text_type, binary_type)): + raise TypeError("not expecting type '%s'" % type(s)) + if PY2 and isinstance(s, text_type): + s = s.encode(encoding, errors) + elif PY3 and isinstance(s, binary_type): + s = s.decode(encoding, errors) + return s + + +def ensure_text(s, encoding='utf-8', errors='strict'): + """Coerce *s* to six.text_type. + + For Python 2: + - `unicode` -> `unicode` + - `str` -> `unicode` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if isinstance(s, binary_type): + return s.decode(encoding, errors) + elif isinstance(s, text_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + + +def python_2_unicode_compatible(klass): + """ + A decorator that defines __unicode__ and __str__ methods under Python 2. + Under Python 3 it does nothing. + + To support Python 2 and 3 with a single code base, define a __str__ method + returning text and apply this decorator to the class. + """ + if PY2: + if '__str__' not in klass.__dict__: + raise ValueError("@python_2_unicode_compatible cannot be applied " + "to %s because it doesn't define __str__()." % + klass.__name__) + klass.__unicode__ = klass.__str__ + klass.__str__ = lambda self: self.__unicode__().encode('utf-8') + return klass + + +# Complete the moves implementation. +# This code is at the end of this module to speed up module loading. +# Turn this module into a package. +__path__ = [] # required for PEP 302 and PEP 451 +__package__ = __name__ # see PEP 366 @ReservedAssignment +if globals().get("__spec__") is not None: + __spec__.submodule_search_locations = [] # PEP 451 @UndefinedVariable +# Remove other six meta path importers, since they cause problems. This can +# happen if six is removed from sys.modules and then reloaded. (Setuptools does +# this for some reason.) +if sys.meta_path: + for i, importer in enumerate(sys.meta_path): + # Here's some real nastiness: Another "instance" of the six module might + # be floating around. Therefore, we can't use isinstance() to check for + # the six meta path importer, since the other six instance will have + # inserted an importer with different class. + if (type(importer).__name__ == "_SixMetaPathImporter" and + importer.name == __name__): + del sys.meta_path[i] + break + del i, importer +# Finally, add the importer to the meta path import hook. +sys.meta_path.append(_importer) diff --git a/fanficfare/story.py b/fanficfare/story.py index 95bd761e..2dc6c33b 100644 --- a/fanficfare/story.py +++ b/fanficfare/story.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2016 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,10 @@ # limitations under the License. # -import os, re +from __future__ import absolute_import +import os, re, sys import copy from collections import defaultdict -import urlparse import string import json import datetime @@ -26,14 +26,20 @@ from math import floor from functools import partial import logging logger = logging.getLogger(__name__) -import urlparse as up + +# py2 vs py3 transition +from . import six +from .six.moves.urllib.parse import (urlparse, urlunparse) +from .six import text_type as unicode +from .six import string_types as basestring +from .six.moves import map import bs4 -import exceptions -from htmlcleanup import conditionalRemoveEntities, removeEntities, removeAllEntities -from configurable import Configurable, re_compile -from htmlheuristics import was_run_marker +from . import exceptions +from .htmlcleanup import conditionalRemoveEntities, removeEntities, removeAllEntities +from .configurable import Configurable, re_compile +from .htmlheuristics import was_run_marker SPACE_REPLACE=u'\s' SPLIT_META=u'\,' @@ -51,12 +57,13 @@ imagetypes = { try: from calibre.utils.magick import Image - from StringIO import StringIO + from .six import BytesIO from gif import GifInfo, CHECK_IS_ANIMATED convtype = {'jpg':'JPG', 'png':'PNG'} def convert_image(url,data,sizes,grayscale, removetrans,imgtype="jpg",background='#ffffff'): + # logger.debug("calibre convert_image called") if url.lower().endswith('.svg'): raise exceptions.RejectImage("Calibre image processing chokes on SVG images.") @@ -68,7 +75,7 @@ try: nwidth, nheight = sizes scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight) - if normalize_format_name(img.format)=="gif" and GifInfo(StringIO(data),CHECK_IS_ANIMATED).frameCount > 1: + if normalize_format_name(img.format)=="gif" and GifInfo(BytesIO(data),CHECK_IS_ANIMATED).frameCount > 1: raise exceptions.RejectImage("Animated gifs come out poorly--not going to use it.") if scaled: @@ -97,15 +104,16 @@ try: except: - # No calibre routines, try for PIL for CLI. + # No calibre routines, try for Pillow for CLI. try: - import Image - from StringIO import StringIO + from PIL import Image + from .six import BytesIO convtype = {'jpg':'JPEG', 'png':'PNG'} def convert_image(url,data,sizes,grayscale, removetrans,imgtype="jpg",background='#ffffff'): + # logger.debug("Pillow convert_image called") export = False - img = Image.open(StringIO(data)) + img = Image.open(BytesIO(data)) owidth, oheight = img.size nwidth, nheight = sizes @@ -132,7 +140,7 @@ except: export = True if export: - outsio = StringIO() + outsio = BytesIO() img.save(outsio,convtype[imgtype]) return (outsio.getvalue(),imgtype,imagetypes[imgtype]) else: @@ -143,11 +151,12 @@ except: # No calibre or PIL, simple pass through with mimetype. def convert_image(url,data,sizes,grayscale, removetrans,imgtype="jpg",background='#ffffff'): + # logger.debug("NO convert_image called") return no_convert_image(url,data) ## also used for explicit no image processing. def no_convert_image(url,data): - parsedUrl = up.urlparse(url) + parsedUrl = urlparse(url) ext=parsedUrl.path[parsedUrl.path.rfind('.')+1:].lower() @@ -332,7 +341,7 @@ class InExMatch: (self.keys,self.match) = line.split("!=") self.match = self.match.replace(SPACE_REPLACE,' ') self.negate = True - self.keys = map( lambda x: x.strip(), self.keys.split(",") ) + self.keys = [x.strip() for x in self.keys.split(",")] # For conditional, only one key def is_key(self,key): @@ -406,7 +415,7 @@ def make_replacements(replace): if "=>" in line: parts = line.split("=>") if len(parts) > 2: - metakeys = map( lambda x: x.strip(), parts[0].split(",") ) + metakeys = [x.strip() for x in parts[0].split(",")] (regexp,replacement)=parts[1:] else: (regexp,replacement)=parts @@ -436,6 +445,7 @@ class Story(Configurable): self.metadata = {'version':os.environ['CURRENT_VERSION_ID']} except: self.metadata = {'version':'unknown'} + self.metadata['python_version']=sys.version self.replacements = [] self.in_ex_cludes = {} self.chapters = [] # chapters will be dict containing(url,title,html,etc) @@ -608,8 +618,8 @@ class Story(Configurable): raise for val in retlist: - retlist = map(partial(self.do_in_ex_clude,'include_metadata_post',key=key),retlist) - retlist = map(partial(self.do_in_ex_clude,'exclude_metadata_post',key=key),retlist) + retlist = list(map(partial(self.do_in_ex_clude,'include_metadata_post',key=key),retlist)) + retlist = list(map(partial(self.do_in_ex_clude,'exclude_metadata_post',key=key),retlist)) if return_list: return retlist @@ -619,7 +629,7 @@ class Story(Configurable): # for saving an html-ified copy of metadata. def dump_html_metadata(self): lines=[] - for k,v in sorted(self.metadata.iteritems()): + for k,v in sorted(six.iteritems(self.metadata)): classes=['metadata'] if isinstance(v, (datetime.date, datetime.datetime, datetime.time)): classes.append("datetime") @@ -689,7 +699,7 @@ class Story(Configurable): return value def getMetadataRaw(self,key): - if self.isValidMetaEntry(key) and self.metadata.has_key(key): + if self.isValidMetaEntry(key) and key in self.metadata: return self.metadata[key] def getMetadata(self, key, @@ -711,7 +721,7 @@ class Story(Configurable): value = self.join_list(key,self.getList(key, removeallentities, doreplacements=True)) if doreplacements: value = self.doReplacements(value,key+"_LIST") - elif self.metadata.has_key(key): + elif key in self.metadata: value = self.metadata[key] if value: if key in ["numWords","numChapters"]+self.getConfigList("comma_entries",[]): @@ -868,7 +878,7 @@ class Story(Configurable): def isList(self,listname): 'Everything set with an include_in_* is considered a list.' return self.isListType(listname) or \ - ( self.isValidMetaEntry(listname) and self.metadata.has_key(listname) \ + ( self.isValidMetaEntry(listname) and listname in self.metadata \ and isinstance(self.metadata[listname],list) ) def getList(self,listname, @@ -948,9 +958,9 @@ class Story(Configurable): retlist = newretlist if removeallentities: - retlist = map(removeAllEntities,retlist) + retlist = list(map(removeAllEntities,retlist)) - retlist = filter( lambda x : x!=None and x!='' ,retlist) + retlist = [x for x in retlist if x!=None and x!=''] if listname == 'genre' and self.getConfig('add_genre_when_multi_category') and len(self.getList('category', removeallentities=False, @@ -984,7 +994,7 @@ class Story(Configurable): tags_list = self.getConfigList("include_subject_tags") + self.getConfigList("extra_subject_tags") # metadata all go into dc:subject tags, but only if they are configured. - for (name,value) in self.getAllMetadata(removeallentities=removeallentities,keeplists=True).iteritems(): + for (name,value) in six.iteritems(self.getAllMetadata(removeallentities=removeallentities,keeplists=True)): if name+'.SPLIT' in tags_list: flist=[] if isinstance(value,list): @@ -1011,7 +1021,6 @@ class Story(Configurable): def addChapter(self, chap, newchap=False): # logger.debug("addChapter(%s,%s)"%(chap,newchap)) chapter = defaultdict(unicode,chap) # default unknown to empty string - chapter['title'] = removeEntities(chapter['title']) chapter['html'] = removeEntities(chapter['html']) if self.getConfig('strip_chapter_numbers') and \ self.getConfig('chapter_title_strip_pattern'): @@ -1029,7 +1038,7 @@ class Story(Configurable): self.chapters.append(chapter) def getChapters(self,fortoc=False): - "Chapters will be dicts" + "Chapters will be defaultdicts(unicode)" retval = [] ## only add numbers if more than one chapter. Ditto (new) marks. @@ -1113,7 +1122,7 @@ class Story(Configurable): else: values = self.get_filename_safe_metadata() - return string.Template(template).substitute(values).encode('utf8') + return string.Template(template).substitute(values) #.encode('utf8') # pass fetch in from adapter in case we need the cookies collected # as well as it's a base_story class method. @@ -1139,15 +1148,15 @@ class Story(Configurable): if url.startswith("http") or url.startswith("file") or parenturl == None: imgurl = url else: - parsedUrl = urlparse.urlparse(parenturl) + parsedUrl = urlparse(parenturl) if url.startswith("//") : - imgurl = urlparse.urlunparse( + imgurl = urlunparse( (parsedUrl.scheme, '', url, '','','')) elif url.startswith("/") : - imgurl = urlparse.urlunparse( + imgurl = urlunparse( (parsedUrl.scheme, parsedUrl.netloc, url, @@ -1158,7 +1167,7 @@ class Story(Configurable): toppath = parsedUrl.path else: toppath = parsedUrl.path[:parsedUrl.path.rindex('/')+1] - imgurl = urlparse.urlunparse( + imgurl = urlunparse( (parsedUrl.scheme, parsedUrl.netloc, toppath + url, @@ -1177,14 +1186,14 @@ class Story(Configurable): if imgurl.endswith('failedtoload'): return ("failedtoload","failedtoload") - parsedUrl = urlparse.urlparse(imgurl) + parsedUrl = urlparse(imgurl) if self.getConfig('no_image_processing'): (data,ext,mime) = no_convert_image(imgurl, fetch(imgurl,referer=parenturl)) else: try: - sizes = [ int(x) for x in self.getConfigList('image_max_size') ] - except Exception, e: + sizes = [ int(x) for x in self.getConfigList('image_max_size',['580', '725']) ] + except Exception as e: raise exceptions.FailedToDownload("Failed to parse image_max_size from personal.ini:%s\nException: %s"%(self.getConfigList('image_max_size'),e)) grayscale = self.getConfig('grayscale_images') imgtype = self.getConfig('convert_images_to') @@ -1201,7 +1210,7 @@ class Story(Configurable): removetrans, imgtype, background="#"+self.getConfig('background_color')) - except Exception, e: + except Exception as e: logger.info("Failed to load or convert image, \nparent:%s\nskipping:%s\nException: %s"%(parenturl,imgurl,e)) return ("failedtoload","failedtoload") @@ -1250,7 +1259,7 @@ class Story(Configurable): def getImgUrls(self): retlist = [] for i, url in enumerate(self.imgurls): - #parsedUrl = urlparse.urlparse(url) + #parsedUrl = urlparse(url) retlist.append(self.imgtuples[i]) return retlist diff --git a/fanficfare/translit.py b/fanficfare/translit.py index 0efdc010..ec05d738 100644 --- a/fanficfare/translit.py +++ b/fanficfare/translit.py @@ -1,5 +1,11 @@ #-*-coding:utf-8-*- # Code taken from http://python.su/forum/viewtopic.php?pid=66946 +from __future__ import absolute_import + +# py2 vs py3 transition +from .six import text_type as unicode +from .six import ensure_text + import unicodedata def is_syllable(letter): syllables = ("A", "E", "I", "O", "U", "a", "e", "i", "o", "u") @@ -37,7 +43,7 @@ def romanize(letter): return func(filter(is_consonant, unid)) def translit(text): output = "" - for letter in text: + for letter in ensure_text(text): output += romanize(letter) return output #def main(): diff --git a/fanficfare/writers/__init__.py b/fanficfare/writers/__init__.py index c0e3863c..f1cb1325 100644 --- a/fanficfare/writers/__init__.py +++ b/fanficfare/writers/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,16 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -## This could (should?) use a dynamic loader like adapters, but for -## now, it's static, since there's so few of them. +from __future__ import absolute_import from ..exceptions import FailedToDownload -from writer_html import HTMLWriter -from writer_txt import TextWriter -from writer_epub import EpubWriter -from writer_mobi import MobiWriter +## This could (should?) use a dynamic loader like adapters, but for +## now, it's static, since there's so few of them. +from .writer_html import HTMLWriter +from .writer_txt import TextWriter +from .writer_epub import EpubWriter +from .writer_mobi import MobiWriter def getWriter(type,config,story): if type == "html": diff --git a/fanficfare/writers/base_writer.py b/fanficfare/writers/base_writer.py index bc3aeb34..eb69e1aa 100644 --- a/fanficfare/writers/base_writer.py +++ b/fanficfare/writers/base_writer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,16 +14,23 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from __future__ import absolute_import import re import os.path import datetime import string -import StringIO import zipfile from zipfile import ZipFile, ZIP_DEFLATED import logging +# py2 vs py3 transition +from .. import six +from ..six import text_type as unicode +from ..six import ensure_text +from ..six import ensure_binary +from ..six import BytesIO # StringIO under py2 + from ..configurable import Configurable from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML @@ -41,10 +48,10 @@ class BaseStoryWriter(Configurable): def __init__(self, configuration, adapter): Configurable.__init__(self, configuration) - + self.adapter = adapter self.story = adapter.getStoryMetadataOnly() # only cache the metadata initially. - + self.story.setMetadata('formatname',self.getFormatName()) self.story.setMetadata('formatext',self.getFormatExt()) @@ -59,12 +66,12 @@ class BaseStoryWriter(Configurable): def getBaseFileName(self): return self.story.formatFileName(self.getConfig('output_filename'),self.getConfig('allow_unsafe_filename')) - + def getZipFileName(self): return self.story.formatFileName(self.getConfig('zip_filename'),self.getConfig('allow_unsafe_filename')) def _write(self, out, text): - out.write(text.encode('utf8')) + out.write(ensure_binary(text)) def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None, NO_TITLE_ENTRY=None): """ @@ -89,7 +96,7 @@ class BaseStoryWriter(Configurable): if self.hasConfig("titlepage_no_title_entry"): NO_TITLE_ENTRY = string.Template(self.getConfig("titlepage_no_title_entry")) - + self._write(out,START.substitute(self.story.getAllMetadata())) if WIDE_ENTRY==None: @@ -120,7 +127,7 @@ class BaseStoryWriter(Configurable): # 'no title' option if there is one. if label == "" and NO_TITLE_ENTRY: TEMPLATE= NO_TITLE_ENTRY - + self._write(out,TEMPLATE.substitute({'label':label, 'id':entry, 'value':self.story.getMetadata(entry)})) @@ -145,7 +152,7 @@ class BaseStoryWriter(Configurable): if self.hasConfig("tocpage_end"): END = string.Template(self.getConfig("tocpage_end")) - + self._write(out,START.substitute(self.story.getAllMetadata())) for index, chap in enumerate(self.story.getChapters(fortoc=True)): @@ -170,13 +177,13 @@ class BaseStoryWriter(Configurable): condremoveentities=False) else: self.story.setMetadata("output_css",'') - + if not outstream: close=True logger.info("Save directly to file: %s" % outfilename) if self.getConfig('make_directories'): path="" - outputdirs = os.path.dirname(outfilename).split('/') + outputdirs = os.path.dirname(ensure_text(outfilename)).split('/') for dir in outputdirs: path+=dir+"/" if not os.path.exists(path): @@ -191,7 +198,7 @@ class BaseStoryWriter(Configurable): if fileupdated > lastupdated: logger.warn("File(%s) Updated(%s) more recently than Story(%s) - Skipping" % (outfilename,fileupdated,lastupdated)) return - if not metaonly: + if not metaonly: self.story = self.adapter.getStory() # get full story # now, just # before writing. @@ -210,7 +217,7 @@ class BaseStoryWriter(Configurable): # above, it will only # fetch once. if self.getConfig('zip_output'): - out = StringIO.StringIO() + out = BytesIO() self.zipout = ZipFile(outstream, 'w', compression=ZIP_DEFLATED) self.writeStoryImpl(out) self.zipout.writestr(self.getBaseFileName(),out.getvalue()) @@ -228,7 +235,7 @@ class BaseStoryWriter(Configurable): def writeFile(self, filename, data): logger.debug("writeFile:%s"%filename) - + if self.getConfig('zip_output'): outputdirs = os.path.dirname(self.getBaseFileName()) if outputdirs: @@ -242,7 +249,7 @@ class BaseStoryWriter(Configurable): dir = os.path.dirname(filename) if not os.path.exists(dir): os.mkdir(dir) ## os.makedirs() doesn't work in 2.5.2? - + outstream = open(filename,"wb") outstream.write(data) outstream.close() diff --git a/fanficfare/writers/writer_epub.py b/fanficfare/writers/writer_epub.py index 049126d0..6907129c 100644 --- a/fanficfare/writers/writer_epub.py +++ b/fanficfare/writers/writer_epub.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,21 +15,27 @@ # limitations under the License. # +from __future__ import absolute_import import logging import string -import StringIO import zipfile from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED import urllib import re +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six import string_types as basestring +from ..six import ensure_binary +from ..six import BytesIO # StringIO under py2 + ## XML isn't as forgiving as HTML, so rather than generate as strings, ## use DOM to generate the XML files. from xml.dom.minidom import parse, parseString, getDOMImplementation import bs4 -from base_writer import * +from .base_writer import * from ..htmlcleanup import stripHTML,removeEntities from ..story import commaGroups @@ -232,7 +238,7 @@ div { margin: 0pt; padding: 0pt; } span = ''%entry idx = logfile.rindex(span)+len(span) values[entry] = logfile[idx:logfile.index('\n',idx)] - except Exception, e: + except Exception as e: #print("e:%s"%e) pass @@ -299,11 +305,11 @@ div { margin: 0pt; padding: 0pt; } def writeStoryImpl(self, out): ## Python 2.5 ZipFile is rather more primative than later - ## versions. It can operate on a file, or on a StringIO, but + ## versions. It can operate on a file, or on a BytesIO, but ## not on an open stream. OTOH, I suspect we would have had ## problems with closing and opening again to change the ## compression type anyway. - zipio = StringIO.StringIO() + zipio = BytesIO() ## mimetype must be first file and uncompressed. Python 2.5 ## ZipFile can't change compression type file-by-file, so we @@ -518,8 +524,8 @@ div { margin: 0pt; padding: 0pt; } COVER = string.Template(self.getConfig("cover_content")) else: COVER = self.EPUB_COVER - coverIO = StringIO.StringIO() - coverIO.write(COVER.substitute(dict(self.story.getAllMetadata().items()+{'coverimg':self.story.cover}.items()))) + coverIO = BytesIO() + self._write(coverIO,COVER.substitute(dict(list(self.story.getAllMetadata().items())+list({'coverimg':self.story.cover}.items())))) if self.getConfig("include_titlepage"): items.append(("title_page","OEBPS/title_page.xhtml","application/xhtml+xml","Title Page")) @@ -575,10 +581,10 @@ div { margin: 0pt; padding: 0pt; } # write content.opf to zip. contentxml = contentdom.toxml(encoding='utf-8') - # tweak for brain damaged Nook STR. Nook insists on name before content. - contentxml = contentxml.replace(''%coverimgid, - ''%coverimgid) + contentxml = contentxml.replace(ensure_binary(''%coverimgid), + ensure_binary(''%coverimgid)) + outputepub.writestr("content.opf",contentxml) contentdom.unlink() @@ -655,7 +661,7 @@ div { margin: 0pt; padding: 0pt; } outputepub.writestr("OEBPS/cover.xhtml",coverIO.getvalue()) coverIO.close() - titlepageIO = StringIO.StringIO() + titlepageIO = BytesIO() self.writeTitlePage(out=titlepageIO, START=TITLE_PAGE_START, ENTRY=TITLE_ENTRY, @@ -667,7 +673,7 @@ div { margin: 0pt; padding: 0pt; } titlepageIO.close() # write toc page. - tocpageIO = StringIO.StringIO() + tocpageIO = BytesIO() self.writeTOCPage(tocpageIO, self.EPUB_TOC_PAGE_START, self.EPUB_TOC_ENTRY, @@ -678,7 +684,7 @@ div { margin: 0pt; padding: 0pt; } if dologpage: # write log page. - logpageIO = StringIO.StringIO() + logpageIO = BytesIO() self.writeLogPage(logpageIO) outputepub.writestr("OEBPS/log_page.xhtml",logpageIO.getvalue()) logpageIO.close() diff --git a/fanficfare/writers/writer_html.py b/fanficfare/writers/writer_html.py index cd727b1c..04615394 100644 --- a/fanficfare/writers/writer_html.py +++ b/fanficfare/writers/writer_html.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2016 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,13 +15,16 @@ # limitations under the License. # +from __future__ import absolute_import import logging import string +# py2 vs py3 transition +from ..six import text_type as unicode + import bs4 -from base_writer import * - +from .base_writer import * class HTMLWriter(BaseStoryWriter): @staticmethod @@ -107,7 +110,7 @@ ${output_css} self._write(out,FILE_START.substitute(self.story.getAllMetadata())) if self.getConfig('include_images') and self.story.cover: - self._write(out,COVER.substitute(dict(self.story.getAllMetadata().items()+{'coverimg':self.story.cover}.items()))) + self._write(out,COVER.substitute(dict(list(self.story.getAllMetadata().items())+list({'coverimg':self.story.cover}.items())))) self.writeTitlePage(out, self.HTML_TITLE_PAGE_START, diff --git a/fanficfare/writers/writer_mobi.py b/fanficfare/writers/writer_mobi.py index f37ee2fe..bc86bb5d 100644 --- a/fanficfare/writers/writer_mobi.py +++ b/fanficfare/writers/writer_mobi.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,15 +15,18 @@ # limitations under the License. # +from __future__ import absolute_import import logging import string -import StringIO -from base_writer import * +from .base_writer import * from ..htmlcleanup import stripHTML from ..mobi import Converter from ..exceptions import FailedToWriteOutput +# py2 vs py3 transition +from ..six import BytesIO # StringIO under py2 + logger = logging.getLogger(__name__) class MobiWriter(BaseStoryWriter): @@ -128,7 +131,7 @@ ${value}
NO_TITLE_ENTRY = self.MOBI_NO_TITLE_ENTRY TITLE_PAGE_END = self.MOBI_TITLE_PAGE_END - titlepageIO = StringIO.StringIO() + titlepageIO = BytesIO() self.writeTitlePage(out=titlepageIO, START=TITLE_PAGE_START, ENTRY=TITLE_ENTRY, @@ -142,7 +145,7 @@ ${value}
## MOBI always has a TOC injected by mobi.py because there's ## no meta-data TOC. # # write toc page. - # tocpageIO = StringIO.StringIO() + # tocpageIO = BytesIO() # self.writeTOCPage(tocpageIO, # self.MOBI_TOC_PAGE_START, # self.MOBI_TOC_ENTRY, diff --git a/fanficfare/writers/writer_txt.py b/fanficfare/writers/writer_txt.py index b5c10647..9f3c9980 100644 --- a/fanficfare/writers/writer_txt.py +++ b/fanficfare/writers/writer_txt.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ # limitations under the License. # +from __future__ import absolute_import import logging import string from textwrap import wrap -from base_writer import * +from .base_writer import * from html2text import html2text diff --git a/included_dependencies/bs4/__init__.py b/included_dependencies/bs4/__init__.py index 46caac04..ac3c1720 100644 --- a/included_dependencies/bs4/__init__.py +++ b/included_dependencies/bs4/__init__.py @@ -21,14 +21,15 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # found in the LICENSE file. __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.5.3" -__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" +__version__ = "4.6.1" +__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import os import re +import sys import traceback import warnings @@ -82,14 +83,46 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser.""" + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be used. This + may be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type of markup + to be used ("html", "html5", "xml"). It's recommended that you + name a specific parser, so that Beautiful Soup gives you the + same results across platforms and virtual environments. + + :param builder: A specific TreeBuilder to use instead of looking one + up based on `features`. You shouldn't need to use this. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4 and there's no need to actually pass keyword + arguments into the constructor. + """ if 'convertEntities' in kwargs: warnings.warn( @@ -171,14 +204,35 @@ class BeautifulSoup(Tag): else: markup_type = "HTML" - caller = traceback.extract_stack()[0] - filename = caller[0] - line_number = caller[1] - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( - filename=filename, - line_number=line_number, - parser=builder.NAME, - markup_type=markup_type)) + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) self.builder = builder self.is_xml = builder.is_xml @@ -215,8 +269,8 @@ class BeautifulSoup(Tag): markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' - 'probably open this file and pass the filehandle into' - 'Beautiful Soup.' % markup) + ' probably open this file and pass the filehandle into' + ' Beautiful Soup.' % markup) self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, @@ -302,9 +356,10 @@ class BeautifulSoup(Tag): self.preserve_whitespace_tag_stack = [] self.pushTag(self) - def new_tag(self, name, namespace=None, nsprefix=None, **attrs): + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, namespace, nsprefix, attrs) + kwattrs.update(attrs) + return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" diff --git a/included_dependencies/bs4/builder/__init__.py b/included_dependencies/bs4/builder/__init__.py index 601979bf..21454e6f 100644 --- a/included_dependencies/bs4/builder/__init__.py +++ b/included_dependencies/bs4/builder/__init__.py @@ -93,7 +93,7 @@ class TreeBuilder(object): preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - + # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. cdata_list_attributes = {} @@ -125,7 +125,7 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - + def feed(self, markup): raise NotImplementedError() @@ -232,9 +232,14 @@ class HTMLTreeBuilder(TreeBuilder): """ preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags - empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) - + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' + ]) + # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, diff --git a/included_dependencies/bs4/builder/_htmlparser.py b/included_dependencies/bs4/builder/_htmlparser.py index 823ca15a..ee6c685d 100644 --- a/included_dependencies/bs4/builder/_htmlparser.py +++ b/included_dependencies/bs4/builder/_htmlparser.py @@ -1,3 +1,4 @@ +# encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" # Use of this source code is governed by a BSD-style license that can be @@ -52,7 +53,42 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): - def handle_starttag(self, name, attrs): + + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + + def handle_startendtag(self, name, attrs): + # This is only called when the markup looks like + # . + + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): # XXX namespace attr_dict = {} for key, value in attrs: @@ -62,10 +98,34 @@ class BeautifulSoupHTMLParser(HTMLParser): value = '' attr_dict[key] = value attrvalue = '""' - self.soup.handle_starttag(name, None, None, attr_dict) + #print "START", name + tag = self.soup.handle_starttag(name, None, None, attr_dict) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # .) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) - def handle_endtag(self, name): - self.soup.handle_endtag(name) + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + #print "END", name + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + # print "ALREADY CLOSED", name + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) def handle_data(self, data): self.soup.handle_data(data) @@ -81,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser): else: real_name = int(name) - try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" - + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError, e: + pass + if not data: + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + pass + data = data or u"\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): @@ -93,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser): if character is not None: data = character else: - data = "&%s;" % name + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name self.handle_data(data) def handle_comment(self, data): @@ -165,10 +245,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) + parser.close() except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e + parser.already_closed_empty_element = [] # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like

as a diff --git a/included_dependencies/bs4/builder/_lxml.py b/included_dependencies/bs4/builder/_lxml.py index d2ca2872..4a0f7de4 100644 --- a/included_dependencies/bs4/builder/_lxml.py +++ b/included_dependencies/bs4/builder/_lxml.py @@ -5,9 +5,13 @@ __all__ = [ 'LXMLTreeBuilder', ] +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable + from io import BytesIO from StringIO import StringIO -import collections from lxml import etree from bs4.element import ( Comment, @@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Use the default parser. parser = self.default_parser(encoding) - if isinstance(parser, collections.Callable): + if isinstance(parser, Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser @@ -147,11 +151,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. - if len(self.nsmaps) > 1: - # There are no new namespaces for this tag, but - # non-default namespaces are in play, so we need a - # separate tag stack to know when they end. - self.nsmaps.append(None) + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. inverted_nsmap = dict((value, key) for key, value in nsmap.items()) diff --git a/included_dependencies/bs4/dammit.py b/included_dependencies/bs4/dammit.py index 7965565f..be46b394 100644 --- a/included_dependencies/bs4/dammit.py +++ b/included_dependencies/bs4/dammit.py @@ -46,9 +46,9 @@ except ImportError: pass xml_encoding_re = re.compile( - '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) + '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) html_meta_re = re.compile( - '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): @@ -82,7 +82,7 @@ class EntitySubstitution(object): } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")") AMPERSAND_OR_BRACKET = re.compile("([<>&])") diff --git a/included_dependencies/bs4/diagnose.py b/included_dependencies/bs4/diagnose.py index 8768332f..7a28c09a 100644 --- a/included_dependencies/bs4/diagnose.py +++ b/included_dependencies/bs4/diagnose.py @@ -37,7 +37,7 @@ def diagnose(data): name) if 'lxml' in basic_parsers: - basic_parsers.append(["lxml", "xml"]) + basic_parsers.append("lxml-xml") try: from lxml import etree print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) @@ -56,21 +56,27 @@ def diagnose(data): if hasattr(data, 'read'): data = data.read() - elif os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data - with open(data) as fp: - data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." return - print + else: + try: + if os.path.exists(data): + print '"%s" looks like a filename. Reading data from the file.' % data + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: - soup = BeautifulSoup(data, parser) + soup = BeautifulSoup(data, features=parser) success = True except Exception, e: print "%s could not parse the markup." % parser diff --git a/included_dependencies/bs4/element.py b/included_dependencies/bs4/element.py index b100d18b..8383c3fc 100644 --- a/included_dependencies/bs4/element.py +++ b/included_dependencies/bs4/element.py @@ -2,7 +2,10 @@ # found in the LICENSE file. __license__ = "MIT" -import collections +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable import re import shlex import sys @@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) -whitespace_re = re.compile("\s+") +whitespace_re = re.compile(r"\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): The value of the 'content' attribute will be one of these objects. """ - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) @@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): return cls._substitute_if_appropriate( ns, EntitySubstitution.substitute_xml) +class Formatter(object): + """Contains information about how to format a parse tree.""" + + # By default, represent void elements as rather than + void_element_close_prefix = '/' + + def substitute_entities(self, *args, **kwargs): + """Transform certain characters into named entities.""" + raise NotImplementedError() + +class HTMLFormatter(Formatter): + """The default HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + +class MinimalHTMLFormatter(Formatter): + """A minimal HTML formatter.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs) + +class HTML5Formatter(HTMLFormatter): + """An HTML formatter that omits the slash in a void tag.""" + void_element_close_prefix = None + +class XMLFormatter(Formatter): + """Substitute only the essential XML entities.""" + def substitute(self, *args, **kwargs): + return EntitySubstitution.substitute_xml(*args, **kwargs) + +class HTMLXMLFormatter(Formatter): + """Format XML using HTML rules.""" + def substitute(self, *args, **kwargs): + return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -132,39 +170,48 @@ class PageElement(object): # # "html" - All Unicode characters with corresponding HTML entities # are converted to those entities on output. + # "html5" - The same as "html", but empty void tags are represented as + # rather than # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". - # A function - This function will be called on every string that + # A callable function - it will be called on every string that needs to undergo entity substitution. + # A Formatter instance - Formatter.substitute(string) will be called on every string that # needs to undergo entity substitution. # - # In an HTML document, the default "html" and "minimal" functions - # will leave the contents of ') # => <script> do_nasty_stuff() </script> @@ -782,7 +813,7 @@ class Filter(base.Filter): # characters, nor why we call unescape. I just know it's always been here. # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all # this will do is remove *more* than it otherwise would. - val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '', + val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '', unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") @@ -807,7 +838,7 @@ class Filter(base.Filter): ' ', unescape(attrs[attr])) if (token["name"] in self.svg_allow_local_href and - (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*', + (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])): del attrs[(namespaces['xlink'], 'href')] if (None, 'style') in attrs: @@ -837,16 +868,16 @@ class Filter(base.Filter): def sanitize_css(self, style): # disallow urls - style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # gauntlet - if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): + if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' - if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): + if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] - for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style): + for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): if not value: continue if prop.lower() in self.allowed_css_properties: @@ -855,7 +886,7 @@ class Filter(base.Filter): 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa + not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') diff --git a/included_dependencies/html5lib/filters/whitespace.py b/included_dependencies/html5lib/filters/whitespace.py index 89210528..0d12584b 100644 --- a/included_dependencies/html5lib/filters/whitespace.py +++ b/included_dependencies/html5lib/filters/whitespace.py @@ -10,7 +10,7 @@ SPACES_REGEX = re.compile("[%s]+" % spaceCharacters) class Filter(base.Filter): - + """Collapses whitespace except in pre, textarea, and script elements""" spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) def __iter__(self): diff --git a/included_dependencies/html5lib/html5parser.py b/included_dependencies/html5lib/html5parser.py index 2abd63e4..9d39b9d4 100644 --- a/included_dependencies/html5lib/html5parser.py +++ b/included_dependencies/html5lib/html5parser.py @@ -1,12 +1,8 @@ from __future__ import absolute_import, division, unicode_literals -from six import with_metaclass, viewkeys, PY3 +from six import with_metaclass, viewkeys import types - -try: - from collections import OrderedDict -except ImportError: - from ordereddict import OrderedDict +from collections import OrderedDict from . import _inputstream from . import _tokenizer @@ -24,18 +20,53 @@ from .constants import ( adjustForeignAttributes as adjustForeignAttributesMap, adjustMathMLAttributes, adjustSVGAttributes, E, - ReparseException + _ReparseException ) def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): - """Parse a string or file-like object into a tree""" + """Parse an HTML document as a string or file-like object into a tree + + :arg doc: the document to parse as a string or file-like object + + :arg treebuilder: the treebuilder to use when parsing + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :returns: parsed tree + + Example: + + >>> from html5lib.html5parser import parse + >>> parse('

This is a doc

') + + + """ tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parse(doc, **kwargs) def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): + """Parse an HTML fragment as a string or file-like object into a tree + + :arg doc: the fragment to parse as a string or file-like object + + :arg container: the container context to parse the fragment in + + :arg treebuilder: the treebuilder to use when parsing + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :returns: parsed tree + + Example: + + >>> from html5lib.html5libparser import parseFragment + >>> parseFragment('this is a fragment') + + + """ tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parseFragment(doc, container=container, **kwargs) @@ -54,16 +85,30 @@ def method_decorator_metaclass(function): class HTMLParser(object): - """HTML parser. Generates a tree structure from a stream of (possibly - malformed) HTML""" + """HTML parser + + Generates a tree structure from a stream of (possibly malformed) HTML. + + """ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): """ - strict - raise an exception when a parse error is encountered + :arg tree: a treebuilder class controlling the type of tree that will be + returned. Built in treebuilders can be accessed through + html5lib.treebuilders.getTreeBuilder(treeType) + + :arg strict: raise an exception when a parse error is encountered + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :arg debug: whether or not to enable debug mode which logs things + + Example: + + >>> from html5lib.html5parser import HTMLParser + >>> parser = HTMLParser() # generates parser with etree builder + >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict - tree - a treebuilder class controlling the type of tree that will be - returned. Built in treebuilders can be accessed through - html5lib.treebuilders.getTreeBuilder(treeType) """ # Raise an exception on the first error encountered @@ -87,7 +132,7 @@ class HTMLParser(object): try: self.mainLoop() - except ReparseException: + except _ReparseException: self.reset() self.mainLoop() @@ -127,9 +172,8 @@ class HTMLParser(object): @property def documentEncoding(self): - """The name of the character encoding - that was used to decode the input stream, - or :obj:`None` if that is not determined yet. + """Name of the character encoding that was used to decode the input stream, or + :obj:`None` if that is not determined yet """ if not hasattr(self, 'tokenizer'): @@ -223,14 +267,24 @@ class HTMLParser(object): def parse(self, stream, *args, **kwargs): """Parse a HTML document into a well-formed tree - stream - a filelike object or string containing the HTML to be parsed + :arg stream: a file-like object or string containing the HTML to be parsed - The optional encoding parameter must be a string that indicates - the encoding. If specified, that encoding will be used, - regardless of any BOM or later declaration (such as in a meta - element) + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element). + + :arg scripting: treat noscript elements as if JavaScript was turned on + + :returns: parsed tree + + Example: + + >>> from html5lib.html5parser import HTMLParser + >>> parser = HTMLParser() + >>> parser.parse('

This is a doc

') + - scripting - treat noscript elements as if javascript was turned on """ self._parse(stream, False, None, *args, **kwargs) return self.tree.getDocument() @@ -238,17 +292,27 @@ class HTMLParser(object): def parseFragment(self, stream, *args, **kwargs): """Parse a HTML fragment into a well-formed tree fragment - container - name of the element we're setting the innerHTML property - if set to None, default to 'div' + :arg container: name of the element we're setting the innerHTML + property if set to None, default to 'div' - stream - a filelike object or string containing the HTML to be parsed + :arg stream: a file-like object or string containing the HTML to be parsed - The optional encoding parameter must be a string that indicates - the encoding. If specified, that encoding will be used, - regardless of any BOM or later declaration (such as in a meta - element) + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + + :arg scripting: treat noscript elements as if JavaScript was turned on + + :returns: parsed tree + + Example: + + >>> from html5lib.html5libparser import HTMLParser + >>> parser = HTMLParser() + >>> parser.parseFragment('this is a fragment') + - scripting - treat noscript elements as if javascript was turned on """ self._parse(stream, True, *args, **kwargs) return self.tree.getFragment() @@ -262,8 +326,7 @@ class HTMLParser(object): raise ParseError(E[errorcode] % datavars) def normalizeToken(self, token): - """ HTML5 specific normalizations to the token stream """ - + # HTML5 specific normalizations to the token stream if token["type"] == tokenTypes["StartTag"]: raw = token["data"] token["data"] = OrderedDict(raw) @@ -331,9 +394,7 @@ class HTMLParser(object): self.phase = new_phase def parseRCDataRawtext(self, token, contentType): - """Generic RCDATA/RAWTEXT Parsing algorithm - contentType - RCDATA or RAWTEXT - """ + # Generic RCDATA/RAWTEXT Parsing algorithm assert contentType in ("RAWTEXT", "RCDATA") self.tree.insertElement(token) @@ -2711,10 +2772,7 @@ def getPhases(debug): def adjust_attributes(token, replacements): - if PY3 or _utils.PY27: - needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) - else: - needs_adjustment = frozenset(token['data']) & frozenset(replacements) + needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) if needs_adjustment: token['data'] = OrderedDict((replacements.get(k, k), v) for k, v in token['data'].items()) diff --git a/included_dependencies/html5lib/serializer.py b/included_dependencies/html5lib/serializer.py index 103dd206..d6b7105d 100644 --- a/included_dependencies/html5lib/serializer.py +++ b/included_dependencies/html5lib/serializer.py @@ -68,10 +68,33 @@ def htmlentityreplace_errors(exc): else: return xmlcharrefreplace_errors(exc) + register_error("htmlentityreplace", htmlentityreplace_errors) def serialize(input, tree="etree", encoding=None, **serializer_opts): + """Serializes the input token stream using the specified treewalker + + :arg input: the token stream to serialize + + :arg tree: the treewalker to use + + :arg encoding: the encoding to use + + :arg serializer_opts: any options to pass to the + :py:class:`html5lib.serializer.HTMLSerializer` that gets created + + :returns: the tree serialized as a string + + Example: + + >>> from html5lib.html5parser import parse + >>> from html5lib.serializer import serialize + >>> token_stream = parse('

Hi!

') + >>> serialize(token_stream, omit_optional_tags=False) + '

Hi!

' + + """ # XXX: Should we cache this? walker = treewalkers.getTreeWalker(tree) s = HTMLSerializer(**serializer_opts) @@ -110,50 +133,83 @@ class HTMLSerializer(object): "strip_whitespace", "sanitize") def __init__(self, **kwargs): - """Initialize HTMLSerializer. + """Initialize HTMLSerializer - Keyword options (default given first unless specified) include: + :arg inject_meta_charset: Whether or not to inject the meta charset. - inject_meta_charset=True|False - Whether it insert a meta element to define the character set of the - document. - quote_attr_values="legacy"|"spec"|"always" - Whether to quote attribute values that don't require quoting - per legacy browser behaviour, when required by the standard, or always. - quote_char=u'"'|u"'" - Use given quote character for attribute quoting. Default is to - use double quote unless attribute value contains a double quote, - in which case single quotes are used instead. - escape_lt_in_attrs=False|True - Whether to escape < in attribute values. - escape_rcdata=False|True - Whether to escape characters that need to be escaped within normal - elements within rcdata elements such as style. - resolve_entities=True|False - Whether to resolve named character entities that appear in the - source tree. The XML predefined entities < > & " ' - are unaffected by this setting. - strip_whitespace=False|True - Whether to remove semantically meaningless whitespace. (This - compresses all whitespace to a single space except within pre.) - minimize_boolean_attributes=True|False - Shortens boolean attributes to give just the attribute value, - for example becomes . - use_trailing_solidus=False|True - Includes a close-tag slash at the end of the start tag of void - elements (empty elements whose end tag is forbidden). E.g.
. - space_before_trailing_solidus=True|False - Places a space immediately before the closing slash in a tag - using a trailing solidus. E.g.
. Requires use_trailing_solidus. - sanitize=False|True - Strip all unsafe or unknown constructs from output. - See `html5lib user documentation`_ - omit_optional_tags=True|False - Omit start/end tags that are optional. - alphabetical_attributes=False|True - Reorder attributes to be in alphabetical order. + Defaults to ``True``. + + :arg quote_attr_values: Whether to quote attribute values that don't + require quoting per legacy browser behavior (``"legacy"``), when + required by the standard (``"spec"``), or always (``"always"``). + + Defaults to ``"legacy"``. + + :arg quote_char: Use given quote character for attribute quoting. + + Defaults to ``"`` which will use double quotes unless attribute + value contains a double quote, in which case single quotes are + used. + + :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute + values. + + Defaults to ``False``. + + :arg escape_rcdata: Whether to escape characters that need to be + escaped within normal elements within rcdata elements such as + style. + + Defaults to ``False``. + + :arg resolve_entities: Whether to resolve named character entities that + appear in the source tree. The XML predefined entities < > + & " ' are unaffected by this setting. + + Defaults to ``True``. + + :arg strip_whitespace: Whether to remove semantically meaningless + whitespace. (This compresses all whitespace to a single space + except within ``pre``.) + + Defaults to ``False``. + + :arg minimize_boolean_attributes: Shortens boolean attributes to give + just the attribute value, for example:: + + + + becomes:: + + + + Defaults to ``True``. + + :arg use_trailing_solidus: Includes a close-tag slash at the end of the + start tag of void elements (empty elements whose end tag is + forbidden). E.g. ``
``. + + Defaults to ``False``. + + :arg space_before_trailing_solidus: Places a space immediately before + the closing slash in a tag using a trailing solidus. E.g. + ``
``. Requires ``use_trailing_solidus=True``. + + Defaults to ``True``. + + :arg sanitize: Strip all unsafe or unknown constructs from output. + See :py:class:`html5lib.filters.sanitizer.Filter`. + + Defaults to ``False``. + + :arg omit_optional_tags: Omit start/end tags that are optional. + + Defaults to ``True``. + + :arg alphabetical_attributes: Reorder attributes to be in alphabetical order. + + Defaults to ``False``. - .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation """ unexpected_args = frozenset(kwargs) - frozenset(self.options) if len(unexpected_args) > 0: @@ -317,6 +373,25 @@ class HTMLSerializer(object): self.serializeError(token["data"]) def render(self, treewalker, encoding=None): + """Serializes the stream from the treewalker into a string + + :arg treewalker: the treewalker to serialize + + :arg encoding: the string encoding to use + + :returns: the serialized tree + + Example: + + >>> from html5lib import parse, getTreeWalker + >>> from html5lib.serializer import HTMLSerializer + >>> token_stream = parse('Hi!') + >>> walker = getTreeWalker('etree') + >>> serializer = HTMLSerializer(omit_optional_tags=False) + >>> serializer.render(walker(token_stream)) + 'Hi!' + + """ if encoding: return b"".join(list(self.serialize(treewalker, encoding))) else: diff --git a/included_dependencies/html5lib/treeadapters/__init__.py b/included_dependencies/html5lib/treeadapters/__init__.py index 4f978466..dfeb0ba5 100644 --- a/included_dependencies/html5lib/treeadapters/__init__.py +++ b/included_dependencies/html5lib/treeadapters/__init__.py @@ -1,3 +1,21 @@ +"""Tree adapters let you convert from one tree structure to another + +Example: + +.. code-block:: python + + import html5lib + from html5lib.treeadapters import genshi + + doc = 'Hi!' + treebuilder = html5lib.getTreeBuilder('etree') + parser = html5lib.HTMLParser(tree=treebuilder) + tree = parser.parse(doc) + TreeWalker = html5lib.getTreeWalker('etree') + + genshi_tree = genshi.to_genshi(TreeWalker(tree)) + +""" from __future__ import absolute_import, division, unicode_literals from . import sax diff --git a/included_dependencies/html5lib/treeadapters/genshi.py b/included_dependencies/html5lib/treeadapters/genshi.py index 04e316df..61d5fb6a 100644 --- a/included_dependencies/html5lib/treeadapters/genshi.py +++ b/included_dependencies/html5lib/treeadapters/genshi.py @@ -5,6 +5,13 @@ from genshi.core import START, END, TEXT, COMMENT, DOCTYPE def to_genshi(walker): + """Convert a tree to a genshi tree + + :arg walker: the treewalker to use to walk the tree to convert it + + :returns: generator of genshi nodes + + """ text = [] for token in walker: type = token["type"] diff --git a/included_dependencies/html5lib/treeadapters/sax.py b/included_dependencies/html5lib/treeadapters/sax.py index ad47df95..f4ccea5a 100644 --- a/included_dependencies/html5lib/treeadapters/sax.py +++ b/included_dependencies/html5lib/treeadapters/sax.py @@ -11,7 +11,13 @@ for prefix, localName, namespace in adjustForeignAttributes.values(): def to_sax(walker, handler): - """Call SAX-like content handler based on treewalker walker""" + """Call SAX-like content handler based on treewalker walker + + :arg walker: the treewalker to use to walk the tree to convert it + + :arg handler: SAX handler to use + + """ handler.startDocument() for prefix, namespace in prefix_mapping.items(): handler.startPrefixMapping(prefix, namespace) diff --git a/included_dependencies/html5lib/treebuilders/__init__.py b/included_dependencies/html5lib/treebuilders/__init__.py index e2328847..d44447ea 100644 --- a/included_dependencies/html5lib/treebuilders/__init__.py +++ b/included_dependencies/html5lib/treebuilders/__init__.py @@ -1,29 +1,32 @@ -"""A collection of modules for building different kinds of tree from -HTML documents. +"""A collection of modules for building different kinds of trees from HTML +documents. To create a treebuilder for a new type of tree, you need to do implement several things: -1) A set of classes for various types of elements: Document, Doctype, -Comment, Element. These must implement the interface of -_base.treebuilders.Node (although comment nodes have a different -signature for their constructor, see treebuilders.etree.Comment) -Textual content may also be implemented as another node type, or not, as -your tree implementation requires. +1. A set of classes for various types of elements: Document, Doctype, Comment, + Element. These must implement the interface of ``base.treebuilders.Node`` + (although comment nodes have a different signature for their constructor, + see ``treebuilders.etree.Comment``) Textual content may also be implemented + as another node type, or not, as your tree implementation requires. -2) A treebuilder object (called TreeBuilder by convention) that -inherits from treebuilders._base.TreeBuilder. This has 4 required attributes: -documentClass - the class to use for the bottommost node of a document -elementClass - the class to use for HTML Elements -commentClass - the class to use for comments -doctypeClass - the class to use for doctypes -It also has one required method: -getDocument - Returns the root node of the complete document tree +2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits + from ``treebuilders.base.TreeBuilder``. This has 4 required attributes: + + * ``documentClass`` - the class to use for the bottommost node of a document + * ``elementClass`` - the class to use for HTML Elements + * ``commentClass`` - the class to use for comments + * ``doctypeClass`` - the class to use for doctypes + + It also has one required method: + + * ``getDocument`` - Returns the root node of the complete document tree + +3. If you wish to run the unit tests, you must also create a ``testSerializer`` + method on your treebuilder which accepts a node and returns a string + containing Node and its children serialized according to the format used in + the unittests -3) If you wish to run the unit tests, you must also create a -testSerializer method on your treebuilder which accepts a node and -returns a string containing Node and its children serialized according -to the format used in the unittests """ from __future__ import absolute_import, division, unicode_literals @@ -34,23 +37,32 @@ treeBuilderCache = {} def getTreeBuilder(treeType, implementation=None, **kwargs): - """Get a TreeBuilder class for various types of tree with built-in support + """Get a TreeBuilder class for various types of trees with built-in support - treeType - the name of the tree type required (case-insensitive). Supported - values are: + :arg treeType: the name of the tree type required (case-insensitive). Supported + values are: - "dom" - A generic builder for DOM implementations, defaulting to - a xml.dom.minidom based implementation. - "etree" - A generic builder for tree implementations exposing an - ElementTree-like interface, defaulting to - xml.etree.cElementTree if available and - xml.etree.ElementTree if not. - "lxml" - A etree-based builder for lxml.etree, handling - limitations of lxml's implementation. + * "dom" - A generic builder for DOM implementations, defaulting to a + xml.dom.minidom based implementation. + * "etree" - A generic builder for tree implementations exposing an + ElementTree-like interface, defaulting to xml.etree.cElementTree if + available and xml.etree.ElementTree if not. + * "lxml" - A etree-based builder for lxml.etree, handling limitations + of lxml's implementation. - implementation - (Currently applies to the "etree" and "dom" tree types). A - module implementing the tree type e.g. - xml.etree.ElementTree or xml.etree.cElementTree.""" + :arg implementation: (Currently applies to the "etree" and "dom" tree + types). A module implementing the tree type e.g. xml.etree.ElementTree + or xml.etree.cElementTree. + + :arg kwargs: Any additional options to pass to the TreeBuilder when + creating it. + + Example: + + >>> from html5lib.treebuilders import getTreeBuilder + >>> builder = getTreeBuilder('etree') + + """ treeType = treeType.lower() if treeType not in treeBuilderCache: diff --git a/included_dependencies/html5lib/treebuilders/base.py b/included_dependencies/html5lib/treebuilders/base.py index a4b2792a..05d97ecc 100644 --- a/included_dependencies/html5lib/treebuilders/base.py +++ b/included_dependencies/html5lib/treebuilders/base.py @@ -21,22 +21,25 @@ listElementsMap = { class Node(object): + """Represents an item in the tree""" def __init__(self, name): - """Node representing an item in the tree. - name - The tag name associated with the node - parent - The parent of the current node (or None for the document node) - value - The value of the current node (applies to text nodes and - comments - attributes - a dict holding name, value pairs for attributes of the node - childNodes - a list of child nodes of the current node. This must - include all elements but not necessarily other node types - _flags - A list of miscellaneous flags that can be set on the node + """Creates a Node + + :arg name: The tag name associated with the node + """ + # The tag name assocaited with the node self.name = name + # The parent of the current node (or None for the document node) self.parent = None + # The value of the current node (applies to text nodes and comments) self.value = None + # A dict holding name -> value pairs for attributes of the node self.attributes = {} + # A list of child nodes of the current node. This must include all + # elements but not necessarily other node types. self.childNodes = [] + # A list of miscellaneous flags that can be set on the node. self._flags = [] def __str__(self): @@ -53,23 +56,41 @@ class Node(object): def appendChild(self, node): """Insert node as a child of the current node + + :arg node: the node to insert + """ raise NotImplementedError def insertText(self, data, insertBefore=None): """Insert data as text in the current node, positioned before the start of node insertBefore or to the end of the node's text. + + :arg data: the data to insert + + :arg insertBefore: True if you want to insert the text before the node + and False if you want to insert it after the node + """ raise NotImplementedError def insertBefore(self, node, refNode): """Insert node as a child of the current node, before refNode in the list of child nodes. Raises ValueError if refNode is not a child of - the current node""" + the current node + + :arg node: the node to insert + + :arg refNode: the child node to insert the node before + + """ raise NotImplementedError def removeChild(self, node): """Remove node from the children of the current node + + :arg node: the child node to remove + """ raise NotImplementedError @@ -77,6 +98,9 @@ class Node(object): """Move all the children of the current node to newParent. This is needed so that trees that don't store text as nodes move the text in the correct way + + :arg newParent: the node to move all this node's children to + """ # XXX - should this method be made more general? for child in self.childNodes: @@ -121,10 +145,12 @@ class ActiveFormattingElements(list): class TreeBuilder(object): """Base treebuilder implementation - documentClass - the class to use for the bottommost node of a document - elementClass - the class to use for HTML Elements - commentClass - the class to use for comments - doctypeClass - the class to use for doctypes + + * documentClass - the class to use for the bottommost node of a document + * elementClass - the class to use for HTML Elements + * commentClass - the class to use for comments + * doctypeClass - the class to use for doctypes + """ # pylint:disable=not-callable @@ -144,6 +170,11 @@ class TreeBuilder(object): fragmentClass = None def __init__(self, namespaceHTMLElements): + """Create a TreeBuilder + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + """ if namespaceHTMLElements: self.defaultNamespace = "http://www.w3.org/1999/xhtml" else: @@ -367,11 +398,11 @@ class TreeBuilder(object): self.generateImpliedEndTags(exclude) def getDocument(self): - "Return the final tree" + """Return the final tree""" return self.document def getFragment(self): - "Return the final fragment" + """Return the final fragment""" # assert self.innerHTML fragment = self.fragmentClass() self.openElements[0].reparentChildren(fragment) @@ -379,5 +410,8 @@ class TreeBuilder(object): def testSerializer(self, node): """Serialize the subtree of node in the format required by unit tests - node - the node from which to start serializing""" + + :arg node: the node from which to start serializing + + """ raise NotImplementedError diff --git a/included_dependencies/html5lib/treebuilders/etree_lxml.py b/included_dependencies/html5lib/treebuilders/etree_lxml.py index 908820c0..ca12a99c 100644 --- a/included_dependencies/html5lib/treebuilders/etree_lxml.py +++ b/included_dependencies/html5lib/treebuilders/etree_lxml.py @@ -309,7 +309,6 @@ class TreeBuilder(base.TreeBuilder): super(TreeBuilder, self).insertComment(data, parent) def insertRoot(self, token): - """Create the document root""" # Because of the way libxml2 works, it doesn't seem to be possible to # alter information like the doctype after the tree has been parsed. # Therefore we need to use the built-in parser to create our initial diff --git a/included_dependencies/html5lib/treewalkers/__init__.py b/included_dependencies/html5lib/treewalkers/__init__.py index 9e19a559..9bec2076 100644 --- a/included_dependencies/html5lib/treewalkers/__init__.py +++ b/included_dependencies/html5lib/treewalkers/__init__.py @@ -13,7 +13,7 @@ from __future__ import absolute_import, division, unicode_literals from .. import constants from .._utils import default_etree -__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"] +__all__ = ["getTreeWalker", "pprint"] treeWalkerCache = {} @@ -21,20 +21,25 @@ treeWalkerCache = {} def getTreeWalker(treeType, implementation=None, **kwargs): """Get a TreeWalker class for various types of tree with built-in support - Args: - treeType (str): the name of the tree type required (case-insensitive). - Supported values are: + :arg str treeType: the name of the tree type required (case-insensitive). + Supported values are: - - "dom": The xml.dom.minidom DOM implementation - - "etree": A generic walker for tree implementations exposing an - elementtree-like interface (known to work with - ElementTree, cElementTree and lxml.etree). - - "lxml": Optimized walker for lxml.etree - - "genshi": a Genshi stream + * "dom": The xml.dom.minidom DOM implementation + * "etree": A generic walker for tree implementations exposing an + elementtree-like interface (known to work with ElementTree, + cElementTree and lxml.etree). + * "lxml": Optimized walker for lxml.etree + * "genshi": a Genshi stream + + :arg implementation: A module implementing the tree type e.g. + xml.etree.ElementTree or cElementTree (Currently applies to the "etree" + tree type only). + + :arg kwargs: keyword arguments passed to the etree walker--for other + walkers, this has no effect + + :returns: a TreeWalker class - Implementation: A module implementing the tree type e.g. - xml.etree.ElementTree or cElementTree (Currently applies to the - "etree" tree type only). """ treeType = treeType.lower() @@ -73,7 +78,13 @@ def concatenateCharacterTokens(tokens): def pprint(walker): - """Pretty printer for tree walkers""" + """Pretty printer for tree walkers + + Takes a TreeWalker instance and pretty prints the output of walking the tree. + + :arg walker: a TreeWalker instance + + """ output = [] indent = 0 for token in concatenateCharacterTokens(walker): diff --git a/included_dependencies/html5lib/treewalkers/base.py b/included_dependencies/html5lib/treewalkers/base.py index 36e1ba24..80c474c4 100644 --- a/included_dependencies/html5lib/treewalkers/base.py +++ b/included_dependencies/html5lib/treewalkers/base.py @@ -18,16 +18,48 @@ spaceCharacters = "".join(spaceCharacters) class TreeWalker(object): + """Walks a tree yielding tokens + + Tokens are dicts that all have a ``type`` field specifying the type of the + token. + + """ def __init__(self, tree): + """Creates a TreeWalker + + :arg tree: the tree to walk + + """ self.tree = tree def __iter__(self): raise NotImplementedError def error(self, msg): + """Generates an error token with the given message + + :arg msg: the error message + + :returns: SerializeError token + + """ return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): + """Generates an EmptyTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :arg attrs: the attributes of the element as a dict + + :arg hasChildren: whether or not to yield a SerializationError because + this tag shouldn't have children + + :returns: EmptyTag token + + """ yield {"type": "EmptyTag", "name": name, "namespace": namespace, "data": attrs} @@ -35,17 +67,61 @@ class TreeWalker(object): yield self.error("Void element has children") def startTag(self, namespace, name, attrs): + """Generates a StartTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :arg attrs: the attributes of the element as a dict + + :returns: StartTag token + + """ return {"type": "StartTag", "name": name, "namespace": namespace, "data": attrs} def endTag(self, namespace, name): + """Generates an EndTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :returns: EndTag token + + """ return {"type": "EndTag", "name": name, "namespace": namespace} def text(self, data): + """Generates SpaceCharacters and Characters tokens + + Depending on what's in the data, this generates one or more + ``SpaceCharacters`` and ``Characters`` tokens. + + For example: + + >>> from html5lib.treewalkers.base import TreeWalker + >>> # Give it an empty tree just so it instantiates + >>> walker = TreeWalker([]) + >>> list(walker.text('')) + [] + >>> list(walker.text(' ')) + [{u'data': ' ', u'type': u'SpaceCharacters'}] + >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE + [{u'data': ' ', u'type': u'SpaceCharacters'}, + {u'data': u'abc', u'type': u'Characters'}, + {u'data': u' ', u'type': u'SpaceCharacters'}] + + :arg data: the text data + + :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens + + """ data = data middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] @@ -60,18 +136,44 @@ class TreeWalker(object): yield {"type": "SpaceCharacters", "data": right} def comment(self, data): + """Generates a Comment token + + :arg data: the comment + + :returns: Comment token + + """ return {"type": "Comment", "data": data} def doctype(self, name, publicId=None, systemId=None): + """Generates a Doctype token + + :arg name: + + :arg publicId: + + :arg systemId: + + :returns: the Doctype token + + """ return {"type": "Doctype", "name": name, "publicId": publicId, "systemId": systemId} def entity(self, name): + """Generates an Entity token + + :arg name: the entity name + + :returns: an Entity token + + """ return {"type": "Entity", "name": name} def unknown(self, nodeType): + """Handles unknown node types""" return self.error("Unknown node type: " + nodeType) diff --git a/included_dependencies/html5lib/treewalkers/etree.py b/included_dependencies/html5lib/treewalkers/etree.py index 8f30f078..d15a7eeb 100644 --- a/included_dependencies/html5lib/treewalkers/etree.py +++ b/included_dependencies/html5lib/treewalkers/etree.py @@ -1,13 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -try: - from collections import OrderedDict -except ImportError: - try: - from ordereddict import OrderedDict - except ImportError: - OrderedDict = dict - +from collections import OrderedDict import re from six import string_types diff --git a/included_dependencies/six.py b/included_dependencies/six.py index 190c0239..818a7fad 100644 --- a/included_dependencies/six.py +++ b/included_dependencies/six.py @@ -1,6 +1,4 @@ -"""Utilities for writing code that runs on Python 2 and 3""" - -# Copyright (c) 2010-2015 Benjamin Peterson +# Copyright (c) 2010-2018 Benjamin Peterson # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -20,6 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +"""Utilities for writing code that runs on Python 2 and 3""" + from __future__ import absolute_import import functools @@ -29,8 +29,7 @@ import sys import types __author__ = "Benjamin Peterson " -__version__ = "1.10.0" - +__version__ = "1.11.0fffinclib" # for version included in fanficfare # Useful for very coarse version differentiation. PY2 = sys.version_info[0] == 2 @@ -241,6 +240,7 @@ _moved_attributes = [ MovedAttribute("map", "itertools", "builtins", "imap", "map"), MovedAttribute("getcwd", "os", "os", "getcwdu", "getcwd"), MovedAttribute("getcwdb", "os", "os", "getcwd", "getcwdb"), + MovedAttribute("getoutput", "commands", "subprocess"), MovedAttribute("range", "__builtin__", "builtins", "xrange", "range"), MovedAttribute("reload_module", "__builtin__", "importlib" if PY34 else "imp", "reload"), MovedAttribute("reduce", "__builtin__", "functools"), @@ -262,10 +262,11 @@ _moved_attributes = [ MovedModule("html_entities", "htmlentitydefs", "html.entities"), MovedModule("html_parser", "HTMLParser", "html.parser"), MovedModule("http_client", "httplib", "http.client"), + MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"), + MovedModule("email_mime_image", "email.MIMEImage", "email.mime.image"), MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"), MovedModule("email_mime_nonmultipart", "email.MIMENonMultipart", "email.mime.nonmultipart"), MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"), - MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"), MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"), MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"), MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"), @@ -337,10 +338,12 @@ _urllib_parse_moved_attributes = [ MovedAttribute("quote_plus", "urllib", "urllib.parse"), MovedAttribute("unquote", "urllib", "urllib.parse"), MovedAttribute("unquote_plus", "urllib", "urllib.parse"), + MovedAttribute("unquote_to_bytes", "urllib", "urllib.parse", "unquote", "unquote_to_bytes"), MovedAttribute("urlencode", "urllib", "urllib.parse"), MovedAttribute("splitquery", "urllib", "urllib.parse"), MovedAttribute("splittag", "urllib", "urllib.parse"), MovedAttribute("splituser", "urllib", "urllib.parse"), + MovedAttribute("splitvalue", "urllib", "urllib.parse"), MovedAttribute("uses_fragment", "urlparse", "urllib.parse"), MovedAttribute("uses_netloc", "urlparse", "urllib.parse"), MovedAttribute("uses_params", "urlparse", "urllib.parse"), @@ -416,6 +419,8 @@ _urllib_request_moved_attributes = [ MovedAttribute("URLopener", "urllib", "urllib.request"), MovedAttribute("FancyURLopener", "urllib", "urllib.request"), MovedAttribute("proxy_bypass", "urllib", "urllib.request"), + MovedAttribute("parse_http_list", "urllib2", "urllib.request"), + MovedAttribute("parse_keqv_list", "urllib2", "urllib.request"), ] for attr in _urllib_request_moved_attributes: setattr(Module_six_moves_urllib_request, attr.name, attr) @@ -679,11 +684,15 @@ if PY3: exec_ = getattr(moves.builtins, "exec") def reraise(tp, value, tb=None): - if value is None: - value = tp() - if value.__traceback__ is not tb: - raise value.with_traceback(tb) - raise value + try: + if value is None: + value = tp() + if value.__traceback__ is not tb: + raise value.with_traceback(tb) + raise value + finally: + value = None + tb = None else: def exec_(_code_, _globs_=None, _locs_=None): @@ -699,19 +708,28 @@ else: exec("""exec _code_ in _globs_, _locs_""") exec_("""def reraise(tp, value, tb=None): - raise tp, value, tb + try: + raise tp, value, tb + finally: + tb = None """) if sys.version_info[:2] == (3, 2): exec_("""def raise_from(value, from_value): - if from_value is None: - raise value - raise value from from_value + try: + if from_value is None: + raise value + raise value from from_value + finally: + value = None """) elif sys.version_info[:2] > (3, 2): exec_("""def raise_from(value, from_value): - raise value from from_value + try: + raise value from from_value + finally: + value = None """) else: def raise_from(value, from_value): @@ -802,10 +820,14 @@ def with_metaclass(meta, *bases): # This requires a bit of explanation: the basic idea is to make a dummy # metaclass for one level of class instantiation that replaces itself with # the actual metaclass. - class metaclass(meta): + class metaclass(type): def __new__(cls, name, this_bases, d): return meta(name, bases, d) + + @classmethod + def __prepare__(cls, name, this_bases): + return meta.__prepare__(name, bases) return type.__new__(metaclass, 'temporary_class', (), {}) @@ -825,6 +847,65 @@ def add_metaclass(metaclass): return wrapper +def ensure_binary(s, encoding='utf-8', errors='strict'): + """Coerce **s** to six.binary_type. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> encoded to `bytes` + - `bytes` -> `bytes` + """ + if isinstance(s, text_type): + return s.encode(encoding, errors) + elif isinstance(s, binary_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + +def ensure_str(s, encoding='utf-8', errors='strict'): + """Coerce *s* to `str`. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if not isinstance(s, (text_type, binary_type)): + raise TypeError("not expecting type '%s'" % type(s)) + if PY2 and isinstance(s, text_type): + s = s.encode(encoding, errors) + elif PY3 and isinstance(s, binary_type): + s = s.decode(encoding, errors) + return s + + +def ensure_text(s, encoding='utf-8', errors='strict'): + """Coerce *s* to six.text_type. + + For Python 2: + - `unicode` -> `unicode` + - `str` -> `unicode` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if isinstance(s, binary_type): + return s.decode(encoding, errors) + elif isinstance(s, text_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + + def python_2_unicode_compatible(klass): """ A decorator that defines __unicode__ and __str__ methods under Python 2. diff --git a/ini-order.py b/ini-order.py index be65928b..7c2cde75 100644 --- a/ini-order.py +++ b/ini-order.py @@ -1,16 +1,31 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright 2018, Jim Miller + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import re import sys +from io import open # so py2.7 has open with encoding param. argv = sys.argv[1:] -# infile = argv[0] -# outfile = argv[1] - sections = {} cursectname = "" cursectlines = [] -with open(argv[0],"r") as infile: +with open(argv[0],"r", encoding="utf8") as infile: for line in infile: if re.match(r"^\[([^\]]+)\]$",line): sections[cursectname] = cursectlines @@ -40,8 +55,8 @@ leadsects = [ followsects = [ ] -with open(argv[1],"w") as outfile: - kl = sections.keys() +with open(argv[1],"w", encoding="utf8") as outfile: + kl = list(sections.keys()) kl.sort() for k in leadsects: outfile.write("".join(sections[k])) diff --git a/makeplugin.py b/makeplugin.py index 62b61da7..72b69cbc 100644 --- a/makeplugin.py +++ b/makeplugin.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -# Copyright 2015, Jim Miller +# Copyright 2018, Jim Miller # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/makezip.py b/makezip.py index 0028a029..24e95d16 100644 --- a/makezip.py +++ b/makezip.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -# Copyright 2015, Jim Miller +# Copyright 2018, Jim Miller # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,11 +15,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import absolute_import import os, zipfile, sys from glob import glob +from six import text_type as unicode + def addFolderToZip(myZipFile,folder,exclude=[]): - folder = folder.encode('ascii') #convert path to ascii for ZipFile Method + # print("folder:"+folder) excludelist=[] for ex in exclude: excludelist.extend(glob(folder+"/"+ex)) @@ -27,7 +30,7 @@ def addFolderToZip(myZipFile,folder,exclude=[]): if file in excludelist: continue if os.path.isfile(file): - #print file + # print("folder file:"+file) myZipFile.write(file, file, zipfile.ZIP_DEFLATED) elif os.path.isdir(file): addFolderToZip(myZipFile,file,exclude=exclude) @@ -40,11 +43,10 @@ def createZipFile(filename,mode,files,exclude=[]): for file in files: if file in excludelist: continue - file = file.encode('ascii') #convert path to ascii for ZipFile Method + # print("file:"+file) if os.path.isfile(file): (filepath, filename) = os.path.split(file) - #print file - myZipFile.write( file, filename, zipfile.ZIP_DEFLATED ) + myZipFile.write( file, unicode(filename), zipfile.ZIP_DEFLATED ) if os.path.isdir(file): addFolderToZip(myZipFile,file,exclude=exclude) myZipFile.close() diff --git a/setup.py b/setup.py index 456ccd2b..ef03b3c5 100644 --- a/setup.py +++ b/setup.py @@ -13,15 +13,21 @@ from setuptools import setup, find_packages import codecs from os import path +package_name="FanFicFare" + +import sys +if sys.version_info < (2,7): + sys.exit(package_name+' requires Python 2.7 or newer.') + # Get the long description from the relevant file with codecs.open('DESCRIPTION.rst', encoding='utf-8') as f: long_description = f.read() - + setup( - name="FanFicFare", + name=package_name, # Versions should comply with PEP440. - version="2.28.0", + version="2.37.3", description='A tool for downloading fanfiction to eBook formats', long_description=long_description, @@ -42,7 +48,7 @@ setup( # 3 - Alpha # 4 - Beta # 5 - Production/Stable - 'Development Status :: 5 - Production/Stable', + 'Development Status :: 3 - Alpha', 'Environment :: Console', @@ -55,13 +61,9 @@ setup( # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. - # 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', - # 'Programming Language :: Python :: 3', - # 'Programming Language :: Python :: 3.2', - # 'Programming Language :: Python :: 3.3', - # 'Programming Language :: Python :: 3.4', + # Earlier py3 version may work, but I've not tested them. + 'Programming Language :: Python :: 3.7', ], # What does your project relate to? @@ -79,7 +81,8 @@ setup( # your project is installed. For an analysis of "install_requires" vs pip's # requirements files see: # https://packaging.python.org/en/latest/requirements.html - install_requires=['beautifulsoup4','chardet','html5lib','html2text'], # html5lib requires 'six'. + install_requires=['beautifulsoup4','chardet','html5lib','html2text'], + # html5lib requires 'six', FFF includes it's own copy as fanficfare.six # List additional groups of dependencies here (e.g. development # dependencies). You can install these using the following syntax, diff --git a/version_update.py b/version_update.py index 928052eb..ba870ed9 100644 --- a/version_update.py +++ b/version_update.py @@ -1,11 +1,26 @@ +#!/usr/bin/python # -*- coding: utf-8 -*- +# Copyright 2018, Jim Miller + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import codecs, sys, re from tempfile import mkstemp from os import rename, close, unlink -#print sys.argv[1:] +#print(sys.argv[1:]) ## Files that contain version numbers that will need to be updated. version_files = [ @@ -56,7 +71,7 @@ version="2.3.6" def do_loop(files, pattern, substring): global saved_version for source_file_path in files: - print "src:"+source_file_path + print("src:"+source_file_path) fh, target_file_path = mkstemp() with codecs.open(target_file_path, 'w', 'utf-8') as target_file: with codecs.open(source_file_path, 'r', 'utf-8') as source_file: @@ -78,7 +93,7 @@ if __name__ == '__main__': raise Exception() [int(x) for x in args] except: - print "Requires exactly 3 numeric args: major minor micro" + print("Requires exactly 3 numeric args: major minor micro") exit() main(args) -# print saved_version +# print(saved_version) diff --git a/webservice/app.yaml b/webservice/app.yaml index aa68d720..0ec2f729 100644 --- a/webservice/app.yaml +++ b/webservice/app.yaml @@ -1,6 +1,6 @@ # ffd-retief-hrd fanficfare application: fanficfare -version: 2-28-0 +version: 2-37-3 runtime: python27 api_version: 1 threadsafe: true diff --git a/webservice/index.html b/webservice/index.html index ddb24f49..7f42dcf8 100644 --- a/webservice/index.html +++ b/webservice/index.html @@ -31,7 +31,7 @@ If you have any problems with this application, please report them in the
FanFicFare Google Group. The - previous version + previous version is also available for you to use if necessary.