diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index 8e563a16..d374a099 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -405,6 +405,13 @@ user_agent:FFF/2.X ## non-intuitive. #description_limit:1000 +## Because some adapters can pull chapter URLs from human posts, the +## odds of errors in the chapter URLs can be higher for some +## sites/stories. You can set continue_on_chapter_error:true to +## continue on after failing to download a chapter and instead record +## an error message in the ebook for that chapter. +continue_on_chapter_error:false + [base_efiction] ## At the time of writing, eFiction Base adapters allow downloading ## the whole story in bulk using the 'Print' feature. If 'bulk_load' @@ -481,13 +488,6 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M ## the description. description_limit:500 -## Because base_xenforoforum adapters can pull chapter URLs from human -## posts, the odds of errors in the chapter URLs are vastly higher. -## You can set continue_on_chapter_error:true to continue on after -## failing to download a chapter and instead record an error message -## in the ebook for that chapter. -continue_on_chapter_error:false - ## When given a thread URL, use threadmarks as chapter links when ## there are at least this many threadmarks. A number of older ## threads have a single threadmark to an 'index' post. Set to 1 to diff --git a/fanficfare/adapters/adapter_test1.py b/fanficfare/adapters/adapter_test1.py index 5e51d211..9330c752 100644 --- a/fanficfare/adapters/adapter_test1.py +++ b/fanficfare/adapters/adapter_test1.py @@ -360,41 +360,28 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!" ## for chapter_urls setting. logger.debug('Getting chapter text from: %s' % url) - try: - origurl = url - (data,opened) = self._fetchUrlOpened(url,extrasleep=2.0) - url = opened.geturl() - if '#' in origurl and '#' not in url: - url = url + origurl[origurl.index('#'):] - logger.debug("chapter URL redirected to: %s"%url) + origurl = url + (data,opened) = self._fetchUrlOpened(url,extrasleep=2.0) + url = opened.geturl() + if '#' in origurl and '#' not in url: + url = url + origurl[origurl.index('#'):] + logger.debug("chapter URL redirected to: %s"%url) - soup = self.make_soup(data) + soup = self.make_soup(data) - if '#' in url: - anchorid = url.split('#')[1] - soup = soup.find('li',id=anchorid) + if '#' in url: + anchorid = url.split('#')[1] + soup = soup.find('li',id=anchorid) - bq = soup.find('blockquote') + bq = soup.find('blockquote') - bq.name='div' + bq.name='div' - for iframe in bq.find_all('iframe'): - iframe.extract() # calibre book reader & editor don't like iframes to youtube. + for iframe in bq.find_all('iframe'): + iframe.extract() # calibre book reader & editor don't like iframes to youtube. - for qdiv in bq.find_all('div',{'class':'quoteExpand'}): - qdiv.extract() # Remove
click to expand
- - except Exception as e: - if self.getConfig('continue_on_chapter_error'): - bq = self.make_soup("""
-

Error

-

FanFicFare failed to download this chapter. Because you have -continue_on_chapter_error set to true in your personal.ini, the download continued.

-

Chapter URL:
%s

-

Error:

%s

-
"""%(url,traceback.format_exc())) - else: - raise + for qdiv in bq.find_all('div',{'class':'quoteExpand'}): + qdiv.extract() # Remove
click to expand
return self.utf8FromSoup(url[:url.index('/',8)+1],bq) diff --git a/fanficfare/adapters/adapter_wuxiaworldcom.py b/fanficfare/adapters/adapter_wuxiaworldcom.py index e734f10b..fdc81aa1 100644 --- a/fanficfare/adapters/adapter_wuxiaworldcom.py +++ b/fanficfare/adapters/adapter_wuxiaworldcom.py @@ -28,7 +28,6 @@ import logging import re import urllib2 import urlparse -import traceback from base_adapter import BaseSiteAdapter, makeDate @@ -150,32 +149,14 @@ class WuxiaWorldComSiteAdapter(BaseSiteAdapter): def getChapterText(self, url): #logger.debug('Getting chapter text from: %s', url) - try: - data = self._fetchUrl(url) - soup = self.make_soup(data) - story = soup.find('div', {'itemprop':'articleBody'}) - if not story: - raise exceptions.FailedToDownload( - "Error downloading Chapter: %s! Missing required element!" % url) - #removing the Previous and next chapter links - for tag in story.find_all('a'): - tag.extract() - - except Exception as e: - if self.getConfig('continue_on_chapter_error'): - story = self.make_soup("""
-

Error

-

FanFicFare failed to download this chapter. Because you have -continue_on_chapter_error set to true, the download continued.

-

Chapter URL:
%s

-

-Authors on wuxiaworld.com create their own index pages, so it's not -uncommon for there to be 404 errors when there are links to chapters -that haven't been uploaded yet. -

-

Error:

%s

-
"""%(url,traceback.format_exc())) - else: - raise + data = self._fetchUrl(url) + soup = self.make_soup(data) + story = soup.find('div', {'itemprop':'articleBody'}) + if not story: + raise exceptions.FailedToDownload( + "Error downloading Chapter: %s! Missing required element!" % url) + #removing the Previous and next chapter links + for tag in story.find_all('a'): + tag.extract() return self.utf8FromSoup(url, story) diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index 1324e40a..3fdbb79b 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -25,7 +25,7 @@ import urllib2 as u2 import urlparse as up import cookielib as cl from functools import partial -import pickle +import traceback import bs4 @@ -178,12 +178,6 @@ class BaseSiteAdapter(Configurable): ''' return False - # def story_load(self,filename): - # d = pickle.load(self.story.metadata,filename) - # self.story.metadata = d['metadata'] - # self.chapterUrls = d['chapterlist'] - # self.story.metadataDone = True - def _setURL(self,url): self.url = url self.parsedUrl = up.urlparse(url) @@ -395,17 +389,30 @@ class BaseSiteAdapter(Configurable): self.oldchaptersdata[url]['chapterorigtitle'] != self.oldchaptersdata[url]['chaptertitle']) ) - if not data: - data = self.getChapterTextNum(url,index) - # if had to fetch and has existing chapters - newchap = bool(self.oldchapters or self.oldchaptersmap) + try: + if not data: + data = self.getChapterTextNum(url,index) + # if had to fetch and has existing chapters + newchap = bool(self.oldchapters or self.oldchaptersmap) - if index == 0 and self.getConfig('always_reload_first_chapter'): - data = self.getChapterTextNum(url,index) - # first chapter is rarely marked new - # anyway--only if it's replaced during an - # update. - newchap = False + if index == 0 and self.getConfig('always_reload_first_chapter'): + data = self.getChapterTextNum(url,index) + # first chapter is rarely marked new + # anyway--only if it's replaced during an + # update. + newchap = False + except Exception as e: + if self.getConfig('continue_on_chapter_error'): + data = self.make_soup("""
+

Error

+

FanFicFare failed to download this chapter. Because +continue_on_chapter_error is set to true, the download continued.

+

Chapter URL:
%s

+

Error:

%s

+
"""%(url,traceback.format_exc().replace("&","&").replace(">",">").replace("<","<"))) + title = title+"(FAILED)" + else: + raise self.story.addChapter(url, removeEntities(title), @@ -480,7 +487,7 @@ class BaseSiteAdapter(Configurable): def getSiteURLFragment(self): "Needs to be overriden in case of adapters that share a domain." return self.getSiteDomain() - + @classmethod def getConfigSection(cls): "Only needs to be overriden if != site domain." diff --git a/fanficfare/adapters/base_xenforoforum_adapter.py b/fanficfare/adapters/base_xenforoforum_adapter.py index 1782658a..acd7c41a 100644 --- a/fanficfare/adapters/base_xenforoforum_adapter.py +++ b/fanficfare/adapters/base_xenforoforum_adapter.py @@ -17,7 +17,6 @@ import time import logging -import traceback logger = logging.getLogger(__name__) import re import urllib2 @@ -340,76 +339,63 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): def getChapterTextNum(self, url, index): logger.debug('Getting chapter text from: %s index: %s' % (url,index)) - try: - origurl = url + origurl = url - # reader mode shows only threadmarked posts in threadmark - # order. don't use reader mode for /threads/ urls, or - # first post when always_include_first_post. - if ( self.reader and - self.getConfig("use_reader_mode",True) and - '/threads/' not in url and - (index > 0 or not self.getConfig('always_include_first_post')) ): - logger.debug("USE READER MODE") - # in case it changes: - posts_per_page = self.getConfig("reader_posts_per_page",10) + # reader mode shows only threadmarked posts in threadmark + # order. don't use reader mode for /threads/ urls, or + # first post when always_include_first_post. + if ( self.reader and + self.getConfig("use_reader_mode",True) and + '/threads/' not in url and + (index > 0 or not self.getConfig('always_include_first_post')) ): + logger.debug("USE READER MODE") + # in case it changes: + posts_per_page = self.getConfig("reader_posts_per_page",10) - # always_include_first_post with threadmarks added an - # extra first chapter, we should be past it. - if self.getConfig('always_include_first_post'): - index = index - 1 - reader_page_num = int((index+posts_per_page)/posts_per_page) - reader_url=self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/reader?page='+unicode(reader_page_num) - logger.debug("Reader URL to: %s"%reader_url) - data = self._fetchUrl(reader_url) - topsoup = souptag = self.make_soup(data) + # always_include_first_post with threadmarks added an + # extra first chapter, we should be past it. + if self.getConfig('always_include_first_post'): + index = index - 1 + reader_page_num = int((index+posts_per_page)/posts_per_page) + reader_url=self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/reader?page='+unicode(reader_page_num) + logger.debug("Reader URL to: %s"%reader_url) + data = self._fetchUrl(reader_url) + topsoup = souptag = self.make_soup(data) - # assumed normalized to /posts/1234/ - anchorid = "post-"+url.split('/')[-2] - logger.debug("anchorid: %s"%anchorid) + # assumed normalized to /posts/1234/ + anchorid = "post-"+url.split('/')[-2] + logger.debug("anchorid: %s"%anchorid) + souptag = topsoup.find('li',id=anchorid) + else: + logger.debug("DON'T USE READER MODE") + (data,opened) = self._fetchUrlOpened(url) + url = opened.geturl() + if '#' in origurl and '#' not in url: + url = url + origurl[origurl.index('#'):] + logger.debug("chapter URL redirected to: %s"%url) + + topsoup = souptag = self.make_soup(data) + + if '#' in url: + anchorid = url.split('#')[1] souptag = topsoup.find('li',id=anchorid) - else: - logger.debug("DON'T USE READER MODE") - (data,opened) = self._fetchUrlOpened(url) - url = opened.geturl() - if '#' in origurl and '#' not in url: - url = url + origurl[origurl.index('#'):] - logger.debug("chapter URL redirected to: %s"%url) - topsoup = souptag = self.make_soup(data) + self.handle_spoilers(topsoup,souptag) - if '#' in url: - anchorid = url.split('#')[1] - souptag = topsoup.find('li',id=anchorid) + bq = souptag.find('blockquote') - self.handle_spoilers(topsoup,souptag) + bq.name='div' - bq = souptag.find('blockquote') + for iframe in bq.find_all('iframe'): + iframe.extract() # calibre book reader & editor don't like iframes to youtube. - bq.name='div' + for qdiv in bq.find_all('div',{'class':'quoteExpand'}): + qdiv.extract() # Remove
click to expand
- for iframe in bq.find_all('iframe'): - iframe.extract() # calibre book reader & editor don't like iframes to youtube. - - for qdiv in bq.find_all('div',{'class':'quoteExpand'}): - qdiv.extract() # Remove
click to expand
- - ## img alt="[​IMG]" class="bbCodeImage LbImage lazyload - ## include lazy load images. - for img in bq.find_all('img',{'class':'lazyload'}): - img['src'] = img['data-src'] - - except Exception as e: - if self.getConfig('continue_on_chapter_error'): - bq = self.make_soup("""
-

Error

-

FanFicFare failed to download this chapter. Because you have -continue_on_chapter_error set to true in your personal.ini, the download continued.

-

Chapter URL:
%s

-

Error:

%s

-
"""%(url,traceback.format_exc())) - else: - raise + ## img alt="[​IMG]" class="bbCodeImage LbImage lazyload + ## include lazy load images. + for img in bq.find_all('img',{'class':'lazyload'}): + img['src'] = img['data-src'] # XenForo uses return self.utf8FromSoup(self.getURLPrefix()+'/',bq) diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index e50c6eac..9bca1b86 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -153,6 +153,7 @@ def get_valid_set_options(): 'titlepage_use_table':(None,None,boollist), 'use_ssl_unverified_context':(None,None,boollist), + 'continue_on_chapter_error':(None,None,boollist), 'add_chapter_numbers':(None,None,boollist+['toconly']), @@ -194,7 +195,6 @@ def get_valid_set_options(): 'internalize_text_links':(None,['epub','html'],boollist), 'capitalize_forumtags':(base_xenforo_list,None,boollist), - 'continue_on_chapter_error':(base_xenforo_list+['wuxiaworld.com'],None,boollist), 'minimum_threadmarks':(base_xenforo_list,None,None), 'first_post_title':(base_xenforo_list,None,None), 'always_include_first_post':(base_xenforo_list,None,boollist), diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 50926b77..99af2ab3 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -409,6 +409,13 @@ user_agent:FFF/2.X ## non-intuitive. #description_limit:1000 +## Because some adapters can pull chapter URLs from human posts, the +## odds of errors in the chapter URLs can be higher for some +## sites/stories. You can set continue_on_chapter_error:true to +## continue on after failing to download a chapter and instead record +## an error message in the ebook for that chapter. +continue_on_chapter_error:false + ## The FFF CLI can fetch story URLs from unread emails when configured ## to read from your IMAP mail server. The example shows GMail, but ## other services that support IMAP can be used. GMail requires you @@ -519,13 +526,6 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M ## the description. description_limit:500 -## Because base_xenforoforum adapters can pull chapter URLs from human -## posts, the odds of errors in the chapter URLs are vastly higher. -## You can set continue_on_chapter_error:true to continue on after -## failing to download a chapter and instead record an error message -## in the ebook for that chapter. -continue_on_chapter_error:false - ## When given a thread URL, use threadmarks as chapter links when ## there are at least this many threadmarks. A number of older ## threads have a single threadmark to an 'index' post. Set to 1 to