From f5a627e0081072901962593be022d822454499c4 Mon Sep 17 00:00:00 2001 From: cryzed Date: Mon, 17 Apr 2017 19:45:04 +0200 Subject: [PATCH 1/3] Fix "got more than 100 headers"-issue --- fanficfare/adapters/adapter_royalroadl.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fanficfare/adapters/adapter_royalroadl.py b/fanficfare/adapters/adapter_royalroadl.py index ab765d12..1d611999 100644 --- a/fanficfare/adapters/adapter_royalroadl.py +++ b/fanficfare/adapters/adapter_royalroadl.py @@ -15,22 +15,27 @@ # limitations under the License. # -import time +from datetime import datetime +import httplib import logging -logger = logging.getLogger(__name__) import re import urllib2 -import cookielib as cl -from datetime import datetime -from ..htmlcleanup import stripHTML from .. import exceptions as exceptions +from ..htmlcleanup import stripHTML +from base_adapter import BaseSiteAdapter + +logger = logging.getLogger(__name__) +# Fix "http.client.HTTPException: got more than 100 headers" issue. RoyalRoadL's webserver seems to be misconfigured and +# sends more than 100 headers for some stories (probably Set-Cookie). This simply increases the maximum header limit to +# 1000 -- changing this state globally isn't an issue, since it should be backwards-compatible with all other adapters. +httplib._MAXHEADERS = 1000 -from base_adapter import BaseSiteAdapter, makeDate def getClass(): return RoyalRoadAdapter + # Class name has to be unique. Our convention is camel case the # sitename with Adapter at the end. www is skipped. class RoyalRoadAdapter(BaseSiteAdapter): From 9c6395b75955ab790b5443db111bd1cf2dce0b07 Mon Sep 17 00:00:00 2001 From: cryzed Date: Tue, 18 Apr 2017 18:47:50 +0200 Subject: [PATCH 2/3] Isolate change of httplib._MAXHEADERS to getChapterText() --- fanficfare/adapters/adapter_royalroadl.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fanficfare/adapters/adapter_royalroadl.py b/fanficfare/adapters/adapter_royalroadl.py index 1d611999..247d8702 100644 --- a/fanficfare/adapters/adapter_royalroadl.py +++ b/fanficfare/adapters/adapter_royalroadl.py @@ -15,6 +15,7 @@ # limitations under the License. # +import contextlib from datetime import datetime import httplib import logging @@ -26,16 +27,22 @@ from ..htmlcleanup import stripHTML from base_adapter import BaseSiteAdapter logger = logging.getLogger(__name__) -# Fix "http.client.HTTPException: got more than 100 headers" issue. RoyalRoadL's webserver seems to be misconfigured and -# sends more than 100 headers for some stories (probably Set-Cookie). This simply increases the maximum header limit to -# 1000 -- changing this state globally isn't an issue, since it should be backwards-compatible with all other adapters. -httplib._MAXHEADERS = 1000 def getClass(): return RoyalRoadAdapter +# Using a context manager for this guarantees that the original max headers value is restored, even when an uncaught +# exception is raised +@contextlib.contextmanager +def httplib_max_headers(number): + original_max_headers = httplib._MAXHEADERS + httplib._MAXHEADERS = number + yield + httplib._MAXHEADERS = original_max_headers + + # Class name has to be unique. Our convention is camel case the # sitename with Adapter at the end. www is skipped. class RoyalRoadAdapter(BaseSiteAdapter): @@ -162,13 +169,16 @@ class RoyalRoadAdapter(BaseSiteAdapter): self.setCoverImage(url,cover_url) # some content is show as tables, this will preserve them - # grab the text for an individual chapter. def getChapterText(self, url): logger.debug('Getting chapter text from: %s' % url) - soup = self.make_soup(self._fetchUrl(url)) + # Work around "http.client.HTTPException: got more than 100 headers" issue. RoyalRoadL's webserver seems to be + # misconfigured and sends more than 100 headers for some stories (probably Set-Cookie). This simply increases + # the maximum header limit to 1000 temporarily. Also see: https://github.com/JimmXinu/FanFicFare/pull/174 + with httplib_max_headers(1000): + soup = self.make_soup(self._fetchUrl(url)) div = soup.find('div',{'class':"chapter-inner chapter-content"}) From 9a6ad627713aa12112275acf8c13bfd6b29c0748 Mon Sep 17 00:00:00 2001 From: cryzed Date: Tue, 18 Apr 2017 22:50:11 +0200 Subject: [PATCH 3/3] Define a dummy httplib_max_headers context manager if a httplib module without the _MAXHEADERS attribute is used --- fanficfare/adapters/adapter_royalroadl.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fanficfare/adapters/adapter_royalroadl.py b/fanficfare/adapters/adapter_royalroadl.py index 247d8702..95be7f91 100644 --- a/fanficfare/adapters/adapter_royalroadl.py +++ b/fanficfare/adapters/adapter_royalroadl.py @@ -33,14 +33,21 @@ def getClass(): return RoyalRoadAdapter -# Using a context manager for this guarantees that the original max headers value is restored, even when an uncaught -# exception is raised -@contextlib.contextmanager -def httplib_max_headers(number): - original_max_headers = httplib._MAXHEADERS - httplib._MAXHEADERS = number - yield - httplib._MAXHEADERS = original_max_headers +# Work around "http.client.HTTPException: got more than 100 headers" issue. Using a context manager for this guarantees +# that the original max headers value is restored, even when an uncaught exception is raised. +if hasattr(httplib, '_MAXHEADERS'): + @contextlib.contextmanager + def httplib_max_headers(number): + original_max_headers = httplib._MAXHEADERS + httplib._MAXHEADERS = number + yield + httplib._MAXHEADERS = original_max_headers +# Google App Engine seems to vendor a modified version of httplib in which the _MAXHEADERS attribute is missing (and +# also avoids this issue entirely) -- in this case we define a dummy version of the context manager +else: + @contextlib.contextmanager + def httplib_max_headers(number): + yield # Class name has to be unique. Our convention is camel case the