Add reformating option fix_excess_space to lightnovelgate (#291)

This commit is contained in:
Dminti Snegirev 2018-04-12 18:14:23 +03:00 committed by Jim Miller
parent 8010d50ab8
commit 620f512d02
4 changed files with 56 additions and 3 deletions

View file

@ -1663,6 +1663,9 @@ extracategories:Lois & Clark: The New Adventures of Superman
## Clear FanFiction from defaults, site is original fiction.
extratags:
## Attempt to fix p and br excess from HTML in great many stories
fix_excess_space:true
[literotica.com]
extra_valid_entries:eroticatags,averrating
eroticatags_label:Erotica Tags

View file

@ -26,10 +26,23 @@ import urlparse
from base_adapter import BaseSiteAdapter, makeDate
from ..htmlcleanup import stripHTML
from bs4 import Comment
from ..htmlcleanup import removeEntities, stripHTML
from .. import exceptions as exceptions
logger = logging.getLogger(__name__)
HTML_TAGS = (
'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi',
'bdo', 'big', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'embed',
'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5',
'h6', 'head', 'header', 'hr', 'html', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'link',
'main', 'map', 'mark', 'menu', 'menuitem', 'meta', 'meter', 'nav', 'noframes', 'noscript', 'object', 'ol',
'optgroup', 'option', 'output', 'p', 'param', 'picture', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp',
'script', 'section', 'select', 'small', 'source', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup',
'svg', 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'tt',
'u', 'ul', 'var', 'video', 'wbr')
def getClass():
''' Initializing the class '''
@ -150,12 +163,44 @@ class LightNovelGateSiteAdapter(BaseSiteAdapter):
cdata.find('h2').extract()
self.setDescription(url, cdata)
def fixExcessSpace(self, data):
# For easier extra space removing (when combining p an br)
data = removeEntities(data)
# Sometimes we don't have even tags like <p> or <br/>, so lets create <p> instead of two new_line
data = re.sub(r"\n[ \s]*\n", "\n<p>", data, flags=re.UNICODE)
# Combining all consequence of p and br to one <p>
# bs4 will create </p> on his own, so don't worry
data = re.sub(r"[ \s]*(</?p\b[^>]*>[ \s]*|<br\b[^>]*>[ \s]*)+", "\n<p>", data, flags=re.UNICODE)
return data
def getChapterText(self, url):
data = self._fetchUrl(url)
# Sometimes we get invalid characters
data = data.decode('utf-8','ignore').encode('utf-8')
if self.getConfig('fix_excess_space', True):
data = self.fixExcessSpace(data)
soup = self.make_soup(data)
story = soup.find('div', {'id':'vung_doc'})
if not story:
raise exceptions.FailedToDownload(
"Error downloading Chapter: %s! Missing required element!" % url)
# Some comments we will get is invalid. Remove them all.
[comment.extract() for comment in story.find_all(text=lambda text:isinstance(text, Comment))]
# We don't need links. They have a bad css and they are not working most of times.
[a.extract() for a in story.find_all('a')]
# Some tags have non-standard tag name.
for tag in story.findAll(recursive=True):
if tag.name not in HTML_TAGS:
tag.name = 'span'
return self.utf8FromSoup(url, story)

View file

@ -251,7 +251,8 @@ def get_valid_set_options():
'legend_spoilers':(base_xenforo_list,None,boollist),
'apocrypha_to_omake':(base_xenforo_list,None,boollist),
'replace_failed_smilies_with_alt_text':(base_xenforo_list,None,boollist),
'fix_pseudo_html': (['webnovel.com'], None, boollist)
'fix_pseudo_html': (['webnovel.com'], None, boollist),
'fix_excess_space': (['lightnovelgate.com'], ['epub', 'html'], boollist)
}
return dict(valdict)
@ -453,7 +454,8 @@ def get_valid_keywords():
'normalize_text_links',
'internalize_text_links',
'replace_failed_smilies_with_alt_text',
'fix_pseudo_html'
'fix_pseudo_html',
'fix_excess_space'
])
# *known* entry keywords -- or rather regexps for them.

View file

@ -1697,6 +1697,9 @@ extracategories:Lois & Clark: The New Adventures of Superman
## Clear FanFiction from defaults, site is original fiction.
extratags:
## Attempt to fix p and br excess from HTML in great many stories
fix_excess_space:true
[literotica.com]
extra_valid_entries:eroticatags,averrating
eroticatags_label:Erotica Tags