mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-04-28 01:42:34 +02:00
Add reformating option fix_excess_space to lightnovelgate (#291)
This commit is contained in:
parent
8010d50ab8
commit
620f512d02
4 changed files with 56 additions and 3 deletions
|
|
@ -1663,6 +1663,9 @@ extracategories:Lois & Clark: The New Adventures of Superman
|
|||
## Clear FanFiction from defaults, site is original fiction.
|
||||
extratags:
|
||||
|
||||
## Attempt to fix p and br excess from HTML in great many stories
|
||||
fix_excess_space:true
|
||||
|
||||
[literotica.com]
|
||||
extra_valid_entries:eroticatags,averrating
|
||||
eroticatags_label:Erotica Tags
|
||||
|
|
|
|||
|
|
@ -26,10 +26,23 @@ import urlparse
|
|||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
from ..htmlcleanup import stripHTML
|
||||
from bs4 import Comment
|
||||
from ..htmlcleanup import removeEntities, stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
HTML_TAGS = (
|
||||
'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi',
|
||||
'bdo', 'big', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'embed',
|
||||
'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5',
|
||||
'h6', 'head', 'header', 'hr', 'html', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'link',
|
||||
'main', 'map', 'mark', 'menu', 'menuitem', 'meta', 'meter', 'nav', 'noframes', 'noscript', 'object', 'ol',
|
||||
'optgroup', 'option', 'output', 'p', 'param', 'picture', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp',
|
||||
'script', 'section', 'select', 'small', 'source', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup',
|
||||
'svg', 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'tt',
|
||||
'u', 'ul', 'var', 'video', 'wbr')
|
||||
|
||||
|
||||
def getClass():
|
||||
''' Initializing the class '''
|
||||
|
|
@ -150,12 +163,44 @@ class LightNovelGateSiteAdapter(BaseSiteAdapter):
|
|||
cdata.find('h2').extract()
|
||||
self.setDescription(url, cdata)
|
||||
|
||||
def fixExcessSpace(self, data):
|
||||
# For easier extra space removing (when combining p an br)
|
||||
data = removeEntities(data)
|
||||
|
||||
# Sometimes we don't have even tags like <p> or <br/>, so lets create <p> instead of two new_line
|
||||
data = re.sub(r"\n[ \s]*\n", "\n<p>", data, flags=re.UNICODE)
|
||||
|
||||
# Combining all consequence of p and br to one <p>
|
||||
# bs4 will create </p> on his own, so don't worry
|
||||
data = re.sub(r"[ \s]*(</?p\b[^>]*>[ \s]*|<br\b[^>]*>[ \s]*)+", "\n<p>", data, flags=re.UNICODE)
|
||||
|
||||
return data
|
||||
|
||||
def getChapterText(self, url):
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
# Sometimes we get invalid characters
|
||||
data = data.decode('utf-8','ignore').encode('utf-8')
|
||||
|
||||
if self.getConfig('fix_excess_space', True):
|
||||
data = self.fixExcessSpace(data)
|
||||
|
||||
soup = self.make_soup(data)
|
||||
|
||||
story = soup.find('div', {'id':'vung_doc'})
|
||||
if not story:
|
||||
raise exceptions.FailedToDownload(
|
||||
"Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
# Some comments we will get is invalid. Remove them all.
|
||||
[comment.extract() for comment in story.find_all(text=lambda text:isinstance(text, Comment))]
|
||||
|
||||
# We don't need links. They have a bad css and they are not working most of times.
|
||||
[a.extract() for a in story.find_all('a')]
|
||||
|
||||
# Some tags have non-standard tag name.
|
||||
for tag in story.findAll(recursive=True):
|
||||
if tag.name not in HTML_TAGS:
|
||||
tag.name = 'span'
|
||||
|
||||
return self.utf8FromSoup(url, story)
|
||||
|
|
|
|||
|
|
@ -251,7 +251,8 @@ def get_valid_set_options():
|
|||
'legend_spoilers':(base_xenforo_list,None,boollist),
|
||||
'apocrypha_to_omake':(base_xenforo_list,None,boollist),
|
||||
'replace_failed_smilies_with_alt_text':(base_xenforo_list,None,boollist),
|
||||
'fix_pseudo_html': (['webnovel.com'], None, boollist)
|
||||
'fix_pseudo_html': (['webnovel.com'], None, boollist),
|
||||
'fix_excess_space': (['lightnovelgate.com'], ['epub', 'html'], boollist)
|
||||
}
|
||||
|
||||
return dict(valdict)
|
||||
|
|
@ -453,7 +454,8 @@ def get_valid_keywords():
|
|||
'normalize_text_links',
|
||||
'internalize_text_links',
|
||||
'replace_failed_smilies_with_alt_text',
|
||||
'fix_pseudo_html'
|
||||
'fix_pseudo_html',
|
||||
'fix_excess_space'
|
||||
])
|
||||
|
||||
# *known* entry keywords -- or rather regexps for them.
|
||||
|
|
|
|||
|
|
@ -1697,6 +1697,9 @@ extracategories:Lois & Clark: The New Adventures of Superman
|
|||
## Clear FanFiction from defaults, site is original fiction.
|
||||
extratags:
|
||||
|
||||
## Attempt to fix p and br excess from HTML in great many stories
|
||||
fix_excess_space:true
|
||||
|
||||
[literotica.com]
|
||||
extra_valid_entries:eroticatags,averrating
|
||||
eroticatags_label:Erotica Tags
|
||||
|
|
|
|||
Loading…
Reference in a new issue