mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-02 20:02:51 +02:00
Incorporating heuristic br->p tag processing as an optional feature.
This commit is contained in:
parent
2195ea5792
commit
df5a91daed
9 changed files with 149 additions and 16 deletions
|
|
@ -303,7 +303,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
|
|||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
chapter=bs.BeautifulSoup('<div class="story"></div>')
|
||||
chapter=bs.BeautifulSoup('<div class="story"></div>').find('div')
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr'))
|
||||
|
||||
|
|
|
|||
|
|
@ -180,7 +180,7 @@ class FictionPadSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url))
|
||||
soup = bs.BeautifulSoup("<div id='story'>"+self._fetchUrl(url)+"</div>")
|
||||
return self.utf8FromSoup(url,soup)
|
||||
|
||||
def getClass():
|
||||
|
|
|
|||
|
|
@ -166,10 +166,11 @@ class NickAndGregNetAdapter(BaseSiteAdapter):
|
|||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('table', {'class' : 'tblborder6'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
# wrap a div around it.
|
||||
divsoup = bs.BeautifulStoneSoup('<div class="story"></div>',
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
div = divsoup.find('div')
|
||||
div.append(soup.find('table', {'class' : 'tblborder6'}))
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
|
|
|||
|
|
@ -214,5 +214,6 @@ class SimplyUndeniableComAdapter(BaseSiteAdapter):
|
|||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
div.name='div'
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
|
|
|||
|
|
@ -241,5 +241,6 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
|
|||
|
||||
if None == story:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
story.name='div'
|
||||
|
||||
return self.utf8FromSoup(url,story)
|
||||
|
|
|
|||
|
|
@ -323,7 +323,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
|
|||
</div>
|
||||
'''
|
||||
elif self.story.getMetadata('storyId') == '0':
|
||||
text=u'''
|
||||
text=u'''<div>
|
||||
<h3>45. Pronglet Returns to Hogwarts: Chapter 7</h3>
|
||||
<br />
|
||||
eyes… but I’m not convinced we should automatically<br />
|
||||
|
|
@ -332,6 +332,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
|
|||
<br /><br />
|
||||
“Sure, invite her along. Does she have children?”<br />
|
||||
<br />
|
||||
</div>
|
||||
'''
|
||||
else:
|
||||
if self.story.getMetadata('storyId') == '667':
|
||||
|
|
|
|||
|
|
@ -228,7 +228,8 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
|
||||
span.name='div'
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ from functools import partial
|
|||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from ..htmlheuristics import replace_br_with_p
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -354,13 +355,8 @@ class BaseSiteAdapter(Configurable):
|
|||
# removes paired, but empty tags.
|
||||
if t.string != None and len(t.string.strip()) == 0 :
|
||||
t.extract()
|
||||
|
||||
retval = soup.__str__('utf8').decode('utf-8')
|
||||
|
||||
if self.getConfig('replace_hr'):
|
||||
# replacing a self-closing tag with a container tag in the
|
||||
# soup is more difficult than it first appears. So cheat.
|
||||
retval = retval.replace("<hr />","<div class='center'>* * *</div>")
|
||||
retval = soup.__str__('utf8').decode('utf-8')
|
||||
|
||||
if self.getConfig('nook_img_fix'):
|
||||
# if the <img> tag doesn't have a div or a p around it,
|
||||
|
|
@ -371,7 +367,19 @@ class BaseSiteAdapter(Configurable):
|
|||
|
||||
# Don't want body tags in chapter html--writers add them.
|
||||
# This is primarily for epub updates.
|
||||
return re.sub(r"</?body>\r?\n?","",retval)
|
||||
retval = re.sub(r"</?body>\r?\n?","",retval)
|
||||
|
||||
if self.getConfig("replace_br_with_p"):
|
||||
# Apply heuristic processing to replace <br> paragraph
|
||||
# breaks with <p> tags.
|
||||
retval = replace_br_with_p(self,retval)
|
||||
|
||||
if self.getConfig('replace_hr'):
|
||||
# replacing a self-closing tag with a container tag in the
|
||||
# soup is more difficult than it first appears. So cheat.
|
||||
retval = retval.replace("<hr />","<div class='center'>* * *</div>")
|
||||
|
||||
return retval
|
||||
|
||||
def cachedfetch(realfetch,cache,url):
|
||||
if url in cache:
|
||||
|
|
|
|||
120
fanficdownloader/htmlheuristics.py
Normal file
120
fanficdownloader/htmlheuristics.py
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
|
||||
from . import exceptions as exceptions
|
||||
|
||||
def replace_br_with_p(body):
|
||||
|
||||
# change surrounding div to a p and remove attrs Top surrounding
|
||||
# tag in all cases now should be div, to just strip the first and
|
||||
# last tags.
|
||||
body = u'<p>'+body[body.index('>')+1:body.rindex("<")]+u'</p>'
|
||||
|
||||
# So many people add formatting to their HR tags, and ePub does not allow those, we are supposed to use css.
|
||||
# This nukes the hr tag attributes.
|
||||
body = re.sub(r'\s*<hr[^>]+>\s*', r'\n<hr />\n', body)
|
||||
|
||||
# Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
|
||||
body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
|
||||
|
||||
# Remove leading and trailing breaks from HR tags
|
||||
body = re.sub(r'\s*(<br\ \/>)*\s*<hr\ \/>\s*(<br\ \/>)*\s*', r'\n<hr />\n', body)
|
||||
# Nuking breaks leading paragraps that may be in the body. They are eventually treated as <p><br /></p>
|
||||
body = re.sub(r'\s*(<br\ \/>)+\s*<p', r'\n<p></p>\n<p', body)
|
||||
# Nuking breaks trailing paragraps that may be in the body. They are eventually treated as <p><br /></p>
|
||||
body = re.sub(r'</p>\s*(<br\ \/>)+\s*', r'</p>\n<p></p>\n', body)
|
||||
|
||||
# Because a leading or trailing non break tag will break the following code, we have to mess around rather badly for a few lines.
|
||||
body = body.replace(u'[',u'&squareBracketStart;')
|
||||
body = body.replace(u']',u'&squareBracketEnd;')
|
||||
body = body.replace(u'<br />',u'[br /]')
|
||||
|
||||
breaksRegexp = [
|
||||
re.compile(r'([^\]])(\[br\ \/\])([^\[])'),
|
||||
re.compile(r'([^\]])(\[br\ \/\]){2}([^\[])'),
|
||||
re.compile(r'([^\]])(\[br\ \/\]){3}([^\[])'),
|
||||
re.compile(r'([^\]])(\[br\ \/\]){4}([^\[])'),
|
||||
re.compile(r'([^\]])(\[br\ \/\]){5}([^\[])'),
|
||||
re.compile(r'([^\]])(\[br\ \/\]){6}([^\[])'),
|
||||
re.compile(r'([^\]])(\[br\ \/\]){7}([^\[])'),
|
||||
re.compile(r'([^\]])(\[br\ \/\]){8}([^\[])'),
|
||||
re.compile(r'(\[br\ \/\]){9,}')]
|
||||
|
||||
breaksCount = [
|
||||
len(breaksRegexp[0].findall(body)),
|
||||
len(breaksRegexp[1].findall(body)),
|
||||
len(breaksRegexp[2].findall(body)),
|
||||
len(breaksRegexp[3].findall(body)),
|
||||
len(breaksRegexp[4].findall(body)),
|
||||
len(breaksRegexp[5].findall(body)),
|
||||
len(breaksRegexp[6].findall(body)),
|
||||
len(breaksRegexp[7].findall(body))]
|
||||
|
||||
breaksMax = 0
|
||||
breaksMaxIndex = 0;
|
||||
|
||||
for i in range(len(breaksCount)):
|
||||
if breaksCount[i] > breaksMax:
|
||||
breaksMax = breaksCount[i]
|
||||
breaksMaxIndex = i
|
||||
|
||||
# Find all instances of consecutive breaks less than otr equal to the max count use most often
|
||||
# replase those tags to inverted p tag pairs, those with more connsecutive breaks are replaced them with a horisontal line
|
||||
for i in range(len(breaksCount)):
|
||||
if i <= breaksMaxIndex:
|
||||
body = breaksRegexp[i].sub(r'\1</p>\n<p>\3', body)
|
||||
else:
|
||||
body = breaksRegexp[i].sub(r'\1</p>\n<hr />\n<p>\3', body)
|
||||
|
||||
body = breaksRegexp[8].sub(r'</p>\n<hr />\n<p>', body)
|
||||
|
||||
# Reverting the square brackets
|
||||
body = body.replace(u'[', u'<')
|
||||
body = body.replace(u']', u'>')
|
||||
body = body.replace(u'&squareBracketStart;', u'[')
|
||||
body = body.replace(u'&squareBracketEnd;', u']')
|
||||
|
||||
# If for some reason, a third break makes its way inside the paragraph, preplace that with the empty paragraph for the additional linespaing.
|
||||
body = re.sub(r'<p>\s*(<br\ \/>)+', r'<p><br /></p>\n<p>', body)
|
||||
|
||||
# change empty p tags to include a br to force spacing.
|
||||
body = re.sub(r'<p>\s*</p>', r'<p><br/></p>', body)
|
||||
|
||||
# Clean up hr tags, and add inverted p tag pairs
|
||||
body = re.sub(r'\s*<hr\ \/>\s*', r'</p>\n<hr />\n<p>', body)
|
||||
|
||||
# Because the previous regexp may cause trouble if the hr tag already had a p tag pair around it, w nee dot repair that.
|
||||
# Repeated opening p tags are condenced to one. As we added the extra leading opening p tags, we can safely assume that
|
||||
# the last in such a chain must be the original. Lets keep its attributes if they are there.
|
||||
body = re.sub(r'\s*(<p[^>]*>\s*)+<p([^>]*)>\s*', r'\n<p\2>', body)
|
||||
# Repeated closing p tags are condenced to one
|
||||
body = re.sub(r'\s*(<\/\s*p>\s*){2,}', r'</p>\n', body)
|
||||
|
||||
# superflous cleaning, remove whitespaces traling opening p tags. These does affect formatting.
|
||||
body = re.sub(r'<p([^>]*)>\s*', r'<p\1>', body)
|
||||
# superflous cleaning, remove whitespaces leading closing p tags. These does not affect formatting.
|
||||
body = re.sub(r'\s*</p>', r'</p>', body)
|
||||
|
||||
# re-wrap in div tag.
|
||||
body = u'<div>\n' + body + u'\n</div>'
|
||||
|
||||
return body
|
||||
|
||||
Loading…
Reference in a new issue