Incorporating heuristic br->p tag processing as an optional feature.

This commit is contained in:
Jim Miller 2013-10-26 15:52:40 -05:00
parent 2195ea5792
commit df5a91daed
9 changed files with 149 additions and 16 deletions

View file

@ -303,7 +303,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
chapter=bs.BeautifulSoup('<div class="story"></div>')
chapter=bs.BeautifulSoup('<div class="story"></div>').find('div')
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr'))

View file

@ -180,7 +180,7 @@ class FictionPadSiteAdapter(BaseSiteAdapter):
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url))
soup = bs.BeautifulSoup("<div id='story'>"+self._fetchUrl(url)+"</div>")
return self.utf8FromSoup(url,soup)
def getClass():

View file

@ -166,10 +166,11 @@ class NickAndGregNetAdapter(BaseSiteAdapter):
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('table', {'class' : 'tblborder6'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
# wrap a div around it.
divsoup = bs.BeautifulStoneSoup('<div class="story"></div>',
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = divsoup.find('div')
div.append(soup.find('table', {'class' : 'tblborder6'}))
return self.utf8FromSoup(url,div)

View file

@ -214,5 +214,6 @@ class SimplyUndeniableComAdapter(BaseSiteAdapter):
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
div.name='div'
return self.utf8FromSoup(url,div)

View file

@ -241,5 +241,6 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
if None == story:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
story.name='div'
return self.utf8FromSoup(url,story)

View file

@ -323,7 +323,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
</div>
'''
elif self.story.getMetadata('storyId') == '0':
text=u'''
text=u'''<div>
<h3>45. Pronglet Returns to Hogwarts: Chapter 7</h3>
<br />
eyes but Im not convinced we should automatically<br />
@ -332,6 +332,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
<br /><br />
Sure, invite her along. Does she have children?<br />
<br />
</div>
'''
else:
if self.story.getMetadata('storyId') == '667':

View file

@ -228,7 +228,8 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
span.name='div'
return self.utf8FromSoup(url,span)
def getClass():

View file

@ -26,6 +26,7 @@ from functools import partial
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from ..htmlheuristics import replace_br_with_p
logger = logging.getLogger(__name__)
@ -354,13 +355,8 @@ class BaseSiteAdapter(Configurable):
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
retval = soup.__str__('utf8').decode('utf-8')
if self.getConfig('replace_hr'):
# replacing a self-closing tag with a container tag in the
# soup is more difficult than it first appears. So cheat.
retval = retval.replace("<hr />","<div class='center'>* * *</div>")
retval = soup.__str__('utf8').decode('utf-8')
if self.getConfig('nook_img_fix'):
# if the <img> tag doesn't have a div or a p around it,
@ -371,7 +367,19 @@ class BaseSiteAdapter(Configurable):
# Don't want body tags in chapter html--writers add them.
# This is primarily for epub updates.
return re.sub(r"</?body>\r?\n?","",retval)
retval = re.sub(r"</?body>\r?\n?","",retval)
if self.getConfig("replace_br_with_p"):
# Apply heuristic processing to replace <br> paragraph
# breaks with <p> tags.
retval = replace_br_with_p(self,retval)
if self.getConfig('replace_hr'):
# replacing a self-closing tag with a container tag in the
# soup is more difficult than it first appears. So cheat.
retval = retval.replace("<hr />","<div class='center'>* * *</div>")
return retval
def cachedfetch(realfetch,cache,url):
if url in cache:

View file

@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
logger = logging.getLogger(__name__)
import re
from . import exceptions as exceptions
def replace_br_with_p(body):
# change surrounding div to a p and remove attrs Top surrounding
# tag in all cases now should be div, to just strip the first and
# last tags.
body = u'<p>'+body[body.index('>')+1:body.rindex("<")]+u'</p>'
# So many people add formatting to their HR tags, and ePub does not allow those, we are supposed to use css.
# This nukes the hr tag attributes.
body = re.sub(r'\s*<hr[^>]+>\s*', r'\n<hr />\n', body)
# Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
# Remove leading and trailing breaks from HR tags
body = re.sub(r'\s*(<br\ \/>)*\s*<hr\ \/>\s*(<br\ \/>)*\s*', r'\n<hr />\n', body)
# Nuking breaks leading paragraps that may be in the body. They are eventually treated as <p><br /></p>
body = re.sub(r'\s*(<br\ \/>)+\s*<p', r'\n<p></p>\n<p', body)
# Nuking breaks trailing paragraps that may be in the body. They are eventually treated as <p><br /></p>
body = re.sub(r'</p>\s*(<br\ \/>)+\s*', r'</p>\n<p></p>\n', body)
# Because a leading or trailing non break tag will break the following code, we have to mess around rather badly for a few lines.
body = body.replace(u'[',u'&squareBracketStart;')
body = body.replace(u']',u'&squareBracketEnd;')
body = body.replace(u'<br />',u'[br /]')
breaksRegexp = [
re.compile(r'([^\]])(\[br\ \/\])([^\[])'),
re.compile(r'([^\]])(\[br\ \/\]){2}([^\[])'),
re.compile(r'([^\]])(\[br\ \/\]){3}([^\[])'),
re.compile(r'([^\]])(\[br\ \/\]){4}([^\[])'),
re.compile(r'([^\]])(\[br\ \/\]){5}([^\[])'),
re.compile(r'([^\]])(\[br\ \/\]){6}([^\[])'),
re.compile(r'([^\]])(\[br\ \/\]){7}([^\[])'),
re.compile(r'([^\]])(\[br\ \/\]){8}([^\[])'),
re.compile(r'(\[br\ \/\]){9,}')]
breaksCount = [
len(breaksRegexp[0].findall(body)),
len(breaksRegexp[1].findall(body)),
len(breaksRegexp[2].findall(body)),
len(breaksRegexp[3].findall(body)),
len(breaksRegexp[4].findall(body)),
len(breaksRegexp[5].findall(body)),
len(breaksRegexp[6].findall(body)),
len(breaksRegexp[7].findall(body))]
breaksMax = 0
breaksMaxIndex = 0;
for i in range(len(breaksCount)):
if breaksCount[i] > breaksMax:
breaksMax = breaksCount[i]
breaksMaxIndex = i
# Find all instances of consecutive breaks less than otr equal to the max count use most often
# replase those tags to inverted p tag pairs, those with more connsecutive breaks are replaced them with a horisontal line
for i in range(len(breaksCount)):
if i <= breaksMaxIndex:
body = breaksRegexp[i].sub(r'\1</p>\n<p>\3', body)
else:
body = breaksRegexp[i].sub(r'\1</p>\n<hr />\n<p>\3', body)
body = breaksRegexp[8].sub(r'</p>\n<hr />\n<p>', body)
# Reverting the square brackets
body = body.replace(u'[', u'<')
body = body.replace(u']', u'>')
body = body.replace(u'&squareBracketStart;', u'[')
body = body.replace(u'&squareBracketEnd;', u']')
# If for some reason, a third break makes its way inside the paragraph, preplace that with the empty paragraph for the additional linespaing.
body = re.sub(r'<p>\s*(<br\ \/>)+', r'<p><br /></p>\n<p>', body)
# change empty p tags to include a br to force spacing.
body = re.sub(r'<p>\s*</p>', r'<p><br/></p>', body)
# Clean up hr tags, and add inverted p tag pairs
body = re.sub(r'\s*<hr\ \/>\s*', r'</p>\n<hr />\n<p>', body)
# Because the previous regexp may cause trouble if the hr tag already had a p tag pair around it, w nee dot repair that.
# Repeated opening p tags are condenced to one. As we added the extra leading opening p tags, we can safely assume that
# the last in such a chain must be the original. Lets keep its attributes if they are there.
body = re.sub(r'\s*(<p[^>]*>\s*)+<p([^>]*)>\s*', r'\n<p\2>', body)
# Repeated closing p tags are condenced to one
body = re.sub(r'\s*(<\/\s*p>\s*){2,}', r'</p>\n', body)
# superflous cleaning, remove whitespaces traling opening p tags. These does affect formatting.
body = re.sub(r'<p([^>]*)>\s*', r'<p\1>', body)
# superflous cleaning, remove whitespaces leading closing p tags. These does not affect formatting.
body = re.sub(r'\s*</p>', r'</p>', body)
# re-wrap in div tag.
body = u'<div>\n' + body + u'\n</div>'
return body