mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-15 21:32:28 +01:00
Work around an SGMLParser/BeautifulStoneSoup entity bug--incorrectly inserts (;) when it shouldn't, "AT&T" becomes "AT&T;". Also update to latest BeautifulSoup: has one minor, unrelated change.
This commit is contained in:
parent
d3abfbf5d6
commit
a32ccf7dac
2 changed files with 18 additions and 3 deletions
|
|
@ -1,5 +1,3 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
|
|
@ -81,7 +79,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
|
|||
from __future__ import generators
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "3.0.8.1"
|
||||
__version__ = "3.2.0"
|
||||
__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
|
||||
__license__ = "New-style BSD"
|
||||
|
||||
|
|
@ -533,6 +531,8 @@ class Tag(PageElement):
|
|||
self.name = name
|
||||
if attrs is None:
|
||||
attrs = []
|
||||
elif isinstance(attrs, dict):
|
||||
attrs = attrs.items()
|
||||
self.attrs = attrs
|
||||
self.contents = []
|
||||
self.setup(parent, previous)
|
||||
|
|
|
|||
|
|
@ -464,6 +464,12 @@ def replaceNumberEntities(data):
|
|||
p = re.compile(r'&#(x?)(\d+);')
|
||||
return p.sub(unirepl, data)
|
||||
|
||||
def replaceNotEntities(data):
|
||||
# not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
|
||||
# (or equiv), SGMLParser, entityref
|
||||
p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
|
||||
return p.sub(r'&\1', data)
|
||||
|
||||
def removeEntities(text):
|
||||
# replace numeric versions of [&<>] with named versions.
|
||||
|
||||
|
|
@ -492,6 +498,15 @@ def removeEntities(text):
|
|||
except UnicodeDecodeError, ex:
|
||||
# for the pound symbol in constants.py
|
||||
text = text.replace(e, v.decode('utf-8'))
|
||||
|
||||
# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
|
||||
# entities terribly well and inserts (;) after something that
|
||||
# it thinks might be an entity. AT&T becomes AT&T; All of my
|
||||
# attempts to fix this by changing the input to
|
||||
# BeautifulStoneSoup break something else instead. But at
|
||||
# this point, there should be *no* real entities left, so find
|
||||
# these not-entities and removing them here should be safe.
|
||||
text = replaceNotEntities(text)
|
||||
|
||||
# < < and & are the only html entities allowed in xhtml, put those back.
|
||||
text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>')
|
||||
|
|
|
|||
Loading…
Reference in a new issue