Work around an SGMLParser/BeautifulStoneSoup entity bug--incorrectly inserts (;) when it shouldn't, "AT&T" becomes "AT&T;". Also update to latest BeautifulSoup: has one minor, unrelated change.

This commit is contained in:
retiefjimm 2010-12-19 12:21:55 -06:00
parent d3abfbf5d6
commit a32ccf7dac
2 changed files with 18 additions and 3 deletions

View file

@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
"""Beautiful Soup
Elixir and Tonic
"The Screen-Scraper's Friend"
@ -81,7 +79,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
from __future__ import generators
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "3.0.8.1"
__version__ = "3.2.0"
__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
__license__ = "New-style BSD"
@ -533,6 +531,8 @@ class Tag(PageElement):
self.name = name
if attrs is None:
attrs = []
elif isinstance(attrs, dict):
attrs = attrs.items()
self.attrs = attrs
self.contents = []
self.setup(parent, previous)

View file

@ -464,6 +464,12 @@ def replaceNumberEntities(data):
p = re.compile(r'&#(x?)(\d+);')
return p.sub(unirepl, data)
def replaceNotEntities(data):
# not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
# (or equiv), SGMLParser, entityref
p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
return p.sub(r'&\1', data)
def removeEntities(text):
# replace numeric versions of [&<>] with named versions.
@ -492,6 +498,15 @@ def removeEntities(text):
except UnicodeDecodeError, ex:
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
# entities terribly well and inserts (;) after something that
# it thinks might be an entity. AT&T becomes AT&T; All of my
# attempts to fix this by changing the input to
# BeautifulStoneSoup break something else instead. But at
# this point, there should be *no* real entities left, so find
# these not-entities and removing them here should be safe.
text = replaceNotEntities(text)
# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')