Work around an SGMLParser/BeautifulStoneSoup entity bug--incorrectly inserts (;) when it shouldn't, "AT&T" becomes "AT&T;". Also update to latest BeautifulSoup: has one minor, unrelated change.

2026-02-13 02:52:12 +01:00 · 2010-12-19 12:21:55 -06:00 · 2010-12-19 12:21:55 -06:00 · a32ccf7dac
commit a32ccf7dac
parent d3abfbf5d6
2 changed files with 18 additions and 3 deletions
--- a/fanficdownloader/BeautifulSoup.py
+++ b/fanficdownloader/BeautifulSoup.py
@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 """Beautiful Soup
 Elixir and Tonic
 "The Screen-Scraper's Friend"
@ -81,7 +79,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
 from __future__ import generators

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "3.0.8.1"
+__version__ = "3.2.0"
 __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
 __license__ = "New-style BSD"

@ -533,6 +531,8 @@ class Tag(PageElement):
        self.name = name
        if attrs is None:
            attrs = []
+        elif isinstance(attrs, dict):
+            attrs = attrs.items()
        self.attrs = attrs
        self.contents = []
        self.setup(parent, previous)
--- a/fanficdownloader/output.py
+++ b/fanficdownloader/output.py
@ -464,6 +464,12 @@ def replaceNumberEntities(data):
 	p = re.compile(r'&#(x?)(\d+);')
 	return p.sub(unirepl, data)

+def replaceNotEntities(data):
+	# not just \w or \S.  regexp from c:\Python25\lib\sgmllib.py
+	# (or equiv), SGMLParser, entityref
+	p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
+	return p.sub(r'&\1', data)
+
 def removeEntities(text):
 	# replace numeric versions of [&<>] with named versions.
 	
@ -492,6 +498,15 @@ def removeEntities(text):
 		except UnicodeDecodeError, ex:
 			# for the pound symbol in constants.py
 			text = text.replace(e, v.decode('utf-8'))
+
+	# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
+	# entities terribly well and inserts (;) after something that
+	# it thinks might be an entity.  AT&T becomes AT&T; All of my
+	# attempts to fix this by changing the input to
+	# BeautifulStoneSoup break something else instead.  But at
+	# this point, there should be *no* real entities left, so find
+	# these not-entities and removing them here should be safe.
+	text = replaceNotEntities(text)
 	
 	# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
 	text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')