diff --git a/fanficdownloader/BeautifulSoup.py b/fanficdownloader/BeautifulSoup.py index 31ff0e5f..4b17b853 100644 --- a/fanficdownloader/BeautifulSoup.py +++ b/fanficdownloader/BeautifulSoup.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """Beautiful Soup Elixir and Tonic "The Screen-Scraper's Friend" @@ -81,7 +79,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. from __future__ import generators __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.0.8.1" +__version__ = "3.2.0" __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" __license__ = "New-style BSD" @@ -533,6 +531,8 @@ class Tag(PageElement): self.name = name if attrs is None: attrs = [] + elif isinstance(attrs, dict): + attrs = attrs.items() self.attrs = attrs self.contents = [] self.setup(parent, previous) diff --git a/fanficdownloader/output.py b/fanficdownloader/output.py index 46b3d7e9..6cbd4fdc 100644 --- a/fanficdownloader/output.py +++ b/fanficdownloader/output.py @@ -464,6 +464,12 @@ def replaceNumberEntities(data): p = re.compile(r'&#(x?)(\d+);') return p.sub(unirepl, data) +def replaceNotEntities(data): + # not just \w or \S. regexp from c:\Python25\lib\sgmllib.py + # (or equiv), SGMLParser, entityref + p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') + return p.sub(r'&\1', data) + def removeEntities(text): # replace numeric versions of [&<>] with named versions. @@ -492,6 +498,15 @@ def removeEntities(text): except UnicodeDecodeError, ex: # for the pound symbol in constants.py text = text.replace(e, v.decode('utf-8')) + + # SGMLParser, and in turn, BeautifulStoneSoup doesn't parse + # entities terribly well and inserts (;) after something that + # it thinks might be an entity. AT&T becomes AT&T; All of my + # attempts to fix this by changing the input to + # BeautifulStoneSoup break something else instead. But at + # this point, there should be *no* real entities left, so find + # these not-entities and removing them here should be safe. + text = replaceNotEntities(text) # < < and & are the only html entities allowed in xhtml, put those back. text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>')