diff --git a/calibre-plugin/__init__.py b/calibre-plugin/__init__.py index a59cecfe..bd75cf1c 100644 --- a/calibre-plugin/__init__.py +++ b/calibre-plugin/__init__.py @@ -27,7 +27,7 @@ class FanFictionDownLoaderBase(InterfaceActionBase): description = 'UI plugin to download FanFiction stories from various sites.' supported_platforms = ['windows', 'osx', 'linux'] author = 'Jim Miller' - version = (1, 5, 15) + version = (1, 5, 16) minimum_calibre_version = (0, 8, 30) #: This field defines the GUI plugin class that contains all the code diff --git a/fanficdownloader/adapters/adapter_test1.py b/fanficdownloader/adapters/adapter_test1.py index 0496227b..c6557370 100644 --- a/fanficdownloader/adapters/adapter_test1.py +++ b/fanficdownloader/adapters/adapter_test1.py @@ -184,6 +184,7 @@ br breaks

Puella Magi Madoka Magica/魔法少女まどか★マギカ
br breaks

+Don't—e;ver—d;o—that—a;gain, 法 é
horizontal rules
diff --git a/fanficdownloader/htmlcleanup.py b/fanficdownloader/htmlcleanup.py index 2bf42803..4dfb306c 100644 --- a/fanficdownloader/htmlcleanup.py +++ b/fanficdownloader/htmlcleanup.py @@ -19,12 +19,21 @@ import re def _unirepl(match): "Return the unicode string for a decimal number" - if match.group(1)=='x': + if match.group(1).startswith('x'): radix=16 + s = match.group(1)[1:] else: radix=10 - value = int(match.group(2), radix) - return "%s%s"%(unichr(value),match.group(3)) + s = match.group(1) + try: + value = int(s, radix) + retval = "%s%s"%(unichr(value),match.group(2)) + except: + # This way, at least if there's more of entities out there + # that fail, it doesn't blow the entire download. + print "Numeric entity translation failed, skipping: &#x%s%s"%(match.group(1),match.group(2)) + retval = "" + return retval def _replaceNumberEntities(data): # The same brokenish entity parsing in SGMLParser that inserts ';' @@ -33,7 +42,8 @@ def _replaceNumberEntities(data): # "Don't—ever—do—that—again," becomes # "Don't—e;ver—d;o—that—a;gain," # Also need to allow for 5 digit decimal entities 法 - p = re.compile(r'&#(x?)([0-9]{,5}|[0-9a-fA-F]{,4})([0-9a-fA-F]*?);') + # Last expression didn't allow for 2 digit hex correctly: é + p = re.compile(r'&#(x[0-9a-fA-F]{,4}|[0-9]{,5})([0-9a-fA-F]*?);') return p.sub(_unirepl, data) def _replaceNotEntities(data):