From ba1df457abb0e03720189e9879ef1bf641a2a9f9 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Mon, 16 May 2011 19:26:58 +0000 Subject: [PATCH] =?UTF-8?q?fix=20for=20a=20charset=20euc=5Ftw=20reported?= =?UTF-8?q?=20by=20devon.mitchell1988@y=E2=80=8Bahoo.com?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/request/basic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/request/basic.py b/lib/request/basic.py index 056d02595..842cda39a 100644 --- a/lib/request/basic.py +++ b/lib/request/basic.py @@ -97,7 +97,7 @@ def checkCharEncoding(encoding): return encoding # http://www.destructor.de/charsets/index.htm - translate = { 'windows-874': 'iso-8859-11', 'en_us': 'utf8', 'macintosh': 'iso-8859-1' } + translate = { 'windows-874': 'iso-8859-11', 'en_us': 'utf8', 'macintosh': 'iso-8859-1', 'euc_tw': 'big5_tw' } for delimiter in (';', ','): if delimiter in encoding: @@ -110,12 +110,16 @@ def checkCharEncoding(encoding): encoding = encoding.replace('5889', '8859') # iso-5889 -> iso-8859 elif '2313' in encoding: encoding = encoding.replace('2313', '2312') # gb2313 -> gb2312 + elif 'x-euc' in encoding: + encoding = encoding.replace('x-euc', 'euc') # x-euc-kr -> euc-kr # name adjustment for compatibility if encoding.startswith('8859'): encoding = 'iso-%s' % encoding elif encoding.startswith('cp-'): encoding = 'cp%s' % encoding[3:] + elif encoding.startswith('euc-'): + encoding = 'euc_%s' % encoding[4:] elif encoding.startswith('windows') and not encoding.startswith('windows-'): encoding = 'windows-%s' % encoding[7:] elif encoding.find('iso-88') > 0: @@ -185,7 +189,7 @@ def decodePage(page, contentEncoding, contentType): if contentType and any(map(lambda x: x in contentType.lower(), ('text/txt', 'text/raw', 'text/html', 'text/xml'))): # can't do for all responses because we need to support binary files too - kb.pageEncoding = kb.pageEncoding or getHeuristicCharEncoding(page) + kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding) return page