From ba1df457abb0e03720189e9879ef1bf641a2a9f9 Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Mon, 16 May 2011 19:26:58 +0000
Subject: [PATCH] =?UTF-8?q?fix=20for=20a=20charset=20euc=5Ftw=20reported?=
 =?UTF-8?q?=20by=20devon.mitchell1988@y=E2=80=8Bahoo.com?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lib/request/basic.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/request/basic.py b/lib/request/basic.py
index 056d02595..842cda39a 100644
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@@ -97,7 +97,7 @@ def checkCharEncoding(encoding):
         return encoding
 
     # http://www.destructor.de/charsets/index.htm
-    translate = { 'windows-874': 'iso-8859-11', 'en_us': 'utf8', 'macintosh': 'iso-8859-1' }
+    translate = { 'windows-874': 'iso-8859-11', 'en_us': 'utf8', 'macintosh': 'iso-8859-1', 'euc_tw': 'big5_tw' }
 
     for delimiter in (';', ','):
         if delimiter in encoding:
@@ -110,12 +110,16 @@ def checkCharEncoding(encoding):
         encoding = encoding.replace('5889', '8859') # iso-5889 -> iso-8859
     elif '2313' in encoding:
         encoding = encoding.replace('2313', '2312') # gb2313 -> gb2312
+    elif 'x-euc' in encoding:
+        encoding = encoding.replace('x-euc', 'euc') # x-euc-kr -> euc-kr
 
     # name adjustment for compatibility
     if encoding.startswith('8859'):
         encoding = 'iso-%s' % encoding
     elif encoding.startswith('cp-'):
         encoding = 'cp%s' % encoding[3:]
+    elif encoding.startswith('euc-'):
+        encoding = 'euc_%s' % encoding[4:]
     elif encoding.startswith('windows') and not encoding.startswith('windows-'):
         encoding = 'windows-%s' % encoding[7:]
     elif encoding.find('iso-88') > 0:
@@ -185,7 +189,7 @@ def decodePage(page, contentEncoding, contentType):
 
     if contentType and any(map(lambda x: x in contentType.lower(), ('text/txt', 'text/raw', 'text/html', 'text/xml'))):
         # can't do for all responses because we need to support binary files too
-        kb.pageEncoding = kb.pageEncoding or getHeuristicCharEncoding(page)
+        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
         page = getUnicode(page, kb.pageEncoding)
 
     return page