From aa81ed4033dd4244a77ab1c129606a73e705e961 Mon Sep 17 00:00:00 2001
From: Miroslav Stampar <miroslav.stampar@gmail.com>
Date: Tue, 4 Jan 2011 15:49:20 +0000
Subject: [PATCH] implementation of a feature suggested by pan@knownsec.com
 (usage of charset type from http-equiv attribute in case when charset is not
 defined in headers)

---
 doc/THANKS           |  4 ++--
 lib/core/settings.py |  2 ++
 lib/request/basic.py | 13 ++++++++++---
 3 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/doc/THANKS b/doc/THANKS
index 1f1733e00..c0c954b37 100644
--- a/doc/THANKS
+++ b/doc/THANKS
@@ -267,8 +267,8 @@ Simone Onofri <simone.onofri@gmail.com>
     Windows
 
 Shaohua Pan <pan@knownsec.com>
-    for reporting few bugs
-    for suggesting a feature
+    for reporting several bugs
+    for suggesting a few features
 
 Antonio Parata <s4tan@ictsc.it>
     for providing me with some ideas for the PHP backdoor
diff --git a/lib/core/settings.py b/lib/core/settings.py
index 5a755990f..a8935c3ee 100644
--- a/lib/core/settings.py
+++ b/lib/core/settings.py
@@ -169,3 +169,5 @@ ERROR_PARSING_REGEXES = (
                             r"<li>Error Type:<br>(?P<result>.+?)</li>", 
                             r"error '[0-9a-f]{8}'((<[^>]+>)|\s)+(?P<result>[^<>]+)"
                         )
+
+META_CHARSET_REGEX  = r'<meta http-equiv="Content-Type" content="[^"]*?charset=(?P<result>[^"]+)" />'
diff --git a/lib/request/basic.py b/lib/request/basic.py
index ebc5691c6..9e23c5fde 100644
--- a/lib/request/basic.py
+++ b/lib/request/basic.py
@@ -15,6 +15,7 @@ import StringIO
 import zlib
 
 from lib.core.common import extractErrorMessage
+from lib.core.common import extractRegexResult
 from lib.core.common import getCompiledRegex
 from lib.core.common import getUnicode
 from lib.core.common import isWindowsDriveLetterPath
@@ -23,6 +24,7 @@ from lib.core.common import sanitizeAsciiString
 from lib.core.data import conf
 from lib.core.data import kb
 from lib.core.data import logger
+from lib.core.settings import META_CHARSET_REGEX
 from lib.parse.headers import headersParser
 from lib.parse.html import htmlParser
 
@@ -127,12 +129,17 @@ def decodePage(page, contentEncoding, contentType):
 
         page = data.read()
 
+    charset = None
+
     # http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
     if contentType and (contentType.find('charset=') != -1):
-        charset = checkCharEncoding(contentType.split('charset=')[-1])
+        charset = contentType.split('charset=')[-1]
+    elif extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE):
+        charset = extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE)
 
-        if charset:
-            kb.pageEncoding = charset
+    charset = checkCharEncoding(charset)
+    if charset:
+        kb.pageEncoding = charset
 
     return getUnicode(page)