From aa81ed4033dd4244a77ab1c129606a73e705e961 Mon Sep 17 00:00:00 2001 From: Miroslav Stampar Date: Tue, 4 Jan 2011 15:49:20 +0000 Subject: [PATCH] implementation of a feature suggested by pan@knownsec.com (usage of charset type from http-equiv attribute in case when charset is not defined in headers) --- doc/THANKS | 4 ++-- lib/core/settings.py | 2 ++ lib/request/basic.py | 13 ++++++++++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/THANKS b/doc/THANKS index 1f1733e00..c0c954b37 100644 --- a/doc/THANKS +++ b/doc/THANKS @@ -267,8 +267,8 @@ Simone Onofri Windows Shaohua Pan - for reporting few bugs - for suggesting a feature + for reporting several bugs + for suggesting a few features Antonio Parata for providing me with some ideas for the PHP backdoor diff --git a/lib/core/settings.py b/lib/core/settings.py index 5a755990f..a8935c3ee 100644 --- a/lib/core/settings.py +++ b/lib/core/settings.py @@ -169,3 +169,5 @@ ERROR_PARSING_REGEXES = ( r"
  • Error Type:
    (?P.+?)
  • ", r"error '[0-9a-f]{8}'((<[^>]+>)|\s)+(?P[^<>]+)" ) + +META_CHARSET_REGEX = r'[^"]+)" />' diff --git a/lib/request/basic.py b/lib/request/basic.py index ebc5691c6..9e23c5fde 100644 --- a/lib/request/basic.py +++ b/lib/request/basic.py @@ -15,6 +15,7 @@ import StringIO import zlib from lib.core.common import extractErrorMessage +from lib.core.common import extractRegexResult from lib.core.common import getCompiledRegex from lib.core.common import getUnicode from lib.core.common import isWindowsDriveLetterPath @@ -23,6 +24,7 @@ from lib.core.common import sanitizeAsciiString from lib.core.data import conf from lib.core.data import kb from lib.core.data import logger +from lib.core.settings import META_CHARSET_REGEX from lib.parse.headers import headersParser from lib.parse.html import htmlParser @@ -127,12 +129,17 @@ def decodePage(page, contentEncoding, contentType): page = data.read() + charset = None + # http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode if contentType and (contentType.find('charset=') != -1): - charset = checkCharEncoding(contentType.split('charset=')[-1]) + charset = contentType.split('charset=')[-1] + elif extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE): + charset = extractRegexResult(META_CHARSET_REGEX, page, re.DOTALL | re.IGNORECASE) - if charset: - kb.pageEncoding = charset + charset = checkCharEncoding(charset) + if charset: + kb.pageEncoding = charset return getUnicode(page)