From 95297b58e000016f1fe8de2b64dc30286de46a25 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sat, 9 Jan 2021 16:01:57 -0600 Subject: [PATCH] CLI Only working with ffnet *only* reading from Chrome browser cache. --- fanficfare/adapters/adapter_fanfictionnet.py | 55 +++- fanficfare/chromagnon/LICENSE | 24 ++ fanficfare/chromagnon/SuperFastHash.py | 89 ++++++ fanficfare/chromagnon/__init__.py | 0 fanficfare/chromagnon/cacheAddress.py | 92 ++++++ fanficfare/chromagnon/cacheAddressTest.py | 64 ++++ fanficfare/chromagnon/cacheBlock.py | 86 ++++++ fanficfare/chromagnon/cacheData.py | 130 ++++++++ fanficfare/chromagnon/cacheEntry.py | 140 +++++++++ fanficfare/chromagnon/cacheParse.py | 293 +++++++++++++++++++ fanficfare/chromagnon/classicalOutput.py | 45 +++ fanficfare/chromagnon/columnOutput.py | 49 ++++ fanficfare/chromagnon/csvOutput.py | 44 +++ fanficfare/chromagnon/downloadParse.py | 106 +++++++ fanficfare/chromagnon/historyParse.py | 178 +++++++++++ fanficfare/chromagnon/jsonOutput.py | 42 +++ fanficfare/chromagnon/visitedLinks.py | 97 ++++++ 17 files changed, 1526 insertions(+), 8 deletions(-) create mode 100644 fanficfare/chromagnon/LICENSE create mode 100644 fanficfare/chromagnon/SuperFastHash.py create mode 100644 fanficfare/chromagnon/__init__.py create mode 100644 fanficfare/chromagnon/cacheAddress.py create mode 100644 fanficfare/chromagnon/cacheAddressTest.py create mode 100644 fanficfare/chromagnon/cacheBlock.py create mode 100644 fanficfare/chromagnon/cacheData.py create mode 100644 fanficfare/chromagnon/cacheEntry.py create mode 100644 fanficfare/chromagnon/cacheParse.py create mode 100644 fanficfare/chromagnon/classicalOutput.py create mode 100644 fanficfare/chromagnon/columnOutput.py create mode 100644 fanficfare/chromagnon/csvOutput.py create mode 100644 fanficfare/chromagnon/downloadParse.py create mode 100644 fanficfare/chromagnon/historyParse.py create mode 100644 fanficfare/chromagnon/jsonOutput.py create mode 100644 fanficfare/chromagnon/visitedLinks.py diff --git a/fanficfare/adapters/adapter_fanfictionnet.py b/fanficfare/adapters/adapter_fanfictionnet.py index 66ead432..be51a5e9 100644 --- a/fanficfare/adapters/adapter_fanfictionnet.py +++ b/fanficfare/adapters/adapter_fanfictionnet.py @@ -25,6 +25,7 @@ import re from ..six import text_type as unicode from ..six.moves.urllib.error import HTTPError +from ..chromagnon.cacheParse import ChromeCache from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -60,6 +61,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): ## accept m(mobile)url, but use www. self.origurl = self.origurl.replace("https://m.","https://www.") + self.chromagnon_cache = None @staticmethod def getSiteDomain(): return 'www.fanfiction.net' @@ -75,14 +77,50 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): def getSiteURLPattern(self): return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$" + def _postUrl(self, url, + parameters={}, + headers={}, + extrasleep=None, + usecache=True): + logger.debug("_postUrl") + raise NotImplementedError + + def _fetchUrlRawOpened(self, url, + parameters=None, + extrasleep=None, + usecache=True, + referer=None): + logger.debug("_fetchUrlRawOpened") + raise NotImplementedError + + def _fetchUrlOpened(self, url, + parameters=None, + usecache=True, + extrasleep=None, + referer=None): + logger.debug("_fetchUrlOpened") + raise NotImplementedError + + def _fetchUrlRaw(self, url, + parameters=None, + extrasleep=None, + usecache=True, + referer=None): + ## This should be the one called for images. + logger.debug("_fetchUrlRaw") + raise NotImplementedError + def _fetchUrl(self,url,parameters=None,extrasleep=1.0,usecache=True): - ## ffnet(and, I assume, fpcom) tends to fail more if hit too - ## fast. This is in additional to what ever the - ## slow_down_sleep_time setting is. - return BaseSiteAdapter._fetchUrl(self,url, - parameters=parameters, - extrasleep=extrasleep, - usecache=usecache) + + if self.chromagnon_cache is None: + logger.debug("Start making self.chromagnon_cache") + self.chromagnon_cache = ChromeCache(self.getConfig("chrome_cache_path")) + logger.debug("Done making self.chromagnon_cache") + data = self.chromagnon_cache.get_cached_file(url) + logger.debug("%s:len(%s)"%(url,len(data))) + if data is None: + raise HTTPError(404,"Not found in Chrome Cache") + return self.configuration._decode(data) def use_pagecache(self): ''' @@ -103,8 +141,9 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): # use BeautifulSoup HTML parser to make everything easier to find. try: data = self._fetchUrl(url) - #logger.debug("\n===================\n%s\n===================\n"%data) + # logger.debug("\n===================\n%s\n===================\n"%data) soup = self.make_soup(data) + # logger.debug("\n===================\n%s\n===================\n"%soup) except HTTPError as e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) diff --git a/fanficfare/chromagnon/LICENSE b/fanficfare/chromagnon/LICENSE new file mode 100644 index 00000000..0b1ea634 --- /dev/null +++ b/fanficfare/chromagnon/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2012, Jean-Rémy Bancel +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Chromagon Project nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/fanficfare/chromagnon/SuperFastHash.py b/fanficfare/chromagnon/SuperFastHash.py new file mode 100644 index 00000000..f6953500 --- /dev/null +++ b/fanficfare/chromagnon/SuperFastHash.py @@ -0,0 +1,89 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Python implementation of SuperFastHash algorithm +Maybe it is better to use c_uint32 to limit the size of variables to 32bits +instead of using 0xFFFFFFFF mask. +""" + +from __future__ import absolute_import +from __future__ import print_function +import binascii +import sys + +def get16bits(data): + """Returns the first 16bits of a string""" + return int(binascii.hexlify(data[1::-1]), 16) + +def superFastHash(data): + hash = length = len(data) + if length == 0: + return 0 + + rem = length & 3 + length >>= 2 + + while length > 0: + hash += get16bits(data) & 0xFFFFFFFF + tmp = (get16bits(data[2:])<< 11) ^ hash + hash = ((hash << 16) & 0xFFFFFFFF) ^ tmp + data = data[4:] + hash += hash >> 11 + hash = hash & 0xFFFFFFFF + length -= 1 + + if rem == 3: + hash += get16bits (data) + hash ^= (hash << 16) & 0xFFFFFFFF + hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF + hash += hash >> 11 + elif rem == 2: + hash += get16bits (data) + hash ^= (hash << 11) & 0xFFFFFFFF + hash += hash >> 17 + elif rem == 1: + hash += int(binascii.hexlify(data[0]), 16) + hash ^= (hash << 10) & 0xFFFFFFFF + hash += hash >> 1 + + hash = hash & 0xFFFFFFFF + hash ^= (hash << 3) & 0xFFFFFFFF + hash += hash >> 5 + hash = hash & 0xFFFFFFFF + hash ^= (hash << 4) & 0xFFFFFFFF + hash += hash >> 17 + hash = hash & 0xFFFFFFFF + hash ^= (hash << 25) & 0xFFFFFFFF + hash += hash >> 6 + hash = hash & 0xFFFFFFFF + + return hash + +if __name__ == "__main__": + print("%08x"%superFastHash(sys.argv[1])) diff --git a/fanficfare/chromagnon/__init__.py b/fanficfare/chromagnon/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fanficfare/chromagnon/cacheAddress.py b/fanficfare/chromagnon/cacheAddress.py new file mode 100644 index 00000000..54410f59 --- /dev/null +++ b/fanficfare/chromagnon/cacheAddress.py @@ -0,0 +1,92 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Chrome Cache Address +See /net/disk_cache/addr.h for design details +""" + +class CacheAddressError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +class CacheAddress(): + """ + Object representing a Chrome Cache Address + """ + SEPARATE_FILE = 0 + RANKING_BLOCK = 1 + BLOCK_256 = 2 + BLOCK_1024 = 3 + BLOCK_4096 = 4 + + typeArray = [("Separate file", 0), + ("Ranking block file", 36), + ("256 bytes block file", 256), + ("1k bytes block file", 1024), + ("4k bytes block file", 4096)] + + def __init__(self, uint_32, path): + """ + Parse the 32 bits of the uint_32 + """ + if uint_32 == 0: + raise CacheAddressError("Null Address") + + #XXX Is self.binary useful ?? + self.addr = uint_32 + self.path = path + + # Checking that the MSB is set + self.binary = bin(uint_32) + if len(self.binary) != 34: + raise CacheAddressError("Uninitialized Address") + + self.blockType = int(self.binary[3:6], 2) + + # If it is an address of a separate file + if self.blockType == CacheAddress.SEPARATE_FILE: + self.fileSelector = "f_%06x" % int(self.binary[6:], 2) + elif self.blockType == CacheAddress.RANKING_BLOCK: + self.fileSelector = "data_" + str(int(self.binary[10:18], 2)) + else: + self.entrySize = CacheAddress.typeArray[self.blockType][1] + self.contiguousBlock = int(self.binary[8:10], 2) + self.fileSelector = "data_" + str(int(self.binary[10:18], 2)) + self.blockNumber = int(self.binary[18:], 2) + + def __str__(self): + string = hex(self.addr) + " (" + if self.blockType >= CacheAddress.BLOCK_256: + string += str(self.contiguousBlock) +\ + " contiguous blocks in " + string += CacheAddress.typeArray[self.blockType][0] +\ + " : " + self.fileSelector + ")" + return string diff --git a/fanficfare/chromagnon/cacheAddressTest.py b/fanficfare/chromagnon/cacheAddressTest.py new file mode 100644 index 00000000..7d9c8d76 --- /dev/null +++ b/fanficfare/chromagnon/cacheAddressTest.py @@ -0,0 +1,64 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +import unittest + +from . import cacheAddress + +class CacheAddressTest(unittest.TestCase): + + def testFileType(self): + """Parse Block Type From Address""" + address = cacheAddress.CacheAddress(0x8000002A) + self.assertEqual(address.blockType, + cacheAddress.CacheAddress.SEPARATE_FILE) + address = cacheAddress.CacheAddress(0x9DFF0000) + self.assertEqual(address.blockType, + cacheAddress.CacheAddress.RANKING_BLOCK) + address = cacheAddress.CacheAddress(0xA0010003) + self.assertEqual(address.blockType, + cacheAddress.CacheAddress.BLOCK_256) + address = cacheAddress.CacheAddress(0xBDFF0108) + self.assertEqual(address.blockType, + cacheAddress.CacheAddress.BLOCK_1024) + address = cacheAddress.CacheAddress(0xCDFF0108) + self.assertEqual(address.blockType, + cacheAddress.CacheAddress.BLOCK_4096) + + def testFilename(self): + """Parse Filename from Address""" + address = cacheAddress.CacheAddress(0x8000002A) + self.assertEqual(address.fileSelector, + "f_0002A") + address = cacheAddress.CacheAddress(0xA001135C) + self.assertEqual(address.fileSelector, + "data_1") + +if __name__ == "__main__": + unittest.main() diff --git a/fanficfare/chromagnon/cacheBlock.py b/fanficfare/chromagnon/cacheBlock.py new file mode 100644 index 00000000..c7adf917 --- /dev/null +++ b/fanficfare/chromagnon/cacheBlock.py @@ -0,0 +1,86 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Parse the header of a Chrome Cache File +See http://www.chromium.org/developers/design-documents/network-stack/disk-cache +for design details +""" +from __future__ import absolute_import +import struct +from six.moves import range + +class CacheBlock(): + """ + Object representing a block of the cache. It can be the index file or any + other block type : 256B, 1024B, 4096B, Ranking Block. + See /net/disk_cache/disk_format.h for details. + """ + + INDEX_MAGIC = 0xC103CAC3 + BLOCK_MAGIC = 0xC104CAC3 + INDEX = 0 + BLOCK = 1 + + def __init__(self, filename): + """ + Parse the header of a cache file + """ + header = open(filename, 'rb') + + # Read Magic Number + magic = struct.unpack('I', header.read(4))[0] + if magic == CacheBlock.BLOCK_MAGIC: + self.type = CacheBlock.BLOCK + header.seek(2, 1) + self.version = struct.unpack('h', header.read(2))[0] + self.header = struct.unpack('h', header.read(2))[0] + self.nextFile = struct.unpack('h', header.read(2))[0] + self.blockSize = struct.unpack('I', header.read(4))[0] + self.entryCount = struct.unpack('I', header.read(4))[0] + self.entryMax = struct.unpack('I', header.read(4))[0] + self.empty = [] + for _ in range(4): + self.empty.append(struct.unpack('I', header.read(4))[0]) + self.position = [] + for _ in range(4): + self.position.append(struct.unpack('I', header.read(4))[0]) + elif magic == CacheBlock.INDEX_MAGIC: + self.type = CacheBlock.INDEX + header.seek(2, 1) + self.version = struct.unpack('h', header.read(2))[0] + self.entryCount = struct.unpack('I', header.read(4))[0] + self.byteCount = struct.unpack('I', header.read(4))[0] + self.lastFileCreated = "f_%06x" % \ + struct.unpack('I', header.read(4))[0] + header.seek(4*2, 1) + self.tableSize = struct.unpack('I', header.read(4))[0] + else: + header.close() + raise Exception("Invalid Chrome Cache File") + header.close() diff --git a/fanficfare/chromagnon/cacheData.py b/fanficfare/chromagnon/cacheData.py new file mode 100644 index 00000000..c56e2620 --- /dev/null +++ b/fanficfare/chromagnon/cacheData.py @@ -0,0 +1,130 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Stores the data fetched in the cache. +Parse the HTTP header if asked. +""" + +from __future__ import absolute_import +import re +import shutil +import struct +import os + +from . import cacheAddress +from six.moves import range + +class CacheData(): + """ + Retrieve data at the given address + Can save it to a separate file for export + """ + + HTTP_HEADER = 0 + UNKNOWN = 1 + + def __init__(self, address, size, isHTTPHeader=False): + """ + It is a lazy evaluation object : the file is open only if it is + needed. It can parse the HTTP header if asked to do so. + See net/http/http_util.cc LocateStartOfStatusLine and + LocateEndOfHeaders for details. + """ + self.size = size + self.address = address + self.type = CacheData.UNKNOWN + + if isHTTPHeader and\ + self.address.blockType != cacheAddress.CacheAddress.SEPARATE_FILE: + # Getting raw data + string = b"" + block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb') + block.seek(8192 + self.address.blockNumber*self.address.entrySize) + for _ in range(self.size): + string += struct.unpack('c', block.read(1))[0] + block.close() + + # Finding the beginning of the request + start = re.search(b"HTTP", string) + if start == None: + return + else: + string = string[start.start():] + + # Finding the end (some null characters : verified by experience) + end = re.search(b"\x00\x00", string) + if end == None: + return + else: + string = string[:end.end()-2] + + # Creating the dictionary of headers + self.headers = {} + for line in string.split(b'\0'): + stripped = line.split(b':') + self.headers[stripped[0].lower()] = \ + b':'.join(stripped[1:]).strip() + self.type = CacheData.HTTP_HEADER + + def save(self, filename=None): + """Save the data to the specified filename""" + if self.address.blockType == cacheAddress.CacheAddress.SEPARATE_FILE: + shutil.copy(os.path.join(self.address.path,self.address.fileSelector), + filename) + else: + output = open(filename, 'wb') + block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb') + block.seek(8192 + self.address.blockNumber*self.address.entrySize) + output.write(block.read(self.size)) + block.close() + output.close() + + def data(self): + """Returns a string representing the data""" + if self.address.blockType == cacheAddress.CacheAddress.SEPARATE_FILE: + with open(os.path.join(self.address.path,self.address.fileSelector), 'rb') as infile: + data = infile.read() + else: + block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb') + block.seek(8192 + self.address.blockNumber*self.address.entrySize) + data = block.read(self.size).decode('utf-8') + block.close() + return data + + def __str__(self): + """ + Display the type of cacheData + """ + if self.type == CacheData.HTTP_HEADER: + if 'content-type' in self.headers: + return "HTTP Header %s" % self.headers['content-type'] + else: + return "HTTP Header" + else: + return "Data" diff --git a/fanficfare/chromagnon/cacheEntry.py b/fanficfare/chromagnon/cacheEntry.py new file mode 100644 index 00000000..480a4213 --- /dev/null +++ b/fanficfare/chromagnon/cacheEntry.py @@ -0,0 +1,140 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Chrome Cache Entry +See http://www.chromium.org/developers/design-documents/network-stack/disk-cache +for design details +""" + +from __future__ import absolute_import +import datetime +import struct +import os + +from . import cacheAddress +from . import cacheData +from six.moves import range + +class CacheEntry(): + """ + See /net/disk_cache/disk_format.h for details. + """ + STATE = ["Normal", + "Evicted (data were deleted)", + "Doomed (shit happened)"] + + def __init__(self, address): + """ + Parse a Chrome Cache Entry at the given address + """ + self.httpHeader = None + block = open(os.path.join(address.path,address.fileSelector), 'rb') + + # Going to the right entry + block.seek(8192 + address.blockNumber*address.entrySize) + + # Parsing basic fields + self.hash = struct.unpack('I', block.read(4))[0] + self.next = struct.unpack('I', block.read(4))[0] + self.rankingNode = struct.unpack('I', block.read(4))[0] + self.usageCounter = struct.unpack('I', block.read(4))[0] + self.reuseCounter = struct.unpack('I', block.read(4))[0] + self.state = struct.unpack('I', block.read(4))[0] + self.creationTime = datetime.datetime(1601, 1, 1) + \ + datetime.timedelta(microseconds=\ + struct.unpack('Q', block.read(8))[0]) + self.keyLength = struct.unpack('I', block.read(4))[0] + self.keyAddress = struct.unpack('I', block.read(4))[0] + + + dataSize = [] + for _ in range(4): + dataSize.append(struct.unpack('I', block.read(4))[0]) + + self.data = [] + for index in range(4): + addr = struct.unpack('I', block.read(4))[0] + try: + addr = cacheAddress.CacheAddress(addr, address.path) + self.data.append(cacheData.CacheData(addr, dataSize[index], + True)) + except cacheAddress.CacheAddressError: + pass + + # Find the HTTP header if there is one + for data in self.data: + if data.type == cacheData.CacheData.HTTP_HEADER: + self.httpHeader = data + break + + self.flags = struct.unpack('I', block.read(4))[0] + + # Skipping pad + block.seek(5*4, 1) + + # Reading local key + if self.keyAddress == 0: + self.key = block.read(self.keyLength).decode('ascii') + # Key stored elsewhere + else: + addr = cacheAddress.CacheAddress(self.keyAddress, address.path) + + # It is probably an HTTP header + self.key = cacheData.CacheData(addr, self.keyLength, True) + + block.close() + + def keyToStr(self): + """ + Since the key can be a string or a CacheData object, this function is an + utility to display the content of the key whatever type is it. + """ + if self.keyAddress == 0: + return self.key + else: + return self.key.data() + + def __str__(self): + string = "Hash: 0x%08x" % self.hash + '\n' + if self.next != 0: + string += "Next: 0x%08x" % self.next + '\n' + string += "Usage Counter: %d" % self.usageCounter + '\n'\ + "Reuse Counter: %d" % self.reuseCounter + '\n'\ + "Creation Time: %s" % self.creationTime + '\n' + if self.keyAddress != 0: + string += "Key Address: 0x%08x" % self.keyAddress + '\n' + string += "Key: %s" % self.key + '\n' + if self.flags != 0: + string += "Flags: 0x%08x" % self.flags + '\n' + string += "State: %s" % CacheEntry.STATE[self.state] + for data in self.data: + string += "\nData (%d bytes) at 0x%08x : %s" % (data.size, + data.address.addr, + data) + return string diff --git a/fanficfare/chromagnon/cacheParse.py b/fanficfare/chromagnon/cacheParse.py new file mode 100644 index 00000000..a21d6cc3 --- /dev/null +++ b/fanficfare/chromagnon/cacheParse.py @@ -0,0 +1,293 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Parse the Chrome Cache File +See http://www.chromium.org/developers/design-documents/network-stack/disk-cache +for design details +""" + +from __future__ import absolute_import +from __future__ import print_function +import gzip +import os +import struct +import sys +import re +import brotli + +from . import csvOutput +from . import SuperFastHash + +from .cacheAddress import CacheAddress +from .cacheBlock import CacheBlock +from .cacheData import CacheData +from .cacheEntry import CacheEntry +from six.moves import range + + +def parse(path, urls=None): + """ + Reads the whole cache and store the collected data in a table + or find out if the given list of urls is in the cache. If yes it + return a list of the corresponding entries. + """ + # Verifying that the path end with / (What happen on windows?) + path = os.path.abspath(path) + cacheBlock = CacheBlock(os.path.join(path, "index")) + + # Checking type + if cacheBlock.type != CacheBlock.INDEX: + raise Exception("Invalid Index File") + + index = open(os.path.join(path, "index"), 'rb') + + # Skipping Header + index.seek(92*4) + + cache = [] + # If no url is specified, parse the whole cache + if urls == None: + for key in range(cacheBlock.tableSize): + raw = struct.unpack('I', index.read(4))[0] + if raw != 0: + entry = CacheEntry(CacheAddress(raw, path=path)) + # Checking if there is a next item in the bucket because + # such entries are not stored in the Index File so they will + # be ignored during iterative lookup in the hash table + while entry.next != 0: + cache.append(entry) + entry = CacheEntry(CacheAddress(entry.next, path=path)) + cache.append(entry) + else: + # Find the entry for each url + for url in urls: + url = bytes(url,'utf8') + # Compute the key and seeking to it + hash = SuperFastHash.superFastHash(url) + key = hash & (cacheBlock.tableSize - 1) + index.seek(92*4 + key*4) + + addr = struct.unpack('I', index.read(4))[0] + # Checking if the address is initialized (i.e. used) + if addr & 0x80000000 == 0: + print("%s is not in the cache" % url, file=sys.stderr) + + # Follow the chained list in the bucket + else: + entry = CacheEntry(CacheAddress(addr, path=path)) + while entry.hash != hash and entry.next != 0: + entry = CacheEntry(CacheAddress(entry.next, path=path)) + if entry.hash == hash: + cache.append(entry) + index.close() + return cache + +class ChromeCache(object): + def __init__(self,path): + self.cache = parse(path) + self.hash_cache = {} + for entry in self.cache: + key = entry.keyToStr() + self.hash_cache[key] = entry + normkey = re.sub(r'^(https://www.fanfiction.net/s/\d+/\d+/).+$',r'\1',key) + ## either overwrites (no harm), or adds new. + self.hash_cache[normkey] = entry + + def get_cached_file(self,url): + if url in self.hash_cache: + entry = self.hash_cache[url] + for i in range(len(entry.data)): + if entry.data[i].type == CacheData.UNKNOWN: + # Extracting data into a file + data = entry.data[i].data() + + # print("content-encoding:%s"%entry.httpHeader.headers.get(b'content-encoding','')) + if entry.httpHeader != None and \ + b'content-encoding' in entry.httpHeader.headers: + if entry.httpHeader.headers[b'content-encoding'] == b"gzip": + data = gzip.decompress(data) + elif entry.httpHeader.headers[b'content-encoding'] == b"br": + data = brotli.decompress(data) + return data + return None + +def exportToHTML(cache, outpath): + """ + Export the cache in html + """ + + # Checking that the directory exists and is writable + if not os.path.exists(outpath): + os.makedirs(outpath) + outpath = os.path.abspath(outpath) + + index = open(os.path.join(outpath,"index.html"), 'w') + index.write("
    ") + + for entry in cache: + # Adding a link in the index + if entry.keyLength > 100: + entry_name = entry.keyToStr()[:100] + "..." + else: + entry_name = entry.keyToStr() + index.write('
  • %s
  • '%(entry.hash, entry_name)) + # We handle the special case where entry_name ends with a slash + page_basename = entry_name.split('/')[-2] if entry_name.endswith('/') else entry_name.split('/')[-1] + + # Creating the entry page + page = open(os.path.join(outpath,"%08x.html"%entry.hash), 'w') + page.write(""" + + + + + """) + + # Details of the entry + page.write("Hash: 0x%08x
    "%entry.hash) + page.write("Usage Counter: %d
    "%entry.usageCounter) + page.write("Reuse Counter: %d
    "%entry.reuseCounter) + page.write("Creation Time: %s
    "%entry.creationTime) + page.write("Key: %s
    "%entry.keyToStr()) + page.write("State: %s
    "%CacheEntry.STATE[entry.state]) + + page.write("
    ") + ## entry.data normally 2 or 1 + ## 2 for headers and data, 1 for headers only. + if len(entry.data) == 0: + page.write("No data associated with this entry :-(") + for i in range(len(entry.data)): + if entry.data[i].type == CacheData.UNKNOWN: + # Extracting data into a file + name = hex(entry.hash) + "_" + str(i) + entry.data[i].save(os.path.join(outpath,name)) + + # print("content-encoding:%s"%entry.httpHeader.headers.get(b'content-encoding','')) + if entry.httpHeader != None and \ + b'content-encoding' in entry.httpHeader.headers: + if entry.httpHeader.headers[b'content-encoding'] == b"gzip": + # XXX Highly inefficient !!!!! + try: + input = gzip.open(os.path.join(outpath, name), 'rb') + output = open(os.path.join(outpath, name + "u"), 'wb') + output.write(input.read()) + input.close() + output.close() + page.write('%s'%(name, page_basename)) + # print("gunzip'ed:%s"%name) + except IOError: + page.write("Something wrong happened while unzipping") + elif entry.httpHeader.headers[b'content-encoding'] == b"br": + try: + with open(os.path.join(outpath,name), 'rb') as input: + with open(os.path.join(outpath,name + "u"), 'wb') as output: + output.write(brotli.decompress(input.read())) + page.write('%s'%(name, page_basename)) + # print("unbrotli'ed:%s"%name) + except IOError: + page.write("Something wrong happened while unzipping") + brotli + else: + page.write('%s'%(name , + entry.keyToStr().split('/')[-1])) + + + # If it is a picture, display it + if entry.httpHeader != None: + if b'content-type' in entry.httpHeader.headers and\ + b"image" in entry.httpHeader.headers[b'content-type']: + page.write('
    '%(name)) + # HTTP Header + else: + page.write("HTTP Header
    ") + for key, value in entry.data[i].headers.items(): + page.write("%s: %s
    "%(key, value)) + page.write("
    ") + page.write("") + page.close() + + index.write("
") + index.close() + +def exportTol2t(cache): + """ + Export the cache in CSV log2timeline compliant format + """ + + output = [] + output.append(["date", + "time", + "timezone", + "MACB", + "source", + "sourcetype", + "type", + "user", + "host", + "short", + "desc", + "version", + "filename", + "inode", + "notes", + "format", + "extra"]) + + for entry in cache: + date = entry.creationTime.date().strftime("%m/%d/%Y") + time = entry.creationTime.time() + # TODO get timezone + timezone = 0 + short = entry.keyToStr() + descr = "Hash: 0x%08x" % entry.hash + descr += " Usage Counter: %d" % entry.usageCounter + if entry.httpHeader != None: + if 'content-type' in entry.httpHeader.headers: + descr += " MIME: %s" % entry.httpHeader.headers['content-type'] + + output.append([date, + time, + timezone, + "MACB", + "WEBCACHE", + "Chrome Cache", + "Cache Entry", + "-", + "-", + short, + descr, + "2", + "-", + "-", + "-", + "-", + "-", + ]) + + csvOutput.csvOutput(output) diff --git a/fanficfare/chromagnon/classicalOutput.py b/fanficfare/chromagnon/classicalOutput.py new file mode 100644 index 00000000..c8310888 --- /dev/null +++ b/fanficfare/chromagnon/classicalOutput.py @@ -0,0 +1,45 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Classical Output Module +""" + +from __future__ import absolute_import +import sys + +def classicalOutput(queryResult, separator="\t"): + """ + Display the data separated by the specified separator + """ + + for line in queryResult: + for element in line: + sys.stdout.write(element) + sys.stdout.write(separator) + sys.stdout.write('\n') diff --git a/fanficfare/chromagnon/columnOutput.py b/fanficfare/chromagnon/columnOutput.py new file mode 100644 index 00000000..38d2f1e2 --- /dev/null +++ b/fanficfare/chromagnon/columnOutput.py @@ -0,0 +1,49 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Column Output Module +""" + +from __future__ import print_function +from six.moves import range +def columnOutput(queryResult, separator=' '): + """ + Display the data in columns + """ + if len(queryResult) == 0: + return + + # Finding width of columns + size = [max([len(str(line[i])) for line in queryResult]) + for i in range(len(queryResult[0]))] + # Generating format string (without last separator) + string = (''.join(["%%-%ds%s" % (x, separator) for x in size]))\ + [:-len(separator)] + for line in queryResult: + print(string % tuple(line)) diff --git a/fanficfare/chromagnon/csvOutput.py b/fanficfare/chromagnon/csvOutput.py new file mode 100644 index 00000000..ecc622de --- /dev/null +++ b/fanficfare/chromagnon/csvOutput.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +CSV Output Module +""" + +from __future__ import absolute_import +import csv +import sys + +def csvOutput(queryResult, separator=',', quote='"'): + """ + Display the data according to csv format + """ + csvWriter = csv.writer(sys.stdout, delimiter=separator, quotechar=quote, + quoting=csv.QUOTE_MINIMAL) + for line in queryResult: + csvWriter.writerow(line) diff --git a/fanficfare/chromagnon/downloadParse.py b/fanficfare/chromagnon/downloadParse.py new file mode 100644 index 00000000..a84859b9 --- /dev/null +++ b/fanficfare/chromagnon/downloadParse.py @@ -0,0 +1,106 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Parse the Chrome Download Table History File +Its a SQLite3 table +""" + +from __future__ import absolute_import +from __future__ import print_function +import datetime +import sqlite3 +import sys +import six + +def parse(filename, urlLength): + """ + filename: path to the history file + urlLength: maximum url length to display + """ + + # Connecting to the DB + try: + history = sqlite3.connect(filename) + except sqlite3.Error as error: + print("==> Error while opening the history file !") + print("==> Details :", error.message) + sys.exit("==> Exiting...") + + # Retrieving all useful data + result = history.execute("SELECT id, \ + full_path, \ + url, \ + start_time, \ + received_bytes, \ + total_bytes, \ + state \ + FROM downloads;") + + output = [] + for line in result: + output.append(DownloadEntry(line, urlLength)) + return output + +class DownloadEntry(object): + """Object to store download entries""" + COLUMN_STR = {'st': "startTime", + 'p': "path", + 'u': "url", + 'rb': "receivedBytes", + 'tb': "totalBytes", + 'pt': "percentReceived", + 's': "state"} + STATE_STR = ["In Progress", + "Complete", + "Cancelled", + "Removing", + "Interrupted"] + + def __init__(self, item, urlLength): + """Parse raw input""" + self.path = item[1] + if len(item[2]) > urlLength and urlLength > 0: + self.url = item[2][0:urlLength - 3] + "..." + else: + self.url = item[2] + self.startTime = datetime.datetime(1601, 1, 1) + \ + datetime.timedelta(microseconds=\ + item[3]) + self.receivedBytes = item[4] + self.totalBytes = item[5] + self.state = DownloadEntry.STATE_STR[item[6]] + if int(item[5]) == 0: + self.percentReceived = "0%" + else: + self.percentReceived = "%d%%" % \ + int(float(item[4])/float(item[5])*100) + + def columnToStr(self, column): + """Returns column content specified by argument""" + return six.text_type(self.__getattribute__(DownloadEntry.COLUMN_STR[column])) diff --git a/fanficfare/chromagnon/historyParse.py b/fanficfare/chromagnon/historyParse.py new file mode 100644 index 00000000..9f331452 --- /dev/null +++ b/fanficfare/chromagnon/historyParse.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Parse the Chrome History File +Its a SQLite3 file +""" + +from __future__ import absolute_import +from __future__ import print_function +import datetime +import re +import sqlite3 +import sys + +from . import cacheParse +import six + +def parse(filename, start, end, checkCache, cachePath, urlLength): + """ + filename: path to the history file + start: beginning of the time window + end: end of the time window + checkCache: check if each page in the history is in the cache + cachePath: path to cache directory + """ + + # Connecting to the DB + try: + history = sqlite3.connect(filename) + except sqlite3.Error as error: + print("==> Error while opening the history file !") + print("==> Details :", error.message) + sys.exit("==> Exiting...") + + reference = datetime.datetime(1601, 1, 1) + + # Retrieving all useful data + result = history.execute("SELECT visits.visit_time, \ + visits.from_visit, \ + visits.transition, \ + urls.url, \ + urls.title, \ + urls.visit_count, \ + urls.typed_count, \ + urls.last_visit_time \ + FROM urls,visits \ + WHERE urls.id=visits.url\ + AND visits.visit_time>%d\ + AND visits.visit_time<%d\ + ORDER BY visits.visit_time;"%\ + (int((start-reference).total_seconds()*1000000),\ + int((end-reference).total_seconds()*1000000)))\ + + # Parsing cache + cache = None + if checkCache: + cache = cacheParse.parse(cachePath) + + output = [] + for line in result: + output.append(HistoryEntry(line, cache, urlLength)) + return output + +class Transition(): + """Object representing transition between history pages""" + + CORE_STRING = ["Link",\ + "Typed",\ + "Auto Bookmark",\ + "Auto Subframe",\ + "Manual Subframe",\ + "Generated",\ + "Start Page",\ + "Form Submit",\ + "Reload",\ + "Keyword",\ + "Keywork Generated"] + QUALIFIER_STRING = [(0x01000000, "Forward or Back Button"), + (0x02000000, "Address Bar"), + (0x04000000, "Home Page"), + (0x10000000, "Beginning of Chain"), + (0x20000000, "End of Chain"), + (0x40000000, "Client Redirection"), + (0x80000000, "Server Redirection")] + + def __init__(self, transition): + """ + Parsing the transtion according to + content/common/page_transition_types.h + """ + self.core = transition & 0xFF + self.qualifier = transition & 0xFFFFFF00 + + def __str__(self): + string = Transition.CORE_STRING[self.core] + for mask, description in Transition.QUALIFIER_STRING: + if self.qualifier & mask != 0: + string += ", %s"%description + return string + +class HistoryEntry(object): + """Object to store database entries""" + COLUMN_STR = {'vt': "visitTime", + 'fv': "fromVisit", + 'tr': "transition", + 'u': "url", + 'tl': "title", + 'vc': "visitCount", + 'tc': "typedCount", + 'lv': "lastVisitTime", + 'cc': "inCache"} + + def __init__(self, item, cache, urlLength): + """Parse raw input""" + self.visitTime = datetime.datetime(1601, 1, 1) + \ + datetime.timedelta(microseconds=\ + item[0]) + self.fromVisit = item[1] + self.transition = Transition(item[2]) + if len(item[3]) > urlLength and urlLength > 0: + self.url = item[3][0:urlLength - 3] + "..." + else: + self.url = item[3] + self.title = item[4] + self.visitCount = item[5] + self.typedCount = item[6] + self.lastVisitTime = datetime.datetime(1601, 1, 1) + \ + datetime.timedelta(microseconds=\ + item[7]) + + # Searching in the cache if there is a copy of the page + # TODO use a hash table to search instead of heavy exhaustive search + self.inCache = False + if cache != None: + for item in cache: + if item.keyToStr() == self.url: + self.inCache = True + break + + def toStr(self): + return [six.text_type(self.visitTime),\ + six.text_type(self.fromVisit),\ + six.text_type(self.transition),\ + six.text_type(self.url),\ + six.text_type(self.title),\ + six.text_type(self.visitCount),\ + six.text_type(self.typedCount),\ + six.text_type(self.lastVisitTime)] + + def columnToStr(self, column): + """Returns column content specified by argument""" + return six.text_type(self.__getattribute__(HistoryEntry.COLUMN_STR[column])) diff --git a/fanficfare/chromagnon/jsonOutput.py b/fanficfare/chromagnon/jsonOutput.py new file mode 100644 index 00000000..95e0531d --- /dev/null +++ b/fanficfare/chromagnon/jsonOutput.py @@ -0,0 +1,42 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +JSON Output Module +""" + +from __future__ import absolute_import +from __future__ import print_function +import json + +def jsonOutput(queryResult, separator=''): + """ + Display the data separated in JSON + """ + + print(json.JSONEncoder().encode(queryResult)) diff --git a/fanficfare/chromagnon/visitedLinks.py b/fanficfare/chromagnon/visitedLinks.py new file mode 100644 index 00000000..6fd894b0 --- /dev/null +++ b/fanficfare/chromagnon/visitedLinks.py @@ -0,0 +1,97 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Parse the Chrome Visited Links +Reverse engineered from + chrome/common/visitedlink_common.* + chrome/browser/visitedlink/visitedlink_* +""" + +from __future__ import absolute_import +import md5 +import struct +import sys +from six.moves import range + +VISITED_LINKS_MAGIC = 0x6b6e4c56; + +def isVisited(path, urls): + """ + Return the list of urls given in parameter with a boolean information + about its presence in the given visited links file + """ + output = [] + + f = open(path, 'rb') + + # Checking file type + magic = struct.unpack('I', f.read(4))[0] + if magic != VISITED_LINKS_MAGIC: + raise Exception("Invalid file") + + # Reading header values + version = struct.unpack('I', f.read(4))[0] + length = struct.unpack('I', f.read(4))[0] + usedItems = struct.unpack('I', f.read(4))[0] + + # Reading salt + salt = "" + for dummy in range(8): + salt += struct.unpack('c', f.read(1))[0] + + for url in urls: + fingerprint = md5.new() + fingerprint.update(salt) + fingerprint.update(url) + digest = fingerprint.hexdigest() + + # Inverting the result + # Why Chrome MD5 computation gives a reverse digest ? + fingerprint = 0 + for i in range(0, 16, 2): + fingerprint += int(digest[i:i+2], 16) << (i/2)*8 + key = fingerprint % length + + # The hash table uses open addressing + f.seek(key*8 + 24, 0) + while True: + finger = struct.unpack('q', f.read(8))[0] + if finger == 0: + output.append((url, False)) + break + if finger == fingerprint: + output.append((url, True)) + break + if f.tell() >= length*8 + 24: + f.seek(24) + if f.tell() == key*8 + 24: + output.append((url, False)) + break + f.close() + return output