CLI Only working with ffnet *only* reading from Chrome browser cache.

This commit is contained in:
Jim Miller 2021-01-09 16:01:57 -06:00
parent 10a7cf8aa7
commit 95297b58e0
17 changed files with 1526 additions and 8 deletions

View file

@ -25,6 +25,7 @@ import re
from ..six import text_type as unicode
from ..six.moves.urllib.error import HTTPError
from ..chromagnon.cacheParse import ChromeCache
from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
@ -60,6 +61,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
## accept m(mobile)url, but use www.
self.origurl = self.origurl.replace("https://m.","https://www.")
self.chromagnon_cache = None
@staticmethod
def getSiteDomain():
return 'www.fanfiction.net'
@ -75,14 +77,50 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$"
def _postUrl(self, url,
parameters={},
headers={},
extrasleep=None,
usecache=True):
logger.debug("_postUrl")
raise NotImplementedError
def _fetchUrlRawOpened(self, url,
parameters=None,
extrasleep=None,
usecache=True,
referer=None):
logger.debug("_fetchUrlRawOpened")
raise NotImplementedError
def _fetchUrlOpened(self, url,
parameters=None,
usecache=True,
extrasleep=None,
referer=None):
logger.debug("_fetchUrlOpened")
raise NotImplementedError
def _fetchUrlRaw(self, url,
parameters=None,
extrasleep=None,
usecache=True,
referer=None):
## This should be the one called for images.
logger.debug("_fetchUrlRaw")
raise NotImplementedError
def _fetchUrl(self,url,parameters=None,extrasleep=1.0,usecache=True):
## ffnet(and, I assume, fpcom) tends to fail more if hit too
## fast. This is in additional to what ever the
## slow_down_sleep_time setting is.
return BaseSiteAdapter._fetchUrl(self,url,
parameters=parameters,
extrasleep=extrasleep,
usecache=usecache)
if self.chromagnon_cache is None:
logger.debug("Start making self.chromagnon_cache")
self.chromagnon_cache = ChromeCache(self.getConfig("chrome_cache_path"))
logger.debug("Done making self.chromagnon_cache")
data = self.chromagnon_cache.get_cached_file(url)
logger.debug("%s:len(%s)"%(url,len(data)))
if data is None:
raise HTTPError(404,"Not found in Chrome Cache")
return self.configuration._decode(data)
def use_pagecache(self):
'''
@ -103,8 +141,9 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
try:
data = self._fetchUrl(url)
#logger.debug("\n===================\n%s\n===================\n"%data)
# logger.debug("\n===================\n%s\n===================\n"%data)
soup = self.make_soup(data)
# logger.debug("\n===================\n%s\n===================\n"%soup)
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(url)

View file

@ -0,0 +1,24 @@
Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Chromagon Project nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View file

@ -0,0 +1,89 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Python implementation of SuperFastHash algorithm
Maybe it is better to use c_uint32 to limit the size of variables to 32bits
instead of using 0xFFFFFFFF mask.
"""
from __future__ import absolute_import
from __future__ import print_function
import binascii
import sys
def get16bits(data):
"""Returns the first 16bits of a string"""
return int(binascii.hexlify(data[1::-1]), 16)
def superFastHash(data):
hash = length = len(data)
if length == 0:
return 0
rem = length & 3
length >>= 2
while length > 0:
hash += get16bits(data) & 0xFFFFFFFF
tmp = (get16bits(data[2:])<< 11) ^ hash
hash = ((hash << 16) & 0xFFFFFFFF) ^ tmp
data = data[4:]
hash += hash >> 11
hash = hash & 0xFFFFFFFF
length -= 1
if rem == 3:
hash += get16bits (data)
hash ^= (hash << 16) & 0xFFFFFFFF
hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF
hash += hash >> 11
elif rem == 2:
hash += get16bits (data)
hash ^= (hash << 11) & 0xFFFFFFFF
hash += hash >> 17
elif rem == 1:
hash += int(binascii.hexlify(data[0]), 16)
hash ^= (hash << 10) & 0xFFFFFFFF
hash += hash >> 1
hash = hash & 0xFFFFFFFF
hash ^= (hash << 3) & 0xFFFFFFFF
hash += hash >> 5
hash = hash & 0xFFFFFFFF
hash ^= (hash << 4) & 0xFFFFFFFF
hash += hash >> 17
hash = hash & 0xFFFFFFFF
hash ^= (hash << 25) & 0xFFFFFFFF
hash += hash >> 6
hash = hash & 0xFFFFFFFF
return hash
if __name__ == "__main__":
print("%08x"%superFastHash(sys.argv[1]))

View file

View file

@ -0,0 +1,92 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Chrome Cache Address
See /net/disk_cache/addr.h for design details
"""
class CacheAddressError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class CacheAddress():
"""
Object representing a Chrome Cache Address
"""
SEPARATE_FILE = 0
RANKING_BLOCK = 1
BLOCK_256 = 2
BLOCK_1024 = 3
BLOCK_4096 = 4
typeArray = [("Separate file", 0),
("Ranking block file", 36),
("256 bytes block file", 256),
("1k bytes block file", 1024),
("4k bytes block file", 4096)]
def __init__(self, uint_32, path):
"""
Parse the 32 bits of the uint_32
"""
if uint_32 == 0:
raise CacheAddressError("Null Address")
#XXX Is self.binary useful ??
self.addr = uint_32
self.path = path
# Checking that the MSB is set
self.binary = bin(uint_32)
if len(self.binary) != 34:
raise CacheAddressError("Uninitialized Address")
self.blockType = int(self.binary[3:6], 2)
# If it is an address of a separate file
if self.blockType == CacheAddress.SEPARATE_FILE:
self.fileSelector = "f_%06x" % int(self.binary[6:], 2)
elif self.blockType == CacheAddress.RANKING_BLOCK:
self.fileSelector = "data_" + str(int(self.binary[10:18], 2))
else:
self.entrySize = CacheAddress.typeArray[self.blockType][1]
self.contiguousBlock = int(self.binary[8:10], 2)
self.fileSelector = "data_" + str(int(self.binary[10:18], 2))
self.blockNumber = int(self.binary[18:], 2)
def __str__(self):
string = hex(self.addr) + " ("
if self.blockType >= CacheAddress.BLOCK_256:
string += str(self.contiguousBlock) +\
" contiguous blocks in "
string += CacheAddress.typeArray[self.blockType][0] +\
" : " + self.fileSelector + ")"
return string

View file

@ -0,0 +1,64 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from __future__ import absolute_import
import unittest
from . import cacheAddress
class CacheAddressTest(unittest.TestCase):
def testFileType(self):
"""Parse Block Type From Address"""
address = cacheAddress.CacheAddress(0x8000002A)
self.assertEqual(address.blockType,
cacheAddress.CacheAddress.SEPARATE_FILE)
address = cacheAddress.CacheAddress(0x9DFF0000)
self.assertEqual(address.blockType,
cacheAddress.CacheAddress.RANKING_BLOCK)
address = cacheAddress.CacheAddress(0xA0010003)
self.assertEqual(address.blockType,
cacheAddress.CacheAddress.BLOCK_256)
address = cacheAddress.CacheAddress(0xBDFF0108)
self.assertEqual(address.blockType,
cacheAddress.CacheAddress.BLOCK_1024)
address = cacheAddress.CacheAddress(0xCDFF0108)
self.assertEqual(address.blockType,
cacheAddress.CacheAddress.BLOCK_4096)
def testFilename(self):
"""Parse Filename from Address"""
address = cacheAddress.CacheAddress(0x8000002A)
self.assertEqual(address.fileSelector,
"f_0002A")
address = cacheAddress.CacheAddress(0xA001135C)
self.assertEqual(address.fileSelector,
"data_1")
if __name__ == "__main__":
unittest.main()

View file

@ -0,0 +1,86 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Parse the header of a Chrome Cache File
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
for design details
"""
from __future__ import absolute_import
import struct
from six.moves import range
class CacheBlock():
"""
Object representing a block of the cache. It can be the index file or any
other block type : 256B, 1024B, 4096B, Ranking Block.
See /net/disk_cache/disk_format.h for details.
"""
INDEX_MAGIC = 0xC103CAC3
BLOCK_MAGIC = 0xC104CAC3
INDEX = 0
BLOCK = 1
def __init__(self, filename):
"""
Parse the header of a cache file
"""
header = open(filename, 'rb')
# Read Magic Number
magic = struct.unpack('I', header.read(4))[0]
if magic == CacheBlock.BLOCK_MAGIC:
self.type = CacheBlock.BLOCK
header.seek(2, 1)
self.version = struct.unpack('h', header.read(2))[0]
self.header = struct.unpack('h', header.read(2))[0]
self.nextFile = struct.unpack('h', header.read(2))[0]
self.blockSize = struct.unpack('I', header.read(4))[0]
self.entryCount = struct.unpack('I', header.read(4))[0]
self.entryMax = struct.unpack('I', header.read(4))[0]
self.empty = []
for _ in range(4):
self.empty.append(struct.unpack('I', header.read(4))[0])
self.position = []
for _ in range(4):
self.position.append(struct.unpack('I', header.read(4))[0])
elif magic == CacheBlock.INDEX_MAGIC:
self.type = CacheBlock.INDEX
header.seek(2, 1)
self.version = struct.unpack('h', header.read(2))[0]
self.entryCount = struct.unpack('I', header.read(4))[0]
self.byteCount = struct.unpack('I', header.read(4))[0]
self.lastFileCreated = "f_%06x" % \
struct.unpack('I', header.read(4))[0]
header.seek(4*2, 1)
self.tableSize = struct.unpack('I', header.read(4))[0]
else:
header.close()
raise Exception("Invalid Chrome Cache File")
header.close()

View file

@ -0,0 +1,130 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Stores the data fetched in the cache.
Parse the HTTP header if asked.
"""
from __future__ import absolute_import
import re
import shutil
import struct
import os
from . import cacheAddress
from six.moves import range
class CacheData():
"""
Retrieve data at the given address
Can save it to a separate file for export
"""
HTTP_HEADER = 0
UNKNOWN = 1
def __init__(self, address, size, isHTTPHeader=False):
"""
It is a lazy evaluation object : the file is open only if it is
needed. It can parse the HTTP header if asked to do so.
See net/http/http_util.cc LocateStartOfStatusLine and
LocateEndOfHeaders for details.
"""
self.size = size
self.address = address
self.type = CacheData.UNKNOWN
if isHTTPHeader and\
self.address.blockType != cacheAddress.CacheAddress.SEPARATE_FILE:
# Getting raw data
string = b""
block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb')
block.seek(8192 + self.address.blockNumber*self.address.entrySize)
for _ in range(self.size):
string += struct.unpack('c', block.read(1))[0]
block.close()
# Finding the beginning of the request
start = re.search(b"HTTP", string)
if start == None:
return
else:
string = string[start.start():]
# Finding the end (some null characters : verified by experience)
end = re.search(b"\x00\x00", string)
if end == None:
return
else:
string = string[:end.end()-2]
# Creating the dictionary of headers
self.headers = {}
for line in string.split(b'\0'):
stripped = line.split(b':')
self.headers[stripped[0].lower()] = \
b':'.join(stripped[1:]).strip()
self.type = CacheData.HTTP_HEADER
def save(self, filename=None):
"""Save the data to the specified filename"""
if self.address.blockType == cacheAddress.CacheAddress.SEPARATE_FILE:
shutil.copy(os.path.join(self.address.path,self.address.fileSelector),
filename)
else:
output = open(filename, 'wb')
block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb')
block.seek(8192 + self.address.blockNumber*self.address.entrySize)
output.write(block.read(self.size))
block.close()
output.close()
def data(self):
"""Returns a string representing the data"""
if self.address.blockType == cacheAddress.CacheAddress.SEPARATE_FILE:
with open(os.path.join(self.address.path,self.address.fileSelector), 'rb') as infile:
data = infile.read()
else:
block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb')
block.seek(8192 + self.address.blockNumber*self.address.entrySize)
data = block.read(self.size).decode('utf-8')
block.close()
return data
def __str__(self):
"""
Display the type of cacheData
"""
if self.type == CacheData.HTTP_HEADER:
if 'content-type' in self.headers:
return "HTTP Header %s" % self.headers['content-type']
else:
return "HTTP Header"
else:
return "Data"

View file

@ -0,0 +1,140 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Chrome Cache Entry
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
for design details
"""
from __future__ import absolute_import
import datetime
import struct
import os
from . import cacheAddress
from . import cacheData
from six.moves import range
class CacheEntry():
"""
See /net/disk_cache/disk_format.h for details.
"""
STATE = ["Normal",
"Evicted (data were deleted)",
"Doomed (shit happened)"]
def __init__(self, address):
"""
Parse a Chrome Cache Entry at the given address
"""
self.httpHeader = None
block = open(os.path.join(address.path,address.fileSelector), 'rb')
# Going to the right entry
block.seek(8192 + address.blockNumber*address.entrySize)
# Parsing basic fields
self.hash = struct.unpack('I', block.read(4))[0]
self.next = struct.unpack('I', block.read(4))[0]
self.rankingNode = struct.unpack('I', block.read(4))[0]
self.usageCounter = struct.unpack('I', block.read(4))[0]
self.reuseCounter = struct.unpack('I', block.read(4))[0]
self.state = struct.unpack('I', block.read(4))[0]
self.creationTime = datetime.datetime(1601, 1, 1) + \
datetime.timedelta(microseconds=\
struct.unpack('Q', block.read(8))[0])
self.keyLength = struct.unpack('I', block.read(4))[0]
self.keyAddress = struct.unpack('I', block.read(4))[0]
dataSize = []
for _ in range(4):
dataSize.append(struct.unpack('I', block.read(4))[0])
self.data = []
for index in range(4):
addr = struct.unpack('I', block.read(4))[0]
try:
addr = cacheAddress.CacheAddress(addr, address.path)
self.data.append(cacheData.CacheData(addr, dataSize[index],
True))
except cacheAddress.CacheAddressError:
pass
# Find the HTTP header if there is one
for data in self.data:
if data.type == cacheData.CacheData.HTTP_HEADER:
self.httpHeader = data
break
self.flags = struct.unpack('I', block.read(4))[0]
# Skipping pad
block.seek(5*4, 1)
# Reading local key
if self.keyAddress == 0:
self.key = block.read(self.keyLength).decode('ascii')
# Key stored elsewhere
else:
addr = cacheAddress.CacheAddress(self.keyAddress, address.path)
# It is probably an HTTP header
self.key = cacheData.CacheData(addr, self.keyLength, True)
block.close()
def keyToStr(self):
"""
Since the key can be a string or a CacheData object, this function is an
utility to display the content of the key whatever type is it.
"""
if self.keyAddress == 0:
return self.key
else:
return self.key.data()
def __str__(self):
string = "Hash: 0x%08x" % self.hash + '\n'
if self.next != 0:
string += "Next: 0x%08x" % self.next + '\n'
string += "Usage Counter: %d" % self.usageCounter + '\n'\
"Reuse Counter: %d" % self.reuseCounter + '\n'\
"Creation Time: %s" % self.creationTime + '\n'
if self.keyAddress != 0:
string += "Key Address: 0x%08x" % self.keyAddress + '\n'
string += "Key: %s" % self.key + '\n'
if self.flags != 0:
string += "Flags: 0x%08x" % self.flags + '\n'
string += "State: %s" % CacheEntry.STATE[self.state]
for data in self.data:
string += "\nData (%d bytes) at 0x%08x : %s" % (data.size,
data.address.addr,
data)
return string

View file

@ -0,0 +1,293 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Parse the Chrome Cache File
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
for design details
"""
from __future__ import absolute_import
from __future__ import print_function
import gzip
import os
import struct
import sys
import re
import brotli
from . import csvOutput
from . import SuperFastHash
from .cacheAddress import CacheAddress
from .cacheBlock import CacheBlock
from .cacheData import CacheData
from .cacheEntry import CacheEntry
from six.moves import range
def parse(path, urls=None):
"""
Reads the whole cache and store the collected data in a table
or find out if the given list of urls is in the cache. If yes it
return a list of the corresponding entries.
"""
# Verifying that the path end with / (What happen on windows?)
path = os.path.abspath(path)
cacheBlock = CacheBlock(os.path.join(path, "index"))
# Checking type
if cacheBlock.type != CacheBlock.INDEX:
raise Exception("Invalid Index File")
index = open(os.path.join(path, "index"), 'rb')
# Skipping Header
index.seek(92*4)
cache = []
# If no url is specified, parse the whole cache
if urls == None:
for key in range(cacheBlock.tableSize):
raw = struct.unpack('I', index.read(4))[0]
if raw != 0:
entry = CacheEntry(CacheAddress(raw, path=path))
# Checking if there is a next item in the bucket because
# such entries are not stored in the Index File so they will
# be ignored during iterative lookup in the hash table
while entry.next != 0:
cache.append(entry)
entry = CacheEntry(CacheAddress(entry.next, path=path))
cache.append(entry)
else:
# Find the entry for each url
for url in urls:
url = bytes(url,'utf8')
# Compute the key and seeking to it
hash = SuperFastHash.superFastHash(url)
key = hash & (cacheBlock.tableSize - 1)
index.seek(92*4 + key*4)
addr = struct.unpack('I', index.read(4))[0]
# Checking if the address is initialized (i.e. used)
if addr & 0x80000000 == 0:
print("%s is not in the cache" % url, file=sys.stderr)
# Follow the chained list in the bucket
else:
entry = CacheEntry(CacheAddress(addr, path=path))
while entry.hash != hash and entry.next != 0:
entry = CacheEntry(CacheAddress(entry.next, path=path))
if entry.hash == hash:
cache.append(entry)
index.close()
return cache
class ChromeCache(object):
def __init__(self,path):
self.cache = parse(path)
self.hash_cache = {}
for entry in self.cache:
key = entry.keyToStr()
self.hash_cache[key] = entry
normkey = re.sub(r'^(https://www.fanfiction.net/s/\d+/\d+/).+$',r'\1',key)
## either overwrites (no harm), or adds new.
self.hash_cache[normkey] = entry
def get_cached_file(self,url):
if url in self.hash_cache:
entry = self.hash_cache[url]
for i in range(len(entry.data)):
if entry.data[i].type == CacheData.UNKNOWN:
# Extracting data into a file
data = entry.data[i].data()
# print("content-encoding:%s"%entry.httpHeader.headers.get(b'content-encoding',''))
if entry.httpHeader != None and \
b'content-encoding' in entry.httpHeader.headers:
if entry.httpHeader.headers[b'content-encoding'] == b"gzip":
data = gzip.decompress(data)
elif entry.httpHeader.headers[b'content-encoding'] == b"br":
data = brotli.decompress(data)
return data
return None
def exportToHTML(cache, outpath):
"""
Export the cache in html
"""
# Checking that the directory exists and is writable
if not os.path.exists(outpath):
os.makedirs(outpath)
outpath = os.path.abspath(outpath)
index = open(os.path.join(outpath,"index.html"), 'w')
index.write("<UL>")
for entry in cache:
# Adding a link in the index
if entry.keyLength > 100:
entry_name = entry.keyToStr()[:100] + "..."
else:
entry_name = entry.keyToStr()
index.write('<LI><a href="%08x.html">%s</a></LI>'%(entry.hash, entry_name))
# We handle the special case where entry_name ends with a slash
page_basename = entry_name.split('/')[-2] if entry_name.endswith('/') else entry_name.split('/')[-1]
# Creating the entry page
page = open(os.path.join(outpath,"%08x.html"%entry.hash), 'w')
page.write("""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
</head>
<body>""")
# Details of the entry
page.write("<b>Hash</b>: 0x%08x<br />"%entry.hash)
page.write("<b>Usage Counter</b>: %d<br />"%entry.usageCounter)
page.write("<b>Reuse Counter</b>: %d<br />"%entry.reuseCounter)
page.write("<b>Creation Time</b>: %s<br />"%entry.creationTime)
page.write("<b>Key</b>: %s<br>"%entry.keyToStr())
page.write("<b>State</b>: %s<br>"%CacheEntry.STATE[entry.state])
page.write("<hr>")
## entry.data normally 2 or 1
## 2 for headers and data, 1 for headers only.
if len(entry.data) == 0:
page.write("No data associated with this entry :-(")
for i in range(len(entry.data)):
if entry.data[i].type == CacheData.UNKNOWN:
# Extracting data into a file
name = hex(entry.hash) + "_" + str(i)
entry.data[i].save(os.path.join(outpath,name))
# print("content-encoding:%s"%entry.httpHeader.headers.get(b'content-encoding',''))
if entry.httpHeader != None and \
b'content-encoding' in entry.httpHeader.headers:
if entry.httpHeader.headers[b'content-encoding'] == b"gzip":
# XXX Highly inefficient !!!!!
try:
input = gzip.open(os.path.join(outpath, name), 'rb')
output = open(os.path.join(outpath, name + "u"), 'wb')
output.write(input.read())
input.close()
output.close()
page.write('<a href="%su">%s</a>'%(name, page_basename))
# print("gunzip'ed:%s"%name)
except IOError:
page.write("Something wrong happened while unzipping")
elif entry.httpHeader.headers[b'content-encoding'] == b"br":
try:
with open(os.path.join(outpath,name), 'rb') as input:
with open(os.path.join(outpath,name + "u"), 'wb') as output:
output.write(brotli.decompress(input.read()))
page.write('<a href="%su">%s</a>'%(name, page_basename))
# print("unbrotli'ed:%s"%name)
except IOError:
page.write("Something wrong happened while unzipping")
brotli
else:
page.write('<a href="%s">%s</a>'%(name ,
entry.keyToStr().split('/')[-1]))
# If it is a picture, display it
if entry.httpHeader != None:
if b'content-type' in entry.httpHeader.headers and\
b"image" in entry.httpHeader.headers[b'content-type']:
page.write('<br /><img src="%s">'%(name))
# HTTP Header
else:
page.write("<u>HTTP Header</u><br />")
for key, value in entry.data[i].headers.items():
page.write("<b>%s</b>: %s<br />"%(key, value))
page.write("<hr>")
page.write("</body></html>")
page.close()
index.write("</UL>")
index.close()
def exportTol2t(cache):
"""
Export the cache in CSV log2timeline compliant format
"""
output = []
output.append(["date",
"time",
"timezone",
"MACB",
"source",
"sourcetype",
"type",
"user",
"host",
"short",
"desc",
"version",
"filename",
"inode",
"notes",
"format",
"extra"])
for entry in cache:
date = entry.creationTime.date().strftime("%m/%d/%Y")
time = entry.creationTime.time()
# TODO get timezone
timezone = 0
short = entry.keyToStr()
descr = "Hash: 0x%08x" % entry.hash
descr += " Usage Counter: %d" % entry.usageCounter
if entry.httpHeader != None:
if 'content-type' in entry.httpHeader.headers:
descr += " MIME: %s" % entry.httpHeader.headers['content-type']
output.append([date,
time,
timezone,
"MACB",
"WEBCACHE",
"Chrome Cache",
"Cache Entry",
"-",
"-",
short,
descr,
"2",
"-",
"-",
"-",
"-",
"-",
])
csvOutput.csvOutput(output)

View file

@ -0,0 +1,45 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Classical Output Module
"""
from __future__ import absolute_import
import sys
def classicalOutput(queryResult, separator="\t"):
"""
Display the data separated by the specified separator
"""
for line in queryResult:
for element in line:
sys.stdout.write(element)
sys.stdout.write(separator)
sys.stdout.write('\n')

View file

@ -0,0 +1,49 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Column Output Module
"""
from __future__ import print_function
from six.moves import range
def columnOutput(queryResult, separator=' '):
"""
Display the data in columns
"""
if len(queryResult) == 0:
return
# Finding width of columns
size = [max([len(str(line[i])) for line in queryResult])
for i in range(len(queryResult[0]))]
# Generating format string (without last separator)
string = (''.join(["%%-%ds%s" % (x, separator) for x in size]))\
[:-len(separator)]
for line in queryResult:
print(string % tuple(line))

View file

@ -0,0 +1,44 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
CSV Output Module
"""
from __future__ import absolute_import
import csv
import sys
def csvOutput(queryResult, separator=',', quote='"'):
"""
Display the data according to csv format
"""
csvWriter = csv.writer(sys.stdout, delimiter=separator, quotechar=quote,
quoting=csv.QUOTE_MINIMAL)
for line in queryResult:
csvWriter.writerow(line)

View file

@ -0,0 +1,106 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Parse the Chrome Download Table History File
Its a SQLite3 table
"""
from __future__ import absolute_import
from __future__ import print_function
import datetime
import sqlite3
import sys
import six
def parse(filename, urlLength):
"""
filename: path to the history file
urlLength: maximum url length to display
"""
# Connecting to the DB
try:
history = sqlite3.connect(filename)
except sqlite3.Error as error:
print("==> Error while opening the history file !")
print("==> Details :", error.message)
sys.exit("==> Exiting...")
# Retrieving all useful data
result = history.execute("SELECT id, \
full_path, \
url, \
start_time, \
received_bytes, \
total_bytes, \
state \
FROM downloads;")
output = []
for line in result:
output.append(DownloadEntry(line, urlLength))
return output
class DownloadEntry(object):
"""Object to store download entries"""
COLUMN_STR = {'st': "startTime",
'p': "path",
'u': "url",
'rb': "receivedBytes",
'tb': "totalBytes",
'pt': "percentReceived",
's': "state"}
STATE_STR = ["In Progress",
"Complete",
"Cancelled",
"Removing",
"Interrupted"]
def __init__(self, item, urlLength):
"""Parse raw input"""
self.path = item[1]
if len(item[2]) > urlLength and urlLength > 0:
self.url = item[2][0:urlLength - 3] + "..."
else:
self.url = item[2]
self.startTime = datetime.datetime(1601, 1, 1) + \
datetime.timedelta(microseconds=\
item[3])
self.receivedBytes = item[4]
self.totalBytes = item[5]
self.state = DownloadEntry.STATE_STR[item[6]]
if int(item[5]) == 0:
self.percentReceived = "0%"
else:
self.percentReceived = "%d%%" % \
int(float(item[4])/float(item[5])*100)
def columnToStr(self, column):
"""Returns column content specified by argument"""
return six.text_type(self.__getattribute__(DownloadEntry.COLUMN_STR[column]))

View file

@ -0,0 +1,178 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Parse the Chrome History File
Its a SQLite3 file
"""
from __future__ import absolute_import
from __future__ import print_function
import datetime
import re
import sqlite3
import sys
from . import cacheParse
import six
def parse(filename, start, end, checkCache, cachePath, urlLength):
"""
filename: path to the history file
start: beginning of the time window
end: end of the time window
checkCache: check if each page in the history is in the cache
cachePath: path to cache directory
"""
# Connecting to the DB
try:
history = sqlite3.connect(filename)
except sqlite3.Error as error:
print("==> Error while opening the history file !")
print("==> Details :", error.message)
sys.exit("==> Exiting...")
reference = datetime.datetime(1601, 1, 1)
# Retrieving all useful data
result = history.execute("SELECT visits.visit_time, \
visits.from_visit, \
visits.transition, \
urls.url, \
urls.title, \
urls.visit_count, \
urls.typed_count, \
urls.last_visit_time \
FROM urls,visits \
WHERE urls.id=visits.url\
AND visits.visit_time>%d\
AND visits.visit_time<%d\
ORDER BY visits.visit_time;"%\
(int((start-reference).total_seconds()*1000000),\
int((end-reference).total_seconds()*1000000)))\
# Parsing cache
cache = None
if checkCache:
cache = cacheParse.parse(cachePath)
output = []
for line in result:
output.append(HistoryEntry(line, cache, urlLength))
return output
class Transition():
"""Object representing transition between history pages"""
CORE_STRING = ["Link",\
"Typed",\
"Auto Bookmark",\
"Auto Subframe",\
"Manual Subframe",\
"Generated",\
"Start Page",\
"Form Submit",\
"Reload",\
"Keyword",\
"Keywork Generated"]
QUALIFIER_STRING = [(0x01000000, "Forward or Back Button"),
(0x02000000, "Address Bar"),
(0x04000000, "Home Page"),
(0x10000000, "Beginning of Chain"),
(0x20000000, "End of Chain"),
(0x40000000, "Client Redirection"),
(0x80000000, "Server Redirection")]
def __init__(self, transition):
"""
Parsing the transtion according to
content/common/page_transition_types.h
"""
self.core = transition & 0xFF
self.qualifier = transition & 0xFFFFFF00
def __str__(self):
string = Transition.CORE_STRING[self.core]
for mask, description in Transition.QUALIFIER_STRING:
if self.qualifier & mask != 0:
string += ", %s"%description
return string
class HistoryEntry(object):
"""Object to store database entries"""
COLUMN_STR = {'vt': "visitTime",
'fv': "fromVisit",
'tr': "transition",
'u': "url",
'tl': "title",
'vc': "visitCount",
'tc': "typedCount",
'lv': "lastVisitTime",
'cc': "inCache"}
def __init__(self, item, cache, urlLength):
"""Parse raw input"""
self.visitTime = datetime.datetime(1601, 1, 1) + \
datetime.timedelta(microseconds=\
item[0])
self.fromVisit = item[1]
self.transition = Transition(item[2])
if len(item[3]) > urlLength and urlLength > 0:
self.url = item[3][0:urlLength - 3] + "..."
else:
self.url = item[3]
self.title = item[4]
self.visitCount = item[5]
self.typedCount = item[6]
self.lastVisitTime = datetime.datetime(1601, 1, 1) + \
datetime.timedelta(microseconds=\
item[7])
# Searching in the cache if there is a copy of the page
# TODO use a hash table to search instead of heavy exhaustive search
self.inCache = False
if cache != None:
for item in cache:
if item.keyToStr() == self.url:
self.inCache = True
break
def toStr(self):
return [six.text_type(self.visitTime),\
six.text_type(self.fromVisit),\
six.text_type(self.transition),\
six.text_type(self.url),\
six.text_type(self.title),\
six.text_type(self.visitCount),\
six.text_type(self.typedCount),\
six.text_type(self.lastVisitTime)]
def columnToStr(self, column):
"""Returns column content specified by argument"""
return six.text_type(self.__getattribute__(HistoryEntry.COLUMN_STR[column]))

View file

@ -0,0 +1,42 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
JSON Output Module
"""
from __future__ import absolute_import
from __future__ import print_function
import json
def jsonOutput(queryResult, separator=''):
"""
Display the data separated in JSON
"""
print(json.JSONEncoder().encode(queryResult))

View file

@ -0,0 +1,97 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Parse the Chrome Visited Links
Reverse engineered from
chrome/common/visitedlink_common.*
chrome/browser/visitedlink/visitedlink_*
"""
from __future__ import absolute_import
import md5
import struct
import sys
from six.moves import range
VISITED_LINKS_MAGIC = 0x6b6e4c56;
def isVisited(path, urls):
"""
Return the list of urls given in parameter with a boolean information
about its presence in the given visited links file
"""
output = []
f = open(path, 'rb')
# Checking file type
magic = struct.unpack('I', f.read(4))[0]
if magic != VISITED_LINKS_MAGIC:
raise Exception("Invalid file")
# Reading header values
version = struct.unpack('I', f.read(4))[0]
length = struct.unpack('I', f.read(4))[0]
usedItems = struct.unpack('I', f.read(4))[0]
# Reading salt
salt = ""
for dummy in range(8):
salt += struct.unpack('c', f.read(1))[0]
for url in urls:
fingerprint = md5.new()
fingerprint.update(salt)
fingerprint.update(url)
digest = fingerprint.hexdigest()
# Inverting the result
# Why Chrome MD5 computation gives a reverse digest ?
fingerprint = 0
for i in range(0, 16, 2):
fingerprint += int(digest[i:i+2], 16) << (i/2)*8
key = fingerprint % length
# The hash table uses open addressing
f.seek(key*8 + 24, 0)
while True:
finger = struct.unpack('q', f.read(8))[0]
if finger == 0:
output.append((url, False))
break
if finger == fingerprint:
output.append((url, True))
break
if f.tell() >= length*8 + 24:
f.seek(24)
if f.tell() == key*8 + 24:
output.append((url, False))
break
f.close()
return output