mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-08 12:36:11 +02:00
CLI Only working with ffnet *only* reading from Chrome browser cache.
This commit is contained in:
parent
10a7cf8aa7
commit
95297b58e0
17 changed files with 1526 additions and 8 deletions
|
|
@ -25,6 +25,7 @@ import re
|
|||
from ..six import text_type as unicode
|
||||
from ..six.moves.urllib.error import HTTPError
|
||||
|
||||
from ..chromagnon.cacheParse import ChromeCache
|
||||
|
||||
from .. import exceptions as exceptions
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
|
@ -60,6 +61,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
## accept m(mobile)url, but use www.
|
||||
self.origurl = self.origurl.replace("https://m.","https://www.")
|
||||
|
||||
self.chromagnon_cache = None
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.fanfiction.net'
|
||||
|
|
@ -75,14 +77,50 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteURLPattern(self):
|
||||
return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$"
|
||||
|
||||
def _postUrl(self, url,
|
||||
parameters={},
|
||||
headers={},
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
logger.debug("_postUrl")
|
||||
raise NotImplementedError
|
||||
|
||||
def _fetchUrlRawOpened(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True,
|
||||
referer=None):
|
||||
logger.debug("_fetchUrlRawOpened")
|
||||
raise NotImplementedError
|
||||
|
||||
def _fetchUrlOpened(self, url,
|
||||
parameters=None,
|
||||
usecache=True,
|
||||
extrasleep=None,
|
||||
referer=None):
|
||||
logger.debug("_fetchUrlOpened")
|
||||
raise NotImplementedError
|
||||
|
||||
def _fetchUrlRaw(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True,
|
||||
referer=None):
|
||||
## This should be the one called for images.
|
||||
logger.debug("_fetchUrlRaw")
|
||||
raise NotImplementedError
|
||||
|
||||
def _fetchUrl(self,url,parameters=None,extrasleep=1.0,usecache=True):
|
||||
## ffnet(and, I assume, fpcom) tends to fail more if hit too
|
||||
## fast. This is in additional to what ever the
|
||||
## slow_down_sleep_time setting is.
|
||||
return BaseSiteAdapter._fetchUrl(self,url,
|
||||
parameters=parameters,
|
||||
extrasleep=extrasleep,
|
||||
usecache=usecache)
|
||||
|
||||
if self.chromagnon_cache is None:
|
||||
logger.debug("Start making self.chromagnon_cache")
|
||||
self.chromagnon_cache = ChromeCache(self.getConfig("chrome_cache_path"))
|
||||
logger.debug("Done making self.chromagnon_cache")
|
||||
data = self.chromagnon_cache.get_cached_file(url)
|
||||
logger.debug("%s:len(%s)"%(url,len(data)))
|
||||
if data is None:
|
||||
raise HTTPError(404,"Not found in Chrome Cache")
|
||||
return self.configuration._decode(data)
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
|
|
@ -103,8 +141,9 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
#logger.debug("\n===================\n%s\n===================\n"%data)
|
||||
# logger.debug("\n===================\n%s\n===================\n"%data)
|
||||
soup = self.make_soup(data)
|
||||
# logger.debug("\n===================\n%s\n===================\n"%soup)
|
||||
except HTTPError as e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
|
|
|
|||
24
fanficfare/chromagnon/LICENSE
Normal file
24
fanficfare/chromagnon/LICENSE
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Chromagon Project nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
89
fanficfare/chromagnon/SuperFastHash.py
Normal file
89
fanficfare/chromagnon/SuperFastHash.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Python implementation of SuperFastHash algorithm
|
||||
Maybe it is better to use c_uint32 to limit the size of variables to 32bits
|
||||
instead of using 0xFFFFFFFF mask.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import binascii
|
||||
import sys
|
||||
|
||||
def get16bits(data):
|
||||
"""Returns the first 16bits of a string"""
|
||||
return int(binascii.hexlify(data[1::-1]), 16)
|
||||
|
||||
def superFastHash(data):
|
||||
hash = length = len(data)
|
||||
if length == 0:
|
||||
return 0
|
||||
|
||||
rem = length & 3
|
||||
length >>= 2
|
||||
|
||||
while length > 0:
|
||||
hash += get16bits(data) & 0xFFFFFFFF
|
||||
tmp = (get16bits(data[2:])<< 11) ^ hash
|
||||
hash = ((hash << 16) & 0xFFFFFFFF) ^ tmp
|
||||
data = data[4:]
|
||||
hash += hash >> 11
|
||||
hash = hash & 0xFFFFFFFF
|
||||
length -= 1
|
||||
|
||||
if rem == 3:
|
||||
hash += get16bits (data)
|
||||
hash ^= (hash << 16) & 0xFFFFFFFF
|
||||
hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF
|
||||
hash += hash >> 11
|
||||
elif rem == 2:
|
||||
hash += get16bits (data)
|
||||
hash ^= (hash << 11) & 0xFFFFFFFF
|
||||
hash += hash >> 17
|
||||
elif rem == 1:
|
||||
hash += int(binascii.hexlify(data[0]), 16)
|
||||
hash ^= (hash << 10) & 0xFFFFFFFF
|
||||
hash += hash >> 1
|
||||
|
||||
hash = hash & 0xFFFFFFFF
|
||||
hash ^= (hash << 3) & 0xFFFFFFFF
|
||||
hash += hash >> 5
|
||||
hash = hash & 0xFFFFFFFF
|
||||
hash ^= (hash << 4) & 0xFFFFFFFF
|
||||
hash += hash >> 17
|
||||
hash = hash & 0xFFFFFFFF
|
||||
hash ^= (hash << 25) & 0xFFFFFFFF
|
||||
hash += hash >> 6
|
||||
hash = hash & 0xFFFFFFFF
|
||||
|
||||
return hash
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("%08x"%superFastHash(sys.argv[1]))
|
||||
0
fanficfare/chromagnon/__init__.py
Normal file
0
fanficfare/chromagnon/__init__.py
Normal file
92
fanficfare/chromagnon/cacheAddress.py
Normal file
92
fanficfare/chromagnon/cacheAddress.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Chrome Cache Address
|
||||
See /net/disk_cache/addr.h for design details
|
||||
"""
|
||||
|
||||
class CacheAddressError(Exception):
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
def __str__(self):
|
||||
return repr(self.value)
|
||||
|
||||
class CacheAddress():
|
||||
"""
|
||||
Object representing a Chrome Cache Address
|
||||
"""
|
||||
SEPARATE_FILE = 0
|
||||
RANKING_BLOCK = 1
|
||||
BLOCK_256 = 2
|
||||
BLOCK_1024 = 3
|
||||
BLOCK_4096 = 4
|
||||
|
||||
typeArray = [("Separate file", 0),
|
||||
("Ranking block file", 36),
|
||||
("256 bytes block file", 256),
|
||||
("1k bytes block file", 1024),
|
||||
("4k bytes block file", 4096)]
|
||||
|
||||
def __init__(self, uint_32, path):
|
||||
"""
|
||||
Parse the 32 bits of the uint_32
|
||||
"""
|
||||
if uint_32 == 0:
|
||||
raise CacheAddressError("Null Address")
|
||||
|
||||
#XXX Is self.binary useful ??
|
||||
self.addr = uint_32
|
||||
self.path = path
|
||||
|
||||
# Checking that the MSB is set
|
||||
self.binary = bin(uint_32)
|
||||
if len(self.binary) != 34:
|
||||
raise CacheAddressError("Uninitialized Address")
|
||||
|
||||
self.blockType = int(self.binary[3:6], 2)
|
||||
|
||||
# If it is an address of a separate file
|
||||
if self.blockType == CacheAddress.SEPARATE_FILE:
|
||||
self.fileSelector = "f_%06x" % int(self.binary[6:], 2)
|
||||
elif self.blockType == CacheAddress.RANKING_BLOCK:
|
||||
self.fileSelector = "data_" + str(int(self.binary[10:18], 2))
|
||||
else:
|
||||
self.entrySize = CacheAddress.typeArray[self.blockType][1]
|
||||
self.contiguousBlock = int(self.binary[8:10], 2)
|
||||
self.fileSelector = "data_" + str(int(self.binary[10:18], 2))
|
||||
self.blockNumber = int(self.binary[18:], 2)
|
||||
|
||||
def __str__(self):
|
||||
string = hex(self.addr) + " ("
|
||||
if self.blockType >= CacheAddress.BLOCK_256:
|
||||
string += str(self.contiguousBlock) +\
|
||||
" contiguous blocks in "
|
||||
string += CacheAddress.typeArray[self.blockType][0] +\
|
||||
" : " + self.fileSelector + ")"
|
||||
return string
|
||||
64
fanficfare/chromagnon/cacheAddressTest.py
Normal file
64
fanficfare/chromagnon/cacheAddressTest.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from __future__ import absolute_import
|
||||
import unittest
|
||||
|
||||
from . import cacheAddress
|
||||
|
||||
class CacheAddressTest(unittest.TestCase):
|
||||
|
||||
def testFileType(self):
|
||||
"""Parse Block Type From Address"""
|
||||
address = cacheAddress.CacheAddress(0x8000002A)
|
||||
self.assertEqual(address.blockType,
|
||||
cacheAddress.CacheAddress.SEPARATE_FILE)
|
||||
address = cacheAddress.CacheAddress(0x9DFF0000)
|
||||
self.assertEqual(address.blockType,
|
||||
cacheAddress.CacheAddress.RANKING_BLOCK)
|
||||
address = cacheAddress.CacheAddress(0xA0010003)
|
||||
self.assertEqual(address.blockType,
|
||||
cacheAddress.CacheAddress.BLOCK_256)
|
||||
address = cacheAddress.CacheAddress(0xBDFF0108)
|
||||
self.assertEqual(address.blockType,
|
||||
cacheAddress.CacheAddress.BLOCK_1024)
|
||||
address = cacheAddress.CacheAddress(0xCDFF0108)
|
||||
self.assertEqual(address.blockType,
|
||||
cacheAddress.CacheAddress.BLOCK_4096)
|
||||
|
||||
def testFilename(self):
|
||||
"""Parse Filename from Address"""
|
||||
address = cacheAddress.CacheAddress(0x8000002A)
|
||||
self.assertEqual(address.fileSelector,
|
||||
"f_0002A")
|
||||
address = cacheAddress.CacheAddress(0xA001135C)
|
||||
self.assertEqual(address.fileSelector,
|
||||
"data_1")
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
86
fanficfare/chromagnon/cacheBlock.py
Normal file
86
fanficfare/chromagnon/cacheBlock.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Parse the header of a Chrome Cache File
|
||||
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
|
||||
for design details
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
import struct
|
||||
from six.moves import range
|
||||
|
||||
class CacheBlock():
|
||||
"""
|
||||
Object representing a block of the cache. It can be the index file or any
|
||||
other block type : 256B, 1024B, 4096B, Ranking Block.
|
||||
See /net/disk_cache/disk_format.h for details.
|
||||
"""
|
||||
|
||||
INDEX_MAGIC = 0xC103CAC3
|
||||
BLOCK_MAGIC = 0xC104CAC3
|
||||
INDEX = 0
|
||||
BLOCK = 1
|
||||
|
||||
def __init__(self, filename):
|
||||
"""
|
||||
Parse the header of a cache file
|
||||
"""
|
||||
header = open(filename, 'rb')
|
||||
|
||||
# Read Magic Number
|
||||
magic = struct.unpack('I', header.read(4))[0]
|
||||
if magic == CacheBlock.BLOCK_MAGIC:
|
||||
self.type = CacheBlock.BLOCK
|
||||
header.seek(2, 1)
|
||||
self.version = struct.unpack('h', header.read(2))[0]
|
||||
self.header = struct.unpack('h', header.read(2))[0]
|
||||
self.nextFile = struct.unpack('h', header.read(2))[0]
|
||||
self.blockSize = struct.unpack('I', header.read(4))[0]
|
||||
self.entryCount = struct.unpack('I', header.read(4))[0]
|
||||
self.entryMax = struct.unpack('I', header.read(4))[0]
|
||||
self.empty = []
|
||||
for _ in range(4):
|
||||
self.empty.append(struct.unpack('I', header.read(4))[0])
|
||||
self.position = []
|
||||
for _ in range(4):
|
||||
self.position.append(struct.unpack('I', header.read(4))[0])
|
||||
elif magic == CacheBlock.INDEX_MAGIC:
|
||||
self.type = CacheBlock.INDEX
|
||||
header.seek(2, 1)
|
||||
self.version = struct.unpack('h', header.read(2))[0]
|
||||
self.entryCount = struct.unpack('I', header.read(4))[0]
|
||||
self.byteCount = struct.unpack('I', header.read(4))[0]
|
||||
self.lastFileCreated = "f_%06x" % \
|
||||
struct.unpack('I', header.read(4))[0]
|
||||
header.seek(4*2, 1)
|
||||
self.tableSize = struct.unpack('I', header.read(4))[0]
|
||||
else:
|
||||
header.close()
|
||||
raise Exception("Invalid Chrome Cache File")
|
||||
header.close()
|
||||
130
fanficfare/chromagnon/cacheData.py
Normal file
130
fanficfare/chromagnon/cacheData.py
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Stores the data fetched in the cache.
|
||||
Parse the HTTP header if asked.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import re
|
||||
import shutil
|
||||
import struct
|
||||
import os
|
||||
|
||||
from . import cacheAddress
|
||||
from six.moves import range
|
||||
|
||||
class CacheData():
|
||||
"""
|
||||
Retrieve data at the given address
|
||||
Can save it to a separate file for export
|
||||
"""
|
||||
|
||||
HTTP_HEADER = 0
|
||||
UNKNOWN = 1
|
||||
|
||||
def __init__(self, address, size, isHTTPHeader=False):
|
||||
"""
|
||||
It is a lazy evaluation object : the file is open only if it is
|
||||
needed. It can parse the HTTP header if asked to do so.
|
||||
See net/http/http_util.cc LocateStartOfStatusLine and
|
||||
LocateEndOfHeaders for details.
|
||||
"""
|
||||
self.size = size
|
||||
self.address = address
|
||||
self.type = CacheData.UNKNOWN
|
||||
|
||||
if isHTTPHeader and\
|
||||
self.address.blockType != cacheAddress.CacheAddress.SEPARATE_FILE:
|
||||
# Getting raw data
|
||||
string = b""
|
||||
block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb')
|
||||
block.seek(8192 + self.address.blockNumber*self.address.entrySize)
|
||||
for _ in range(self.size):
|
||||
string += struct.unpack('c', block.read(1))[0]
|
||||
block.close()
|
||||
|
||||
# Finding the beginning of the request
|
||||
start = re.search(b"HTTP", string)
|
||||
if start == None:
|
||||
return
|
||||
else:
|
||||
string = string[start.start():]
|
||||
|
||||
# Finding the end (some null characters : verified by experience)
|
||||
end = re.search(b"\x00\x00", string)
|
||||
if end == None:
|
||||
return
|
||||
else:
|
||||
string = string[:end.end()-2]
|
||||
|
||||
# Creating the dictionary of headers
|
||||
self.headers = {}
|
||||
for line in string.split(b'\0'):
|
||||
stripped = line.split(b':')
|
||||
self.headers[stripped[0].lower()] = \
|
||||
b':'.join(stripped[1:]).strip()
|
||||
self.type = CacheData.HTTP_HEADER
|
||||
|
||||
def save(self, filename=None):
|
||||
"""Save the data to the specified filename"""
|
||||
if self.address.blockType == cacheAddress.CacheAddress.SEPARATE_FILE:
|
||||
shutil.copy(os.path.join(self.address.path,self.address.fileSelector),
|
||||
filename)
|
||||
else:
|
||||
output = open(filename, 'wb')
|
||||
block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb')
|
||||
block.seek(8192 + self.address.blockNumber*self.address.entrySize)
|
||||
output.write(block.read(self.size))
|
||||
block.close()
|
||||
output.close()
|
||||
|
||||
def data(self):
|
||||
"""Returns a string representing the data"""
|
||||
if self.address.blockType == cacheAddress.CacheAddress.SEPARATE_FILE:
|
||||
with open(os.path.join(self.address.path,self.address.fileSelector), 'rb') as infile:
|
||||
data = infile.read()
|
||||
else:
|
||||
block = open(os.path.join(self.address.path,self.address.fileSelector), 'rb')
|
||||
block.seek(8192 + self.address.blockNumber*self.address.entrySize)
|
||||
data = block.read(self.size).decode('utf-8')
|
||||
block.close()
|
||||
return data
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Display the type of cacheData
|
||||
"""
|
||||
if self.type == CacheData.HTTP_HEADER:
|
||||
if 'content-type' in self.headers:
|
||||
return "HTTP Header %s" % self.headers['content-type']
|
||||
else:
|
||||
return "HTTP Header"
|
||||
else:
|
||||
return "Data"
|
||||
140
fanficfare/chromagnon/cacheEntry.py
Normal file
140
fanficfare/chromagnon/cacheEntry.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Chrome Cache Entry
|
||||
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
|
||||
for design details
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import datetime
|
||||
import struct
|
||||
import os
|
||||
|
||||
from . import cacheAddress
|
||||
from . import cacheData
|
||||
from six.moves import range
|
||||
|
||||
class CacheEntry():
|
||||
"""
|
||||
See /net/disk_cache/disk_format.h for details.
|
||||
"""
|
||||
STATE = ["Normal",
|
||||
"Evicted (data were deleted)",
|
||||
"Doomed (shit happened)"]
|
||||
|
||||
def __init__(self, address):
|
||||
"""
|
||||
Parse a Chrome Cache Entry at the given address
|
||||
"""
|
||||
self.httpHeader = None
|
||||
block = open(os.path.join(address.path,address.fileSelector), 'rb')
|
||||
|
||||
# Going to the right entry
|
||||
block.seek(8192 + address.blockNumber*address.entrySize)
|
||||
|
||||
# Parsing basic fields
|
||||
self.hash = struct.unpack('I', block.read(4))[0]
|
||||
self.next = struct.unpack('I', block.read(4))[0]
|
||||
self.rankingNode = struct.unpack('I', block.read(4))[0]
|
||||
self.usageCounter = struct.unpack('I', block.read(4))[0]
|
||||
self.reuseCounter = struct.unpack('I', block.read(4))[0]
|
||||
self.state = struct.unpack('I', block.read(4))[0]
|
||||
self.creationTime = datetime.datetime(1601, 1, 1) + \
|
||||
datetime.timedelta(microseconds=\
|
||||
struct.unpack('Q', block.read(8))[0])
|
||||
self.keyLength = struct.unpack('I', block.read(4))[0]
|
||||
self.keyAddress = struct.unpack('I', block.read(4))[0]
|
||||
|
||||
|
||||
dataSize = []
|
||||
for _ in range(4):
|
||||
dataSize.append(struct.unpack('I', block.read(4))[0])
|
||||
|
||||
self.data = []
|
||||
for index in range(4):
|
||||
addr = struct.unpack('I', block.read(4))[0]
|
||||
try:
|
||||
addr = cacheAddress.CacheAddress(addr, address.path)
|
||||
self.data.append(cacheData.CacheData(addr, dataSize[index],
|
||||
True))
|
||||
except cacheAddress.CacheAddressError:
|
||||
pass
|
||||
|
||||
# Find the HTTP header if there is one
|
||||
for data in self.data:
|
||||
if data.type == cacheData.CacheData.HTTP_HEADER:
|
||||
self.httpHeader = data
|
||||
break
|
||||
|
||||
self.flags = struct.unpack('I', block.read(4))[0]
|
||||
|
||||
# Skipping pad
|
||||
block.seek(5*4, 1)
|
||||
|
||||
# Reading local key
|
||||
if self.keyAddress == 0:
|
||||
self.key = block.read(self.keyLength).decode('ascii')
|
||||
# Key stored elsewhere
|
||||
else:
|
||||
addr = cacheAddress.CacheAddress(self.keyAddress, address.path)
|
||||
|
||||
# It is probably an HTTP header
|
||||
self.key = cacheData.CacheData(addr, self.keyLength, True)
|
||||
|
||||
block.close()
|
||||
|
||||
def keyToStr(self):
|
||||
"""
|
||||
Since the key can be a string or a CacheData object, this function is an
|
||||
utility to display the content of the key whatever type is it.
|
||||
"""
|
||||
if self.keyAddress == 0:
|
||||
return self.key
|
||||
else:
|
||||
return self.key.data()
|
||||
|
||||
def __str__(self):
|
||||
string = "Hash: 0x%08x" % self.hash + '\n'
|
||||
if self.next != 0:
|
||||
string += "Next: 0x%08x" % self.next + '\n'
|
||||
string += "Usage Counter: %d" % self.usageCounter + '\n'\
|
||||
"Reuse Counter: %d" % self.reuseCounter + '\n'\
|
||||
"Creation Time: %s" % self.creationTime + '\n'
|
||||
if self.keyAddress != 0:
|
||||
string += "Key Address: 0x%08x" % self.keyAddress + '\n'
|
||||
string += "Key: %s" % self.key + '\n'
|
||||
if self.flags != 0:
|
||||
string += "Flags: 0x%08x" % self.flags + '\n'
|
||||
string += "State: %s" % CacheEntry.STATE[self.state]
|
||||
for data in self.data:
|
||||
string += "\nData (%d bytes) at 0x%08x : %s" % (data.size,
|
||||
data.address.addr,
|
||||
data)
|
||||
return string
|
||||
293
fanficfare/chromagnon/cacheParse.py
Normal file
293
fanficfare/chromagnon/cacheParse.py
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Parse the Chrome Cache File
|
||||
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
|
||||
for design details
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import gzip
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import re
|
||||
import brotli
|
||||
|
||||
from . import csvOutput
|
||||
from . import SuperFastHash
|
||||
|
||||
from .cacheAddress import CacheAddress
|
||||
from .cacheBlock import CacheBlock
|
||||
from .cacheData import CacheData
|
||||
from .cacheEntry import CacheEntry
|
||||
from six.moves import range
|
||||
|
||||
|
||||
def parse(path, urls=None):
|
||||
"""
|
||||
Reads the whole cache and store the collected data in a table
|
||||
or find out if the given list of urls is in the cache. If yes it
|
||||
return a list of the corresponding entries.
|
||||
"""
|
||||
# Verifying that the path end with / (What happen on windows?)
|
||||
path = os.path.abspath(path)
|
||||
cacheBlock = CacheBlock(os.path.join(path, "index"))
|
||||
|
||||
# Checking type
|
||||
if cacheBlock.type != CacheBlock.INDEX:
|
||||
raise Exception("Invalid Index File")
|
||||
|
||||
index = open(os.path.join(path, "index"), 'rb')
|
||||
|
||||
# Skipping Header
|
||||
index.seek(92*4)
|
||||
|
||||
cache = []
|
||||
# If no url is specified, parse the whole cache
|
||||
if urls == None:
|
||||
for key in range(cacheBlock.tableSize):
|
||||
raw = struct.unpack('I', index.read(4))[0]
|
||||
if raw != 0:
|
||||
entry = CacheEntry(CacheAddress(raw, path=path))
|
||||
# Checking if there is a next item in the bucket because
|
||||
# such entries are not stored in the Index File so they will
|
||||
# be ignored during iterative lookup in the hash table
|
||||
while entry.next != 0:
|
||||
cache.append(entry)
|
||||
entry = CacheEntry(CacheAddress(entry.next, path=path))
|
||||
cache.append(entry)
|
||||
else:
|
||||
# Find the entry for each url
|
||||
for url in urls:
|
||||
url = bytes(url,'utf8')
|
||||
# Compute the key and seeking to it
|
||||
hash = SuperFastHash.superFastHash(url)
|
||||
key = hash & (cacheBlock.tableSize - 1)
|
||||
index.seek(92*4 + key*4)
|
||||
|
||||
addr = struct.unpack('I', index.read(4))[0]
|
||||
# Checking if the address is initialized (i.e. used)
|
||||
if addr & 0x80000000 == 0:
|
||||
print("%s is not in the cache" % url, file=sys.stderr)
|
||||
|
||||
# Follow the chained list in the bucket
|
||||
else:
|
||||
entry = CacheEntry(CacheAddress(addr, path=path))
|
||||
while entry.hash != hash and entry.next != 0:
|
||||
entry = CacheEntry(CacheAddress(entry.next, path=path))
|
||||
if entry.hash == hash:
|
||||
cache.append(entry)
|
||||
index.close()
|
||||
return cache
|
||||
|
||||
class ChromeCache(object):
|
||||
def __init__(self,path):
|
||||
self.cache = parse(path)
|
||||
self.hash_cache = {}
|
||||
for entry in self.cache:
|
||||
key = entry.keyToStr()
|
||||
self.hash_cache[key] = entry
|
||||
normkey = re.sub(r'^(https://www.fanfiction.net/s/\d+/\d+/).+$',r'\1',key)
|
||||
## either overwrites (no harm), or adds new.
|
||||
self.hash_cache[normkey] = entry
|
||||
|
||||
def get_cached_file(self,url):
|
||||
if url in self.hash_cache:
|
||||
entry = self.hash_cache[url]
|
||||
for i in range(len(entry.data)):
|
||||
if entry.data[i].type == CacheData.UNKNOWN:
|
||||
# Extracting data into a file
|
||||
data = entry.data[i].data()
|
||||
|
||||
# print("content-encoding:%s"%entry.httpHeader.headers.get(b'content-encoding',''))
|
||||
if entry.httpHeader != None and \
|
||||
b'content-encoding' in entry.httpHeader.headers:
|
||||
if entry.httpHeader.headers[b'content-encoding'] == b"gzip":
|
||||
data = gzip.decompress(data)
|
||||
elif entry.httpHeader.headers[b'content-encoding'] == b"br":
|
||||
data = brotli.decompress(data)
|
||||
return data
|
||||
return None
|
||||
|
||||
def exportToHTML(cache, outpath):
|
||||
"""
|
||||
Export the cache in html
|
||||
"""
|
||||
|
||||
# Checking that the directory exists and is writable
|
||||
if not os.path.exists(outpath):
|
||||
os.makedirs(outpath)
|
||||
outpath = os.path.abspath(outpath)
|
||||
|
||||
index = open(os.path.join(outpath,"index.html"), 'w')
|
||||
index.write("<UL>")
|
||||
|
||||
for entry in cache:
|
||||
# Adding a link in the index
|
||||
if entry.keyLength > 100:
|
||||
entry_name = entry.keyToStr()[:100] + "..."
|
||||
else:
|
||||
entry_name = entry.keyToStr()
|
||||
index.write('<LI><a href="%08x.html">%s</a></LI>'%(entry.hash, entry_name))
|
||||
# We handle the special case where entry_name ends with a slash
|
||||
page_basename = entry_name.split('/')[-2] if entry_name.endswith('/') else entry_name.split('/')[-1]
|
||||
|
||||
# Creating the entry page
|
||||
page = open(os.path.join(outpath,"%08x.html"%entry.hash), 'w')
|
||||
page.write("""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
</head>
|
||||
<body>""")
|
||||
|
||||
# Details of the entry
|
||||
page.write("<b>Hash</b>: 0x%08x<br />"%entry.hash)
|
||||
page.write("<b>Usage Counter</b>: %d<br />"%entry.usageCounter)
|
||||
page.write("<b>Reuse Counter</b>: %d<br />"%entry.reuseCounter)
|
||||
page.write("<b>Creation Time</b>: %s<br />"%entry.creationTime)
|
||||
page.write("<b>Key</b>: %s<br>"%entry.keyToStr())
|
||||
page.write("<b>State</b>: %s<br>"%CacheEntry.STATE[entry.state])
|
||||
|
||||
page.write("<hr>")
|
||||
## entry.data normally 2 or 1
|
||||
## 2 for headers and data, 1 for headers only.
|
||||
if len(entry.data) == 0:
|
||||
page.write("No data associated with this entry :-(")
|
||||
for i in range(len(entry.data)):
|
||||
if entry.data[i].type == CacheData.UNKNOWN:
|
||||
# Extracting data into a file
|
||||
name = hex(entry.hash) + "_" + str(i)
|
||||
entry.data[i].save(os.path.join(outpath,name))
|
||||
|
||||
# print("content-encoding:%s"%entry.httpHeader.headers.get(b'content-encoding',''))
|
||||
if entry.httpHeader != None and \
|
||||
b'content-encoding' in entry.httpHeader.headers:
|
||||
if entry.httpHeader.headers[b'content-encoding'] == b"gzip":
|
||||
# XXX Highly inefficient !!!!!
|
||||
try:
|
||||
input = gzip.open(os.path.join(outpath, name), 'rb')
|
||||
output = open(os.path.join(outpath, name + "u"), 'wb')
|
||||
output.write(input.read())
|
||||
input.close()
|
||||
output.close()
|
||||
page.write('<a href="%su">%s</a>'%(name, page_basename))
|
||||
# print("gunzip'ed:%s"%name)
|
||||
except IOError:
|
||||
page.write("Something wrong happened while unzipping")
|
||||
elif entry.httpHeader.headers[b'content-encoding'] == b"br":
|
||||
try:
|
||||
with open(os.path.join(outpath,name), 'rb') as input:
|
||||
with open(os.path.join(outpath,name + "u"), 'wb') as output:
|
||||
output.write(brotli.decompress(input.read()))
|
||||
page.write('<a href="%su">%s</a>'%(name, page_basename))
|
||||
# print("unbrotli'ed:%s"%name)
|
||||
except IOError:
|
||||
page.write("Something wrong happened while unzipping")
|
||||
brotli
|
||||
else:
|
||||
page.write('<a href="%s">%s</a>'%(name ,
|
||||
entry.keyToStr().split('/')[-1]))
|
||||
|
||||
|
||||
# If it is a picture, display it
|
||||
if entry.httpHeader != None:
|
||||
if b'content-type' in entry.httpHeader.headers and\
|
||||
b"image" in entry.httpHeader.headers[b'content-type']:
|
||||
page.write('<br /><img src="%s">'%(name))
|
||||
# HTTP Header
|
||||
else:
|
||||
page.write("<u>HTTP Header</u><br />")
|
||||
for key, value in entry.data[i].headers.items():
|
||||
page.write("<b>%s</b>: %s<br />"%(key, value))
|
||||
page.write("<hr>")
|
||||
page.write("</body></html>")
|
||||
page.close()
|
||||
|
||||
index.write("</UL>")
|
||||
index.close()
|
||||
|
||||
def exportTol2t(cache):
|
||||
"""
|
||||
Export the cache in CSV log2timeline compliant format
|
||||
"""
|
||||
|
||||
output = []
|
||||
output.append(["date",
|
||||
"time",
|
||||
"timezone",
|
||||
"MACB",
|
||||
"source",
|
||||
"sourcetype",
|
||||
"type",
|
||||
"user",
|
||||
"host",
|
||||
"short",
|
||||
"desc",
|
||||
"version",
|
||||
"filename",
|
||||
"inode",
|
||||
"notes",
|
||||
"format",
|
||||
"extra"])
|
||||
|
||||
for entry in cache:
|
||||
date = entry.creationTime.date().strftime("%m/%d/%Y")
|
||||
time = entry.creationTime.time()
|
||||
# TODO get timezone
|
||||
timezone = 0
|
||||
short = entry.keyToStr()
|
||||
descr = "Hash: 0x%08x" % entry.hash
|
||||
descr += " Usage Counter: %d" % entry.usageCounter
|
||||
if entry.httpHeader != None:
|
||||
if 'content-type' in entry.httpHeader.headers:
|
||||
descr += " MIME: %s" % entry.httpHeader.headers['content-type']
|
||||
|
||||
output.append([date,
|
||||
time,
|
||||
timezone,
|
||||
"MACB",
|
||||
"WEBCACHE",
|
||||
"Chrome Cache",
|
||||
"Cache Entry",
|
||||
"-",
|
||||
"-",
|
||||
short,
|
||||
descr,
|
||||
"2",
|
||||
"-",
|
||||
"-",
|
||||
"-",
|
||||
"-",
|
||||
"-",
|
||||
])
|
||||
|
||||
csvOutput.csvOutput(output)
|
||||
45
fanficfare/chromagnon/classicalOutput.py
Normal file
45
fanficfare/chromagnon/classicalOutput.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Classical Output Module
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import sys
|
||||
|
||||
def classicalOutput(queryResult, separator="\t"):
|
||||
"""
|
||||
Display the data separated by the specified separator
|
||||
"""
|
||||
|
||||
for line in queryResult:
|
||||
for element in line:
|
||||
sys.stdout.write(element)
|
||||
sys.stdout.write(separator)
|
||||
sys.stdout.write('\n')
|
||||
49
fanficfare/chromagnon/columnOutput.py
Normal file
49
fanficfare/chromagnon/columnOutput.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Column Output Module
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
from six.moves import range
|
||||
def columnOutput(queryResult, separator=' '):
|
||||
"""
|
||||
Display the data in columns
|
||||
"""
|
||||
if len(queryResult) == 0:
|
||||
return
|
||||
|
||||
# Finding width of columns
|
||||
size = [max([len(str(line[i])) for line in queryResult])
|
||||
for i in range(len(queryResult[0]))]
|
||||
# Generating format string (without last separator)
|
||||
string = (''.join(["%%-%ds%s" % (x, separator) for x in size]))\
|
||||
[:-len(separator)]
|
||||
for line in queryResult:
|
||||
print(string % tuple(line))
|
||||
44
fanficfare/chromagnon/csvOutput.py
Normal file
44
fanficfare/chromagnon/csvOutput.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
CSV Output Module
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import csv
|
||||
import sys
|
||||
|
||||
def csvOutput(queryResult, separator=',', quote='"'):
|
||||
"""
|
||||
Display the data according to csv format
|
||||
"""
|
||||
csvWriter = csv.writer(sys.stdout, delimiter=separator, quotechar=quote,
|
||||
quoting=csv.QUOTE_MINIMAL)
|
||||
for line in queryResult:
|
||||
csvWriter.writerow(line)
|
||||
106
fanficfare/chromagnon/downloadParse.py
Normal file
106
fanficfare/chromagnon/downloadParse.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Parse the Chrome Download Table History File
|
||||
Its a SQLite3 table
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import datetime
|
||||
import sqlite3
|
||||
import sys
|
||||
import six
|
||||
|
||||
def parse(filename, urlLength):
|
||||
"""
|
||||
filename: path to the history file
|
||||
urlLength: maximum url length to display
|
||||
"""
|
||||
|
||||
# Connecting to the DB
|
||||
try:
|
||||
history = sqlite3.connect(filename)
|
||||
except sqlite3.Error as error:
|
||||
print("==> Error while opening the history file !")
|
||||
print("==> Details :", error.message)
|
||||
sys.exit("==> Exiting...")
|
||||
|
||||
# Retrieving all useful data
|
||||
result = history.execute("SELECT id, \
|
||||
full_path, \
|
||||
url, \
|
||||
start_time, \
|
||||
received_bytes, \
|
||||
total_bytes, \
|
||||
state \
|
||||
FROM downloads;")
|
||||
|
||||
output = []
|
||||
for line in result:
|
||||
output.append(DownloadEntry(line, urlLength))
|
||||
return output
|
||||
|
||||
class DownloadEntry(object):
|
||||
"""Object to store download entries"""
|
||||
COLUMN_STR = {'st': "startTime",
|
||||
'p': "path",
|
||||
'u': "url",
|
||||
'rb': "receivedBytes",
|
||||
'tb': "totalBytes",
|
||||
'pt': "percentReceived",
|
||||
's': "state"}
|
||||
STATE_STR = ["In Progress",
|
||||
"Complete",
|
||||
"Cancelled",
|
||||
"Removing",
|
||||
"Interrupted"]
|
||||
|
||||
def __init__(self, item, urlLength):
|
||||
"""Parse raw input"""
|
||||
self.path = item[1]
|
||||
if len(item[2]) > urlLength and urlLength > 0:
|
||||
self.url = item[2][0:urlLength - 3] + "..."
|
||||
else:
|
||||
self.url = item[2]
|
||||
self.startTime = datetime.datetime(1601, 1, 1) + \
|
||||
datetime.timedelta(microseconds=\
|
||||
item[3])
|
||||
self.receivedBytes = item[4]
|
||||
self.totalBytes = item[5]
|
||||
self.state = DownloadEntry.STATE_STR[item[6]]
|
||||
if int(item[5]) == 0:
|
||||
self.percentReceived = "0%"
|
||||
else:
|
||||
self.percentReceived = "%d%%" % \
|
||||
int(float(item[4])/float(item[5])*100)
|
||||
|
||||
def columnToStr(self, column):
|
||||
"""Returns column content specified by argument"""
|
||||
return six.text_type(self.__getattribute__(DownloadEntry.COLUMN_STR[column]))
|
||||
178
fanficfare/chromagnon/historyParse.py
Normal file
178
fanficfare/chromagnon/historyParse.py
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Parse the Chrome History File
|
||||
Its a SQLite3 file
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import datetime
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
from . import cacheParse
|
||||
import six
|
||||
|
||||
def parse(filename, start, end, checkCache, cachePath, urlLength):
|
||||
"""
|
||||
filename: path to the history file
|
||||
start: beginning of the time window
|
||||
end: end of the time window
|
||||
checkCache: check if each page in the history is in the cache
|
||||
cachePath: path to cache directory
|
||||
"""
|
||||
|
||||
# Connecting to the DB
|
||||
try:
|
||||
history = sqlite3.connect(filename)
|
||||
except sqlite3.Error as error:
|
||||
print("==> Error while opening the history file !")
|
||||
print("==> Details :", error.message)
|
||||
sys.exit("==> Exiting...")
|
||||
|
||||
reference = datetime.datetime(1601, 1, 1)
|
||||
|
||||
# Retrieving all useful data
|
||||
result = history.execute("SELECT visits.visit_time, \
|
||||
visits.from_visit, \
|
||||
visits.transition, \
|
||||
urls.url, \
|
||||
urls.title, \
|
||||
urls.visit_count, \
|
||||
urls.typed_count, \
|
||||
urls.last_visit_time \
|
||||
FROM urls,visits \
|
||||
WHERE urls.id=visits.url\
|
||||
AND visits.visit_time>%d\
|
||||
AND visits.visit_time<%d\
|
||||
ORDER BY visits.visit_time;"%\
|
||||
(int((start-reference).total_seconds()*1000000),\
|
||||
int((end-reference).total_seconds()*1000000)))\
|
||||
|
||||
# Parsing cache
|
||||
cache = None
|
||||
if checkCache:
|
||||
cache = cacheParse.parse(cachePath)
|
||||
|
||||
output = []
|
||||
for line in result:
|
||||
output.append(HistoryEntry(line, cache, urlLength))
|
||||
return output
|
||||
|
||||
class Transition():
|
||||
"""Object representing transition between history pages"""
|
||||
|
||||
CORE_STRING = ["Link",\
|
||||
"Typed",\
|
||||
"Auto Bookmark",\
|
||||
"Auto Subframe",\
|
||||
"Manual Subframe",\
|
||||
"Generated",\
|
||||
"Start Page",\
|
||||
"Form Submit",\
|
||||
"Reload",\
|
||||
"Keyword",\
|
||||
"Keywork Generated"]
|
||||
QUALIFIER_STRING = [(0x01000000, "Forward or Back Button"),
|
||||
(0x02000000, "Address Bar"),
|
||||
(0x04000000, "Home Page"),
|
||||
(0x10000000, "Beginning of Chain"),
|
||||
(0x20000000, "End of Chain"),
|
||||
(0x40000000, "Client Redirection"),
|
||||
(0x80000000, "Server Redirection")]
|
||||
|
||||
def __init__(self, transition):
|
||||
"""
|
||||
Parsing the transtion according to
|
||||
content/common/page_transition_types.h
|
||||
"""
|
||||
self.core = transition & 0xFF
|
||||
self.qualifier = transition & 0xFFFFFF00
|
||||
|
||||
def __str__(self):
|
||||
string = Transition.CORE_STRING[self.core]
|
||||
for mask, description in Transition.QUALIFIER_STRING:
|
||||
if self.qualifier & mask != 0:
|
||||
string += ", %s"%description
|
||||
return string
|
||||
|
||||
class HistoryEntry(object):
|
||||
"""Object to store database entries"""
|
||||
COLUMN_STR = {'vt': "visitTime",
|
||||
'fv': "fromVisit",
|
||||
'tr': "transition",
|
||||
'u': "url",
|
||||
'tl': "title",
|
||||
'vc': "visitCount",
|
||||
'tc': "typedCount",
|
||||
'lv': "lastVisitTime",
|
||||
'cc': "inCache"}
|
||||
|
||||
def __init__(self, item, cache, urlLength):
|
||||
"""Parse raw input"""
|
||||
self.visitTime = datetime.datetime(1601, 1, 1) + \
|
||||
datetime.timedelta(microseconds=\
|
||||
item[0])
|
||||
self.fromVisit = item[1]
|
||||
self.transition = Transition(item[2])
|
||||
if len(item[3]) > urlLength and urlLength > 0:
|
||||
self.url = item[3][0:urlLength - 3] + "..."
|
||||
else:
|
||||
self.url = item[3]
|
||||
self.title = item[4]
|
||||
self.visitCount = item[5]
|
||||
self.typedCount = item[6]
|
||||
self.lastVisitTime = datetime.datetime(1601, 1, 1) + \
|
||||
datetime.timedelta(microseconds=\
|
||||
item[7])
|
||||
|
||||
# Searching in the cache if there is a copy of the page
|
||||
# TODO use a hash table to search instead of heavy exhaustive search
|
||||
self.inCache = False
|
||||
if cache != None:
|
||||
for item in cache:
|
||||
if item.keyToStr() == self.url:
|
||||
self.inCache = True
|
||||
break
|
||||
|
||||
def toStr(self):
|
||||
return [six.text_type(self.visitTime),\
|
||||
six.text_type(self.fromVisit),\
|
||||
six.text_type(self.transition),\
|
||||
six.text_type(self.url),\
|
||||
six.text_type(self.title),\
|
||||
six.text_type(self.visitCount),\
|
||||
six.text_type(self.typedCount),\
|
||||
six.text_type(self.lastVisitTime)]
|
||||
|
||||
def columnToStr(self, column):
|
||||
"""Returns column content specified by argument"""
|
||||
return six.text_type(self.__getattribute__(HistoryEntry.COLUMN_STR[column]))
|
||||
42
fanficfare/chromagnon/jsonOutput.py
Normal file
42
fanficfare/chromagnon/jsonOutput.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
JSON Output Module
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import json
|
||||
|
||||
def jsonOutput(queryResult, separator=''):
|
||||
"""
|
||||
Display the data separated in JSON
|
||||
"""
|
||||
|
||||
print(json.JSONEncoder().encode(queryResult))
|
||||
97
fanficfare/chromagnon/visitedLinks.py
Normal file
97
fanficfare/chromagnon/visitedLinks.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the Chromagon Project nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""
|
||||
Parse the Chrome Visited Links
|
||||
Reverse engineered from
|
||||
chrome/common/visitedlink_common.*
|
||||
chrome/browser/visitedlink/visitedlink_*
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import md5
|
||||
import struct
|
||||
import sys
|
||||
from six.moves import range
|
||||
|
||||
VISITED_LINKS_MAGIC = 0x6b6e4c56;
|
||||
|
||||
def isVisited(path, urls):
|
||||
"""
|
||||
Return the list of urls given in parameter with a boolean information
|
||||
about its presence in the given visited links file
|
||||
"""
|
||||
output = []
|
||||
|
||||
f = open(path, 'rb')
|
||||
|
||||
# Checking file type
|
||||
magic = struct.unpack('I', f.read(4))[0]
|
||||
if magic != VISITED_LINKS_MAGIC:
|
||||
raise Exception("Invalid file")
|
||||
|
||||
# Reading header values
|
||||
version = struct.unpack('I', f.read(4))[0]
|
||||
length = struct.unpack('I', f.read(4))[0]
|
||||
usedItems = struct.unpack('I', f.read(4))[0]
|
||||
|
||||
# Reading salt
|
||||
salt = ""
|
||||
for dummy in range(8):
|
||||
salt += struct.unpack('c', f.read(1))[0]
|
||||
|
||||
for url in urls:
|
||||
fingerprint = md5.new()
|
||||
fingerprint.update(salt)
|
||||
fingerprint.update(url)
|
||||
digest = fingerprint.hexdigest()
|
||||
|
||||
# Inverting the result
|
||||
# Why Chrome MD5 computation gives a reverse digest ?
|
||||
fingerprint = 0
|
||||
for i in range(0, 16, 2):
|
||||
fingerprint += int(digest[i:i+2], 16) << (i/2)*8
|
||||
key = fingerprint % length
|
||||
|
||||
# The hash table uses open addressing
|
||||
f.seek(key*8 + 24, 0)
|
||||
while True:
|
||||
finger = struct.unpack('q', f.read(8))[0]
|
||||
if finger == 0:
|
||||
output.append((url, False))
|
||||
break
|
||||
if finger == fingerprint:
|
||||
output.append((url, True))
|
||||
break
|
||||
if f.tell() >= length*8 + 24:
|
||||
f.seek(24)
|
||||
if f.tell() == key*8 + 24:
|
||||
output.append((url, False))
|
||||
break
|
||||
f.close()
|
||||
return output
|
||||
Loading…
Reference in a new issue