FanFicFare/fanficfare/browsercache/chromagnon/cacheParse.py
2023-01-01 13:06:03 -06:00

244 lines
9 KiB
Python

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Parse the Chrome Cache File
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
for design details
"""
import gzip
import os
import struct
import sys
#import csvOutput
from . import SuperFastHash
from .cacheAddress import CacheAddress
from .cacheBlock import CacheBlock
from .cacheData import CacheData
from .cacheEntry import CacheEntry
def parse(path, urls=None):
"""
Reads the whole cache and store the collected data in a table
or find out if the given list of urls is in the cache. If yes it
return a list of the corresponding entries.
"""
# Verifying that the path end with / (What happen on windows?)
path = os.path.abspath(path) + '/'
cacheBlock = CacheBlock(path + "index")
# Checking type
if cacheBlock.type != CacheBlock.INDEX:
raise Exception("Invalid Index File")
index = open(path + "index", 'rb')
# Skipping Header
index.seek(92*4)
cache = []
# If no url is specified, parse the whole cache
if urls == None:
for key in range(cacheBlock.tableSize):
raw = struct.unpack('I', index.read(4))[0]
if raw != 0:
entry = CacheEntry(CacheAddress(raw, path=path))
# Checking if there is a next item in the bucket because
# such entries are not stored in the Index File so they will
# be ignored during iterative lookup in the hash table
while entry.next != 0:
cache.append(entry)
entry = CacheEntry(CacheAddress(entry.next, path=path))
cache.append(entry)
else:
# Find the entry for each url
for url in urls:
# Compute the key and seeking to it
hash = SuperFastHash.superFastHash(url)
key = hash & (cacheBlock.tableSize - 1)
index.seek(92*4 + key*4)
addr = struct.unpack('I', index.read(4))[0]
# Checking if the address is initialized (i.e. used)
if addr & 0x80000000 == 0:
print("%s is not in the cache" % url)
# Follow the chained list in the bucket
else:
entry = CacheEntry(CacheAddress(addr, path=path))
while entry.hash != hash and entry.next != 0:
entry = CacheEntry(CacheAddress(entry.next, path=path))
if entry.hash == hash:
cache.append(entry)
return cache
def exportToHTML(cache, outpath):
"""
Export the cache in html
"""
# Checking that the directory exists and is writable
if not os.path.exists(outpath):
os.makedirs(outpath)
outpath = os.path.abspath(outpath) + '/'
index = open(outpath + "index.html", 'w')
index.write("<UL>")
for entry in cache:
# Adding a link in the index
if entry.keyLength > 100:
entry_name = entry.keyToStr()[:100] + "..."
else:
entry_name = entry.keyToStr()
index.write('<LI><a href="%08x">%s</a></LI>'%(entry.hash, entry_name))
# We handle the special case where entry_name ends with a slash
page_basename = entry_name.split('/')[-2] if entry_name.endswith('/') else entry_name.split('/')[-1]
# Creating the entry page
page = open(outpath + "%08x"%entry.hash, 'w')
page.write("""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
</head>
<body>""")
# Details of the entry
page.write("<b>Hash</b>: 0x%08x<br />"%entry.hash)
page.write("<b>Usage Counter</b>: %d<br />"%entry.usageCounter)
page.write("<b>Reuse Counter</b>: %d<br />"%entry.reuseCounter)
page.write("<b>Creation Time</b>: %s<br />"%entry.creationTime)
page.write("<b>Key</b>: %s<br>"%entry.keyToStr())
page.write("<b>State</b>: %s<br>"%CacheEntry.STATE[entry.state])
page.write("<hr>")
if len(entry.data) == 0:
page.write("No data associated with this entry :-(")
for i in range(len(entry.data)):
if entry.data[i].type == CacheData.UNKNOWN:
# Extracting data into a file
name = hex(entry.hash) + "_" + str(i)
entry.data[i].save(outpath + name)
if entry.httpHeader != None and \
entry.httpHeader.headers.has_key('content-encoding') and\
entry.httpHeader.headers['content-encoding'] == "gzip":
# XXX Highly inefficient !!!!!
try:
input = gzip.open(outpath + name, 'rb')
output = open(outpath + name + "u", 'w')
output.write(input.read())
input.close()
output.close()
page.write('<a href="%su">%s</a>'%(name, page_basename))
except IOError:
page.write("Something wrong happened while unzipping")
else:
page.write('<a href="%s">%s</a>'%(name ,
entry.keyToStr().split('/')[-1]))
# If it is a picture, display it
if entry.httpHeader != None:
if entry.httpHeader.headers.has_key('content-type') and\
"image" in entry.httpHeader.headers['content-type']:
page.write('<br /><img src="%s">'%(name))
# HTTP Header
else:
page.write("<u>HTTP Header</u><br />")
for key, value in entry.data[i].headers.items():
page.write("<b>%s</b>: %s<br />"%(key, value))
page.write("<hr>")
page.write("</body></html>")
page.close()
index.write("</UL>")
index.close()
def exportTol2t(cache):
"""
Export the cache in CSV log2timeline compliant format
"""
output = []
output.append(["date",
"time",
"timezone",
"MACB",
"source",
"sourcetype",
"type",
"user",
"host",
"short",
"desc",
"version",
"filename",
"inode",
"notes",
"format",
"extra"])
for entry in cache:
date = entry.creationTime.date().strftime("%m/%d/%Y")
time = entry.creationTime.time()
# TODO get timezone
timezone = 0
short = entry.keyToStr()
descr = "Hash: 0x%08x" % entry.hash
descr += " Usage Counter: %d" % entry.usageCounter
if entry.httpHeader != None:
if entry.httpHeader.headers.has_key('content-type'):
descr += " MIME: %s" % entry.httpHeader.headers['content-type']
output.append([date,
time,
timezone,
"MACB",
"WEBCACHE",
"Chrome Cache",
"Cache Entry",
"-",
"-",
short,
descr,
"2",
"-",
"-",
"-",
"-",
"-",
])
# csvOutput.csvOutput(output)