From 8f198bae217fa767225dad6b8b3417b3f35c982f Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Wed, 28 Oct 2015 18:06:19 -0500
Subject: [PATCH] Allow logging in to sites, to view hidden things

---
 .gitignore        |  2 ++
 epub.py           |  2 +-
 fetch.py          | 45 ++++++++++++++++++---------------------------
 leech.py          | 20 +++++++++++++++-----
 requirements.txt  |  1 +
 sites/__init__.py | 10 +++++++---
 sites/xenforo.py  | 13 +++++++++++--
 7 files changed, 55 insertions(+), 38 deletions(-)
diff --git a/.gitignore b/.gitignore
index 8d0268a..dadd55b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 *.epub
 *.mobi
 leech.db
+leech.cookies
+leech.json
 venv/
 
 # Byte-compiled / optimized / DLL files
diff --git a/epub.py b/epub.py
index 7710ff4..fc90852 100644
--- a/epub.py
+++ b/epub.py
@@ -121,7 +121,7 @@ def make_epub(filename, html_files, meta):
 
     epub.close()
 
-    return True
+    return filename
 
 if __name__ == '__main__':
     make_epub('test.epub', [('Chapter 1', 'test/a.html'), ('Chapter 2', 'test/b.html')], {})
diff --git a/fetch.py b/fetch.py
index 5cb9621..dc87f5e 100644
--- a/fetch.py
+++ b/fetch.py
@@ -1,10 +1,9 @@
 #!/usr/bin/python
 
-import gzip
 import sqlite3
+import http.cookiejar
 
-from io import BytesIO
-from urllib.request import Request, urlopen
+import requests
 
 __version__ = 1
 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
@@ -20,7 +19,7 @@ class Fetch:
         if it doesn't already exist. (":memory:" will store everything
         in-memory, if you only need to use this as a temporary thing).
         """
-        store = sqlite3.connect(storepath)
+        store = sqlite3.connect(storepath + '.db')
         self.store = store
         c = store.cursor()
         c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""")
@@ -29,6 +28,18 @@ class Fetch:
 
         self.cachetime = cachetime
 
+        lwp_cookiejar = http.cookiejar.LWPCookieJar()
+        try:
+            lwp_cookiejar.load(storepath + '.cookies', ignore_discard=True)
+        except Exception as e:
+            pass
+
+        self.session = requests.Session()
+        self.session.cookies = lwp_cookiejar
+        self.session.headers.update({
+            'User-agent': USER_AGENT
+        })
+
     def __call__(self, url, **kw):
         return self.get(url, **kw)
 
@@ -44,9 +55,9 @@ class Fetch:
             c.close()
             if row:
                 return row[0]
-        data = _fetch(url, **kw)
-        self.__set(url, data)
-        return data
+        data = self.session.get(url, **kw)
+        self.__set(url, data.text)
+        return data.text
 
     def __set(self, url, value):
         """Add a value to the store, at the current time
@@ -58,23 +69,3 @@ class Fetch:
         c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,))
         self.store.commit()
         c.close()
-
-
-def _fetch(url, data=None, ungzip=True):
-    """A generic URL-fetcher, which handles gzipped content, returns a string"""
-    request = Request(url)
-    request.add_header('Accept-encoding', 'gzip')
-    request.add_header('User-agent', USER_AGENT)
-    try:
-        f = urlopen(request, data)
-    except Exception as e:
-        return None
-    data = f.read()
-    if ungzip and f.headers.get('content-encoding', '') == 'gzip':
-        data = gzip.GzipFile(fileobj=BytesIO(data), mode='r').read()
-        try:
-            data = data.decode()
-        except UnicodeDecodeError:
-            data = data.decode('latin1')
-    f.close()
-    return data
\ No newline at end of file
diff --git a/leech.py b/leech.py
index e27d2ed..6787771 100755
--- a/leech.py
+++ b/leech.py
@@ -3,12 +3,13 @@
 import argparse
 import importlib
 import os
+import json
 
 import sites
 import epub
 from fetch import Fetch
 
-fetch = Fetch("leech.db")
+fetch = Fetch("leech")
 
 html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <html xmlns="http://www.w3.org/1999/xhtml">
@@ -23,14 +24,21 @@ html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 '''
 
 
-def leech(url, filename=None):
+def leech(url, filename=None, cache=True):
     # we have: a page, which could be absolutely any part of a story, or not a story at all
     # check a bunch of things which are completely ff.n specific, to get text from it
     site = sites.get(url)
     if not site:
         raise Exception("No site handler found")
 
-    handler = site(fetch)
+    handler = site(fetch, cache=cache)
+
+    with open('leech.json') as store_file:
+        store = json.load(store_file)
+        login = store.get('logins', {}).get(site.__name__, False)
+        if login:
+            handler.login(login)
+
     story = handler.extract(url)
     if not story:
         raise Exception("Couldn't extract story")
@@ -46,7 +54,7 @@ def leech(url, filename=None):
 
     filename = filename or story['title'] + '.epub'
 
-    epub.make_epub(filename, html, metadata)
+    filename = epub.make_epub(filename, html, metadata)
 
     return filename
 
@@ -54,7 +62,9 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('url', help="url of a story to fetch")
     parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
+    parser.add_argument('--no-cache', dest='cache', action='store_false')
+    parser.set_defaults(cache=True)
     args = parser.parse_args()
 
-    filename = leech(args.url, filename=args.filename)
+    filename = leech(args.url, filename=args.filename, cache=args.cache)
     print("File created:", filename)
diff --git a/requirements.txt b/requirements.txt
index 94bda8d..0b2104c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 beautifulsoup4==4.4.1
 html5lib==0.999
+requests==2.8.1
 six==1.6.1
diff --git a/sites/__init__.py b/sites/__init__.py
index b5e44b6..0f537a4 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -7,9 +7,10 @@ class Site:
     """A Site handles checking whether a URL might represent a site, and then
     extracting the content of a story from said site.
     """
-    def __init__(self, fetch):
+    def __init__(self, fetch, cache=True):
         super().__init__()
         self.fetch = fetch
+        self.cache = cache
 
     @staticmethod
     def matches(url):
@@ -18,8 +19,11 @@ class Site:
     def extract(self, url):
         raise NotImplementedError()
 
-    def _soup(self, url, method='html5lib'):
-        page = self.fetch(url)
+    def login(self, login_details):
+        raise NotImplementedError()
+
+    def _soup(self, url, method='html5lib', **kw):
+        page = self.fetch(url, cached=self.cache, **kw)
         if not page:
             raise SiteException("Couldn't fetch", url)
         return BeautifulSoup(page, method)
diff --git a/sites/xenforo.py b/sites/xenforo.py
index 6a96679..f4413fb 100644
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@@ -13,14 +13,23 @@ class XenForo(Site):
     def matches(cls, url):
         return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url)
 
+    def login(self, login_details):
+        # Todo: handle non-https?
+        post = {
+            'login': login_details[0],
+            'password': login_details[1],
+        }
+        self.fetch.session.post('https://%s/login/login' % self.domain, data=post)
+        print("Logged in as", login_details[0])
+
     def extract(self, url):
         soup = self._soup(url)
 
         base = soup.head.base.get('href')
 
         story = {}
-        story['title'] = str(soup.find('h1').string)
-        story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
+        story['title'] = soup.find('h1').get_text()
+        story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text()
 
         marks = self._chapter_list(url)