From 8f198bae217fa767225dad6b8b3417b3f35c982f Mon Sep 17 00:00:00 2001 From: David Lynch Date: Wed, 28 Oct 2015 18:06:19 -0500 Subject: [PATCH] Allow logging in to sites, to view hidden things --- .gitignore | 2 ++ epub.py | 2 +- fetch.py | 45 ++++++++++++++++++--------------------------- leech.py | 20 +++++++++++++++----- requirements.txt | 1 + sites/__init__.py | 10 +++++++--- sites/xenforo.py | 13 +++++++++++-- 7 files changed, 55 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index 8d0268a..dadd55b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *.epub *.mobi leech.db +leech.cookies +leech.json venv/ # Byte-compiled / optimized / DLL files diff --git a/epub.py b/epub.py index 7710ff4..fc90852 100644 --- a/epub.py +++ b/epub.py @@ -121,7 +121,7 @@ def make_epub(filename, html_files, meta): epub.close() - return True + return filename if __name__ == '__main__': make_epub('test.epub', [('Chapter 1', 'test/a.html'), ('Chapter 2', 'test/b.html')], {}) diff --git a/fetch.py b/fetch.py index 5cb9621..dc87f5e 100644 --- a/fetch.py +++ b/fetch.py @@ -1,10 +1,9 @@ #!/usr/bin/python -import gzip import sqlite3 +import http.cookiejar -from io import BytesIO -from urllib.request import Request, urlopen +import requests __version__ = 1 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ @@ -20,7 +19,7 @@ class Fetch: if it doesn't already exist. (":memory:" will store everything in-memory, if you only need to use this as a temporary thing). """ - store = sqlite3.connect(storepath) + store = sqlite3.connect(storepath + '.db') self.store = store c = store.cursor() c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""") @@ -29,6 +28,18 @@ class Fetch: self.cachetime = cachetime + lwp_cookiejar = http.cookiejar.LWPCookieJar() + try: + lwp_cookiejar.load(storepath + '.cookies', ignore_discard=True) + except Exception as e: + pass + + self.session = requests.Session() + self.session.cookies = lwp_cookiejar + self.session.headers.update({ + 'User-agent': USER_AGENT + }) + def __call__(self, url, **kw): return self.get(url, **kw) @@ -44,9 +55,9 @@ class Fetch: c.close() if row: return row[0] - data = _fetch(url, **kw) - self.__set(url, data) - return data + data = self.session.get(url, **kw) + self.__set(url, data.text) + return data.text def __set(self, url, value): """Add a value to the store, at the current time @@ -58,23 +69,3 @@ class Fetch: c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,)) self.store.commit() c.close() - - -def _fetch(url, data=None, ungzip=True): - """A generic URL-fetcher, which handles gzipped content, returns a string""" - request = Request(url) - request.add_header('Accept-encoding', 'gzip') - request.add_header('User-agent', USER_AGENT) - try: - f = urlopen(request, data) - except Exception as e: - return None - data = f.read() - if ungzip and f.headers.get('content-encoding', '') == 'gzip': - data = gzip.GzipFile(fileobj=BytesIO(data), mode='r').read() - try: - data = data.decode() - except UnicodeDecodeError: - data = data.decode('latin1') - f.close() - return data \ No newline at end of file diff --git a/leech.py b/leech.py index e27d2ed..6787771 100755 --- a/leech.py +++ b/leech.py @@ -3,12 +3,13 @@ import argparse import importlib import os +import json import sites import epub from fetch import Fetch -fetch = Fetch("leech.db") +fetch = Fetch("leech") html_template = ''' @@ -23,14 +24,21 @@ html_template = ''' ''' -def leech(url, filename=None): +def leech(url, filename=None, cache=True): # we have: a page, which could be absolutely any part of a story, or not a story at all # check a bunch of things which are completely ff.n specific, to get text from it site = sites.get(url) if not site: raise Exception("No site handler found") - handler = site(fetch) + handler = site(fetch, cache=cache) + + with open('leech.json') as store_file: + store = json.load(store_file) + login = store.get('logins', {}).get(site.__name__, False) + if login: + handler.login(login) + story = handler.extract(url) if not story: raise Exception("Couldn't extract story") @@ -46,7 +54,7 @@ def leech(url, filename=None): filename = filename or story['title'] + '.epub' - epub.make_epub(filename, html, metadata) + filename = epub.make_epub(filename, html, metadata) return filename @@ -54,7 +62,9 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('url', help="url of a story to fetch") parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)") + parser.add_argument('--no-cache', dest='cache', action='store_false') + parser.set_defaults(cache=True) args = parser.parse_args() - filename = leech(args.url, filename=args.filename) + filename = leech(args.url, filename=args.filename, cache=args.cache) print("File created:", filename) diff --git a/requirements.txt b/requirements.txt index 94bda8d..0b2104c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ beautifulsoup4==4.4.1 html5lib==0.999 +requests==2.8.1 six==1.6.1 diff --git a/sites/__init__.py b/sites/__init__.py index b5e44b6..0f537a4 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -7,9 +7,10 @@ class Site: """A Site handles checking whether a URL might represent a site, and then extracting the content of a story from said site. """ - def __init__(self, fetch): + def __init__(self, fetch, cache=True): super().__init__() self.fetch = fetch + self.cache = cache @staticmethod def matches(url): @@ -18,8 +19,11 @@ class Site: def extract(self, url): raise NotImplementedError() - def _soup(self, url, method='html5lib'): - page = self.fetch(url) + def login(self, login_details): + raise NotImplementedError() + + def _soup(self, url, method='html5lib', **kw): + page = self.fetch(url, cached=self.cache, **kw) if not page: raise SiteException("Couldn't fetch", url) return BeautifulSoup(page, method) diff --git a/sites/xenforo.py b/sites/xenforo.py index 6a96679..f4413fb 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -13,14 +13,23 @@ class XenForo(Site): def matches(cls, url): return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url) + def login(self, login_details): + # Todo: handle non-https? + post = { + 'login': login_details[0], + 'password': login_details[1], + } + self.fetch.session.post('https://%s/login/login' % self.domain, data=post) + print("Logged in as", login_details[0]) + def extract(self, url): soup = self._soup(url) base = soup.head.base.get('href') story = {} - story['title'] = str(soup.find('h1').string) - story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string) + story['title'] = soup.find('h1').get_text() + story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text() marks = self._chapter_list(url)