1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-07 08:53:30 +01:00

Allow logging in to sites, to view hidden things

This commit is contained in:
David Lynch 2015-10-28 18:06:19 -05:00
parent 150d0f63d5
commit 8f198bae21
7 changed files with 55 additions and 38 deletions

2
.gitignore vendored
View file

@ -1,6 +1,8 @@
*.epub *.epub
*.mobi *.mobi
leech.db leech.db
leech.cookies
leech.json
venv/ venv/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files

View file

@ -121,7 +121,7 @@ def make_epub(filename, html_files, meta):
epub.close() epub.close()
return True return filename
if __name__ == '__main__': if __name__ == '__main__':
make_epub('test.epub', [('Chapter 1', 'test/a.html'), ('Chapter 2', 'test/b.html')], {}) make_epub('test.epub', [('Chapter 1', 'test/a.html'), ('Chapter 2', 'test/b.html')], {})

View file

@ -1,10 +1,9 @@
#!/usr/bin/python #!/usr/bin/python
import gzip
import sqlite3 import sqlite3
import http.cookiejar
from io import BytesIO import requests
from urllib.request import Request, urlopen
__version__ = 1 __version__ = 1
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
@ -20,7 +19,7 @@ class Fetch:
if it doesn't already exist. (":memory:" will store everything if it doesn't already exist. (":memory:" will store everything
in-memory, if you only need to use this as a temporary thing). in-memory, if you only need to use this as a temporary thing).
""" """
store = sqlite3.connect(storepath) store = sqlite3.connect(storepath + '.db')
self.store = store self.store = store
c = store.cursor() c = store.cursor()
c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""") c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""")
@ -29,6 +28,18 @@ class Fetch:
self.cachetime = cachetime self.cachetime = cachetime
lwp_cookiejar = http.cookiejar.LWPCookieJar()
try:
lwp_cookiejar.load(storepath + '.cookies', ignore_discard=True)
except Exception as e:
pass
self.session = requests.Session()
self.session.cookies = lwp_cookiejar
self.session.headers.update({
'User-agent': USER_AGENT
})
def __call__(self, url, **kw): def __call__(self, url, **kw):
return self.get(url, **kw) return self.get(url, **kw)
@ -44,9 +55,9 @@ class Fetch:
c.close() c.close()
if row: if row:
return row[0] return row[0]
data = _fetch(url, **kw) data = self.session.get(url, **kw)
self.__set(url, data) self.__set(url, data.text)
return data return data.text
def __set(self, url, value): def __set(self, url, value):
"""Add a value to the store, at the current time """Add a value to the store, at the current time
@ -58,23 +69,3 @@ class Fetch:
c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,)) c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,))
self.store.commit() self.store.commit()
c.close() c.close()
def _fetch(url, data=None, ungzip=True):
"""A generic URL-fetcher, which handles gzipped content, returns a string"""
request = Request(url)
request.add_header('Accept-encoding', 'gzip')
request.add_header('User-agent', USER_AGENT)
try:
f = urlopen(request, data)
except Exception as e:
return None
data = f.read()
if ungzip and f.headers.get('content-encoding', '') == 'gzip':
data = gzip.GzipFile(fileobj=BytesIO(data), mode='r').read()
try:
data = data.decode()
except UnicodeDecodeError:
data = data.decode('latin1')
f.close()
return data

View file

@ -3,12 +3,13 @@
import argparse import argparse
import importlib import importlib
import os import os
import json
import sites import sites
import epub import epub
from fetch import Fetch from fetch import Fetch
fetch = Fetch("leech.db") fetch = Fetch("leech")
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?> html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml"> <html xmlns="http://www.w3.org/1999/xhtml">
@ -23,14 +24,21 @@ html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
''' '''
def leech(url, filename=None): def leech(url, filename=None, cache=True):
# we have: a page, which could be absolutely any part of a story, or not a story at all # we have: a page, which could be absolutely any part of a story, or not a story at all
# check a bunch of things which are completely ff.n specific, to get text from it # check a bunch of things which are completely ff.n specific, to get text from it
site = sites.get(url) site = sites.get(url)
if not site: if not site:
raise Exception("No site handler found") raise Exception("No site handler found")
handler = site(fetch) handler = site(fetch, cache=cache)
with open('leech.json') as store_file:
store = json.load(store_file)
login = store.get('logins', {}).get(site.__name__, False)
if login:
handler.login(login)
story = handler.extract(url) story = handler.extract(url)
if not story: if not story:
raise Exception("Couldn't extract story") raise Exception("Couldn't extract story")
@ -46,7 +54,7 @@ def leech(url, filename=None):
filename = filename or story['title'] + '.epub' filename = filename or story['title'] + '.epub'
epub.make_epub(filename, html, metadata) filename = epub.make_epub(filename, html, metadata)
return filename return filename
@ -54,7 +62,9 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('url', help="url of a story to fetch") parser.add_argument('url', help="url of a story to fetch")
parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)") parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
parser.add_argument('--no-cache', dest='cache', action='store_false')
parser.set_defaults(cache=True)
args = parser.parse_args() args = parser.parse_args()
filename = leech(args.url, filename=args.filename) filename = leech(args.url, filename=args.filename, cache=args.cache)
print("File created:", filename) print("File created:", filename)

View file

@ -1,3 +1,4 @@
beautifulsoup4==4.4.1 beautifulsoup4==4.4.1
html5lib==0.999 html5lib==0.999
requests==2.8.1
six==1.6.1 six==1.6.1

View file

@ -7,9 +7,10 @@ class Site:
"""A Site handles checking whether a URL might represent a site, and then """A Site handles checking whether a URL might represent a site, and then
extracting the content of a story from said site. extracting the content of a story from said site.
""" """
def __init__(self, fetch): def __init__(self, fetch, cache=True):
super().__init__() super().__init__()
self.fetch = fetch self.fetch = fetch
self.cache = cache
@staticmethod @staticmethod
def matches(url): def matches(url):
@ -18,8 +19,11 @@ class Site:
def extract(self, url): def extract(self, url):
raise NotImplementedError() raise NotImplementedError()
def _soup(self, url, method='html5lib'): def login(self, login_details):
page = self.fetch(url) raise NotImplementedError()
def _soup(self, url, method='html5lib', **kw):
page = self.fetch(url, cached=self.cache, **kw)
if not page: if not page:
raise SiteException("Couldn't fetch", url) raise SiteException("Couldn't fetch", url)
return BeautifulSoup(page, method) return BeautifulSoup(page, method)

View file

@ -13,14 +13,23 @@ class XenForo(Site):
def matches(cls, url): def matches(cls, url):
return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url) return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url)
def login(self, login_details):
# Todo: handle non-https?
post = {
'login': login_details[0],
'password': login_details[1],
}
self.fetch.session.post('https://%s/login/login' % self.domain, data=post)
print("Logged in as", login_details[0])
def extract(self, url): def extract(self, url):
soup = self._soup(url) soup = self._soup(url)
base = soup.head.base.get('href') base = soup.head.base.get('href')
story = {} story = {}
story['title'] = str(soup.find('h1').string) story['title'] = soup.find('h1').get_text()
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string) story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text()
marks = self._chapter_list(url) marks = self._chapter_list(url)