mirror of
https://github.com/kemayo/leech
synced 2025-12-07 08:53:30 +01:00
Allow logging in to sites, to view hidden things
This commit is contained in:
parent
150d0f63d5
commit
8f198bae21
7 changed files with 55 additions and 38 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,6 +1,8 @@
|
||||||
*.epub
|
*.epub
|
||||||
*.mobi
|
*.mobi
|
||||||
leech.db
|
leech.db
|
||||||
|
leech.cookies
|
||||||
|
leech.json
|
||||||
venv/
|
venv/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
|
|
|
||||||
2
epub.py
2
epub.py
|
|
@ -121,7 +121,7 @@ def make_epub(filename, html_files, meta):
|
||||||
|
|
||||||
epub.close()
|
epub.close()
|
||||||
|
|
||||||
return True
|
return filename
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
make_epub('test.epub', [('Chapter 1', 'test/a.html'), ('Chapter 2', 'test/b.html')], {})
|
make_epub('test.epub', [('Chapter 1', 'test/a.html'), ('Chapter 2', 'test/b.html')], {})
|
||||||
|
|
|
||||||
45
fetch.py
45
fetch.py
|
|
@ -1,10 +1,9 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
import gzip
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import http.cookiejar
|
||||||
|
|
||||||
from io import BytesIO
|
import requests
|
||||||
from urllib.request import Request, urlopen
|
|
||||||
|
|
||||||
__version__ = 1
|
__version__ = 1
|
||||||
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
||||||
|
|
@ -20,7 +19,7 @@ class Fetch:
|
||||||
if it doesn't already exist. (":memory:" will store everything
|
if it doesn't already exist. (":memory:" will store everything
|
||||||
in-memory, if you only need to use this as a temporary thing).
|
in-memory, if you only need to use this as a temporary thing).
|
||||||
"""
|
"""
|
||||||
store = sqlite3.connect(storepath)
|
store = sqlite3.connect(storepath + '.db')
|
||||||
self.store = store
|
self.store = store
|
||||||
c = store.cursor()
|
c = store.cursor()
|
||||||
c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""")
|
c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""")
|
||||||
|
|
@ -29,6 +28,18 @@ class Fetch:
|
||||||
|
|
||||||
self.cachetime = cachetime
|
self.cachetime = cachetime
|
||||||
|
|
||||||
|
lwp_cookiejar = http.cookiejar.LWPCookieJar()
|
||||||
|
try:
|
||||||
|
lwp_cookiejar.load(storepath + '.cookies', ignore_discard=True)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.session.cookies = lwp_cookiejar
|
||||||
|
self.session.headers.update({
|
||||||
|
'User-agent': USER_AGENT
|
||||||
|
})
|
||||||
|
|
||||||
def __call__(self, url, **kw):
|
def __call__(self, url, **kw):
|
||||||
return self.get(url, **kw)
|
return self.get(url, **kw)
|
||||||
|
|
||||||
|
|
@ -44,9 +55,9 @@ class Fetch:
|
||||||
c.close()
|
c.close()
|
||||||
if row:
|
if row:
|
||||||
return row[0]
|
return row[0]
|
||||||
data = _fetch(url, **kw)
|
data = self.session.get(url, **kw)
|
||||||
self.__set(url, data)
|
self.__set(url, data.text)
|
||||||
return data
|
return data.text
|
||||||
|
|
||||||
def __set(self, url, value):
|
def __set(self, url, value):
|
||||||
"""Add a value to the store, at the current time
|
"""Add a value to the store, at the current time
|
||||||
|
|
@ -58,23 +69,3 @@ class Fetch:
|
||||||
c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,))
|
c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,))
|
||||||
self.store.commit()
|
self.store.commit()
|
||||||
c.close()
|
c.close()
|
||||||
|
|
||||||
|
|
||||||
def _fetch(url, data=None, ungzip=True):
|
|
||||||
"""A generic URL-fetcher, which handles gzipped content, returns a string"""
|
|
||||||
request = Request(url)
|
|
||||||
request.add_header('Accept-encoding', 'gzip')
|
|
||||||
request.add_header('User-agent', USER_AGENT)
|
|
||||||
try:
|
|
||||||
f = urlopen(request, data)
|
|
||||||
except Exception as e:
|
|
||||||
return None
|
|
||||||
data = f.read()
|
|
||||||
if ungzip and f.headers.get('content-encoding', '') == 'gzip':
|
|
||||||
data = gzip.GzipFile(fileobj=BytesIO(data), mode='r').read()
|
|
||||||
try:
|
|
||||||
data = data.decode()
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
data = data.decode('latin1')
|
|
||||||
f.close()
|
|
||||||
return data
|
|
||||||
20
leech.py
20
leech.py
|
|
@ -3,12 +3,13 @@
|
||||||
import argparse
|
import argparse
|
||||||
import importlib
|
import importlib
|
||||||
import os
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
import sites
|
import sites
|
||||||
import epub
|
import epub
|
||||||
from fetch import Fetch
|
from fetch import Fetch
|
||||||
|
|
||||||
fetch = Fetch("leech.db")
|
fetch = Fetch("leech")
|
||||||
|
|
||||||
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
|
@ -23,14 +24,21 @@ html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
def leech(url, filename=None):
|
def leech(url, filename=None, cache=True):
|
||||||
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
||||||
# check a bunch of things which are completely ff.n specific, to get text from it
|
# check a bunch of things which are completely ff.n specific, to get text from it
|
||||||
site = sites.get(url)
|
site = sites.get(url)
|
||||||
if not site:
|
if not site:
|
||||||
raise Exception("No site handler found")
|
raise Exception("No site handler found")
|
||||||
|
|
||||||
handler = site(fetch)
|
handler = site(fetch, cache=cache)
|
||||||
|
|
||||||
|
with open('leech.json') as store_file:
|
||||||
|
store = json.load(store_file)
|
||||||
|
login = store.get('logins', {}).get(site.__name__, False)
|
||||||
|
if login:
|
||||||
|
handler.login(login)
|
||||||
|
|
||||||
story = handler.extract(url)
|
story = handler.extract(url)
|
||||||
if not story:
|
if not story:
|
||||||
raise Exception("Couldn't extract story")
|
raise Exception("Couldn't extract story")
|
||||||
|
|
@ -46,7 +54,7 @@ def leech(url, filename=None):
|
||||||
|
|
||||||
filename = filename or story['title'] + '.epub'
|
filename = filename or story['title'] + '.epub'
|
||||||
|
|
||||||
epub.make_epub(filename, html, metadata)
|
filename = epub.make_epub(filename, html, metadata)
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
@ -54,7 +62,9 @@ if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('url', help="url of a story to fetch")
|
parser.add_argument('url', help="url of a story to fetch")
|
||||||
parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
|
parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
|
||||||
|
parser.add_argument('--no-cache', dest='cache', action='store_false')
|
||||||
|
parser.set_defaults(cache=True)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
filename = leech(args.url, filename=args.filename)
|
filename = leech(args.url, filename=args.filename, cache=args.cache)
|
||||||
print("File created:", filename)
|
print("File created:", filename)
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
beautifulsoup4==4.4.1
|
beautifulsoup4==4.4.1
|
||||||
html5lib==0.999
|
html5lib==0.999
|
||||||
|
requests==2.8.1
|
||||||
six==1.6.1
|
six==1.6.1
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,10 @@ class Site:
|
||||||
"""A Site handles checking whether a URL might represent a site, and then
|
"""A Site handles checking whether a URL might represent a site, and then
|
||||||
extracting the content of a story from said site.
|
extracting the content of a story from said site.
|
||||||
"""
|
"""
|
||||||
def __init__(self, fetch):
|
def __init__(self, fetch, cache=True):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.fetch = fetch
|
self.fetch = fetch
|
||||||
|
self.cache = cache
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
|
|
@ -18,8 +19,11 @@ class Site:
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def _soup(self, url, method='html5lib'):
|
def login(self, login_details):
|
||||||
page = self.fetch(url)
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def _soup(self, url, method='html5lib', **kw):
|
||||||
|
page = self.fetch(url, cached=self.cache, **kw)
|
||||||
if not page:
|
if not page:
|
||||||
raise SiteException("Couldn't fetch", url)
|
raise SiteException("Couldn't fetch", url)
|
||||||
return BeautifulSoup(page, method)
|
return BeautifulSoup(page, method)
|
||||||
|
|
|
||||||
|
|
@ -13,14 +13,23 @@ class XenForo(Site):
|
||||||
def matches(cls, url):
|
def matches(cls, url):
|
||||||
return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url)
|
return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url)
|
||||||
|
|
||||||
|
def login(self, login_details):
|
||||||
|
# Todo: handle non-https?
|
||||||
|
post = {
|
||||||
|
'login': login_details[0],
|
||||||
|
'password': login_details[1],
|
||||||
|
}
|
||||||
|
self.fetch.session.post('https://%s/login/login' % self.domain, data=post)
|
||||||
|
print("Logged in as", login_details[0])
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
|
|
||||||
base = soup.head.base.get('href')
|
base = soup.head.base.get('href')
|
||||||
|
|
||||||
story = {}
|
story = {}
|
||||||
story['title'] = str(soup.find('h1').string)
|
story['title'] = soup.find('h1').get_text()
|
||||||
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
|
story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text()
|
||||||
|
|
||||||
marks = self._chapter_list(url)
|
marks = self._chapter_list(url)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue