diff --git a/.gitignore b/.gitignore index fe27ec7..55fa808 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.epub *.mobi -*.json +./*.json leech.db leech.sqlite leech.cookies diff --git a/README.markdown b/README.markdown index 84e69ae..0bfcc13 100644 --- a/README.markdown +++ b/README.markdown @@ -33,6 +33,8 @@ Supports * ArchiveOfOurOwn * Yes, it has its own built-in EPUB export, but the formatting is horrible * Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably + * RoyalRoad + * Fiction.live (Anonkun) * DeviantArt galleries/collections * Sta.sh * Completely arbitrary sites, with a bit more work (see below) diff --git a/examples/deathworlders.json b/examples/deathworlders.json new file mode 100644 index 0000000..f6ede53 --- /dev/null +++ b/examples/deathworlders.json @@ -0,0 +1,7 @@ +{ + "url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience", + "title": "Deathworlders", + "author": "Philip Richard Johnson, AKA Hambone", + "chapter_selector": "#block-book-navigation .menu a", + "content_selector": "article .node-content .field-name-body .field-item" +} diff --git a/examples/heretical-edge.json b/examples/heretical-edge.json new file mode 100644 index 0000000..f266957 --- /dev/null +++ b/examples/heretical-edge.json @@ -0,0 +1,8 @@ +{ + "url": "https://ceruleanscrawling.wordpress.com/table-of-contents/", + "title": "Heretical Edge", + "author": "Ceruelean", + "chapter_selector": "article .entry-content > p > a", + "content_selector": "article .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical1.json b/examples/practical1.json new file mode 100644 index 0000000..00e1d20 --- /dev/null +++ b/examples/practical1.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 1", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical2.json b/examples/practical2.json new file mode 100644 index 0000000..2dfd4c9 --- /dev/null +++ b/examples/practical2.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 2", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical3.json b/examples/practical3.json new file mode 100644 index 0000000..cc883fb --- /dev/null +++ b/examples/practical3.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 3", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/sagaofsoul.json b/examples/sagaofsoul.json new file mode 100644 index 0000000..27bab61 --- /dev/null +++ b/examples/sagaofsoul.json @@ -0,0 +1,8 @@ +{ + "url": "http://www.sagaofsoul.com/story.html", + "title": "Saga of Soul", + "author": "Ouri Maler", + "chapter_selector": "#mainbody li a", + "content_selector": "#mainbody", + "filter_selector": "script, noscript" +} diff --git a/leech.py b/leech.py index b73c26e..280bab8 100755 --- a/leech.py +++ b/leech.py @@ -4,6 +4,8 @@ import argparse import sys import json import http.cookiejar +import logging +import sqlite3 import sites import ebook @@ -14,6 +16,8 @@ import requests_cache __version__ = 1 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ +logger = logging.getLogger(__name__) + def leech(url, session, filename=None, args=None): # we have: a page, which could be absolutely any part of a story, or not a story at all @@ -22,7 +26,7 @@ def leech(url, session, filename=None, args=None): if not site: raise Exception("No site handler found") - print("Handler", site, url) + logger.info("Handler: %s (%s)", site, url) handler = site(session, args=args) @@ -48,13 +52,27 @@ if __name__ == '__main__': parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)") parser.add_argument('--no-cache', dest='cache', action='store_false') parser.add_argument('--flush', dest='flush', action='store_true') - parser.set_defaults(cache=True, flush=False) + parser.add_argument('-v', '--verbose', help="verbose output", action='store_true', dest='verbose') + parser.set_defaults(cache=True, flush=False, verbose=False) args, extra_args = parser.parse_known_args() + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig( + level=logging.INFO, + format="[%(name)s] %(message)s" + ) + if args.flush: requests_cache.install_cache('leech') requests_cache.clear() - print("Flushed cache") + + conn = sqlite3.connect('leech.sqlite') + conn.execute("VACUUM") + conn.close() + + logger.info("Flushed cache") sys.exit() if not args.url: @@ -76,4 +94,4 @@ if __name__ == '__main__': }) filename = leech(args.url, filename=args.filename, session=session, args=extra_args) - print("File created:", filename) + logger.info("File created: %s", filename) diff --git a/sites/__init__.py b/sites/__init__.py index 70ab656..8933b1d 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -3,9 +3,13 @@ import glob import os import argparse import uuid +import time +import logging import attr from bs4 import BeautifulSoup +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) _sites = [] @@ -96,9 +100,16 @@ class Site: def _add_arguments(self, parser): pass - def _soup(self, url, method='html5lib', **kw): + def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw): page = self.session.get(url, **kw) if not page: + if retry and retry > 0: + delay = retry_delay + if 'Retry-After' in page.headers: + delay = int(page.headers['Retry-After']) + logger.warning("Load failed: waiting %s to retry (%s)", delay, page) + time.sleep(delay) + return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) raise SiteException("Couldn't fetch", url) return BeautifulSoup(page.text, method) diff --git a/sites/ao3.py b/sites/ao3.py index 4523ae6..5303c80 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -1,9 +1,15 @@ #!/usr/bin/python +import logging import datetime import re +import urllib +import requests_cache +from bs4 import BeautifulSoup from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + @register class ArchiveOfOurOwn(Site): @@ -15,12 +21,32 @@ class ArchiveOfOurOwn(Site): if match: return match.group(1) + '/' + def login(self, login_details): + with requests_cache.disabled(): + login = self.session.get('http://archiveofourown.org/login') + soup = BeautifulSoup(login.text, 'html5lib') + form = soup.find(id='new_user_session') + post = { + 'user_session[login]': login_details[0], + 'user_session[password]': login_details[1], + # standard fields: + 'user_session[remember_me]': '1', + 'utf8': form.find(attrs={'name': 'utf8'})['value'], + 'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'], + 'commit': 'Log In', + } + # I feel the session *should* handle this cookies bit for me. But + # it doesn't. And I don't know why. + self.session.post('https://archiveofourown.org/user_sessions', data=post, cookies=login.cookies) + logger.info("Logged in as %s", login_details[0]) + def extract(self, url): workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1) return self._extract_work(workid) def _extract_work(self, workid): - soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)) + nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid) + soup = self._soup(nav_url) metadata = soup.select('#main h2.heading a') story = Section( @@ -31,9 +57,7 @@ class ArchiveOfOurOwn(Site): for chapter in soup.select('#main ol[role="navigation"] li'): link = chapter.find('a') - chapter_url = str(link.get('href')) - if chapter_url.startswith('/works/'): - chapter_url = 'http://archiveofourown.org' + chapter_url + chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href'))) chapter_url += '?view_adult=true' updated = datetime.datetime.strptime( @@ -46,7 +70,7 @@ class ArchiveOfOurOwn(Site): return story def _chapter(self, url): - print("Extracting chapter from", url) + logger.info("Extracting chapter @ %s", url) soup = self._soup(url) content = soup.find('div', role='article') @@ -79,7 +103,8 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn): story = Section( title=soup.select('#main h2.heading')[0].string, - author=soup.select('#main dl.series.meta a[rel="author"]')[0].string + author=soup.select('#main dl.series.meta a[rel="author"]')[0].string, + url='http://archiveofourown.org/series/{}'.format(seriesid) ) for work in soup.select('#main ul.series li.work'): diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 1463f14..1989bc6 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -1,11 +1,15 @@ #!/usr/bin/python +import logging import attr import datetime import json import os.path +import urllib from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + """ Example JSON: { @@ -47,19 +51,19 @@ class Arbitrary(Site): story = Section( title=definition.title, - author=definition.author + author=definition.author, + url=url ) if definition.chapter_selector: soup = self._soup(definition.url) for chapter in soup.select(definition.chapter_selector): - chapter_url = str(chapter.get('href')) + chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href'))) story.add(Chapter( title=chapter.string, contents=self._chapter(chapter_url, definition), # TODO: better date detection - date=datetime.datetime.now(), - url=url + date=datetime.datetime.now() )) else: story.add(Chapter( @@ -74,7 +78,7 @@ class Arbitrary(Site): def _chapter(self, url, definition): # TODO: refactor so this can meaningfully handle multiple matches on content_selector. # Probably by changing it so that this returns a Chapter / Section. - print("Extracting chapter from", url) + logger.info("Extracting chapter @ %s", url) soup = self._soup(url) content = soup.select(definition.content_selector)[0] diff --git a/sites/deviantart.py b/sites/deviantart.py index bb2775a..df30e92 100644 --- a/sites/deviantart.py +++ b/sites/deviantart.py @@ -1,10 +1,13 @@ #!/usr/bin/python +import logging import re from . import register, Section from .stash import Stash +logger = logging.getLogger(__name__) + @register class DeviantArt(Stash): @@ -41,6 +44,6 @@ class DeviantArt(Stash): if thumb['href'] is not '#': story.add(self._chapter(thumb['href'])) except Exception as e: - print(e) + logger.exception("Couldn't extract chapters from thumbs") return story diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index c3a6792..7f86aed 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -1,9 +1,12 @@ #!/usr/bin/python +import logging import datetime import re from . import register, Site, SiteException, Section, Chapter +logger = logging.getLogger(__name__) + @register class FanFictionNet(Site): @@ -11,9 +14,9 @@ class FanFictionNet(Site): @staticmethod def matches(url): # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights - match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url) + match = re.match(r'^https?://(?:www|m)\.fanfiction\.net/s/(\d+)/?.*', url) if match: - return match.group(1) + '/' + return 'https://www.fanfiction.net/s/' + match.group(1) + '/' def extract(self, url): soup = self._soup(url) @@ -59,7 +62,7 @@ class FanFictionNet(Site): return story def _chapter(self, url): - print("Extracting chapter from", url) + logger.info("Fetching chapter @ %s", url) soup = self._soup(url) content = soup.find(id="content_wrapper_inner") @@ -74,7 +77,7 @@ class FanFictionNet(Site): for tag in text.find_all(True): tag.attrs = None except Exception as e: - print("Trouble cleaning attributes", e) + logger.exception("Trouble cleaning attributes") return text.prettify() @@ -84,6 +87,6 @@ class FictionPress(FanFictionNet): @staticmethod def matches(url): # e.g. https://www.fictionpress.com/s/2961893/1/Mother-of-Learning - match = re.match(r'^(https?://www\.fictionpress\.com/s/\d+)/?.*', url) + match = re.match(r'^https?://(?:www|m)\.fictionpress\.com/s/(\d+)/?.*', url) if match: - return match.group(1) + '/' + return 'https://www.fictionpress.com/s/' + match.group(1) + '/' diff --git a/sites/fictionlive.py b/sites/fictionlive.py index 6d588ba..a7d8fae 100644 --- a/sites/fictionlive.py +++ b/sites/fictionlive.py @@ -1,10 +1,13 @@ #!/usr/bin/python +import logging import itertools import datetime import re from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + @register class FictionLive(Site): @@ -40,7 +43,7 @@ class FictionLive(Site): # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998 # i.e. format is [current timestamp] / [next timestamp - 1] chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1) - print("Extracting chapter from", chapter_url) + logger.info("Extracting chapter \"%s\" @ %s", currc['title'], chapter_url) data = self.session.get(chapter_url).json() html = [] diff --git a/sites/royalroad.py b/sites/royalroad.py new file mode 100644 index 0000000..794fdd2 --- /dev/null +++ b/sites/royalroad.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +import http.client +import logging +import datetime +import re +import urllib +from . import register, Site, Section, Chapter + +logger = logging.getLogger(__name__) + + +@register +class RoyalRoad(Site): + """Royal Road: a place where people write novels, mostly seeming to be light-novel in tone.""" + @staticmethod + def matches(url): + # e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen + match = re.match(r'^(https?://royalroadl\.com/fiction/\d+)/?.*', url) + if match: + return match.group(1) + '/' + + def extract(self, url): + workid = re.match(r'^https?://royalroadl\.com/fiction/(\d+)/?.*', url).group(1) + soup = self._soup('https://royalroadl.com/fiction/{}'.format(workid)) + # should have gotten redirected, for a valid title + + original_maxheaders = http.client._MAXHEADERS + http.client._MAXHEADERS = 1000 + + story = Section( + title=soup.find('h1', property='name').string.strip(), + author=soup.find('meta', property='books:author').get('content').strip(), + url=soup.find('meta', property='og:url').get('content').strip() + ) + + for chapter in soup.select('#chapters tbody tr[data-url]'): + chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url')))) + + updated = datetime.datetime.fromtimestamp( + int(chapter.find('time').get('unixtime')), + ) + + story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated)) + + http.client._MAXHEADERS = original_maxheaders + + return story + + def _chapter(self, url): + logger.info("Extracting chapter @ %s", url) + soup = self._soup(url) + content = soup.find('div', class_='chapter-content') + + # TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well. + author_note = soup.find('div', class_='author-note-portlet') + + return (author_note and (author_note.prettify() + '
') or '') + content.prettify() diff --git a/sites/stash.py b/sites/stash.py index e7487b6..9c77b83 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -1,9 +1,12 @@ #!/usr/bin/python +import logging import datetime import re from . import register, Site, SiteException, Section, Chapter +logger = logging.getLogger(__name__) + @register class Stash(Site): @@ -35,12 +38,12 @@ class Stash(Site): if thumb['href'] is not '#': story.add(self._chapter(thumb['href'])) except Exception as e: - print(e) + logger.exception("Couldn't extract chapters from thumbs") return story def _chapter(self, url): - print("Extracting chapter from", url) + logger.info("Fetching chapter @ %s", url) soup = self._soup(url) content = soup.find(class_="journal-wrapper") diff --git a/sites/xenforo.py b/sites/xenforo.py index cb0e8a1..aa530eb 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -2,8 +2,11 @@ import datetime import re +import logging from . import register, Site, SiteException, Section, Chapter +logger = logging.getLogger(__name__) + class XenForo(Site): """XenForo is forum software that powers a number of fiction-related forums.""" @@ -23,7 +26,7 @@ class XenForo(Site): 'password': login_details[1], } self.session.post('https://%s/login/login' % self.domain, data=post) - print("Logged in as", login_details[0]) + logger.info("Logged in as %s", login_details[0]) def extract(self, url): soup = self._soup(url) @@ -47,7 +50,7 @@ class XenForo(Site): if not href.startswith('http'): href = base + href title = str(mark.string).strip() - print("Fetching chapter", title, href) + logger.info("Fetching chapter \"%s\" @ %s", title, href) chapter = Chapter(title=title, contents="") contents, post_date = self._chapter(href, idx) chapter.contents = contents @@ -63,7 +66,7 @@ class XenForo(Site): try: return self._chapter_list_threadmarks(url) except SiteException as e: - print("Tried threadmarks", e.args) + logger.debug("Tried threadmarks (%r)", e.args) return self._chapter_list_index(url) def _chapter_list_threadmarks(self, url): @@ -71,7 +74,10 @@ class XenForo(Site): threadmarks_link = soup.find(class_="threadmarksTrigger", href=True) if not threadmarks_link: - threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0] + try: + threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0] + except IndexError: + pass if not threadmarks_link: raise SiteException("No threadmarks")