From dc0d2162fbee2bb26598f93ed7a6e6cea1b8391a Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 22 Oct 2017 17:06:40 -0500 Subject: [PATCH 01/15] Arbitrary handler had misplaced url arg --- sites/arbitrary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 1463f14..9a56bf6 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -47,7 +47,8 @@ class Arbitrary(Site): story = Section( title=definition.title, - author=definition.author + author=definition.author, + url=url ) if definition.chapter_selector: @@ -58,8 +59,7 @@ class Arbitrary(Site): title=chapter.string, contents=self._chapter(chapter_url, definition), # TODO: better date detection - date=datetime.datetime.now(), - url=url + date=datetime.datetime.now() )) else: story.add(Chapter( From 257ab69394939510e6497b1eb139e93ae420b585 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 22 Oct 2017 17:31:10 -0500 Subject: [PATCH 02/15] Arbitrary handler: canonicalize URLs --- sites/arbitrary.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 9a56bf6..195faee 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -4,6 +4,7 @@ import attr import datetime import json import os.path +import urllib from . import register, Site, Section, Chapter """ @@ -54,7 +55,7 @@ class Arbitrary(Site): if definition.chapter_selector: soup = self._soup(definition.url) for chapter in soup.select(definition.chapter_selector): - chapter_url = str(chapter.get('href')) + chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href'))) story.add(Chapter( title=chapter.string, contents=self._chapter(chapter_url, definition), From df8e67d3e102d8f4640bbf757248accd9f95520c Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 22 Oct 2017 17:33:43 -0500 Subject: [PATCH 03/15] Include some examples for the arbitrary handler --- .gitignore | 2 +- examples/deathworlders.json | 7 +++++++ examples/heretical-edge.json | 8 ++++++++ examples/practical1.json | 8 ++++++++ examples/practical2.json | 8 ++++++++ examples/practical3.json | 8 ++++++++ 6 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 examples/deathworlders.json create mode 100644 examples/heretical-edge.json create mode 100644 examples/practical1.json create mode 100644 examples/practical2.json create mode 100644 examples/practical3.json diff --git a/.gitignore b/.gitignore index fe27ec7..55fa808 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.epub *.mobi -*.json +./*.json leech.db leech.sqlite leech.cookies diff --git a/examples/deathworlders.json b/examples/deathworlders.json new file mode 100644 index 0000000..f6ede53 --- /dev/null +++ b/examples/deathworlders.json @@ -0,0 +1,7 @@ +{ + "url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience", + "title": "Deathworlders", + "author": "Philip Richard Johnson, AKA Hambone", + "chapter_selector": "#block-book-navigation .menu a", + "content_selector": "article .node-content .field-name-body .field-item" +} diff --git a/examples/heretical-edge.json b/examples/heretical-edge.json new file mode 100644 index 0000000..f266957 --- /dev/null +++ b/examples/heretical-edge.json @@ -0,0 +1,8 @@ +{ + "url": "https://ceruleanscrawling.wordpress.com/table-of-contents/", + "title": "Heretical Edge", + "author": "Ceruelean", + "chapter_selector": "article .entry-content > p > a", + "content_selector": "article .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical1.json b/examples/practical1.json new file mode 100644 index 0000000..00e1d20 --- /dev/null +++ b/examples/practical1.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 1", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical2.json b/examples/practical2.json new file mode 100644 index 0000000..2dfd4c9 --- /dev/null +++ b/examples/practical2.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 2", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical3.json b/examples/practical3.json new file mode 100644 index 0000000..cc883fb --- /dev/null +++ b/examples/practical3.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 3", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} From 27b677a44421c098731b4af929b8df9de238ec56 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 29 Oct 2017 19:50:19 -0500 Subject: [PATCH 04/15] Fix no-threadmarks autodetect --- sites/xenforo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sites/xenforo.py b/sites/xenforo.py index cb0e8a1..9c94401 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -71,7 +71,10 @@ class XenForo(Site): threadmarks_link = soup.find(class_="threadmarksTrigger", href=True) if not threadmarks_link: - threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0] + try: + threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0] + except IndexError: + pass if not threadmarks_link: raise SiteException("No threadmarks") From f1ac7c8bdae09403989ffed581df35465fe93197 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Tue, 31 Oct 2017 00:27:54 -0500 Subject: [PATCH 05/15] Retry failed site-requests --- sites/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sites/__init__.py b/sites/__init__.py index 70ab656..24161be 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -3,6 +3,7 @@ import glob import os import argparse import uuid +import time import attr from bs4 import BeautifulSoup @@ -96,9 +97,16 @@ class Site: def _add_arguments(self, parser): pass - def _soup(self, url, method='html5lib', **kw): + def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw): page = self.session.get(url, **kw) if not page: + if retry and retry > 0: + delay = retry_delay + if page.headers['Retry-After']: + delay = int(page.headers['Retry-After']) + print("Load failed: waiting {}s to retry ({})".format(delay, page)) + time.sleep(delay) + return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) raise SiteException("Couldn't fetch", url) return BeautifulSoup(page.text, method) From 6d52c72c991e7055b8505857122fc1eb5203905b Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sat, 4 Nov 2017 00:09:09 -0500 Subject: [PATCH 06/15] Use logging instead of print Fixes #10 --- leech.py | 20 ++++++++++++++++---- sites/__init__.py | 5 ++++- sites/ao3.py | 5 ++++- sites/arbitrary.py | 5 ++++- sites/deviantart.py | 5 ++++- sites/fanfictionnet.py | 7 +++++-- sites/fictionlive.py | 5 ++++- sites/stash.py | 7 +++++-- sites/xenforo.py | 9 ++++++--- 9 files changed, 52 insertions(+), 16 deletions(-) diff --git a/leech.py b/leech.py index b73c26e..9cc1be5 100755 --- a/leech.py +++ b/leech.py @@ -4,6 +4,7 @@ import argparse import sys import json import http.cookiejar +import logging import sites import ebook @@ -14,6 +15,8 @@ import requests_cache __version__ = 1 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ +logger = logging.getLogger(__name__) + def leech(url, session, filename=None, args=None): # we have: a page, which could be absolutely any part of a story, or not a story at all @@ -22,7 +25,7 @@ def leech(url, session, filename=None, args=None): if not site: raise Exception("No site handler found") - print("Handler", site, url) + logger.info("Handler: %s (%s)", site, url) handler = site(session, args=args) @@ -48,13 +51,22 @@ if __name__ == '__main__': parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)") parser.add_argument('--no-cache', dest='cache', action='store_false') parser.add_argument('--flush', dest='flush', action='store_true') - parser.set_defaults(cache=True, flush=False) + parser.add_argument('-v', '--verbose', help="verbose output", action='store_true', dest='verbose') + parser.set_defaults(cache=True, flush=False, verbose=False) args, extra_args = parser.parse_known_args() + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig( + level=logging.INFO, + format="[%(name)s] %(message)s" + ) + if args.flush: requests_cache.install_cache('leech') requests_cache.clear() - print("Flushed cache") + logger.info("Flushed cache") sys.exit() if not args.url: @@ -76,4 +88,4 @@ if __name__ == '__main__': }) filename = leech(args.url, filename=args.filename, session=session, args=extra_args) - print("File created:", filename) + logger.info("File created: %s", filename) diff --git a/sites/__init__.py b/sites/__init__.py index 24161be..9a91f4f 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -4,9 +4,12 @@ import os import argparse import uuid import time +import logging import attr from bs4 import BeautifulSoup +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) _sites = [] @@ -104,7 +107,7 @@ class Site: delay = retry_delay if page.headers['Retry-After']: delay = int(page.headers['Retry-After']) - print("Load failed: waiting {}s to retry ({})".format(delay, page)) + logger.warning("Load failed: waiting %s to retry (%s)", delay, page) time.sleep(delay) return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) raise SiteException("Couldn't fetch", url) diff --git a/sites/ao3.py b/sites/ao3.py index 4523ae6..bce4e61 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -1,9 +1,12 @@ #!/usr/bin/python +import logging import datetime import re from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + @register class ArchiveOfOurOwn(Site): @@ -46,7 +49,7 @@ class ArchiveOfOurOwn(Site): return story def _chapter(self, url): - print("Extracting chapter from", url) + logger.info("Extracting chapter @ %s", url) soup = self._soup(url) content = soup.find('div', role='article') diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 195faee..1989bc6 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import logging import attr import datetime import json @@ -7,6 +8,8 @@ import os.path import urllib from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + """ Example JSON: { @@ -75,7 +78,7 @@ class Arbitrary(Site): def _chapter(self, url, definition): # TODO: refactor so this can meaningfully handle multiple matches on content_selector. # Probably by changing it so that this returns a Chapter / Section. - print("Extracting chapter from", url) + logger.info("Extracting chapter @ %s", url) soup = self._soup(url) content = soup.select(definition.content_selector)[0] diff --git a/sites/deviantart.py b/sites/deviantart.py index bb2775a..df30e92 100644 --- a/sites/deviantart.py +++ b/sites/deviantart.py @@ -1,10 +1,13 @@ #!/usr/bin/python +import logging import re from . import register, Section from .stash import Stash +logger = logging.getLogger(__name__) + @register class DeviantArt(Stash): @@ -41,6 +44,6 @@ class DeviantArt(Stash): if thumb['href'] is not '#': story.add(self._chapter(thumb['href'])) except Exception as e: - print(e) + logger.exception("Couldn't extract chapters from thumbs") return story diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index c3a6792..0da64ae 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -1,9 +1,12 @@ #!/usr/bin/python +import logging import datetime import re from . import register, Site, SiteException, Section, Chapter +logger = logging.getLogger(__name__) + @register class FanFictionNet(Site): @@ -59,7 +62,7 @@ class FanFictionNet(Site): return story def _chapter(self, url): - print("Extracting chapter from", url) + logger.info("Fetching chapter @ %s", url) soup = self._soup(url) content = soup.find(id="content_wrapper_inner") @@ -74,7 +77,7 @@ class FanFictionNet(Site): for tag in text.find_all(True): tag.attrs = None except Exception as e: - print("Trouble cleaning attributes", e) + logger.exception("Trouble cleaning attributes") return text.prettify() diff --git a/sites/fictionlive.py b/sites/fictionlive.py index 6d588ba..a7d8fae 100644 --- a/sites/fictionlive.py +++ b/sites/fictionlive.py @@ -1,10 +1,13 @@ #!/usr/bin/python +import logging import itertools import datetime import re from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + @register class FictionLive(Site): @@ -40,7 +43,7 @@ class FictionLive(Site): # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998 # i.e. format is [current timestamp] / [next timestamp - 1] chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1) - print("Extracting chapter from", chapter_url) + logger.info("Extracting chapter \"%s\" @ %s", currc['title'], chapter_url) data = self.session.get(chapter_url).json() html = [] diff --git a/sites/stash.py b/sites/stash.py index e7487b6..9c77b83 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -1,9 +1,12 @@ #!/usr/bin/python +import logging import datetime import re from . import register, Site, SiteException, Section, Chapter +logger = logging.getLogger(__name__) + @register class Stash(Site): @@ -35,12 +38,12 @@ class Stash(Site): if thumb['href'] is not '#': story.add(self._chapter(thumb['href'])) except Exception as e: - print(e) + logger.exception("Couldn't extract chapters from thumbs") return story def _chapter(self, url): - print("Extracting chapter from", url) + logger.info("Fetching chapter @ %s", url) soup = self._soup(url) content = soup.find(class_="journal-wrapper") diff --git a/sites/xenforo.py b/sites/xenforo.py index 9c94401..aa530eb 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -2,8 +2,11 @@ import datetime import re +import logging from . import register, Site, SiteException, Section, Chapter +logger = logging.getLogger(__name__) + class XenForo(Site): """XenForo is forum software that powers a number of fiction-related forums.""" @@ -23,7 +26,7 @@ class XenForo(Site): 'password': login_details[1], } self.session.post('https://%s/login/login' % self.domain, data=post) - print("Logged in as", login_details[0]) + logger.info("Logged in as %s", login_details[0]) def extract(self, url): soup = self._soup(url) @@ -47,7 +50,7 @@ class XenForo(Site): if not href.startswith('http'): href = base + href title = str(mark.string).strip() - print("Fetching chapter", title, href) + logger.info("Fetching chapter \"%s\" @ %s", title, href) chapter = Chapter(title=title, contents="") contents, post_date = self._chapter(href, idx) chapter.contents = contents @@ -63,7 +66,7 @@ class XenForo(Site): try: return self._chapter_list_threadmarks(url) except SiteException as e: - print("Tried threadmarks", e.args) + logger.debug("Tried threadmarks (%r)", e.args) return self._chapter_list_index(url) def _chapter_list_threadmarks(self, url): From 7bb6da382c5e7479cee4d68846c227f3865b8d2d Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sat, 4 Nov 2017 00:30:59 -0500 Subject: [PATCH 07/15] Oh hey, another missing Section URL --- sites/ao3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sites/ao3.py b/sites/ao3.py index bce4e61..b314579 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -82,7 +82,8 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn): story = Section( title=soup.select('#main h2.heading')[0].string, - author=soup.select('#main dl.series.meta a[rel="author"]')[0].string + author=soup.select('#main dl.series.meta a[rel="author"]')[0].string, + url='http://archiveofourown.org/series/{}'.format(seriesid) ) for work in soup.select('#main ul.series li.work'): From e099f47e66a2a529d354cd06304995cb69f97a24 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Fri, 17 Nov 2017 21:37:13 -0600 Subject: [PATCH 08/15] Support: RoyalRoad --- README.markdown | 2 ++ sites/royalroad.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 sites/royalroad.py diff --git a/README.markdown b/README.markdown index 84e69ae..0bfcc13 100644 --- a/README.markdown +++ b/README.markdown @@ -33,6 +33,8 @@ Supports * ArchiveOfOurOwn * Yes, it has its own built-in EPUB export, but the formatting is horrible * Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably + * RoyalRoad + * Fiction.live (Anonkun) * DeviantArt galleries/collections * Sta.sh * Completely arbitrary sites, with a bit more work (see below) diff --git a/sites/royalroad.py b/sites/royalroad.py new file mode 100644 index 0000000..6a64a41 --- /dev/null +++ b/sites/royalroad.py @@ -0,0 +1,59 @@ +#!/usr/bin/python + +import http.client +import logging +import datetime +import re +import urllib +from . import register, Site, Section, Chapter + +logger = logging.getLogger(__name__) + + +@register +class RoyalRoad(Site): + """Royal Road: a place where people write novels, mostly seeming to be light-novel in tone.""" + @staticmethod + def matches(url): + # e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen + match = re.match(r'^(https?://royalroadl\.com/fiction/\d+)/?.*', url) + if match: + return match.group(1) + '/' + + def extract(self, url): + workid = re.match(r'^https?://royalroadl\.com/fiction/(\d+)/?.*', url).group(1) + soup = self._soup('https://royalroadl.com/fiction/{}'.format(workid)) + # should have gotten redirected, for a valid title + + original_maxheaders = http.client._MAXHEADERS + http.client._MAXHEADERS = 1000 + + metadata = soup.select('#main h2.heading a') + story = Section( + title=soup.find('h1', property='name').string.strip(), + author=soup.find('meta', property='books:author').get('content').strip(), + url=soup.find('meta', property='og:url').get('content').strip() + ) + + for chapter in soup.select('#chapters tbody tr[data-url]'): + chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url')))) + + updated = datetime.datetime.fromtimestamp( + int(chapter.find('time').get('unixtime')), + ) + + story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated)) + + http.client._MAXHEADERS = original_maxheaders + + return story + + def _chapter(self, url): + logger.info("Extracting chapter @ %s", url) + soup = self._soup(url) + content = soup.find('div', class_='chapter-content') + + # TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well. + author_note = soup.find('div', class_='author-note-portlet') + + return (author_note and (author_note.prettify() + '
') or '') + content.prettify() From e9dab9ab7dade5813c1364acece24f68b6545f7b Mon Sep 17 00:00:00 2001 From: David Lynch Date: Fri, 17 Nov 2017 22:57:54 -0600 Subject: [PATCH 09/15] Fix linting on royalroad --- sites/royalroad.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sites/royalroad.py b/sites/royalroad.py index 6a64a41..794fdd2 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -28,7 +28,6 @@ class RoyalRoad(Site): original_maxheaders = http.client._MAXHEADERS http.client._MAXHEADERS = 1000 - metadata = soup.select('#main h2.heading a') story = Section( title=soup.find('h1', property='name').string.strip(), author=soup.find('meta', property='books:author').get('content').strip(), From fb588793489cac5a0b48f7809c87012a677f0d3f Mon Sep 17 00:00:00 2001 From: David Lynch Date: Tue, 5 Dec 2017 21:34:40 -0600 Subject: [PATCH 10/15] New example --- examples/sagaofsoul.json | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 examples/sagaofsoul.json diff --git a/examples/sagaofsoul.json b/examples/sagaofsoul.json new file mode 100644 index 0000000..27bab61 --- /dev/null +++ b/examples/sagaofsoul.json @@ -0,0 +1,8 @@ +{ + "url": "http://www.sagaofsoul.com/story.html", + "title": "Saga of Soul", + "author": "Ouri Maler", + "chapter_selector": "#mainbody li a", + "content_selector": "#mainbody", + "filter_selector": "script, noscript" +} From f8d494283c8d435c70d5900576c21d7c539af483 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Fri, 19 Jan 2018 13:19:45 -0600 Subject: [PATCH 11/15] Proper URL normalization for AO3 chapters --- sites/ao3.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sites/ao3.py b/sites/ao3.py index b314579..957ac68 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -3,6 +3,7 @@ import logging import datetime import re +import urllib from . import register, Site, Section, Chapter logger = logging.getLogger(__name__) @@ -23,7 +24,8 @@ class ArchiveOfOurOwn(Site): return self._extract_work(workid) def _extract_work(self, workid): - soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)) + nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid) + soup = self._soup(nav_url) metadata = soup.select('#main h2.heading a') story = Section( @@ -34,9 +36,7 @@ class ArchiveOfOurOwn(Site): for chapter in soup.select('#main ol[role="navigation"] li'): link = chapter.find('a') - chapter_url = str(link.get('href')) - if chapter_url.startswith('/works/'): - chapter_url = 'http://archiveofourown.org' + chapter_url + chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href'))) chapter_url += '?view_adult=true' updated = datetime.datetime.strptime( From 2042f813d06200a913d4ce3da5688b62360cfe0e Mon Sep 17 00:00:00 2001 From: David Lynch Date: Fri, 19 Jan 2018 14:15:43 -0600 Subject: [PATCH 12/15] Allow AO3 logins for member-only stories --- sites/ao3.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sites/ao3.py b/sites/ao3.py index 957ac68..5303c80 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -4,6 +4,8 @@ import logging import datetime import re import urllib +import requests_cache +from bs4 import BeautifulSoup from . import register, Site, Section, Chapter logger = logging.getLogger(__name__) @@ -19,6 +21,25 @@ class ArchiveOfOurOwn(Site): if match: return match.group(1) + '/' + def login(self, login_details): + with requests_cache.disabled(): + login = self.session.get('http://archiveofourown.org/login') + soup = BeautifulSoup(login.text, 'html5lib') + form = soup.find(id='new_user_session') + post = { + 'user_session[login]': login_details[0], + 'user_session[password]': login_details[1], + # standard fields: + 'user_session[remember_me]': '1', + 'utf8': form.find(attrs={'name': 'utf8'})['value'], + 'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'], + 'commit': 'Log In', + } + # I feel the session *should* handle this cookies bit for me. But + # it doesn't. And I don't know why. + self.session.post('https://archiveofourown.org/user_sessions', data=post, cookies=login.cookies) + logger.info("Logged in as %s", login_details[0]) + def extract(self, url): workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1) return self._extract_work(workid) From b8123e0b267396a9ddcae6af3d86d4dee3ae50e8 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Fri, 19 Jan 2018 14:21:05 -0600 Subject: [PATCH 13/15] Explicitly VACUUM the cache on flush --- leech.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/leech.py b/leech.py index 9cc1be5..280bab8 100755 --- a/leech.py +++ b/leech.py @@ -5,6 +5,7 @@ import sys import json import http.cookiejar import logging +import sqlite3 import sites import ebook @@ -66,6 +67,11 @@ if __name__ == '__main__': if args.flush: requests_cache.install_cache('leech') requests_cache.clear() + + conn = sqlite3.connect('leech.sqlite') + conn.execute("VACUUM") + conn.close() + logger.info("Flushed cache") sys.exit() From 7d2c1647e2eeea332478156bec7393535fd63428 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Wed, 28 Feb 2018 20:54:37 -0600 Subject: [PATCH 14/15] Safer check on retry-after --- sites/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sites/__init__.py b/sites/__init__.py index 9a91f4f..8933b1d 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -105,7 +105,7 @@ class Site: if not page: if retry and retry > 0: delay = retry_delay - if page.headers['Retry-After']: + if 'Retry-After' in page.headers: delay = int(page.headers['Retry-After']) logger.warning("Load failed: waiting %s to retry (%s)", delay, page) time.sleep(delay) From 868ef4b1576793e71b4cd6a23d8874202b588302 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Fri, 30 Mar 2018 15:18:57 -0500 Subject: [PATCH 15/15] Handle mobile links for FFN --- sites/fanfictionnet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index 0da64ae..7f86aed 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -14,9 +14,9 @@ class FanFictionNet(Site): @staticmethod def matches(url): # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights - match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url) + match = re.match(r'^https?://(?:www|m)\.fanfiction\.net/s/(\d+)/?.*', url) if match: - return match.group(1) + '/' + return 'https://www.fanfiction.net/s/' + match.group(1) + '/' def extract(self, url): soup = self._soup(url) @@ -87,6 +87,6 @@ class FictionPress(FanFictionNet): @staticmethod def matches(url): # e.g. https://www.fictionpress.com/s/2961893/1/Mother-of-Learning - match = re.match(r'^(https?://www\.fictionpress\.com/s/\d+)/?.*', url) + match = re.match(r'^https?://(?:www|m)\.fictionpress\.com/s/(\d+)/?.*', url) if match: - return match.group(1) + '/' + return 'https://www.fictionpress.com/s/' + match.group(1) + '/'