mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
parent
f1ac7c8bda
commit
6d52c72c99
9 changed files with 52 additions and 16 deletions
20
leech.py
20
leech.py
|
|
@ -4,6 +4,7 @@ import argparse
|
|||
import sys
|
||||
import json
|
||||
import http.cookiejar
|
||||
import logging
|
||||
|
||||
import sites
|
||||
import ebook
|
||||
|
|
@ -14,6 +15,8 @@ import requests_cache
|
|||
__version__ = 1
|
||||
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def leech(url, session, filename=None, args=None):
|
||||
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
||||
|
|
@ -22,7 +25,7 @@ def leech(url, session, filename=None, args=None):
|
|||
if not site:
|
||||
raise Exception("No site handler found")
|
||||
|
||||
print("Handler", site, url)
|
||||
logger.info("Handler: %s (%s)", site, url)
|
||||
|
||||
handler = site(session, args=args)
|
||||
|
||||
|
|
@ -48,13 +51,22 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
|
||||
parser.add_argument('--no-cache', dest='cache', action='store_false')
|
||||
parser.add_argument('--flush', dest='flush', action='store_true')
|
||||
parser.set_defaults(cache=True, flush=False)
|
||||
parser.add_argument('-v', '--verbose', help="verbose output", action='store_true', dest='verbose')
|
||||
parser.set_defaults(cache=True, flush=False, verbose=False)
|
||||
args, extra_args = parser.parse_known_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(name)s] %(message)s"
|
||||
)
|
||||
|
||||
if args.flush:
|
||||
requests_cache.install_cache('leech')
|
||||
requests_cache.clear()
|
||||
print("Flushed cache")
|
||||
logger.info("Flushed cache")
|
||||
sys.exit()
|
||||
|
||||
if not args.url:
|
||||
|
|
@ -76,4 +88,4 @@ if __name__ == '__main__':
|
|||
})
|
||||
|
||||
filename = leech(args.url, filename=args.filename, session=session, args=extra_args)
|
||||
print("File created:", filename)
|
||||
logger.info("File created: %s", filename)
|
||||
|
|
|
|||
|
|
@ -4,9 +4,12 @@ import os
|
|||
import argparse
|
||||
import uuid
|
||||
import time
|
||||
import logging
|
||||
import attr
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.addHandler(logging.NullHandler())
|
||||
_sites = []
|
||||
|
||||
|
||||
|
|
@ -104,7 +107,7 @@ class Site:
|
|||
delay = retry_delay
|
||||
if page.headers['Retry-After']:
|
||||
delay = int(page.headers['Retry-After'])
|
||||
print("Load failed: waiting {}s to retry ({})".format(delay, page))
|
||||
logger.warning("Load failed: waiting %s to retry (%s)", delay, page)
|
||||
time.sleep(delay)
|
||||
return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
|
||||
raise SiteException("Couldn't fetch", url)
|
||||
|
|
|
|||
|
|
@ -1,9 +1,12 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import logging
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@register
|
||||
class ArchiveOfOurOwn(Site):
|
||||
|
|
@ -46,7 +49,7 @@ class ArchiveOfOurOwn(Site):
|
|||
return story
|
||||
|
||||
def _chapter(self, url):
|
||||
print("Extracting chapter from", url)
|
||||
logger.info("Extracting chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
content = soup.find('div', role='article')
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import logging
|
||||
import attr
|
||||
import datetime
|
||||
import json
|
||||
|
|
@ -7,6 +8,8 @@ import os.path
|
|||
import urllib
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
"""
|
||||
Example JSON:
|
||||
{
|
||||
|
|
@ -75,7 +78,7 @@ class Arbitrary(Site):
|
|||
def _chapter(self, url, definition):
|
||||
# TODO: refactor so this can meaningfully handle multiple matches on content_selector.
|
||||
# Probably by changing it so that this returns a Chapter / Section.
|
||||
print("Extracting chapter from", url)
|
||||
logger.info("Extracting chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
content = soup.select(definition.content_selector)[0]
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,13 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from . import register, Section
|
||||
from .stash import Stash
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@register
|
||||
class DeviantArt(Stash):
|
||||
|
|
@ -41,6 +44,6 @@ class DeviantArt(Stash):
|
|||
if thumb['href'] is not '#':
|
||||
story.add(self._chapter(thumb['href']))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.exception("Couldn't extract chapters from thumbs")
|
||||
|
||||
return story
|
||||
|
|
|
|||
|
|
@ -1,9 +1,12 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import logging
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, SiteException, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@register
|
||||
class FanFictionNet(Site):
|
||||
|
|
@ -59,7 +62,7 @@ class FanFictionNet(Site):
|
|||
return story
|
||||
|
||||
def _chapter(self, url):
|
||||
print("Extracting chapter from", url)
|
||||
logger.info("Fetching chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
|
||||
content = soup.find(id="content_wrapper_inner")
|
||||
|
|
@ -74,7 +77,7 @@ class FanFictionNet(Site):
|
|||
for tag in text.find_all(True):
|
||||
tag.attrs = None
|
||||
except Exception as e:
|
||||
print("Trouble cleaning attributes", e)
|
||||
logger.exception("Trouble cleaning attributes")
|
||||
|
||||
return text.prettify()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,13 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import logging
|
||||
import itertools
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@register
|
||||
class FictionLive(Site):
|
||||
|
|
@ -40,7 +43,7 @@ class FictionLive(Site):
|
|||
# https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998
|
||||
# i.e. format is [current timestamp] / [next timestamp - 1]
|
||||
chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1)
|
||||
print("Extracting chapter from", chapter_url)
|
||||
logger.info("Extracting chapter \"%s\" @ %s", currc['title'], chapter_url)
|
||||
data = self.session.get(chapter_url).json()
|
||||
html = []
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,12 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import logging
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, SiteException, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@register
|
||||
class Stash(Site):
|
||||
|
|
@ -35,12 +38,12 @@ class Stash(Site):
|
|||
if thumb['href'] is not '#':
|
||||
story.add(self._chapter(thumb['href']))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.exception("Couldn't extract chapters from thumbs")
|
||||
|
||||
return story
|
||||
|
||||
def _chapter(self, url):
|
||||
print("Extracting chapter from", url)
|
||||
logger.info("Fetching chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
|
||||
content = soup.find(class_="journal-wrapper")
|
||||
|
|
|
|||
|
|
@ -2,8 +2,11 @@
|
|||
|
||||
import datetime
|
||||
import re
|
||||
import logging
|
||||
from . import register, Site, SiteException, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class XenForo(Site):
|
||||
"""XenForo is forum software that powers a number of fiction-related forums."""
|
||||
|
|
@ -23,7 +26,7 @@ class XenForo(Site):
|
|||
'password': login_details[1],
|
||||
}
|
||||
self.session.post('https://%s/login/login' % self.domain, data=post)
|
||||
print("Logged in as", login_details[0])
|
||||
logger.info("Logged in as %s", login_details[0])
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
|
|
@ -47,7 +50,7 @@ class XenForo(Site):
|
|||
if not href.startswith('http'):
|
||||
href = base + href
|
||||
title = str(mark.string).strip()
|
||||
print("Fetching chapter", title, href)
|
||||
logger.info("Fetching chapter \"%s\" @ %s", title, href)
|
||||
chapter = Chapter(title=title, contents="")
|
||||
contents, post_date = self._chapter(href, idx)
|
||||
chapter.contents = contents
|
||||
|
|
@ -63,7 +66,7 @@ class XenForo(Site):
|
|||
try:
|
||||
return self._chapter_list_threadmarks(url)
|
||||
except SiteException as e:
|
||||
print("Tried threadmarks", e.args)
|
||||
logger.debug("Tried threadmarks (%r)", e.args)
|
||||
return self._chapter_list_index(url)
|
||||
|
||||
def _chapter_list_threadmarks(self, url):
|
||||
|
|
|
|||
Loading…
Reference in a new issue