mirror of
https://github.com/kemayo/leech
synced 2025-12-14 04:14:35 +01:00
Change sites strategy to use classes and inheritance
This commit is contained in:
parent
1795c717e9
commit
2aba80be24
7 changed files with 238 additions and 243 deletions
26
leech.py
26
leech.py
|
|
@ -4,6 +4,7 @@ import argparse
|
|||
import importlib
|
||||
import os
|
||||
|
||||
import sites
|
||||
import epub
|
||||
from fetch import Fetch
|
||||
|
||||
|
|
@ -25,11 +26,12 @@ html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
|||
def leech(url, filename=None):
|
||||
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
||||
# check a bunch of things which are completely ff.n specific, to get text from it
|
||||
site = _get_site(url)
|
||||
site = sites.get(url)
|
||||
if not site:
|
||||
raise Exception("No site handler found")
|
||||
|
||||
story = site.extract(url, fetch)
|
||||
handler = site(fetch)
|
||||
story = handler.extract(url)
|
||||
if not story:
|
||||
raise Exception("Couldn't extract story")
|
||||
|
||||
|
|
@ -48,26 +50,7 @@ def leech(url, filename=None):
|
|||
|
||||
return filename
|
||||
|
||||
_sites = []
|
||||
|
||||
|
||||
def _get_site(url):
|
||||
for site in _sites:
|
||||
if site.match(url):
|
||||
return site
|
||||
|
||||
|
||||
def _load_sites():
|
||||
dirname = os.path.join(os.path.dirname(__file__), 'sites')
|
||||
for f in os.listdir(dirname):
|
||||
if not f.endswith('.py'):
|
||||
continue
|
||||
mod = importlib.import_module('sites.' + f.replace('.py', ''))
|
||||
_sites.append(mod)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_load_sites()
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('url', help="url of a story to fetch")
|
||||
parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
|
||||
|
|
@ -75,4 +58,3 @@ if __name__ == '__main__':
|
|||
|
||||
filename = leech(args.url, filename=args.filename)
|
||||
print("File created:", filename)
|
||||
|
||||
|
|
|
|||
38
sites/__init__.py
Normal file
38
sites/__init__.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
_sites = []
|
||||
|
||||
class Site:
|
||||
"""A Site handles checking whether a URL might represent a site, and then
|
||||
extracting the content of a story from said site.
|
||||
"""
|
||||
def __init__(self, fetch):
|
||||
super().__init__()
|
||||
self.fetch = fetch
|
||||
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
raise NotImplementedError()
|
||||
|
||||
def extract(self, url):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _soup(self, url, method='html5lib'):
|
||||
page = self.fetch(url)
|
||||
return BeautifulSoup(page, method)
|
||||
|
||||
class SiteException(Exception):
|
||||
pass
|
||||
|
||||
def register(site_class):
|
||||
_sites.append(site_class)
|
||||
return site_class
|
||||
|
||||
def get(url):
|
||||
for site_class in _sites:
|
||||
if site_class.matches(url):
|
||||
return site_class
|
||||
|
||||
# And now, the things that will use this:
|
||||
from . import spacebattles, fanfictionnet, deviantart, stash
|
||||
|
|
@ -1,44 +1,42 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .stash import _extract_chapter
|
||||
from .stash import Stash
|
||||
|
||||
class DeviantArt(Stash):
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
# Need a collection page
|
||||
return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
|
||||
|
||||
def match(url):
|
||||
# Need a collection page
|
||||
return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
|
||||
def extract(url, fetch):
|
||||
soup = self._soup(url)
|
||||
content = soup.find(id="output")
|
||||
if not content:
|
||||
return
|
||||
|
||||
story = {}
|
||||
chapters = []
|
||||
|
||||
def extract(url, fetch):
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
content = soup.find(id="output")
|
||||
if not content:
|
||||
return
|
||||
if "gallery" in url:
|
||||
story['author'] = str(content.select('h1 a.u')[0].string)
|
||||
else:
|
||||
authors = set(str(author.string) for author in content.select('.stream .details a.u'))
|
||||
story['author'] = ', '.join(authors)
|
||||
|
||||
story = {}
|
||||
chapters = []
|
||||
story['title'] = str(content.find(class_="folder-title").string)
|
||||
|
||||
if "gallery" in url:
|
||||
story['author'] = str(content.select('h1 a.u')[0].string)
|
||||
else:
|
||||
authors = set(str(author.string) for author in content.select('.stream .details a.u'))
|
||||
story['author'] = ', '.join(authors)
|
||||
thumbs = content.select(".stream a.thumb")
|
||||
if not thumbs:
|
||||
return
|
||||
for thumb in thumbs:
|
||||
try:
|
||||
if thumb['href'] is not '#':
|
||||
chapters.append(self._chapter(thumb['href']))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
story['title'] = str(content.find(class_="folder-title").string)
|
||||
story['chapters'] = chapters
|
||||
|
||||
thumbs = content.select(".stream a.thumb")
|
||||
if not thumbs:
|
||||
return
|
||||
for thumb in thumbs:
|
||||
try:
|
||||
if thumb['href'] is not '#':
|
||||
chapters.append(_extract_chapter(thumb['href'], fetch))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
return story
|
||||
|
|
|
|||
|
|
@ -1,64 +1,64 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from . import register, Site, SiteException
|
||||
|
||||
|
||||
def match(url):
|
||||
## e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
|
||||
return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
|
||||
@register
|
||||
class FanFictionNet(Site):
|
||||
"""FFN: it has a lot of stuff"""
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
|
||||
return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
content = soup.find(id="content_wrapper_inner")
|
||||
if not content:
|
||||
raise SiteException("No content")
|
||||
|
||||
def extract(url, fetch):
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
content = soup.find(id="content_wrapper_inner")
|
||||
if not content:
|
||||
return
|
||||
story = {}
|
||||
chapters = []
|
||||
|
||||
story = {}
|
||||
chapters = []
|
||||
metadata = content.find(id='profile_top')
|
||||
story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
|
||||
story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
|
||||
|
||||
metadata = content.find(id='profile_top')
|
||||
story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
|
||||
story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
|
||||
chapter_select = content.find(id="chap_select")
|
||||
if chapter_select:
|
||||
base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
|
||||
if not base_url:
|
||||
raise SiteException("Can't find base URL for chapters")
|
||||
base_url = base_url.group(0)
|
||||
|
||||
chapter_select = content.find(id="chap_select")
|
||||
if chapter_select:
|
||||
base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
|
||||
if not base_url:
|
||||
return
|
||||
base_url = base_url.group(0)
|
||||
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
|
||||
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
|
||||
for option in options:
|
||||
chapters.append((option[1], self._chapter(base_url + option[0])))
|
||||
else:
|
||||
chapters.append((story['title'], self._extract_chapter(url)))
|
||||
|
||||
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
|
||||
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
|
||||
for option in options:
|
||||
chapters.append(_extract_chapter(base_url + option[0], option[1], fetch))
|
||||
else:
|
||||
chapters.append(_extract_chapter(url, story['title'], fetch))
|
||||
story['chapters'] = chapters
|
||||
|
||||
story['chapters'] = chapters
|
||||
return story
|
||||
|
||||
return story
|
||||
def _chapter(self, url):
|
||||
print("Extracting chapter from", url)
|
||||
soup = self._soup(url)
|
||||
|
||||
content = soup.find(id="content_wrapper_inner")
|
||||
if not content:
|
||||
raise SiteException("No chapter content")
|
||||
|
||||
def _extract_chapter(url, title, fetch):
|
||||
print("Extracting chapter from", url)
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
text = content.find(id="storytext")
|
||||
|
||||
content = soup.find(id="content_wrapper_inner")
|
||||
if not content:
|
||||
return
|
||||
# clean up some invalid xhtml attributes
|
||||
# TODO: be more selective about this somehow
|
||||
try:
|
||||
for tag in text.find_all(True):
|
||||
tag.attrs = None
|
||||
except Exception as e:
|
||||
print("Trouble cleaning attributes", e)
|
||||
|
||||
text = content.find(id="storytext")
|
||||
|
||||
# clean up some invalid xhtml attributes
|
||||
# TODO: be more selective about this somehow
|
||||
try:
|
||||
for tag in text.find_all(True):
|
||||
tag.attrs = None
|
||||
except Exception as e:
|
||||
print("Trouble cleaning attributes", e)
|
||||
|
||||
return (title, text.prettify())
|
||||
return text.prettify()
|
||||
|
|
|
|||
|
|
@ -1,58 +1,97 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from . import register, Site, SiteException
|
||||
|
||||
|
||||
def match(url):
|
||||
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url)
|
||||
@register
|
||||
class SpaceBattles(Site):
|
||||
"""SpaceBattles is a forum..."""
|
||||
|
||||
def extract(url, fetch):
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url)
|
||||
|
||||
base = soup.head.base.get('href')
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
|
||||
story = {}
|
||||
story['title'] = str(soup.find('h1').string)
|
||||
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
|
||||
base = soup.head.base.get('href')
|
||||
|
||||
threadmarks_link = soup.find(class_="threadmarksTrigger")
|
||||
if not threadmarks_link:
|
||||
print("No threadmarks")
|
||||
return
|
||||
story = {}
|
||||
story['title'] = str(soup.find('h1').string)
|
||||
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
|
||||
|
||||
page = fetch(base + threadmarks_link.get('href'))
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
marks = self._chapter_list(url)
|
||||
|
||||
marks = soup.select('li.primaryContent.memberListItem')
|
||||
if not marks:
|
||||
print("No marks on threadmarks page")
|
||||
return
|
||||
chapters = []
|
||||
for mark in marks:
|
||||
href = mark.get('href')
|
||||
if '/members' in href:
|
||||
continue
|
||||
if not href.startswith('http'):
|
||||
href = base + href
|
||||
chapters.append((str(mark.string), self._chapter(href)))
|
||||
|
||||
chapters = []
|
||||
for mark in marks:
|
||||
href = mark.a.get('href')
|
||||
print("Extracting chapter from", href)
|
||||
match = re.match(r'posts/(\d+)/?', href)
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
|
||||
def _chapter_list(self, url):
|
||||
soup = self._soup(url)
|
||||
|
||||
threadmarks_link = soup.find(class_="threadmarksTrigger")
|
||||
if not threadmarks_link:
|
||||
raise SiteException("No threadmarks")
|
||||
|
||||
base = soup.head.base.get('href')
|
||||
soup = self._soup(base + threadmarks_link.get('href'))
|
||||
|
||||
marks = soup.select('li.primaryContent.memberListItem a')
|
||||
if not marks:
|
||||
raise SiteException("No marks on threadmarks page")
|
||||
|
||||
return marks
|
||||
|
||||
def _chapter(self, url):
|
||||
print("Extracting chapter from", url)
|
||||
match = re.match(r'posts/(\d+)/?', url)
|
||||
if not match:
|
||||
match = re.match(r'.+#post-(\d+)$', href)
|
||||
match = re.match(r'.+#post-(\d+)$', url)
|
||||
if not match:
|
||||
print("Unparseable threadmark href", href)
|
||||
print("Unparseable threadmark href", url)
|
||||
chapter_postid = match and match.group(1)
|
||||
chapter_page = fetch(base + href)
|
||||
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
|
||||
chapter_soup = self._soup(url, 'html5lib')
|
||||
|
||||
if chapter_postid:
|
||||
post = chapter_soup.find('li', id='post-'+chapter_postid)
|
||||
else:
|
||||
# just the first one in the thread, then
|
||||
post = chapter_soup.find('li', class_='message')
|
||||
|
||||
return self._clean_chapter(post)
|
||||
|
||||
def _clean_chapter(self, post):
|
||||
post = post.find('blockquote', class_='messageText')
|
||||
post.name = 'div'
|
||||
# mostly, we want to remove colors because the Kindle is terrible at them
|
||||
for tag in post.find_all(style=True):
|
||||
del(tag['style'])
|
||||
return post.prettify()
|
||||
|
||||
chapters.append((str(mark.a.string), post.prettify()))
|
||||
|
||||
story['chapters'] = chapters
|
||||
@register
|
||||
class SpaceBattlesIndex(SpaceBattles):
|
||||
"""A spacebattles thread with an index post"""
|
||||
@staticmethod
|
||||
def match(url):
|
||||
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)
|
||||
|
||||
return story
|
||||
def _chapter_list(self, url):
|
||||
soup = self._soup(url)
|
||||
|
||||
post = post = soup.find('li', id='post-'+postid)
|
||||
links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
|
||||
if not links:
|
||||
raise SiteException("No links in index?")
|
||||
|
||||
return links
|
||||
|
|
|
|||
|
|
@ -1,61 +0,0 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def match(url):
|
||||
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)
|
||||
|
||||
def extract(url, fetch):
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
|
||||
base = soup.head.base.get('href')
|
||||
|
||||
match = re.match(r'.+/posts/(\d+)/?', url)
|
||||
if not match:
|
||||
print("Unparseable post URL", url)
|
||||
return
|
||||
postid = match.group(1)
|
||||
|
||||
story = {}
|
||||
story['title'] = str(soup.find('h1').string)
|
||||
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
|
||||
|
||||
post = post = soup.find('li', id='post-'+postid)
|
||||
links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
|
||||
if not links:
|
||||
print("No links in index?")
|
||||
|
||||
chapters = []
|
||||
for link in links:
|
||||
href = link.get('href')
|
||||
if '/members/' in href:
|
||||
# skip links to users
|
||||
continue
|
||||
if not href.startswith('http'):
|
||||
href = base + href
|
||||
print("Extracting chapter from", href)
|
||||
match = re.match(r'.+#post-(\d+)$', href)
|
||||
if not match:
|
||||
match = re.match(r'.+/posts/(\d+)/?$', href)
|
||||
if not match:
|
||||
print("Unparseable index link href", href)
|
||||
chapter_postid = match and match.group(1)
|
||||
chapter_page = fetch(href)
|
||||
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
|
||||
|
||||
if chapter_postid:
|
||||
post = chapter_soup.find('li', id='post-'+chapter_postid)
|
||||
else:
|
||||
# just the first one in the thread, then
|
||||
post = chapter_soup.find('li', class_='message')
|
||||
post = post.find('blockquote', class_='messageText')
|
||||
post.name = 'div'
|
||||
|
||||
chapters.append((str(link.string), post.prettify()))
|
||||
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
|
|
@ -1,62 +1,61 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from . import register, Site, SiteException
|
||||
|
||||
|
||||
def match(url):
|
||||
# Need a stack page
|
||||
return re.match(r'^https?://sta\.sh/2.+/?.*', url)
|
||||
@register
|
||||
class Stash(Site):
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
# Need a stack page
|
||||
return re.match(r'^https?://sta\.sh/2.+/?.*', url)
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
content = soup.find(id="stash-body")
|
||||
if not content:
|
||||
return
|
||||
|
||||
def extract(url, fetch):
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
content = soup.find(id="stash-body")
|
||||
if not content:
|
||||
return
|
||||
story = {}
|
||||
chapters = []
|
||||
|
||||
story = {}
|
||||
chapters = []
|
||||
# metadata = content.find(id='profile_top')
|
||||
story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
|
||||
story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
|
||||
|
||||
# metadata = content.find(id='profile_top')
|
||||
story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
|
||||
story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
|
||||
thumbs = content.select(".stash-folder-stream .thumb")
|
||||
if not thumbs:
|
||||
return
|
||||
for thumb in thumbs:
|
||||
try:
|
||||
if thumb['href'] is not '#':
|
||||
chapters.append(self._chapter(thumb['href']))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
thumbs = content.select(".stash-folder-stream .thumb")
|
||||
if not thumbs:
|
||||
return
|
||||
for thumb in thumbs:
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
|
||||
def _chapter(self, url):
|
||||
print("Extracting chapter from", url)
|
||||
soup = self._soup(url)
|
||||
|
||||
content = soup.find(class_="journal-wrapper")
|
||||
if not content:
|
||||
raise SiteException("No content")
|
||||
|
||||
title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
|
||||
|
||||
text = content.find(class_="text")
|
||||
|
||||
# clean up some invalid xhtml attributes
|
||||
# TODO: be more selective about this somehow
|
||||
try:
|
||||
if thumb['href'] is not '#':
|
||||
chapters.append(_extract_chapter(thumb['href'], fetch))
|
||||
for tag in text.find_all(True):
|
||||
tag.attrs = None
|
||||
except Exception as e:
|
||||
print(e)
|
||||
raise SiteException("Trouble cleaning attributes", e)
|
||||
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
|
||||
|
||||
def _extract_chapter(url, fetch):
|
||||
print("Extracting chapter from", url)
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
|
||||
content = soup.find(class_="journal-wrapper")
|
||||
if not content:
|
||||
raise Exception("No content")
|
||||
|
||||
title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
|
||||
|
||||
text = content.find(class_="text")
|
||||
|
||||
# clean up some invalid xhtml attributes
|
||||
# TODO: be more selective about this somehow
|
||||
try:
|
||||
for tag in text.find_all(True):
|
||||
tag.attrs = None
|
||||
except Exception as e:
|
||||
raise Exception("Trouble cleaning attributes", e)
|
||||
|
||||
return (title, text.prettify())
|
||||
return (title, text.prettify())
|
||||
|
|
|
|||
Loading…
Reference in a new issue