1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-14 04:14:35 +01:00

Change sites strategy to use classes and inheritance

This commit is contained in:
David Lynch 2015-09-14 00:38:02 -05:00
parent 1795c717e9
commit 2aba80be24
7 changed files with 238 additions and 243 deletions

View file

@ -4,6 +4,7 @@ import argparse
import importlib
import os
import sites
import epub
from fetch import Fetch
@ -25,11 +26,12 @@ html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
def leech(url, filename=None):
# we have: a page, which could be absolutely any part of a story, or not a story at all
# check a bunch of things which are completely ff.n specific, to get text from it
site = _get_site(url)
site = sites.get(url)
if not site:
raise Exception("No site handler found")
story = site.extract(url, fetch)
handler = site(fetch)
story = handler.extract(url)
if not story:
raise Exception("Couldn't extract story")
@ -48,26 +50,7 @@ def leech(url, filename=None):
return filename
_sites = []
def _get_site(url):
for site in _sites:
if site.match(url):
return site
def _load_sites():
dirname = os.path.join(os.path.dirname(__file__), 'sites')
for f in os.listdir(dirname):
if not f.endswith('.py'):
continue
mod = importlib.import_module('sites.' + f.replace('.py', ''))
_sites.append(mod)
if __name__ == '__main__':
_load_sites()
parser = argparse.ArgumentParser()
parser.add_argument('url', help="url of a story to fetch")
parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
@ -75,4 +58,3 @@ if __name__ == '__main__':
filename = leech(args.url, filename=args.filename)
print("File created:", filename)

38
sites/__init__.py Normal file
View file

@ -0,0 +1,38 @@
from bs4 import BeautifulSoup
_sites = []
class Site:
"""A Site handles checking whether a URL might represent a site, and then
extracting the content of a story from said site.
"""
def __init__(self, fetch):
super().__init__()
self.fetch = fetch
@staticmethod
def matches(url):
raise NotImplementedError()
def extract(self, url):
raise NotImplementedError()
def _soup(self, url, method='html5lib'):
page = self.fetch(url)
return BeautifulSoup(page, method)
class SiteException(Exception):
pass
def register(site_class):
_sites.append(site_class)
return site_class
def get(url):
for site_class in _sites:
if site_class.matches(url):
return site_class
# And now, the things that will use this:
from . import spacebattles, fanfictionnet, deviantart, stash

View file

@ -1,44 +1,42 @@
#!/usr/bin/python
import re
from bs4 import BeautifulSoup
from .stash import _extract_chapter
from .stash import Stash
class DeviantArt(Stash):
@staticmethod
def matches(url):
# Need a collection page
return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
def match(url):
# Need a collection page
return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
def extract(url, fetch):
soup = self._soup(url)
content = soup.find(id="output")
if not content:
return
story = {}
chapters = []
def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
content = soup.find(id="output")
if not content:
return
if "gallery" in url:
story['author'] = str(content.select('h1 a.u')[0].string)
else:
authors = set(str(author.string) for author in content.select('.stream .details a.u'))
story['author'] = ', '.join(authors)
story = {}
chapters = []
story['title'] = str(content.find(class_="folder-title").string)
if "gallery" in url:
story['author'] = str(content.select('h1 a.u')[0].string)
else:
authors = set(str(author.string) for author in content.select('.stream .details a.u'))
story['author'] = ', '.join(authors)
thumbs = content.select(".stream a.thumb")
if not thumbs:
return
for thumb in thumbs:
try:
if thumb['href'] is not '#':
chapters.append(self._chapter(thumb['href']))
except Exception as e:
print(e)
story['title'] = str(content.find(class_="folder-title").string)
story['chapters'] = chapters
thumbs = content.select(".stream a.thumb")
if not thumbs:
return
for thumb in thumbs:
try:
if thumb['href'] is not '#':
chapters.append(_extract_chapter(thumb['href'], fetch))
except Exception as e:
print(e)
story['chapters'] = chapters
return story
return story

View file

@ -1,64 +1,64 @@
#!/usr/bin/python
import re
from bs4 import BeautifulSoup
from . import register, Site, SiteException
def match(url):
## e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
@register
class FanFictionNet(Site):
"""FFN: it has a lot of stuff"""
@staticmethod
def matches(url):
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
def extract(self, url):
soup = self._soup(url)
content = soup.find(id="content_wrapper_inner")
if not content:
raise SiteException("No content")
def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
content = soup.find(id="content_wrapper_inner")
if not content:
return
story = {}
chapters = []
story = {}
chapters = []
metadata = content.find(id='profile_top')
story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
metadata = content.find(id='profile_top')
story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
chapter_select = content.find(id="chap_select")
if chapter_select:
base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
if not base_url:
raise SiteException("Can't find base URL for chapters")
base_url = base_url.group(0)
chapter_select = content.find(id="chap_select")
if chapter_select:
base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
if not base_url:
return
base_url = base_url.group(0)
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
for option in options:
chapters.append((option[1], self._chapter(base_url + option[0])))
else:
chapters.append((story['title'], self._extract_chapter(url)))
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
for option in options:
chapters.append(_extract_chapter(base_url + option[0], option[1], fetch))
else:
chapters.append(_extract_chapter(url, story['title'], fetch))
story['chapters'] = chapters
story['chapters'] = chapters
return story
return story
def _chapter(self, url):
print("Extracting chapter from", url)
soup = self._soup(url)
content = soup.find(id="content_wrapper_inner")
if not content:
raise SiteException("No chapter content")
def _extract_chapter(url, title, fetch):
print("Extracting chapter from", url)
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
text = content.find(id="storytext")
content = soup.find(id="content_wrapper_inner")
if not content:
return
# clean up some invalid xhtml attributes
# TODO: be more selective about this somehow
try:
for tag in text.find_all(True):
tag.attrs = None
except Exception as e:
print("Trouble cleaning attributes", e)
text = content.find(id="storytext")
# clean up some invalid xhtml attributes
# TODO: be more selective about this somehow
try:
for tag in text.find_all(True):
tag.attrs = None
except Exception as e:
print("Trouble cleaning attributes", e)
return (title, text.prettify())
return text.prettify()

View file

@ -1,58 +1,97 @@
#!/usr/bin/python
import re
from bs4 import BeautifulSoup
from . import register, Site, SiteException
def match(url):
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url)
@register
class SpaceBattles(Site):
"""SpaceBattles is a forum..."""
def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
@staticmethod
def matches(url):
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url)
base = soup.head.base.get('href')
def extract(self, url):
soup = self._soup(url)
story = {}
story['title'] = str(soup.find('h1').string)
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
base = soup.head.base.get('href')
threadmarks_link = soup.find(class_="threadmarksTrigger")
if not threadmarks_link:
print("No threadmarks")
return
story = {}
story['title'] = str(soup.find('h1').string)
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
page = fetch(base + threadmarks_link.get('href'))
soup = BeautifulSoup(page, 'html5lib')
marks = self._chapter_list(url)
marks = soup.select('li.primaryContent.memberListItem')
if not marks:
print("No marks on threadmarks page")
return
chapters = []
for mark in marks:
href = mark.get('href')
if '/members' in href:
continue
if not href.startswith('http'):
href = base + href
chapters.append((str(mark.string), self._chapter(href)))
chapters = []
for mark in marks:
href = mark.a.get('href')
print("Extracting chapter from", href)
match = re.match(r'posts/(\d+)/?', href)
story['chapters'] = chapters
return story
def _chapter_list(self, url):
soup = self._soup(url)
threadmarks_link = soup.find(class_="threadmarksTrigger")
if not threadmarks_link:
raise SiteException("No threadmarks")
base = soup.head.base.get('href')
soup = self._soup(base + threadmarks_link.get('href'))
marks = soup.select('li.primaryContent.memberListItem a')
if not marks:
raise SiteException("No marks on threadmarks page")
return marks
def _chapter(self, url):
print("Extracting chapter from", url)
match = re.match(r'posts/(\d+)/?', url)
if not match:
match = re.match(r'.+#post-(\d+)$', href)
match = re.match(r'.+#post-(\d+)$', url)
if not match:
print("Unparseable threadmark href", href)
print("Unparseable threadmark href", url)
chapter_postid = match and match.group(1)
chapter_page = fetch(base + href)
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
chapter_soup = self._soup(url, 'html5lib')
if chapter_postid:
post = chapter_soup.find('li', id='post-'+chapter_postid)
else:
# just the first one in the thread, then
post = chapter_soup.find('li', class_='message')
return self._clean_chapter(post)
def _clean_chapter(self, post):
post = post.find('blockquote', class_='messageText')
post.name = 'div'
# mostly, we want to remove colors because the Kindle is terrible at them
for tag in post.find_all(style=True):
del(tag['style'])
return post.prettify()
chapters.append((str(mark.a.string), post.prettify()))
story['chapters'] = chapters
@register
class SpaceBattlesIndex(SpaceBattles):
"""A spacebattles thread with an index post"""
@staticmethod
def match(url):
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)
return story
def _chapter_list(self, url):
soup = self._soup(url)
post = post = soup.find('li', id='post-'+postid)
links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
if not links:
raise SiteException("No links in index?")
return links

View file

@ -1,61 +0,0 @@
#!/usr/bin/python
import re
from bs4 import BeautifulSoup
def match(url):
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)
def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
base = soup.head.base.get('href')
match = re.match(r'.+/posts/(\d+)/?', url)
if not match:
print("Unparseable post URL", url)
return
postid = match.group(1)
story = {}
story['title'] = str(soup.find('h1').string)
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
post = post = soup.find('li', id='post-'+postid)
links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
if not links:
print("No links in index?")
chapters = []
for link in links:
href = link.get('href')
if '/members/' in href:
# skip links to users
continue
if not href.startswith('http'):
href = base + href
print("Extracting chapter from", href)
match = re.match(r'.+#post-(\d+)$', href)
if not match:
match = re.match(r'.+/posts/(\d+)/?$', href)
if not match:
print("Unparseable index link href", href)
chapter_postid = match and match.group(1)
chapter_page = fetch(href)
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
if chapter_postid:
post = chapter_soup.find('li', id='post-'+chapter_postid)
else:
# just the first one in the thread, then
post = chapter_soup.find('li', class_='message')
post = post.find('blockquote', class_='messageText')
post.name = 'div'
chapters.append((str(link.string), post.prettify()))
story['chapters'] = chapters
return story

View file

@ -1,62 +1,61 @@
#!/usr/bin/python
import re
from bs4 import BeautifulSoup
from . import register, Site, SiteException
def match(url):
# Need a stack page
return re.match(r'^https?://sta\.sh/2.+/?.*', url)
@register
class Stash(Site):
@staticmethod
def matches(url):
# Need a stack page
return re.match(r'^https?://sta\.sh/2.+/?.*', url)
def extract(self, url):
soup = self._soup(url)
content = soup.find(id="stash-body")
if not content:
return
def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
content = soup.find(id="stash-body")
if not content:
return
story = {}
chapters = []
story = {}
chapters = []
# metadata = content.find(id='profile_top')
story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
# metadata = content.find(id='profile_top')
story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
thumbs = content.select(".stash-folder-stream .thumb")
if not thumbs:
return
for thumb in thumbs:
try:
if thumb['href'] is not '#':
chapters.append(self._chapter(thumb['href']))
except Exception as e:
print(e)
thumbs = content.select(".stash-folder-stream .thumb")
if not thumbs:
return
for thumb in thumbs:
story['chapters'] = chapters
return story
def _chapter(self, url):
print("Extracting chapter from", url)
soup = self._soup(url)
content = soup.find(class_="journal-wrapper")
if not content:
raise SiteException("No content")
title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
text = content.find(class_="text")
# clean up some invalid xhtml attributes
# TODO: be more selective about this somehow
try:
if thumb['href'] is not '#':
chapters.append(_extract_chapter(thumb['href'], fetch))
for tag in text.find_all(True):
tag.attrs = None
except Exception as e:
print(e)
raise SiteException("Trouble cleaning attributes", e)
story['chapters'] = chapters
return story
def _extract_chapter(url, fetch):
print("Extracting chapter from", url)
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
content = soup.find(class_="journal-wrapper")
if not content:
raise Exception("No content")
title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
text = content.find(class_="text")
# clean up some invalid xhtml attributes
# TODO: be more selective about this somehow
try:
for tag in text.find_all(True):
tag.attrs = None
except Exception as e:
raise Exception("Trouble cleaning attributes", e)
return (title, text.prettify())
return (title, text.prettify())