From 2aba80be24ecaa07399dd6a1410c9f177cac16bd Mon Sep 17 00:00:00 2001 From: David Lynch Date: Mon, 14 Sep 2015 00:38:02 -0500 Subject: [PATCH] Change sites strategy to use classes and inheritance --- leech.py | 26 ++------ sites/__init__.py | 38 ++++++++++++ sites/deviantart.py | 62 ++++++++++--------- sites/fanfictionnet.py | 94 ++++++++++++++--------------- sites/spacebattles.py | 103 ++++++++++++++++++++++---------- sites/spacebattles_indexpost.py | 61 ------------------- sites/stash.py | 97 +++++++++++++++--------------- 7 files changed, 238 insertions(+), 243 deletions(-) create mode 100644 sites/__init__.py delete mode 100644 sites/spacebattles_indexpost.py diff --git a/leech.py b/leech.py index 7693030..e27d2ed 100755 --- a/leech.py +++ b/leech.py @@ -4,6 +4,7 @@ import argparse import importlib import os +import sites import epub from fetch import Fetch @@ -25,11 +26,12 @@ html_template = ''' def leech(url, filename=None): # we have: a page, which could be absolutely any part of a story, or not a story at all # check a bunch of things which are completely ff.n specific, to get text from it - site = _get_site(url) + site = sites.get(url) if not site: raise Exception("No site handler found") - story = site.extract(url, fetch) + handler = site(fetch) + story = handler.extract(url) if not story: raise Exception("Couldn't extract story") @@ -48,26 +50,7 @@ def leech(url, filename=None): return filename -_sites = [] - - -def _get_site(url): - for site in _sites: - if site.match(url): - return site - - -def _load_sites(): - dirname = os.path.join(os.path.dirname(__file__), 'sites') - for f in os.listdir(dirname): - if not f.endswith('.py'): - continue - mod = importlib.import_module('sites.' + f.replace('.py', '')) - _sites.append(mod) - - if __name__ == '__main__': - _load_sites() parser = argparse.ArgumentParser() parser.add_argument('url', help="url of a story to fetch") parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)") @@ -75,4 +58,3 @@ if __name__ == '__main__': filename = leech(args.url, filename=args.filename) print("File created:", filename) - diff --git a/sites/__init__.py b/sites/__init__.py new file mode 100644 index 0000000..f83c2e3 --- /dev/null +++ b/sites/__init__.py @@ -0,0 +1,38 @@ + +from bs4 import BeautifulSoup + +_sites = [] + +class Site: + """A Site handles checking whether a URL might represent a site, and then + extracting the content of a story from said site. + """ + def __init__(self, fetch): + super().__init__() + self.fetch = fetch + + @staticmethod + def matches(url): + raise NotImplementedError() + + def extract(self, url): + raise NotImplementedError() + + def _soup(self, url, method='html5lib'): + page = self.fetch(url) + return BeautifulSoup(page, method) + +class SiteException(Exception): + pass + +def register(site_class): + _sites.append(site_class) + return site_class + +def get(url): + for site_class in _sites: + if site_class.matches(url): + return site_class + +# And now, the things that will use this: +from . import spacebattles, fanfictionnet, deviantart, stash diff --git a/sites/deviantart.py b/sites/deviantart.py index 01d024e..97d15e8 100644 --- a/sites/deviantart.py +++ b/sites/deviantart.py @@ -1,44 +1,42 @@ #!/usr/bin/python import re -from bs4 import BeautifulSoup -from .stash import _extract_chapter +from .stash import Stash +class DeviantArt(Stash): + @staticmethod + def matches(url): + # Need a collection page + return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url) -def match(url): - # Need a collection page - return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url) + def extract(url, fetch): + soup = self._soup(url) + content = soup.find(id="output") + if not content: + return + story = {} + chapters = [] -def extract(url, fetch): - page = fetch(url) - soup = BeautifulSoup(page, 'html5lib') - content = soup.find(id="output") - if not content: - return + if "gallery" in url: + story['author'] = str(content.select('h1 a.u')[0].string) + else: + authors = set(str(author.string) for author in content.select('.stream .details a.u')) + story['author'] = ', '.join(authors) - story = {} - chapters = [] + story['title'] = str(content.find(class_="folder-title").string) - if "gallery" in url: - story['author'] = str(content.select('h1 a.u')[0].string) - else: - authors = set(str(author.string) for author in content.select('.stream .details a.u')) - story['author'] = ', '.join(authors) + thumbs = content.select(".stream a.thumb") + if not thumbs: + return + for thumb in thumbs: + try: + if thumb['href'] is not '#': + chapters.append(self._chapter(thumb['href'])) + except Exception as e: + print(e) - story['title'] = str(content.find(class_="folder-title").string) + story['chapters'] = chapters - thumbs = content.select(".stream a.thumb") - if not thumbs: - return - for thumb in thumbs: - try: - if thumb['href'] is not '#': - chapters.append(_extract_chapter(thumb['href'], fetch)) - except Exception as e: - print(e) - - story['chapters'] = chapters - - return story + return story diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index f6b3994..19d4044 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -1,64 +1,64 @@ #!/usr/bin/python import re -from bs4 import BeautifulSoup +from . import register, Site, SiteException -def match(url): - ## e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights - return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url) +@register +class FanFictionNet(Site): + """FFN: it has a lot of stuff""" + @staticmethod + def matches(url): + # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights + return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url) + def extract(self, url): + soup = self._soup(url) + content = soup.find(id="content_wrapper_inner") + if not content: + raise SiteException("No content") -def extract(url, fetch): - page = fetch(url) - soup = BeautifulSoup(page, 'html5lib') - content = soup.find(id="content_wrapper_inner") - if not content: - return + story = {} + chapters = [] - story = {} - chapters = [] + metadata = content.find(id='profile_top') + story['title'] = str(metadata.find('b', class_="xcontrast_txt").string) + story['author'] = str(metadata.find('a', class_="xcontrast_txt").string) - metadata = content.find(id='profile_top') - story['title'] = str(metadata.find('b', class_="xcontrast_txt").string) - story['author'] = str(metadata.find('a', class_="xcontrast_txt").string) + chapter_select = content.find(id="chap_select") + if chapter_select: + base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url) + if not base_url: + raise SiteException("Can't find base URL for chapters") + base_url = base_url.group(0) - chapter_select = content.find(id="chap_select") - if chapter_select: - base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url) - if not base_url: - return - base_url = base_url.group(0) + # beautiful soup doesn't handle ffn's unclosed option tags at all well here + options = re.findall(r']*>([^<]+)', str(chapter_select)) + for option in options: + chapters.append((option[1], self._chapter(base_url + option[0]))) + else: + chapters.append((story['title'], self._extract_chapter(url))) - # beautiful soup doesn't handle ffn's unclosed option tags at all well here - options = re.findall(r']*>([^<]+)', str(chapter_select)) - for option in options: - chapters.append(_extract_chapter(base_url + option[0], option[1], fetch)) - else: - chapters.append(_extract_chapter(url, story['title'], fetch)) + story['chapters'] = chapters - story['chapters'] = chapters + return story - return story + def _chapter(self, url): + print("Extracting chapter from", url) + soup = self._soup(url) + content = soup.find(id="content_wrapper_inner") + if not content: + raise SiteException("No chapter content") -def _extract_chapter(url, title, fetch): - print("Extracting chapter from", url) - page = fetch(url) - soup = BeautifulSoup(page, 'html5lib') + text = content.find(id="storytext") - content = soup.find(id="content_wrapper_inner") - if not content: - return + # clean up some invalid xhtml attributes + # TODO: be more selective about this somehow + try: + for tag in text.find_all(True): + tag.attrs = None + except Exception as e: + print("Trouble cleaning attributes", e) - text = content.find(id="storytext") - - # clean up some invalid xhtml attributes - # TODO: be more selective about this somehow - try: - for tag in text.find_all(True): - tag.attrs = None - except Exception as e: - print("Trouble cleaning attributes", e) - - return (title, text.prettify()) + return text.prettify() diff --git a/sites/spacebattles.py b/sites/spacebattles.py index b482ca1..653eb9a 100644 --- a/sites/spacebattles.py +++ b/sites/spacebattles.py @@ -1,58 +1,97 @@ #!/usr/bin/python import re -from bs4 import BeautifulSoup +from . import register, Site, SiteException -def match(url): - return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url) +@register +class SpaceBattles(Site): + """SpaceBattles is a forum...""" -def extract(url, fetch): - page = fetch(url) - soup = BeautifulSoup(page, 'html5lib') + @staticmethod + def matches(url): + return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url) - base = soup.head.base.get('href') + def extract(self, url): + soup = self._soup(url) - story = {} - story['title'] = str(soup.find('h1').string) - story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string) + base = soup.head.base.get('href') - threadmarks_link = soup.find(class_="threadmarksTrigger") - if not threadmarks_link: - print("No threadmarks") - return + story = {} + story['title'] = str(soup.find('h1').string) + story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string) - page = fetch(base + threadmarks_link.get('href')) - soup = BeautifulSoup(page, 'html5lib') + marks = self._chapter_list(url) - marks = soup.select('li.primaryContent.memberListItem') - if not marks: - print("No marks on threadmarks page") - return + chapters = [] + for mark in marks: + href = mark.get('href') + if '/members' in href: + continue + if not href.startswith('http'): + href = base + href + chapters.append((str(mark.string), self._chapter(href))) - chapters = [] - for mark in marks: - href = mark.a.get('href') - print("Extracting chapter from", href) - match = re.match(r'posts/(\d+)/?', href) + story['chapters'] = chapters + + return story + + def _chapter_list(self, url): + soup = self._soup(url) + + threadmarks_link = soup.find(class_="threadmarksTrigger") + if not threadmarks_link: + raise SiteException("No threadmarks") + + base = soup.head.base.get('href') + soup = self._soup(base + threadmarks_link.get('href')) + + marks = soup.select('li.primaryContent.memberListItem a') + if not marks: + raise SiteException("No marks on threadmarks page") + + return marks + + def _chapter(self, url): + print("Extracting chapter from", url) + match = re.match(r'posts/(\d+)/?', url) if not match: - match = re.match(r'.+#post-(\d+)$', href) + match = re.match(r'.+#post-(\d+)$', url) if not match: - print("Unparseable threadmark href", href) + print("Unparseable threadmark href", url) chapter_postid = match and match.group(1) - chapter_page = fetch(base + href) - chapter_soup = BeautifulSoup(chapter_page, 'html5lib') + chapter_soup = self._soup(url, 'html5lib') if chapter_postid: post = chapter_soup.find('li', id='post-'+chapter_postid) else: # just the first one in the thread, then post = chapter_soup.find('li', class_='message') + + return self._clean_chapter(post) + + def _clean_chapter(self, post): post = post.find('blockquote', class_='messageText') post.name = 'div' + # mostly, we want to remove colors because the Kindle is terrible at them + for tag in post.find_all(style=True): + del(tag['style']) + return post.prettify() - chapters.append((str(mark.a.string), post.prettify())) - story['chapters'] = chapters +@register +class SpaceBattlesIndex(SpaceBattles): + """A spacebattles thread with an index post""" + @staticmethod + def match(url): + return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url) - return story + def _chapter_list(self, url): + soup = self._soup(url) + + post = post = soup.find('li', id='post-'+postid) + links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink') + if not links: + raise SiteException("No links in index?") + + return links diff --git a/sites/spacebattles_indexpost.py b/sites/spacebattles_indexpost.py deleted file mode 100644 index c2f4679..0000000 --- a/sites/spacebattles_indexpost.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/python - -import re -from bs4 import BeautifulSoup - - -def match(url): - return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url) - -def extract(url, fetch): - page = fetch(url) - soup = BeautifulSoup(page, 'html5lib') - - base = soup.head.base.get('href') - - match = re.match(r'.+/posts/(\d+)/?', url) - if not match: - print("Unparseable post URL", url) - return - postid = match.group(1) - - story = {} - story['title'] = str(soup.find('h1').string) - story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string) - - post = post = soup.find('li', id='post-'+postid) - links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink') - if not links: - print("No links in index?") - - chapters = [] - for link in links: - href = link.get('href') - if '/members/' in href: - # skip links to users - continue - if not href.startswith('http'): - href = base + href - print("Extracting chapter from", href) - match = re.match(r'.+#post-(\d+)$', href) - if not match: - match = re.match(r'.+/posts/(\d+)/?$', href) - if not match: - print("Unparseable index link href", href) - chapter_postid = match and match.group(1) - chapter_page = fetch(href) - chapter_soup = BeautifulSoup(chapter_page, 'html5lib') - - if chapter_postid: - post = chapter_soup.find('li', id='post-'+chapter_postid) - else: - # just the first one in the thread, then - post = chapter_soup.find('li', class_='message') - post = post.find('blockquote', class_='messageText') - post.name = 'div' - - chapters.append((str(link.string), post.prettify())) - - story['chapters'] = chapters - - return story diff --git a/sites/stash.py b/sites/stash.py index b83139f..d414ac4 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -1,62 +1,61 @@ #!/usr/bin/python import re -from bs4 import BeautifulSoup +from . import register, Site, SiteException -def match(url): - # Need a stack page - return re.match(r'^https?://sta\.sh/2.+/?.*', url) +@register +class Stash(Site): + @staticmethod + def matches(url): + # Need a stack page + return re.match(r'^https?://sta\.sh/2.+/?.*', url) + def extract(self, url): + soup = self._soup(url) + content = soup.find(id="stash-body") + if not content: + return -def extract(url, fetch): - page = fetch(url) - soup = BeautifulSoup(page, 'html5lib') - content = soup.find(id="stash-body") - if not content: - return + story = {} + chapters = [] - story = {} - chapters = [] + # metadata = content.find(id='profile_top') + story['title'] = str(soup.find(class_="stash-folder-name").h2.string) + story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s") - # metadata = content.find(id='profile_top') - story['title'] = str(soup.find(class_="stash-folder-name").h2.string) - story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s") + thumbs = content.select(".stash-folder-stream .thumb") + if not thumbs: + return + for thumb in thumbs: + try: + if thumb['href'] is not '#': + chapters.append(self._chapter(thumb['href'])) + except Exception as e: + print(e) - thumbs = content.select(".stash-folder-stream .thumb") - if not thumbs: - return - for thumb in thumbs: + story['chapters'] = chapters + + return story + + def _chapter(self, url): + print("Extracting chapter from", url) + soup = self._soup(url) + + content = soup.find(class_="journal-wrapper") + if not content: + raise SiteException("No content") + + title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string) + + text = content.find(class_="text") + + # clean up some invalid xhtml attributes + # TODO: be more selective about this somehow try: - if thumb['href'] is not '#': - chapters.append(_extract_chapter(thumb['href'], fetch)) + for tag in text.find_all(True): + tag.attrs = None except Exception as e: - print(e) + raise SiteException("Trouble cleaning attributes", e) - story['chapters'] = chapters - - return story - - -def _extract_chapter(url, fetch): - print("Extracting chapter from", url) - page = fetch(url) - soup = BeautifulSoup(page, 'html5lib') - - content = soup.find(class_="journal-wrapper") - if not content: - raise Exception("No content") - - title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string) - - text = content.find(class_="text") - - # clean up some invalid xhtml attributes - # TODO: be more selective about this somehow - try: - for tag in text.find_all(True): - tag.attrs = None - except Exception as e: - raise Exception("Trouble cleaning attributes", e) - - return (title, text.prettify()) + return (title, text.prettify())