Change sites strategy to use classes and inheritance

2025-12-14 04:14:35 +01:00 · 2015-09-14 00:38:02 -05:00 · 2015-09-14 00:38:02 -05:00 · 2aba80be24
commit 2aba80be24
parent 1795c717e9
7 changed files with 238 additions and 243 deletions
--- a/leech.py
+++ b/leech.py
@ -4,6 +4,7 @@ import argparse
 import importlib
 import os

+import sites
 import epub
 from fetch import Fetch

@ -25,11 +26,12 @@ html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 def leech(url, filename=None):
    # we have: a page, which could be absolutely any part of a story, or not a story at all
    # check a bunch of things which are completely ff.n specific, to get text from it
-    site = _get_site(url)
+    site = sites.get(url)
    if not site:
        raise Exception("No site handler found")

-    story = site.extract(url, fetch)
+    handler = site(fetch)
+    story = handler.extract(url)
    if not story:
        raise Exception("Couldn't extract story")

@ -48,26 +50,7 @@ def leech(url, filename=None):

    return filename

-_sites = []
-
-
-def _get_site(url):
-    for site in _sites:
-        if site.match(url):
-            return site
-
-
-def _load_sites():
-    dirname = os.path.join(os.path.dirname(__file__), 'sites')
-    for f in os.listdir(dirname):
-        if not f.endswith('.py'):
-            continue
-        mod = importlib.import_module('sites.' + f.replace('.py', ''))
-        _sites.append(mod)
-
-
 if __name__ == '__main__':
-    _load_sites()
    parser = argparse.ArgumentParser()
    parser.add_argument('url', help="url of a story to fetch")
    parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
@ -75,4 +58,3 @@ if __name__ == '__main__':

    filename = leech(args.url, filename=args.filename)
    print("File created:", filename)
-
--- a/sites/init.py
+++ b/sites/init.py
@ -0,0 +1,38 @@
+
+from bs4 import BeautifulSoup
+
+_sites = []
+
+class Site:
+    """A Site handles checking whether a URL might represent a site, and then
+    extracting the content of a story from said site.
+    """
+    def __init__(self, fetch):
+        super().__init__()
+        self.fetch = fetch
+
+    @staticmethod
+    def matches(url):
+        raise NotImplementedError()
+
+    def extract(self, url):
+        raise NotImplementedError()
+
+    def _soup(self, url, method='html5lib'):
+        page = self.fetch(url)
+        return BeautifulSoup(page, method)
+
+class SiteException(Exception):
+    pass
+
+def register(site_class):
+    _sites.append(site_class)
+    return site_class
+
+def get(url):
+    for site_class in _sites:
+        if site_class.matches(url):
+            return site_class
+
+# And now, the things that will use this:
+from . import spacebattles, fanfictionnet, deviantart, stash
--- a/sites/deviantart.py
+++ b/sites/deviantart.py
@ -1,44 +1,42 @@
 #!/usr/bin/python

 import re
-from bs4 import BeautifulSoup

-from .stash import _extract_chapter
+from .stash import Stash

+class DeviantArt(Stash):
+    @staticmethod
+    def matches(url):
+        # Need a collection page
+        return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)

-def match(url):
-    # Need a collection page
-    return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
+    def extract(url, fetch):
+        soup = self._soup(url)
+        content = soup.find(id="output")
+        if not content:
+            return

+        story = {}
+        chapters = []

-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-    content = soup.find(id="output")
-    if not content:
-        return
+        if "gallery" in url:
+            story['author'] = str(content.select('h1 a.u')[0].string)
+        else:
+            authors = set(str(author.string) for author in content.select('.stream .details a.u'))
+            story['author'] = ', '.join(authors)

-    story = {}
-    chapters = []
+        story['title'] = str(content.find(class_="folder-title").string)

-    if "gallery" in url:
-        story['author'] = str(content.select('h1 a.u')[0].string)
-    else:
-        authors = set(str(author.string) for author in content.select('.stream .details a.u'))
-        story['author'] = ', '.join(authors)
+        thumbs = content.select(".stream a.thumb")
+        if not thumbs:
+            return
+        for thumb in thumbs:
+            try:
+                if thumb['href'] is not '#':
+                    chapters.append(self._chapter(thumb['href']))
+            except Exception as e:
+                print(e)

-    story['title'] = str(content.find(class_="folder-title").string)
+        story['chapters'] = chapters

-    thumbs = content.select(".stream a.thumb")
-    if not thumbs:
-        return
-    for thumb in thumbs:
-        try:
-            if thumb['href'] is not '#':
-                chapters.append(_extract_chapter(thumb['href'], fetch))
-        except Exception as e:
-            print(e)
-
-    story['chapters'] = chapters
-
-    return story
+        return story
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@ -1,64 +1,64 @@
 #!/usr/bin/python

 import re
-from bs4 import BeautifulSoup
+from . import register, Site, SiteException


-def match(url):
-    ## e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
-    return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
+@register
+class FanFictionNet(Site):
+    """FFN: it has a lot of stuff"""
+    @staticmethod
+    def matches(url):
+        # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
+        return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)

+    def extract(self, url):
+        soup = self._soup(url)
+        content = soup.find(id="content_wrapper_inner")
+        if not content:
+            raise SiteException("No content")

-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-    content = soup.find(id="content_wrapper_inner")
-    if not content:
-        return
+        story = {}
+        chapters = []

-    story = {}
-    chapters = []
+        metadata = content.find(id='profile_top')
+        story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
+        story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)

-    metadata = content.find(id='profile_top')
-    story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
-    story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
+        chapter_select = content.find(id="chap_select")
+        if chapter_select:
+            base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
+            if not base_url:
+                raise SiteException("Can't find base URL for chapters")
+            base_url = base_url.group(0)

-    chapter_select = content.find(id="chap_select")
-    if chapter_select:
-        base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
-        if not base_url:
-            return
-        base_url = base_url.group(0)
+            # beautiful soup doesn't handle ffn's unclosed option tags at all well here
+            options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
+            for option in options:
+                chapters.append((option[1], self._chapter(base_url + option[0])))
+        else:
+            chapters.append((story['title'], self._extract_chapter(url)))

-        # beautiful soup doesn't handle ffn's unclosed option tags at all well here
-        options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
-        for option in options:
-            chapters.append(_extract_chapter(base_url + option[0], option[1], fetch))
-    else:
-        chapters.append(_extract_chapter(url, story['title'], fetch))
+        story['chapters'] = chapters

-    story['chapters'] = chapters
+        return story

-    return story
+    def _chapter(self, url):
+        print("Extracting chapter from", url)
+        soup = self._soup(url)

+        content = soup.find(id="content_wrapper_inner")
+        if not content:
+            raise SiteException("No chapter content")

-def _extract_chapter(url, title, fetch):
-    print("Extracting chapter from", url)
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
+        text = content.find(id="storytext")

-    content = soup.find(id="content_wrapper_inner")
-    if not content:
-        return
+        # clean up some invalid xhtml attributes
+        # TODO: be more selective about this somehow
+        try:
+            for tag in text.find_all(True):
+                tag.attrs = None
+        except Exception as e:
+            print("Trouble cleaning attributes", e)

-    text = content.find(id="storytext")
-
-    # clean up some invalid xhtml attributes
-    # TODO: be more selective about this somehow
-    try:
-        for tag in text.find_all(True):
-            tag.attrs = None
-    except Exception as e:
-        print("Trouble cleaning attributes", e)
-
-    return (title, text.prettify())
+        return text.prettify()
--- a/sites/spacebattles.py
+++ b/sites/spacebattles.py
@ -1,58 +1,97 @@
 #!/usr/bin/python

 import re
-from bs4 import BeautifulSoup
+from . import register, Site, SiteException


-def match(url):
-    return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url)
+@register
+class SpaceBattles(Site):
+    """SpaceBattles is a forum..."""

-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
+    @staticmethod
+    def matches(url):
+        return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url)

-    base = soup.head.base.get('href')
+    def extract(self, url):
+        soup = self._soup(url)

-    story = {}
-    story['title'] = str(soup.find('h1').string)
-    story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
+        base = soup.head.base.get('href')

-    threadmarks_link = soup.find(class_="threadmarksTrigger")
-    if not threadmarks_link:
-        print("No threadmarks")
-        return
+        story = {}
+        story['title'] = str(soup.find('h1').string)
+        story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)

-    page = fetch(base + threadmarks_link.get('href'))
-    soup = BeautifulSoup(page, 'html5lib')
+        marks = self._chapter_list(url)

-    marks = soup.select('li.primaryContent.memberListItem')
-    if not marks:
-        print("No marks on threadmarks page")
-        return
+        chapters = []
+        for mark in marks:
+            href = mark.get('href')
+            if '/members' in href:
+                continue
+            if not href.startswith('http'):
+                href = base + href
+            chapters.append((str(mark.string), self._chapter(href)))

-    chapters = []
-    for mark in marks:
-        href = mark.a.get('href')
-        print("Extracting chapter from", href)
-        match = re.match(r'posts/(\d+)/?', href)
+        story['chapters'] = chapters
+
+        return story
+
+    def _chapter_list(self, url):
+        soup = self._soup(url)
+
+        threadmarks_link = soup.find(class_="threadmarksTrigger")
+        if not threadmarks_link:
+            raise SiteException("No threadmarks")
+
+        base = soup.head.base.get('href')
+        soup = self._soup(base + threadmarks_link.get('href'))
+
+        marks = soup.select('li.primaryContent.memberListItem a')
+        if not marks:
+            raise SiteException("No marks on threadmarks page")
+
+        return marks
+
+    def _chapter(self, url):
+        print("Extracting chapter from", url)
+        match = re.match(r'posts/(\d+)/?', url)
        if not match:
-            match = re.match(r'.+#post-(\d+)$', href)
+            match = re.match(r'.+#post-(\d+)$', url)
            if not match:
-                print("Unparseable threadmark href", href)
+                print("Unparseable threadmark href", url)
        chapter_postid = match and match.group(1)
-        chapter_page = fetch(base + href)
-        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
+        chapter_soup = self._soup(url, 'html5lib')

        if chapter_postid:
            post = chapter_soup.find('li', id='post-'+chapter_postid)
        else:
            # just the first one in the thread, then
            post = chapter_soup.find('li', class_='message')
+
+        return self._clean_chapter(post)
+
+    def _clean_chapter(self, post):
        post = post.find('blockquote', class_='messageText')
        post.name = 'div'
+        # mostly, we want to remove colors because the Kindle is terrible at them
+        for tag in post.find_all(style=True):
+            del(tag['style'])
+        return post.prettify()

-        chapters.append((str(mark.a.string), post.prettify()))

-    story['chapters'] = chapters
+@register
+class SpaceBattlesIndex(SpaceBattles):
+    """A spacebattles thread with an index post"""
+    @staticmethod
+    def match(url):
+        return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)

-    return story
+    def _chapter_list(self, url):
+        soup = self._soup(url)
+
+        post = post = soup.find('li', id='post-'+postid)
+        links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
+        if not links:
+            raise SiteException("No links in index?")
+
+        return links
--- a/sites/spacebattles_indexpost.py
+++ b/sites/spacebattles_indexpost.py
@ -1,61 +0,0 @@
-#!/usr/bin/python
-
-import re
-from bs4 import BeautifulSoup
-
-
-def match(url):
-    return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)
-
-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-
-    base = soup.head.base.get('href')
-
-    match = re.match(r'.+/posts/(\d+)/?', url)
-    if not match:
-        print("Unparseable post URL", url)
-        return
-    postid = match.group(1)
-
-    story = {}
-    story['title'] = str(soup.find('h1').string)
-    story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
-
-    post = post = soup.find('li', id='post-'+postid)
-    links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
-    if not links:
-        print("No links in index?")
-
-    chapters = []
-    for link in links:
-        href = link.get('href')
-        if '/members/' in href:
-            # skip links to users
-            continue
-        if not href.startswith('http'):
-            href = base + href
-        print("Extracting chapter from", href)
-        match = re.match(r'.+#post-(\d+)$', href)
-        if not match:
-            match = re.match(r'.+/posts/(\d+)/?$', href)
-            if not match:
-                print("Unparseable index link href", href)
-        chapter_postid = match and match.group(1)
-        chapter_page = fetch(href)
-        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
-
-        if chapter_postid:
-            post = chapter_soup.find('li', id='post-'+chapter_postid)
-        else:
-            # just the first one in the thread, then
-            post = chapter_soup.find('li', class_='message')
-        post = post.find('blockquote', class_='messageText')
-        post.name = 'div'
-
-        chapters.append((str(link.string), post.prettify()))
-
-    story['chapters'] = chapters
-
-    return story
--- a/sites/stash.py
+++ b/sites/stash.py
@ -1,62 +1,61 @@
 #!/usr/bin/python

 import re
-from bs4 import BeautifulSoup
+from . import register, Site, SiteException


-def match(url):
-    # Need a stack page
-    return re.match(r'^https?://sta\.sh/2.+/?.*', url)
+@register
+class Stash(Site):
+    @staticmethod
+    def matches(url):
+        # Need a stack page
+        return re.match(r'^https?://sta\.sh/2.+/?.*', url)

+    def extract(self, url):
+        soup = self._soup(url)
+        content = soup.find(id="stash-body")
+        if not content:
+            return

-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-    content = soup.find(id="stash-body")
-    if not content:
-        return
+        story = {}
+        chapters = []

-    story = {}
-    chapters = []
+        # metadata = content.find(id='profile_top')
+        story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
+        story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")

-    # metadata = content.find(id='profile_top')
-    story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
-    story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
+        thumbs = content.select(".stash-folder-stream .thumb")
+        if not thumbs:
+            return
+        for thumb in thumbs:
+            try:
+                if thumb['href'] is not '#':
+                    chapters.append(self._chapter(thumb['href']))
+            except Exception as e:
+                print(e)

-    thumbs = content.select(".stash-folder-stream .thumb")
-    if not thumbs:
-        return
-    for thumb in thumbs:
+        story['chapters'] = chapters
+
+        return story
+
+    def _chapter(self, url):
+        print("Extracting chapter from", url)
+        soup = self._soup(url)
+
+        content = soup.find(class_="journal-wrapper")
+        if not content:
+            raise SiteException("No content")
+
+        title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
+
+        text = content.find(class_="text")
+
+        # clean up some invalid xhtml attributes
+        # TODO: be more selective about this somehow
        try:
-            if thumb['href'] is not '#':
-                chapters.append(_extract_chapter(thumb['href'], fetch))
+            for tag in text.find_all(True):
+                tag.attrs = None
        except Exception as e:
-            print(e)
+            raise SiteException("Trouble cleaning attributes", e)

-    story['chapters'] = chapters
-
-    return story
-
-
-def _extract_chapter(url, fetch):
-    print("Extracting chapter from", url)
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-
-    content = soup.find(class_="journal-wrapper")
-    if not content:
-        raise Exception("No content")
-
-    title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
-
-    text = content.find(class_="text")
-
-    # clean up some invalid xhtml attributes
-    # TODO: be more selective about this somehow
-    try:
-        for tag in text.find_all(True):
-            tag.attrs = None
-    except Exception as e:
-        raise Exception("Trouble cleaning attributes", e)
-
-    return (title, text.prettify())
+        return (title, text.prettify())