From 2aba80be24ecaa07399dd6a1410c9f177cac16bd Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Mon, 14 Sep 2015 00:38:02 -0500
Subject: [PATCH] Change sites strategy to use classes and inheritance

---
 leech.py                        |  26 ++------
 sites/__init__.py               |  38 ++++++++++++
 sites/deviantart.py             |  62 ++++++++++---------
 sites/fanfictionnet.py          |  94 ++++++++++++++---------------
 sites/spacebattles.py           | 103 ++++++++++++++++++++++----------
 sites/spacebattles_indexpost.py |  61 -------------------
 sites/stash.py                  |  97 +++++++++++++++---------------
 7 files changed, 238 insertions(+), 243 deletions(-)
 create mode 100644 sites/__init__.py
 delete mode 100644 sites/spacebattles_indexpost.py

diff --git a/leech.py b/leech.py
index 7693030..e27d2ed 100755
--- a/leech.py
+++ b/leech.py
@@ -4,6 +4,7 @@ import argparse
 import importlib
 import os
 
+import sites
 import epub
 from fetch import Fetch
 
@@ -25,11 +26,12 @@ html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 def leech(url, filename=None):
     # we have: a page, which could be absolutely any part of a story, or not a story at all
     # check a bunch of things which are completely ff.n specific, to get text from it
-    site = _get_site(url)
+    site = sites.get(url)
     if not site:
         raise Exception("No site handler found")
 
-    story = site.extract(url, fetch)
+    handler = site(fetch)
+    story = handler.extract(url)
     if not story:
         raise Exception("Couldn't extract story")
 
@@ -48,26 +50,7 @@ def leech(url, filename=None):
 
     return filename
 
-_sites = []
-
-
-def _get_site(url):
-    for site in _sites:
-        if site.match(url):
-            return site
-
-
-def _load_sites():
-    dirname = os.path.join(os.path.dirname(__file__), 'sites')
-    for f in os.listdir(dirname):
-        if not f.endswith('.py'):
-            continue
-        mod = importlib.import_module('sites.' + f.replace('.py', ''))
-        _sites.append(mod)
-
-
 if __name__ == '__main__':
-    _load_sites()
     parser = argparse.ArgumentParser()
     parser.add_argument('url', help="url of a story to fetch")
     parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
@@ -75,4 +58,3 @@ if __name__ == '__main__':
 
     filename = leech(args.url, filename=args.filename)
     print("File created:", filename)
-
diff --git a/sites/__init__.py b/sites/__init__.py
new file mode 100644
index 0000000..f83c2e3
--- /dev/null
+++ b/sites/__init__.py
@@ -0,0 +1,38 @@
+
+from bs4 import BeautifulSoup
+
+_sites = []
+
+class Site:
+    """A Site handles checking whether a URL might represent a site, and then
+    extracting the content of a story from said site.
+    """
+    def __init__(self, fetch):
+        super().__init__()
+        self.fetch = fetch
+
+    @staticmethod
+    def matches(url):
+        raise NotImplementedError()
+
+    def extract(self, url):
+        raise NotImplementedError()
+
+    def _soup(self, url, method='html5lib'):
+        page = self.fetch(url)
+        return BeautifulSoup(page, method)
+
+class SiteException(Exception):
+    pass
+
+def register(site_class):
+    _sites.append(site_class)
+    return site_class
+
+def get(url):
+    for site_class in _sites:
+        if site_class.matches(url):
+            return site_class
+
+# And now, the things that will use this:
+from . import spacebattles, fanfictionnet, deviantart, stash
diff --git a/sites/deviantart.py b/sites/deviantart.py
index 01d024e..97d15e8 100644
--- a/sites/deviantart.py
+++ b/sites/deviantart.py
@@ -1,44 +1,42 @@
 #!/usr/bin/python
 
 import re
-from bs4 import BeautifulSoup
 
-from .stash import _extract_chapter
+from .stash import Stash
 
+class DeviantArt(Stash):
+    @staticmethod
+    def matches(url):
+        # Need a collection page
+        return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
 
-def match(url):
-    # Need a collection page
-    return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
+    def extract(url, fetch):
+        soup = self._soup(url)
+        content = soup.find(id="output")
+        if not content:
+            return
 
+        story = {}
+        chapters = []
 
-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-    content = soup.find(id="output")
-    if not content:
-        return
+        if "gallery" in url:
+            story['author'] = str(content.select('h1 a.u')[0].string)
+        else:
+            authors = set(str(author.string) for author in content.select('.stream .details a.u'))
+            story['author'] = ', '.join(authors)
 
-    story = {}
-    chapters = []
+        story['title'] = str(content.find(class_="folder-title").string)
 
-    if "gallery" in url:
-        story['author'] = str(content.select('h1 a.u')[0].string)
-    else:
-        authors = set(str(author.string) for author in content.select('.stream .details a.u'))
-        story['author'] = ', '.join(authors)
+        thumbs = content.select(".stream a.thumb")
+        if not thumbs:
+            return
+        for thumb in thumbs:
+            try:
+                if thumb['href'] is not '#':
+                    chapters.append(self._chapter(thumb['href']))
+            except Exception as e:
+                print(e)
 
-    story['title'] = str(content.find(class_="folder-title").string)
+        story['chapters'] = chapters
 
-    thumbs = content.select(".stream a.thumb")
-    if not thumbs:
-        return
-    for thumb in thumbs:
-        try:
-            if thumb['href'] is not '#':
-                chapters.append(_extract_chapter(thumb['href'], fetch))
-        except Exception as e:
-            print(e)
-
-    story['chapters'] = chapters
-
-    return story
+        return story
diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py
index f6b3994..19d4044 100644
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@@ -1,64 +1,64 @@
 #!/usr/bin/python
 
 import re
-from bs4 import BeautifulSoup
+from . import register, Site, SiteException
 
 
-def match(url):
-    ## e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
-    return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
+@register
+class FanFictionNet(Site):
+    """FFN: it has a lot of stuff"""
+    @staticmethod
+    def matches(url):
+        # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
+        return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
 
+    def extract(self, url):
+        soup = self._soup(url)
+        content = soup.find(id="content_wrapper_inner")
+        if not content:
+            raise SiteException("No content")
 
-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-    content = soup.find(id="content_wrapper_inner")
-    if not content:
-        return
+        story = {}
+        chapters = []
 
-    story = {}
-    chapters = []
+        metadata = content.find(id='profile_top')
+        story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
+        story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
 
-    metadata = content.find(id='profile_top')
-    story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
-    story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
+        chapter_select = content.find(id="chap_select")
+        if chapter_select:
+            base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
+            if not base_url:
+                raise SiteException("Can't find base URL for chapters")
+            base_url = base_url.group(0)
 
-    chapter_select = content.find(id="chap_select")
-    if chapter_select:
-        base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
-        if not base_url:
-            return
-        base_url = base_url.group(0)
+            # beautiful soup doesn't handle ffn's unclosed option tags at all well here
+            options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
+            for option in options:
+                chapters.append((option[1], self._chapter(base_url + option[0])))
+        else:
+            chapters.append((story['title'], self._extract_chapter(url)))
 
-        # beautiful soup doesn't handle ffn's unclosed option tags at all well here
-        options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
-        for option in options:
-            chapters.append(_extract_chapter(base_url + option[0], option[1], fetch))
-    else:
-        chapters.append(_extract_chapter(url, story['title'], fetch))
+        story['chapters'] = chapters
 
-    story['chapters'] = chapters
+        return story
 
-    return story
+    def _chapter(self, url):
+        print("Extracting chapter from", url)
+        soup = self._soup(url)
 
+        content = soup.find(id="content_wrapper_inner")
+        if not content:
+            raise SiteException("No chapter content")
 
-def _extract_chapter(url, title, fetch):
-    print("Extracting chapter from", url)
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
+        text = content.find(id="storytext")
 
-    content = soup.find(id="content_wrapper_inner")
-    if not content:
-        return
+        # clean up some invalid xhtml attributes
+        # TODO: be more selective about this somehow
+        try:
+            for tag in text.find_all(True):
+                tag.attrs = None
+        except Exception as e:
+            print("Trouble cleaning attributes", e)
 
-    text = content.find(id="storytext")
-
-    # clean up some invalid xhtml attributes
-    # TODO: be more selective about this somehow
-    try:
-        for tag in text.find_all(True):
-            tag.attrs = None
-    except Exception as e:
-        print("Trouble cleaning attributes", e)
-
-    return (title, text.prettify())
+        return text.prettify()
diff --git a/sites/spacebattles.py b/sites/spacebattles.py
index b482ca1..653eb9a 100644
--- a/sites/spacebattles.py
+++ b/sites/spacebattles.py
@@ -1,58 +1,97 @@
 #!/usr/bin/python
 
 import re
-from bs4 import BeautifulSoup
+from . import register, Site, SiteException
 
 
-def match(url):
-    return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url)
+@register
+class SpaceBattles(Site):
+    """SpaceBattles is a forum..."""
 
-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
+    @staticmethod
+    def matches(url):
+        return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/threads/.*\d+/?.*', url)
 
-    base = soup.head.base.get('href')
+    def extract(self, url):
+        soup = self._soup(url)
 
-    story = {}
-    story['title'] = str(soup.find('h1').string)
-    story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
+        base = soup.head.base.get('href')
 
-    threadmarks_link = soup.find(class_="threadmarksTrigger")
-    if not threadmarks_link:
-        print("No threadmarks")
-        return
+        story = {}
+        story['title'] = str(soup.find('h1').string)
+        story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
 
-    page = fetch(base + threadmarks_link.get('href'))
-    soup = BeautifulSoup(page, 'html5lib')
+        marks = self._chapter_list(url)
 
-    marks = soup.select('li.primaryContent.memberListItem')
-    if not marks:
-        print("No marks on threadmarks page")
-        return
+        chapters = []
+        for mark in marks:
+            href = mark.get('href')
+            if '/members' in href:
+                continue
+            if not href.startswith('http'):
+                href = base + href
+            chapters.append((str(mark.string), self._chapter(href)))
 
-    chapters = []
-    for mark in marks:
-        href = mark.a.get('href')
-        print("Extracting chapter from", href)
-        match = re.match(r'posts/(\d+)/?', href)
+        story['chapters'] = chapters
+
+        return story
+
+    def _chapter_list(self, url):
+        soup = self._soup(url)
+
+        threadmarks_link = soup.find(class_="threadmarksTrigger")
+        if not threadmarks_link:
+            raise SiteException("No threadmarks")
+
+        base = soup.head.base.get('href')
+        soup = self._soup(base + threadmarks_link.get('href'))
+
+        marks = soup.select('li.primaryContent.memberListItem a')
+        if not marks:
+            raise SiteException("No marks on threadmarks page")
+
+        return marks
+
+    def _chapter(self, url):
+        print("Extracting chapter from", url)
+        match = re.match(r'posts/(\d+)/?', url)
         if not match:
-            match = re.match(r'.+#post-(\d+)$', href)
+            match = re.match(r'.+#post-(\d+)$', url)
             if not match:
-                print("Unparseable threadmark href", href)
+                print("Unparseable threadmark href", url)
         chapter_postid = match and match.group(1)
-        chapter_page = fetch(base + href)
-        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
+        chapter_soup = self._soup(url, 'html5lib')
 
         if chapter_postid:
             post = chapter_soup.find('li', id='post-'+chapter_postid)
         else:
             # just the first one in the thread, then
             post = chapter_soup.find('li', class_='message')
+
+        return self._clean_chapter(post)
+
+    def _clean_chapter(self, post):
         post = post.find('blockquote', class_='messageText')
         post.name = 'div'
+        # mostly, we want to remove colors because the Kindle is terrible at them
+        for tag in post.find_all(style=True):
+            del(tag['style'])
+        return post.prettify()
 
-        chapters.append((str(mark.a.string), post.prettify()))
 
-    story['chapters'] = chapters
+@register
+class SpaceBattlesIndex(SpaceBattles):
+    """A spacebattles thread with an index post"""
+    @staticmethod
+    def match(url):
+        return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)
 
-    return story
+    def _chapter_list(self, url):
+        soup = self._soup(url)
+
+        post = post = soup.find('li', id='post-'+postid)
+        links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
+        if not links:
+            raise SiteException("No links in index?")
+
+        return links
diff --git a/sites/spacebattles_indexpost.py b/sites/spacebattles_indexpost.py
deleted file mode 100644
index c2f4679..0000000
--- a/sites/spacebattles_indexpost.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/python
-
-import re
-from bs4 import BeautifulSoup
-
-
-def match(url):
-    return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)
-
-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-
-    base = soup.head.base.get('href')
-
-    match = re.match(r'.+/posts/(\d+)/?', url)
-    if not match:
-        print("Unparseable post URL", url)
-        return
-    postid = match.group(1)
-
-    story = {}
-    story['title'] = str(soup.find('h1').string)
-    story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
-
-    post = post = soup.find('li', id='post-'+postid)
-    links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
-    if not links:
-        print("No links in index?")
-
-    chapters = []
-    for link in links:
-        href = link.get('href')
-        if '/members/' in href:
-            # skip links to users
-            continue
-        if not href.startswith('http'):
-            href = base + href
-        print("Extracting chapter from", href)
-        match = re.match(r'.+#post-(\d+)$', href)
-        if not match:
-            match = re.match(r'.+/posts/(\d+)/?$', href)
-            if not match:
-                print("Unparseable index link href", href)
-        chapter_postid = match and match.group(1)
-        chapter_page = fetch(href)
-        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
-
-        if chapter_postid:
-            post = chapter_soup.find('li', id='post-'+chapter_postid)
-        else:
-            # just the first one in the thread, then
-            post = chapter_soup.find('li', class_='message')
-        post = post.find('blockquote', class_='messageText')
-        post.name = 'div'
-
-        chapters.append((str(link.string), post.prettify()))
-
-    story['chapters'] = chapters
-
-    return story
diff --git a/sites/stash.py b/sites/stash.py
index b83139f..d414ac4 100644
--- a/sites/stash.py
+++ b/sites/stash.py
@@ -1,62 +1,61 @@
 #!/usr/bin/python
 
 import re
-from bs4 import BeautifulSoup
+from . import register, Site, SiteException
 
 
-def match(url):
-    # Need a stack page
-    return re.match(r'^https?://sta\.sh/2.+/?.*', url)
+@register
+class Stash(Site):
+    @staticmethod
+    def matches(url):
+        # Need a stack page
+        return re.match(r'^https?://sta\.sh/2.+/?.*', url)
 
+    def extract(self, url):
+        soup = self._soup(url)
+        content = soup.find(id="stash-body")
+        if not content:
+            return
 
-def extract(url, fetch):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-    content = soup.find(id="stash-body")
-    if not content:
-        return
+        story = {}
+        chapters = []
 
-    story = {}
-    chapters = []
+        # metadata = content.find(id='profile_top')
+        story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
+        story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
 
-    # metadata = content.find(id='profile_top')
-    story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
-    story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
+        thumbs = content.select(".stash-folder-stream .thumb")
+        if not thumbs:
+            return
+        for thumb in thumbs:
+            try:
+                if thumb['href'] is not '#':
+                    chapters.append(self._chapter(thumb['href']))
+            except Exception as e:
+                print(e)
 
-    thumbs = content.select(".stash-folder-stream .thumb")
-    if not thumbs:
-        return
-    for thumb in thumbs:
+        story['chapters'] = chapters
+
+        return story
+
+    def _chapter(self, url):
+        print("Extracting chapter from", url)
+        soup = self._soup(url)
+
+        content = soup.find(class_="journal-wrapper")
+        if not content:
+            raise SiteException("No content")
+
+        title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
+
+        text = content.find(class_="text")
+
+        # clean up some invalid xhtml attributes
+        # TODO: be more selective about this somehow
         try:
-            if thumb['href'] is not '#':
-                chapters.append(_extract_chapter(thumb['href'], fetch))
+            for tag in text.find_all(True):
+                tag.attrs = None
         except Exception as e:
-            print(e)
+            raise SiteException("Trouble cleaning attributes", e)
 
-    story['chapters'] = chapters
-
-    return story
-
-
-def _extract_chapter(url, fetch):
-    print("Extracting chapter from", url)
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-
-    content = soup.find(class_="journal-wrapper")
-    if not content:
-        raise Exception("No content")
-
-    title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
-
-    text = content.find(class_="text")
-
-    # clean up some invalid xhtml attributes
-    # TODO: be more selective about this somehow
-    try:
-        for tag in text.find_all(True):
-            tag.attrs = None
-    except Exception as e:
-        raise Exception("Trouble cleaning attributes", e)
-
-    return (title, text.prettify())
+        return (title, text.prettify())