Split out per-site story extractors

2026-03-07 13:02:20 +01:00 · 2013-12-24 02:21:23 -06:00 · 2013-12-24 02:21:23 -06:00 · 91b7ed3917
commit 91b7ed3917
parent a571e2c17b
2 changed files with 81 additions and 51 deletions
--- a/leech.py
+++ b/leech.py
@ -1,7 +1,7 @@
 #!/usr/bin/python

-import re
-from bs4 import BeautifulSoup
+import importlib
+import os

 import epub
 from fetch import Fetch
@ -21,7 +21,13 @@ html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 '''

 def leech(url):
-    story = _extract(url)
+    # we have: a page, which could be absolutely any part of a story, or not a story at all
+    # check a bunch of things which are completely ff.n specific, to get text from it
+    site = _get_site(url)
+    if not site:
+        return
+
+    story = site.extract(url, fetch)

    metadata = {
        'title': story['title'],
@ -34,56 +40,23 @@ def leech(url):

    epub.make_epub(story['title'] + '.epub', html, metadata)

-def _extract(url):
-    # we have: a page, which could be absolutely any part of a story, or not a story at all
-    # check a bunch of things which are completely ff.n specific, to get text from it
-    page = fetch(url)
-    soup = BeautifulSoup(page)
-    content = soup.find(id="content_wrapper_inner")
-    if not content:
-        return
+_sites = []

-    story = {}
-    chapters = []
+def _get_site(url):
+    for site in _sites:
+        if site.match(url):
+            return site

-    metadata = content.find(id='profile_top')
-    story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
-    story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
+def _load_sites():
+    dirname = os.path.join(os.path.dirname(__file__), 'sites')
+    for f in os.listdir(dirname):
+        if not f.endswith('.py'):
+            continue
+        mod = importlib.import_module('sites.' + f.replace('.py', ''))
+        _sites.append(mod)

-    chapter_select = content.find(id="chap_select")
-    if chapter_select:
-        base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
-        if not base_url:
-            return
-        base_url = base_url.group(0)
-
-        # beautiful soup doesn't handle ffn's unclosed option tags at all well here
-        options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
-        for option in options:
-            chapters.append(_extract_chapter(base_url + option[0], option[1]))
-    else:
-        chapters.append(_extract_chapter(url, story['title']))
-
-    story['chapters'] = chapters
-
-    return story
-
-def _extract_chapter(url, title):
-    page = fetch(url)
-    soup = BeautifulSoup(page, 'html5lib')
-
-    content = soup.find(id="content_wrapper_inner")
-    if not content:
-        return
-
-    text = content.find(id="storytext")
-
-    # clean up some invalid xhtml attributes
-    # TODO: be more selective about this somehow
-    for tag in text.find_all(True):
-        tag.attrs = None
-
-    return (title, text.prettify())

 if __name__ == '__main__':
-    leech('https://www.fanfiction.net/s/4510497/1/Neon-Genesis-Evangelion-Redux')
+    _load_sites()
+    leech('https://www.fanfiction.net/s/4109686/3/Taking-Sights')
+    pass
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@ -0,0 +1,57 @@
+#!/usr/bin/python
+
+import re
+from bs4 import BeautifulSoup
+
+def match(url):
+    ## e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
+    return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
+
+def extract(url, fetch):
+    page = fetch(url)
+    soup = BeautifulSoup(page)
+    content = soup.find(id="content_wrapper_inner")
+    if not content:
+        return
+
+    story = {}
+    chapters = []
+
+    metadata = content.find(id='profile_top')
+    story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
+    story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
+
+    chapter_select = content.find(id="chap_select")
+    if chapter_select:
+        base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
+        if not base_url:
+            return
+        base_url = base_url.group(0)
+
+        # beautiful soup doesn't handle ffn's unclosed option tags at all well here
+        options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
+        for option in options:
+            chapters.append(_extract_chapter(base_url + option[0], option[1], fetch))
+    else:
+        chapters.append(_extract_chapter(url, story['title'], fetch))
+
+    story['chapters'] = chapters
+
+    return story
+
+def _extract_chapter(url, title, fetch):
+    page = fetch(url)
+    soup = BeautifulSoup(page, 'html5lib')
+
+    content = soup.find(id="content_wrapper_inner")
+    if not content:
+        return
+
+    text = content.find(id="storytext")
+
+    # clean up some invalid xhtml attributes
+    # TODO: be more selective about this somehow
+    for tag in text.find_all(True):
+        tag.attrs = None
+
+    return (title, text.prettify())