diff --git a/leech.py b/leech.py index c093696..c0d2572 100644 --- a/leech.py +++ b/leech.py @@ -1,7 +1,7 @@ #!/usr/bin/python -import re -from bs4 import BeautifulSoup +import importlib +import os import epub from fetch import Fetch @@ -21,7 +21,13 @@ html_template = ''' ''' def leech(url): - story = _extract(url) + # we have: a page, which could be absolutely any part of a story, or not a story at all + # check a bunch of things which are completely ff.n specific, to get text from it + site = _get_site(url) + if not site: + return + + story = site.extract(url, fetch) metadata = { 'title': story['title'], @@ -34,56 +40,23 @@ def leech(url): epub.make_epub(story['title'] + '.epub', html, metadata) -def _extract(url): - # we have: a page, which could be absolutely any part of a story, or not a story at all - # check a bunch of things which are completely ff.n specific, to get text from it - page = fetch(url) - soup = BeautifulSoup(page) - content = soup.find(id="content_wrapper_inner") - if not content: - return +_sites = [] - story = {} - chapters = [] +def _get_site(url): + for site in _sites: + if site.match(url): + return site - metadata = content.find(id='profile_top') - story['title'] = str(metadata.find('b', class_="xcontrast_txt").string) - story['author'] = str(metadata.find('a', class_="xcontrast_txt").string) +def _load_sites(): + dirname = os.path.join(os.path.dirname(__file__), 'sites') + for f in os.listdir(dirname): + if not f.endswith('.py'): + continue + mod = importlib.import_module('sites.' + f.replace('.py', '')) + _sites.append(mod) - chapter_select = content.find(id="chap_select") - if chapter_select: - base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url) - if not base_url: - return - base_url = base_url.group(0) - - # beautiful soup doesn't handle ffn's unclosed option tags at all well here - options = re.findall(r']*>([^<]+)', str(chapter_select)) - for option in options: - chapters.append(_extract_chapter(base_url + option[0], option[1])) - else: - chapters.append(_extract_chapter(url, story['title'])) - - story['chapters'] = chapters - - return story - -def _extract_chapter(url, title): - page = fetch(url) - soup = BeautifulSoup(page, 'html5lib') - - content = soup.find(id="content_wrapper_inner") - if not content: - return - - text = content.find(id="storytext") - - # clean up some invalid xhtml attributes - # TODO: be more selective about this somehow - for tag in text.find_all(True): - tag.attrs = None - - return (title, text.prettify()) if __name__ == '__main__': - leech('https://www.fanfiction.net/s/4510497/1/Neon-Genesis-Evangelion-Redux') + _load_sites() + leech('https://www.fanfiction.net/s/4109686/3/Taking-Sights') + pass diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py new file mode 100644 index 0000000..76be5ad --- /dev/null +++ b/sites/fanfictionnet.py @@ -0,0 +1,57 @@ +#!/usr/bin/python + +import re +from bs4 import BeautifulSoup + +def match(url): + ## e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights + return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url) + +def extract(url, fetch): + page = fetch(url) + soup = BeautifulSoup(page) + content = soup.find(id="content_wrapper_inner") + if not content: + return + + story = {} + chapters = [] + + metadata = content.find(id='profile_top') + story['title'] = str(metadata.find('b', class_="xcontrast_txt").string) + story['author'] = str(metadata.find('a', class_="xcontrast_txt").string) + + chapter_select = content.find(id="chap_select") + if chapter_select: + base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url) + if not base_url: + return + base_url = base_url.group(0) + + # beautiful soup doesn't handle ffn's unclosed option tags at all well here + options = re.findall(r']*>([^<]+)', str(chapter_select)) + for option in options: + chapters.append(_extract_chapter(base_url + option[0], option[1], fetch)) + else: + chapters.append(_extract_chapter(url, story['title'], fetch)) + + story['chapters'] = chapters + + return story + +def _extract_chapter(url, title, fetch): + page = fetch(url) + soup = BeautifulSoup(page, 'html5lib') + + content = soup.find(id="content_wrapper_inner") + if not content: + return + + text = content.find(id="storytext") + + # clean up some invalid xhtml attributes + # TODO: be more selective about this somehow + for tag in text.find_all(True): + tag.attrs = None + + return (title, text.prettify())