1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Use a namedtuple for chapters

This commit is contained in:
David Lynch 2016-09-23 13:11:52 -05:00
parent 574cea3fc8
commit 24fa9aa22d
7 changed files with 23 additions and 17 deletions

View file

@ -90,7 +90,7 @@ def leech(url, session, filename=None, args=None):
if not story: if not story:
raise Exception("Couldn't extract story") raise Exception("Couldn't extract story")
dates = [c[2] for c in story['chapters'] if c[2]] dates = [c.date for c in story['chapters'] if c.date]
metadata = { metadata = {
'title': story['title'], 'title': story['title'],
'author': story['author'], 'author': story['author'],
@ -105,11 +105,11 @@ def leech(url, session, filename=None, args=None):
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata))) html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
for i, (chapter_title, chapter_html, chapter_date) in enumerate(story['chapters']): for i, chapter in enumerate(story['chapters']):
html.append(( html.append((
chapter_title, chapter.title,
'chapter%d.html' % (i + 1), 'chapter%d.html' % (i + 1),
html_template.format(title=chapter_title, text=chapter_html) html_template.format(title=chapter.title, text=chapter.contents)
)) ))
if 'footnotes' in story and story['footnotes']: if 'footnotes' in story and story['footnotes']:

View file

@ -2,11 +2,15 @@
import glob import glob
import os import os
import argparse import argparse
import collections
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
_sites = [] _sites = []
Chapter = collections.namedtuple('Chapter', ['title', 'contents', 'date'])
class Site: class Site:
"""A Site handles checking whether a URL might represent a site, and then """A Site handles checking whether a URL might represent a site, and then
extracting the content of a story from said site. extracting the content of a story from said site.
@ -30,7 +34,7 @@ class Site:
story (dict) containing keys: story (dict) containing keys:
title (string) title (string)
author (string) author (string)
chapters (list): list of tuples, in form (title, HTML, datetime) chapters (list): list of Chapters (namedtuple, defined above)
""" """
raise NotImplementedError() raise NotImplementedError()

View file

@ -2,7 +2,7 @@
import datetime import datetime
import re import re
from . import register, Site, SiteException from . import register, Site, SiteException, Chapter
@register @register
@ -37,7 +37,7 @@ class ArchiveOfOurOwn(Site):
"(%Y-%m-%d)" "(%Y-%m-%d)"
) )
chapters.append((link.string, self._chapter(chapter_url), updated)) chapters.append(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
if not chapters: if not chapters:
raise SiteException("No content") raise SiteException("No content")

View file

@ -2,9 +2,10 @@
import re import re
from . import register, Site, SiteException from . import register
from .stash import Stash from .stash import Stash
@register @register
class DeviantArt(Stash): class DeviantArt(Stash):
@staticmethod @staticmethod

View file

@ -2,7 +2,7 @@
import datetime import datetime
import re import re
from . import register, Site, SiteException from . import register, Site, SiteException, Chapter
@register @register
@ -45,11 +45,11 @@ class FanFictionNet(Site):
# beautiful soup doesn't handle ffn's unclosed option tags at all well here # beautiful soup doesn't handle ffn's unclosed option tags at all well here
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select)) options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
for option in options: for option in options:
chapters.append((option[1], self._chapter(base_url + option[0]), False)) chapters.append(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
chapters[-1] = (chapters[-1][0], chapters[-1][1], updated) chapters[-1] = Chapter(title=chapters[-1].title, contents=chapters[-1].contents, date=updated)
chapters[0] = (chapters[0][0], chapters[0][1], published) chapters[0] = Chapter(title=chapters[0].title, contents=chapters[0].contents, date=published)
else: else:
chapters.append((story['title'], self._extract_chapter(url), published)) chapters.append(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
story['chapters'] = chapters story['chapters'] = chapters

View file

@ -2,7 +2,7 @@
import datetime import datetime
import re import re
from . import register, Site, SiteException from . import register, Site, SiteException, Chapter
@register @register
@ -59,7 +59,7 @@ class Stash(Site):
except Exception as e: except Exception as e:
raise SiteException("Trouble cleaning attributes", e) raise SiteException("Trouble cleaning attributes", e)
return (title, text.prettify(), self._date(soup)) return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
def _date(self, soup): def _date(self, soup):
maybe_date = soup.find('div', class_="dev-metainfo-details").find('span', ts=True) maybe_date = soup.find('div', class_="dev-metainfo-details").find('span', ts=True)

View file

@ -2,7 +2,7 @@
import datetime import datetime
import re import re
from . import register, Site, SiteException from . import register, Site, SiteException, Chapter
class XenForo(Site): class XenForo(Site):
@ -41,7 +41,8 @@ class XenForo(Site):
if not href.startswith('http'): if not href.startswith('http'):
href = base + href href = base + href
print("Fetching chapter", mark.string, href) print("Fetching chapter", mark.string, href)
chapters.append((str(mark.string),) + self._chapter(href, idx)) contents, post_date = self._chapter(href, idx)
chapters.append(Chapter(title=str(mark.string), contents=contents, date=post_date))
story['chapters'] = chapters story['chapters'] = chapters
story['footnotes'] = '\n\n'.join(self.footnotes) story['footnotes'] = '\n\n'.join(self.footnotes)