mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
Use a namedtuple for chapters
This commit is contained in:
parent
574cea3fc8
commit
24fa9aa22d
7 changed files with 23 additions and 17 deletions
8
leech.py
8
leech.py
|
|
@ -90,7 +90,7 @@ def leech(url, session, filename=None, args=None):
|
||||||
if not story:
|
if not story:
|
||||||
raise Exception("Couldn't extract story")
|
raise Exception("Couldn't extract story")
|
||||||
|
|
||||||
dates = [c[2] for c in story['chapters'] if c[2]]
|
dates = [c.date for c in story['chapters'] if c.date]
|
||||||
metadata = {
|
metadata = {
|
||||||
'title': story['title'],
|
'title': story['title'],
|
||||||
'author': story['author'],
|
'author': story['author'],
|
||||||
|
|
@ -105,11 +105,11 @@ def leech(url, session, filename=None, args=None):
|
||||||
|
|
||||||
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
|
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
|
||||||
|
|
||||||
for i, (chapter_title, chapter_html, chapter_date) in enumerate(story['chapters']):
|
for i, chapter in enumerate(story['chapters']):
|
||||||
html.append((
|
html.append((
|
||||||
chapter_title,
|
chapter.title,
|
||||||
'chapter%d.html' % (i + 1),
|
'chapter%d.html' % (i + 1),
|
||||||
html_template.format(title=chapter_title, text=chapter_html)
|
html_template.format(title=chapter.title, text=chapter.contents)
|
||||||
))
|
))
|
||||||
|
|
||||||
if 'footnotes' in story and story['footnotes']:
|
if 'footnotes' in story and story['footnotes']:
|
||||||
|
|
|
||||||
|
|
@ -2,11 +2,15 @@
|
||||||
import glob
|
import glob
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
|
import collections
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
_sites = []
|
_sites = []
|
||||||
|
|
||||||
|
|
||||||
|
Chapter = collections.namedtuple('Chapter', ['title', 'contents', 'date'])
|
||||||
|
|
||||||
|
|
||||||
class Site:
|
class Site:
|
||||||
"""A Site handles checking whether a URL might represent a site, and then
|
"""A Site handles checking whether a URL might represent a site, and then
|
||||||
extracting the content of a story from said site.
|
extracting the content of a story from said site.
|
||||||
|
|
@ -30,7 +34,7 @@ class Site:
|
||||||
story (dict) containing keys:
|
story (dict) containing keys:
|
||||||
title (string)
|
title (string)
|
||||||
author (string)
|
author (string)
|
||||||
chapters (list): list of tuples, in form (title, HTML, datetime)
|
chapters (list): list of Chapters (namedtuple, defined above)
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException
|
from . import register, Site, SiteException, Chapter
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
|
@ -37,7 +37,7 @@ class ArchiveOfOurOwn(Site):
|
||||||
"(%Y-%m-%d)"
|
"(%Y-%m-%d)"
|
||||||
)
|
)
|
||||||
|
|
||||||
chapters.append((link.string, self._chapter(chapter_url), updated))
|
chapters.append(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
|
||||||
|
|
||||||
if not chapters:
|
if not chapters:
|
||||||
raise SiteException("No content")
|
raise SiteException("No content")
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,10 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from . import register, Site, SiteException
|
from . import register
|
||||||
from .stash import Stash
|
from .stash import Stash
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
class DeviantArt(Stash):
|
class DeviantArt(Stash):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException
|
from . import register, Site, SiteException, Chapter
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
|
@ -45,11 +45,11 @@ class FanFictionNet(Site):
|
||||||
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
|
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
|
||||||
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
|
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
|
||||||
for option in options:
|
for option in options:
|
||||||
chapters.append((option[1], self._chapter(base_url + option[0]), False))
|
chapters.append(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
|
||||||
chapters[-1] = (chapters[-1][0], chapters[-1][1], updated)
|
chapters[-1] = Chapter(title=chapters[-1].title, contents=chapters[-1].contents, date=updated)
|
||||||
chapters[0] = (chapters[0][0], chapters[0][1], published)
|
chapters[0] = Chapter(title=chapters[0].title, contents=chapters[0].contents, date=published)
|
||||||
else:
|
else:
|
||||||
chapters.append((story['title'], self._extract_chapter(url), published))
|
chapters.append(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
|
||||||
|
|
||||||
story['chapters'] = chapters
|
story['chapters'] = chapters
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException
|
from . import register, Site, SiteException, Chapter
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
|
@ -59,7 +59,7 @@ class Stash(Site):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise SiteException("Trouble cleaning attributes", e)
|
raise SiteException("Trouble cleaning attributes", e)
|
||||||
|
|
||||||
return (title, text.prettify(), self._date(soup))
|
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
|
||||||
|
|
||||||
def _date(self, soup):
|
def _date(self, soup):
|
||||||
maybe_date = soup.find('div', class_="dev-metainfo-details").find('span', ts=True)
|
maybe_date = soup.find('div', class_="dev-metainfo-details").find('span', ts=True)
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException
|
from . import register, Site, SiteException, Chapter
|
||||||
|
|
||||||
|
|
||||||
class XenForo(Site):
|
class XenForo(Site):
|
||||||
|
|
@ -41,7 +41,8 @@ class XenForo(Site):
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
href = base + href
|
href = base + href
|
||||||
print("Fetching chapter", mark.string, href)
|
print("Fetching chapter", mark.string, href)
|
||||||
chapters.append((str(mark.string),) + self._chapter(href, idx))
|
contents, post_date = self._chapter(href, idx)
|
||||||
|
chapters.append(Chapter(title=str(mark.string), contents=contents, date=post_date))
|
||||||
|
|
||||||
story['chapters'] = chapters
|
story['chapters'] = chapters
|
||||||
story['footnotes'] = '\n\n'.join(self.footnotes)
|
story['footnotes'] = '\n\n'.join(self.footnotes)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue