mirror of
https://github.com/kemayo/leech
synced 2026-05-08 21:11:13 +02:00
Stories are now made of nested sections/chapters
This is prep-work for improving epub TOC generation a bit.
This commit is contained in:
parent
7addf4c3d1
commit
e6343cb1c9
8 changed files with 139 additions and 97 deletions
13
epub.py
13
epub.py
|
|
@ -90,11 +90,10 @@ def make_epub(filename, html_files, meta, extra_files=False):
|
||||||
|
|
||||||
# Write each HTML file to the ebook, collect information for the index
|
# Write each HTML file to the ebook, collect information for the index
|
||||||
for i, html in enumerate(html_files):
|
for i, html in enumerate(html_files):
|
||||||
basename = os.path.basename(html[1])
|
|
||||||
file_id = 'file_%d' % (i + 1)
|
file_id = 'file_%d' % (i + 1)
|
||||||
etree.SubElement(manifest, 'item', {
|
etree.SubElement(manifest, 'item', {
|
||||||
'id': file_id,
|
'id': file_id,
|
||||||
'href': basename,
|
'href': html[1],
|
||||||
'media-type': "application/xhtml+xml",
|
'media-type': "application/xhtml+xml",
|
||||||
})
|
})
|
||||||
itemref = etree.SubElement(spine, 'itemref', idref=file_id)
|
itemref = etree.SubElement(spine, 'itemref', idref=file_id)
|
||||||
|
|
@ -103,21 +102,21 @@ def make_epub(filename, html_files, meta, extra_files=False):
|
||||||
'id': file_id,
|
'id': file_id,
|
||||||
})
|
})
|
||||||
etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0]
|
etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0]
|
||||||
etree.SubElement(point, 'content', src=basename)
|
etree.SubElement(point, 'content', src=html[1])
|
||||||
|
|
||||||
if 'cover.html' == basename:
|
if 'cover.html' == os.path.basename(html[1]):
|
||||||
etree.SubElement(guide, 'reference', {
|
etree.SubElement(guide, 'reference', {
|
||||||
'type': 'cover',
|
'type': 'cover',
|
||||||
'title': 'Cover',
|
'title': 'Cover',
|
||||||
'href': basename,
|
'href': html[1],
|
||||||
})
|
})
|
||||||
itemref.set('linear', 'no')
|
itemref.set('linear', 'no')
|
||||||
|
|
||||||
# and add the actual html to the zip
|
# and add the actual html to the zip
|
||||||
if html[2]:
|
if html[2]:
|
||||||
epub.writestr('OEBPS/' + basename, html[2])
|
epub.writestr('OEBPS/' + html[1], html[2])
|
||||||
else:
|
else:
|
||||||
epub.write(html[1], 'OEBPS/' + basename)
|
epub.write(html[1], 'OEBPS/' + html[1])
|
||||||
|
|
||||||
if extra_files:
|
if extra_files:
|
||||||
for i, data in enumerate(extra_files):
|
for i, data in enumerate(extra_files):
|
||||||
|
|
|
||||||
41
leech.py
41
leech.py
|
|
@ -5,6 +5,7 @@ import sys
|
||||||
import json
|
import json
|
||||||
import datetime
|
import datetime
|
||||||
import http.cookiejar
|
import http.cookiejar
|
||||||
|
import collections
|
||||||
|
|
||||||
import sites
|
import sites
|
||||||
import epub
|
import epub
|
||||||
|
|
@ -90,10 +91,10 @@ def leech(url, session, filename=None, args=None):
|
||||||
if not story:
|
if not story:
|
||||||
raise Exception("Couldn't extract story")
|
raise Exception("Couldn't extract story")
|
||||||
|
|
||||||
dates = [c.date for c in story['chapters'] if c.date]
|
dates = list(story.dates())
|
||||||
metadata = {
|
metadata = {
|
||||||
'title': story['title'],
|
'title': story.title,
|
||||||
'author': story['author'],
|
'author': story.author,
|
||||||
'unique_id': url,
|
'unique_id': url,
|
||||||
'started': min(dates),
|
'started': min(dates),
|
||||||
'updated': max(dates),
|
'updated': max(dates),
|
||||||
|
|
@ -101,28 +102,40 @@ def leech(url, session, filename=None, args=None):
|
||||||
|
|
||||||
# The cover is static, and the only change comes from the image which we generate
|
# The cover is static, and the only change comes from the image which we generate
|
||||||
html = [('Cover', 'cover.html', cover_template)]
|
html = [('Cover', 'cover.html', cover_template)]
|
||||||
cover_image = ('images/cover.png', cover.make_cover(story['title'], story['author']).read(), 'image/png')
|
cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png')
|
||||||
|
|
||||||
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
|
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
|
||||||
|
|
||||||
for i, chapter in enumerate(story['chapters']):
|
html.extend(chapter_html(story))
|
||||||
html.append((
|
|
||||||
chapter.title,
|
|
||||||
'chapter%d.html' % (i + 1),
|
|
||||||
html_template.format(title=chapter.title, text=chapter.contents)
|
|
||||||
))
|
|
||||||
|
|
||||||
if 'footnotes' in story and story['footnotes']:
|
|
||||||
html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes'])))
|
|
||||||
|
|
||||||
css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
|
css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
|
||||||
|
|
||||||
filename = filename or story['title'] + '.epub'
|
filename = filename or story.title + '.epub'
|
||||||
|
|
||||||
|
# print([c[0:-1] for c in html])
|
||||||
filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image))
|
filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image))
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
def chapter_html(story, titleprefix=None):
|
||||||
|
chapters = []
|
||||||
|
for i, chapter in enumerate(story):
|
||||||
|
if hasattr(chapter, '__iter__'):
|
||||||
|
# This is a Section
|
||||||
|
chapters.extend(chapter_html(chapter, titleprefix=chapter.title))
|
||||||
|
else:
|
||||||
|
title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title
|
||||||
|
chapters.append((
|
||||||
|
title,
|
||||||
|
'{}/chapter{}.html'.format(story.id, i + 1),
|
||||||
|
html_template.format(title=title, text=chapter.contents)
|
||||||
|
))
|
||||||
|
if story.footnotes:
|
||||||
|
chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
|
||||||
|
return chapters
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('url', help="url of a story to fetch", nargs='?')
|
parser.add_argument('url', help="url of a story to fetch", nargs='?')
|
||||||
|
|
|
||||||
|
|
@ -2,13 +2,57 @@
|
||||||
import glob
|
import glob
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import collections
|
import uuid
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
_sites = []
|
_sites = []
|
||||||
|
|
||||||
|
|
||||||
Chapter = collections.namedtuple('Chapter', ['title', 'contents', 'date'])
|
class Chapter:
|
||||||
|
def __init__(self, title, contents, date=False, chapterid=None):
|
||||||
|
if not chapterid:
|
||||||
|
chapterid = str(uuid.uuid4())
|
||||||
|
self.id = chapterid
|
||||||
|
self.title = title
|
||||||
|
self.contents = contents
|
||||||
|
self.date = date
|
||||||
|
|
||||||
|
|
||||||
|
class Section:
|
||||||
|
def __init__(self, title, author, sectionid=None):
|
||||||
|
if not sectionid:
|
||||||
|
sectionid = str(uuid.uuid4())
|
||||||
|
self.id = sectionid
|
||||||
|
self.title = title
|
||||||
|
self.author = author
|
||||||
|
# Will contain a mix of Sections and Chapters
|
||||||
|
self.contents = []
|
||||||
|
self.footnotes = []
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self.contents.__iter__()
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return self.contents.__getitem__(index)
|
||||||
|
|
||||||
|
def __setitem__(self, index, value):
|
||||||
|
return self.contents.__setitem__(index, value)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.contents)
|
||||||
|
|
||||||
|
def add(self, value, index=None):
|
||||||
|
if index is not None:
|
||||||
|
self.contents.insert(index, value)
|
||||||
|
else:
|
||||||
|
self.contents.append(value)
|
||||||
|
|
||||||
|
def dates(self):
|
||||||
|
for chapter in self.contents:
|
||||||
|
if hasattr(chapter, '__iter__'):
|
||||||
|
yield from chapter.dates()
|
||||||
|
elif chapter.date:
|
||||||
|
yield chapter.date
|
||||||
|
|
||||||
|
|
||||||
class Site:
|
class Site:
|
||||||
|
|
@ -59,22 +103,24 @@ class Site:
|
||||||
soup = BeautifulSoup("", 'html5lib')
|
soup = BeautifulSoup("", 'html5lib')
|
||||||
return soup.new_tag(*args, **kw)
|
return soup.new_tag(*args, **kw)
|
||||||
|
|
||||||
def _footnote(self, contents, backlink_href=''):
|
def _footnote(self, contents, chapterid):
|
||||||
"""Register a footnote and return a link to that footnote"""
|
"""Register a footnote and return a link to that footnote"""
|
||||||
|
|
||||||
|
# TODO: This embeds knowledge of what the generated filenames will be. Work out a better way.
|
||||||
|
|
||||||
idx = len(self.footnotes) + 1
|
idx = len(self.footnotes) + 1
|
||||||
|
|
||||||
# epub spec footnotes are all about epub:type on the footnote and the link
|
# epub spec footnotes are all about epub:type on the footnote and the link
|
||||||
# http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php
|
# http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php
|
||||||
contents.name = 'div'
|
contents.name = 'div'
|
||||||
contents.attrs['id'] = "footnote%d" % idx
|
contents.attrs['id'] = "footnote{}".format(idx)
|
||||||
contents.attrs['epub:type'] = 'rearnote'
|
contents.attrs['epub:type'] = 'rearnote'
|
||||||
|
|
||||||
# a backlink is essential for Kindle to think of this as a footnote
|
# a backlink is essential for Kindle to think of this as a footnote
|
||||||
# otherwise it doesn't get the inline-popup treatment
|
# otherwise it doesn't get the inline-popup treatment
|
||||||
# http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf
|
# http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf
|
||||||
# section 3.9.10
|
# section 3.9.10
|
||||||
backlink = self._new_tag('a', href="%s#noteback%d" % (backlink_href, idx))
|
backlink = self._new_tag('a', href="chapter{}.html#noteback{}".format(chapterid, idx))
|
||||||
backlink.string = '^'
|
backlink.string = '^'
|
||||||
contents.insert(0, backlink)
|
contents.insert(0, backlink)
|
||||||
|
|
||||||
|
|
@ -84,8 +130,8 @@ class Site:
|
||||||
# epub annotations.
|
# epub annotations.
|
||||||
spoiler_link = self._new_tag('a')
|
spoiler_link = self._new_tag('a')
|
||||||
spoiler_link.attrs = {
|
spoiler_link.attrs = {
|
||||||
'id': 'noteback%d' % idx,
|
'id': 'noteback{}'.format(idx),
|
||||||
'href': "footnotes.html#footnote%d" % idx,
|
'href': "footnotes.html#footnote{}".format(idx),
|
||||||
'epub:type': 'noteref',
|
'epub:type': 'noteref',
|
||||||
}
|
}
|
||||||
spoiler_link.string = str(idx)
|
spoiler_link.string = str(idx)
|
||||||
|
|
|
||||||
37
sites/ao3.py
37
sites/ao3.py
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException, Chapter
|
from . import register, Site, SiteException, Section, Chapter
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
|
@ -21,12 +21,11 @@ class ArchiveOfOurOwn(Site):
|
||||||
soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
|
soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
|
||||||
|
|
||||||
metadata = soup.select('#main h2.heading a')
|
metadata = soup.select('#main h2.heading a')
|
||||||
story = {
|
story = Section(
|
||||||
'title': metadata[0].string,
|
title=metadata[0].string,
|
||||||
'author': metadata[1].string,
|
author=metadata[1].string
|
||||||
}
|
)
|
||||||
|
|
||||||
chapters = []
|
|
||||||
for chapter in soup.select('#main ol[role="navigation"] li'):
|
for chapter in soup.select('#main ol[role="navigation"] li'):
|
||||||
link = chapter.find('a')
|
link = chapter.find('a')
|
||||||
chapter_url = str(link.get('href'))
|
chapter_url = str(link.get('href'))
|
||||||
|
|
@ -39,12 +38,7 @@ class ArchiveOfOurOwn(Site):
|
||||||
"(%Y-%m-%d)"
|
"(%Y-%m-%d)"
|
||||||
)
|
)
|
||||||
|
|
||||||
chapters.append(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
|
story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
|
||||||
|
|
||||||
if not chapters:
|
|
||||||
raise SiteException("No content")
|
|
||||||
|
|
||||||
story['chapters'] = chapters
|
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
|
|
@ -63,7 +57,7 @@ class ArchiveOfOurOwn(Site):
|
||||||
class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007
|
# e.g. http://archiveofourown.org/series/5683105/
|
||||||
return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url)
|
return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url)
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
|
|
@ -71,23 +65,16 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
||||||
|
|
||||||
soup = self._soup('http://archiveofourown.org/series/{}?view_adult=true'.format(seriesid))
|
soup = self._soup('http://archiveofourown.org/series/{}?view_adult=true'.format(seriesid))
|
||||||
|
|
||||||
story = {
|
story = Section(
|
||||||
'title': soup.select('#main h2.heading')[0].string,
|
title=soup.select('#main h2.heading')[0].string,
|
||||||
'author': soup.select('#main dl.series.meta a[rel="author"]')[0].string,
|
author=soup.select('#main dl.series.meta a[rel="author"]')[0].string
|
||||||
}
|
)
|
||||||
|
|
||||||
chapters = []
|
|
||||||
for work in soup.select('#main ul.series li.work'):
|
for work in soup.select('#main ul.series li.work'):
|
||||||
workid = work.get('id').replace('work_', '')
|
workid = work.get('id').replace('work_', '')
|
||||||
substory = self._extract_work(workid)
|
substory = self._extract_work(workid)
|
||||||
|
|
||||||
# TODO: improve epub-writer to be able to generate a toc.ncx with nested headings
|
# TODO: improve epub-writer to be able to generate a toc.ncx with nested headings
|
||||||
# In the meantime, append the story title to the chapter titles.
|
story.add(substory)
|
||||||
chapters.extend((
|
|
||||||
Chapter(title="{}: {}".format(substory['title'], c.title), contents=c.contents, date=c.date)
|
|
||||||
for c in substory['chapters']
|
|
||||||
))
|
|
||||||
|
|
||||||
story['chapters'] = chapters
|
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from . import register
|
from . import register, Section
|
||||||
from .stash import Stash
|
from .stash import Stash
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -19,16 +19,16 @@ class DeviantArt(Stash):
|
||||||
if not content:
|
if not content:
|
||||||
return
|
return
|
||||||
|
|
||||||
story = {}
|
|
||||||
chapters = []
|
|
||||||
|
|
||||||
if "gallery" in url:
|
if "gallery" in url:
|
||||||
story['author'] = str(content.select('h1 a.u')[0].string)
|
author = str(content.select('h1 a.u')[0].string)
|
||||||
else:
|
else:
|
||||||
authors = set(str(author.string) for author in content.select('.stream .details a.u'))
|
authors = set(str(author.string) for author in content.select('.stream .details a.u'))
|
||||||
story['author'] = ', '.join(authors)
|
author = ', '.join(authors)
|
||||||
|
|
||||||
story['title'] = str(content.find(class_="folder-title").string)
|
story = Section(
|
||||||
|
title=str(content.find(class_="folder-title").string),
|
||||||
|
author=author
|
||||||
|
)
|
||||||
|
|
||||||
thumbs = content.select(".stream a.thumb")
|
thumbs = content.select(".stream a.thumb")
|
||||||
if not thumbs:
|
if not thumbs:
|
||||||
|
|
@ -36,10 +36,8 @@ class DeviantArt(Stash):
|
||||||
for thumb in thumbs:
|
for thumb in thumbs:
|
||||||
try:
|
try:
|
||||||
if thumb['href'] is not '#':
|
if thumb['href'] is not '#':
|
||||||
chapters.append(self._chapter(thumb['href']))
|
story.add(self._chapter(thumb['href']))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
story['chapters'] = chapters
|
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException, Chapter
|
from . import register, Site, SiteException, Section, Chapter
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
|
@ -19,12 +19,12 @@ class FanFictionNet(Site):
|
||||||
if not content:
|
if not content:
|
||||||
raise SiteException("No content")
|
raise SiteException("No content")
|
||||||
|
|
||||||
story = {}
|
|
||||||
chapters = []
|
|
||||||
|
|
||||||
metadata = content.find(id='profile_top')
|
metadata = content.find(id='profile_top')
|
||||||
story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
|
|
||||||
story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
|
story = Section(
|
||||||
|
title=str(metadata.find('b', class_="xcontrast_txt").string),
|
||||||
|
author=str(metadata.find('a', class_="xcontrast_txt").string)
|
||||||
|
)
|
||||||
|
|
||||||
dates = content.find_all('span', attrs={'data-xutime': True})
|
dates = content.find_all('span', attrs={'data-xutime': True})
|
||||||
published = False
|
published = False
|
||||||
|
|
@ -45,13 +45,13 @@ class FanFictionNet(Site):
|
||||||
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
|
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
|
||||||
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
|
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
|
||||||
for option in options:
|
for option in options:
|
||||||
chapters.append(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
|
story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
|
||||||
chapters[-1] = Chapter(title=chapters[-1].title, contents=chapters[-1].contents, date=updated)
|
|
||||||
chapters[0] = Chapter(title=chapters[0].title, contents=chapters[0].contents, date=published)
|
|
||||||
else:
|
|
||||||
chapters.append(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
|
|
||||||
|
|
||||||
story['chapters'] = chapters
|
# fix up the dates
|
||||||
|
story[-1].date = updated
|
||||||
|
story[0].date = published
|
||||||
|
else:
|
||||||
|
story.add(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException, Chapter
|
from . import register, Site, SiteException, Section, Chapter
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
|
@ -18,12 +18,11 @@ class Stash(Site):
|
||||||
if not content:
|
if not content:
|
||||||
return
|
return
|
||||||
|
|
||||||
story = {}
|
|
||||||
chapters = []
|
|
||||||
|
|
||||||
# metadata = content.find(id='profile_top')
|
# metadata = content.find(id='profile_top')
|
||||||
story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
|
story = Section(
|
||||||
story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
|
title=str(soup.find(class_="stash-folder-name").h2.string),
|
||||||
|
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
|
||||||
|
)
|
||||||
|
|
||||||
thumbs = content.select(".stash-folder-stream .thumb")
|
thumbs = content.select(".stash-folder-stream .thumb")
|
||||||
if not thumbs:
|
if not thumbs:
|
||||||
|
|
@ -31,12 +30,10 @@ class Stash(Site):
|
||||||
for thumb in thumbs:
|
for thumb in thumbs:
|
||||||
try:
|
try:
|
||||||
if thumb['href'] is not '#':
|
if thumb['href'] is not '#':
|
||||||
chapters.append(self._chapter(thumb['href']))
|
story.add(self._chapter(thumb['href']))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
story['chapters'] = chapters
|
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, url):
|
def _chapter(self, url):
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException, Chapter
|
from . import register, Site, SiteException, Section, Chapter
|
||||||
|
|
||||||
|
|
||||||
class XenForo(Site):
|
class XenForo(Site):
|
||||||
|
|
@ -28,25 +28,27 @@ class XenForo(Site):
|
||||||
|
|
||||||
base = soup.head.base.get('href')
|
base = soup.head.base.get('href')
|
||||||
|
|
||||||
story = {}
|
story = Section(
|
||||||
story['title'] = soup.find('h1').get_text()
|
title=soup.find('h1').get_text(),
|
||||||
story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text()
|
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text()
|
||||||
|
)
|
||||||
|
|
||||||
marks = [mark for mark in self._chapter_list(url) if '/members' not in mark.get('href')]
|
marks = [mark for mark in self._chapter_list(url) if '/members' not in mark.get('href')]
|
||||||
marks = marks[self.options.offset:self.options.limit]
|
marks = marks[self.options.offset:self.options.limit]
|
||||||
|
|
||||||
chapters = []
|
|
||||||
for idx, mark in enumerate(marks, 1):
|
for idx, mark in enumerate(marks, 1):
|
||||||
href = mark.get('href')
|
href = mark.get('href')
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
href = base + href
|
href = base + href
|
||||||
title = str(mark.string).strip()
|
title = str(mark.string).strip()
|
||||||
print("Fetching chapter", title, href)
|
print("Fetching chapter", title, href)
|
||||||
contents, post_date = self._chapter(href, idx)
|
chapter = Chapter(title=title, contents="")
|
||||||
chapters.append(Chapter(title=title, contents=contents, date=post_date))
|
contents, post_date = self._chapter(href, chapter.id)
|
||||||
|
chapter.contents = contents
|
||||||
|
chapter.date = post_date
|
||||||
|
story.add(chapter)
|
||||||
|
|
||||||
story['chapters'] = chapters
|
story.footnotes = self.footnotes
|
||||||
story['footnotes'] = '\n\n'.join(self.footnotes)
|
|
||||||
self.footnotes = []
|
self.footnotes = []
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
@ -90,10 +92,10 @@ class XenForo(Site):
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def _chapter(self, url, chapter_number):
|
def _chapter(self, url, chapterid):
|
||||||
post = self._post_from_url(url)
|
post = self._post_from_url(url)
|
||||||
|
|
||||||
return self._clean_chapter(post, chapter_number), self._post_date(post)
|
return self._clean_chapter(post, chapterid), self._post_date(post)
|
||||||
|
|
||||||
def _post_from_url(self, url):
|
def _post_from_url(self, url):
|
||||||
# URLs refer to specific posts, so get just that one
|
# URLs refer to specific posts, so get just that one
|
||||||
|
|
@ -115,7 +117,7 @@ class XenForo(Site):
|
||||||
# just the first one in the thread, then
|
# just the first one in the thread, then
|
||||||
return soup.find('li', class_='message')
|
return soup.find('li', class_='message')
|
||||||
|
|
||||||
def _clean_chapter(self, post, chapter_number):
|
def _clean_chapter(self, post, chapterid):
|
||||||
post = post.find('blockquote', class_='messageText')
|
post = post.find('blockquote', class_='messageText')
|
||||||
post.name = 'div'
|
post.name = 'div'
|
||||||
# mostly, we want to remove colors because the Kindle is terrible at them
|
# mostly, we want to remove colors because the Kindle is terrible at them
|
||||||
|
|
@ -130,7 +132,7 @@ class XenForo(Site):
|
||||||
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
|
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
|
||||||
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
||||||
if self.options.spoilers:
|
if self.options.spoilers:
|
||||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), 'chapter%d.html' % chapter_number)
|
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
||||||
if spoiler_title:
|
if spoiler_title:
|
||||||
link.string = spoiler_title.get_text()
|
link.string = spoiler_title.get_text()
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue