1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 16:33:16 +01:00

Adjust Arbitrary so it can handle non-chapter works

This commit is contained in:
David Lynch 2017-04-29 20:59:04 -05:00
parent 17664125f3
commit 529b85c7a6

View file

@ -1,5 +1,6 @@
#!/usr/bin/python
import attr
import datetime
import json
import os.path
@ -24,8 +25,20 @@ difference would be whether a `--toc` arg was given.)
"""
@attr.s
class SiteDefinition:
url = attr.ib()
title = attr.ib()
author = attr.ib()
content_selector = attr.ib()
# If this is present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter.
chapter_selector = attr.ib(default=False)
# If this is present, it's used to filter out content that matches the selector
filter_selector = attr.ib(default=False)
@register
class ArbitraryIndex(Site):
class Arbitrary(Site):
"""A way to describe an arbitrary side for a one-off fetch
"""
@staticmethod
@ -36,16 +49,16 @@ class ArbitraryIndex(Site):
def extract(self, url):
with open(url) as definition_file:
definition = json.load(definition_file)
soup = self._soup(definition['url'])
definition = SiteDefinition(**json.load(definition_file))
story = Section(
title=definition['title'],
author=definition['author']
title=definition.title,
author=definition.author
)
for chapter in soup.select(definition['chapter_selector']):
if definition.chapter_selector:
soup = self._soup(definition.url)
for chapter in soup.select(definition.chapter_selector):
chapter_url = str(chapter.get('href'))
story.add(Chapter(
title=chapter.string,
@ -53,16 +66,25 @@ class ArbitraryIndex(Site):
# TODO: better date detection
date=datetime.datetime.now()
))
else:
story.add(Chapter(
title=definition.title,
contents=self._chapter(definition.url, definition),
# TODO: better date detection
date=datetime.datetime.now()
))
return story
def _chapter(self, url, definition):
# TODO: refactor so this can meaningfully handle multiple matches on content_selector.
# Probably by changing it so that this returns a Chapter / Section.
print("Extracting chapter from", url)
soup = self._soup(url)
content = soup.select(definition['content_selector'])[0]
content = soup.select(definition.content_selector)[0]
if 'filter_selector' in definition:
for filtered in content.select(definition['filter_selector']):
if definition.filter_selector:
for filtered in content.select(definition.filter_selector):
filtered.decompose()
return content.prettify()