mirror of
https://github.com/kemayo/leech
synced 2025-12-06 16:33:16 +01:00
Adjust Arbitrary so it can handle non-chapter works
This commit is contained in:
parent
17664125f3
commit
529b85c7a6
1 changed files with 35 additions and 13 deletions
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import attr
|
||||
import datetime
|
||||
import json
|
||||
import os.path
|
||||
|
|
@ -24,8 +25,20 @@ difference would be whether a `--toc` arg was given.)
|
|||
"""
|
||||
|
||||
|
||||
@attr.s
|
||||
class SiteDefinition:
|
||||
url = attr.ib()
|
||||
title = attr.ib()
|
||||
author = attr.ib()
|
||||
content_selector = attr.ib()
|
||||
# If this is present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter.
|
||||
chapter_selector = attr.ib(default=False)
|
||||
# If this is present, it's used to filter out content that matches the selector
|
||||
filter_selector = attr.ib(default=False)
|
||||
|
||||
|
||||
@register
|
||||
class ArbitraryIndex(Site):
|
||||
class Arbitrary(Site):
|
||||
"""A way to describe an arbitrary side for a one-off fetch
|
||||
"""
|
||||
@staticmethod
|
||||
|
|
@ -36,20 +49,27 @@ class ArbitraryIndex(Site):
|
|||
|
||||
def extract(self, url):
|
||||
with open(url) as definition_file:
|
||||
definition = json.load(definition_file)
|
||||
|
||||
soup = self._soup(definition['url'])
|
||||
definition = SiteDefinition(**json.load(definition_file))
|
||||
|
||||
story = Section(
|
||||
title=definition['title'],
|
||||
author=definition['author']
|
||||
title=definition.title,
|
||||
author=definition.author
|
||||
)
|
||||
|
||||
for chapter in soup.select(definition['chapter_selector']):
|
||||
chapter_url = str(chapter.get('href'))
|
||||
if definition.chapter_selector:
|
||||
soup = self._soup(definition.url)
|
||||
for chapter in soup.select(definition.chapter_selector):
|
||||
chapter_url = str(chapter.get('href'))
|
||||
story.add(Chapter(
|
||||
title=chapter.string,
|
||||
contents=self._chapter(chapter_url, definition),
|
||||
# TODO: better date detection
|
||||
date=datetime.datetime.now()
|
||||
))
|
||||
else:
|
||||
story.add(Chapter(
|
||||
title=chapter.string,
|
||||
contents=self._chapter(chapter_url, definition),
|
||||
title=definition.title,
|
||||
contents=self._chapter(definition.url, definition),
|
||||
# TODO: better date detection
|
||||
date=datetime.datetime.now()
|
||||
))
|
||||
|
|
@ -57,12 +77,14 @@ class ArbitraryIndex(Site):
|
|||
return story
|
||||
|
||||
def _chapter(self, url, definition):
|
||||
# TODO: refactor so this can meaningfully handle multiple matches on content_selector.
|
||||
# Probably by changing it so that this returns a Chapter / Section.
|
||||
print("Extracting chapter from", url)
|
||||
soup = self._soup(url)
|
||||
content = soup.select(definition['content_selector'])[0]
|
||||
content = soup.select(definition.content_selector)[0]
|
||||
|
||||
if 'filter_selector' in definition:
|
||||
for filtered in content.select(definition['filter_selector']):
|
||||
if definition.filter_selector:
|
||||
for filtered in content.select(definition.filter_selector):
|
||||
filtered.decompose()
|
||||
|
||||
return content.prettify()
|
||||
|
|
|
|||
Loading…
Reference in a new issue