mirror of
https://github.com/kemayo/leech
synced 2026-05-07 03:52:17 +02:00
Assorted tweaks to make type checking a bit happier
This commit is contained in:
parent
82cf246593
commit
e93590e13c
4 changed files with 27 additions and 28 deletions
8
leech.py
8
leech.py
|
|
@ -45,9 +45,9 @@ def configure_logging(verbose):
|
|||
)
|
||||
|
||||
|
||||
def create_session(cache):
|
||||
def create_session(cache) -> requests_cache.CachedSession | requests.Session:
|
||||
if cache:
|
||||
session = requests_cache.CachedSession('leech', expire_after=4 * 3600, use_temp=True)
|
||||
session = requests_cache.CachedSession('leech', expire_after=4 * 3600, use_temp=True, backend='sqlite')
|
||||
logger.debug("CachedSession at %s", session.cache.db_path)
|
||||
else:
|
||||
session = requests.Session()
|
||||
|
|
@ -59,7 +59,7 @@ def create_session(cache):
|
|||
logger.debug("No leech.cookies present in %s", directory)
|
||||
continue
|
||||
try:
|
||||
lwp_cookiejar.load(directory / 'leech.cookies', ignore_discard=True)
|
||||
lwp_cookiejar.load(str(directory / 'leech.cookies'), ignore_discard=True)
|
||||
except Exception:
|
||||
# This file is very much optional, so this log isn't really necessary
|
||||
logger.exception("Couldn't load cookies from leech.cookies in %s", dirs.user_data_path)
|
||||
|
|
@ -200,7 +200,7 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, user_age
|
|||
options, login = create_options(site, site_options, other_flags)
|
||||
if UA := user_agent or options.get('user_agent'):
|
||||
logger.debug('USER_AGENT overridden to "%s"', UA)
|
||||
session.headers.update( {'USER_AGENT': UA})
|
||||
session.headers.update({'USER_AGENT': UA})
|
||||
site_output_dir = Path(output_dir or options.get('output_dir', os.getcwd())).expanduser().resolve()
|
||||
if not os.path.exists(site_output_dir):
|
||||
logger.warning("output directory doesn't exist: %s", site_output_dir)
|
||||
|
|
|
|||
|
|
@ -7,9 +7,10 @@ import uuid
|
|||
import datetime
|
||||
import time
|
||||
import logging
|
||||
import urllib
|
||||
import re
|
||||
import hashlib
|
||||
import requests
|
||||
from urllib import parse as urlparse
|
||||
from attrs import define, field, Factory
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
|
@ -34,7 +35,7 @@ class Image:
|
|||
if self.url.startswith("data:image") and 'base64' in self.url:
|
||||
head, base64data = self.url.split(',')
|
||||
return str(head.split(';')[0].split('/')[1])
|
||||
path = urllib.parse.urlparse(self.url).path
|
||||
path = urlparse.urlparse(self.url).path
|
||||
return os.path.splitext(path)[1]
|
||||
|
||||
|
||||
|
|
@ -42,7 +43,7 @@ class Image:
|
|||
class Chapter:
|
||||
title: str
|
||||
contents: str
|
||||
date: datetime.datetime = False
|
||||
date: datetime.datetime | None = None
|
||||
images: dict = Factory(dict)
|
||||
|
||||
|
||||
|
|
@ -93,7 +94,7 @@ class Site:
|
|||
"""A Site handles checking whether a URL might represent a site, and then
|
||||
extracting the content of a story from said site.
|
||||
"""
|
||||
session: object = field()
|
||||
session: requests.Session = field()
|
||||
footnotes: list = field(factory=list, init=False)
|
||||
options: dict = Factory(
|
||||
lambda site: site.get_default_options(),
|
||||
|
|
@ -102,9 +103,7 @@ class Site:
|
|||
|
||||
@classmethod
|
||||
def site_key(cls):
|
||||
if hasattr(cls, '_key'):
|
||||
return cls._key
|
||||
return cls.__name__
|
||||
return getattr(cls, '_key', cls.__name__)
|
||||
|
||||
@staticmethod
|
||||
def get_site_specific_option_defs():
|
||||
|
|
@ -176,8 +175,8 @@ class Site:
|
|||
options[option.name] = option_value
|
||||
return options
|
||||
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
@classmethod
|
||||
def matches(cls, url):
|
||||
raise NotImplementedError()
|
||||
|
||||
def extract(self, url):
|
||||
|
|
@ -196,7 +195,7 @@ class Site:
|
|||
def login(self, login_details):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _soup(self, url, method=False, delay=0, retry=3, retry_delay=10, **kw):
|
||||
def _soup(self, url, method=None, delay=0, retry=3, retry_delay=10, **kw) -> tuple[BeautifulSoup, str]:
|
||||
if not method:
|
||||
method = self.options.get('parser', 'lxml')
|
||||
if url.startswith('http://') or url.startswith('https://'):
|
||||
|
|
@ -221,7 +220,7 @@ class Site:
|
|||
text = url
|
||||
fallback_base = ''
|
||||
soup = BeautifulSoup(text, method)
|
||||
return soup, (soup.head and soup.head.base) and soup.head.base.get('href') or fallback_base
|
||||
return soup, str((soup.head and soup.head.base) and soup.head.base.get('href') or fallback_base)
|
||||
|
||||
def _form_in_soup(self, soup):
|
||||
if soup.name == 'form':
|
||||
|
|
@ -265,7 +264,7 @@ class Site:
|
|||
return soup.new_tag(*args, **kw)
|
||||
|
||||
def _join_url(self, *args, **kwargs):
|
||||
return urllib.parse.urljoin(*args, **kwargs)
|
||||
return urlparse.urljoin(*args, **kwargs)
|
||||
|
||||
def _footnote(self, contents, chapterid):
|
||||
"""Register a footnote and return a link to that footnote"""
|
||||
|
|
@ -302,7 +301,7 @@ class Site:
|
|||
|
||||
return spoiler_link
|
||||
|
||||
def _clean(self, contents, base=False):
|
||||
def _clean(self, contents, base:str|None=None):
|
||||
"""Clean up story content to be more ebook-friendly
|
||||
|
||||
TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
|
||||
|
|
@ -339,7 +338,7 @@ class Site:
|
|||
# Call this on a story after it's fully extracted to clean up things
|
||||
for chapter in story:
|
||||
if hasattr(chapter, '__iter__'):
|
||||
self._finalize(chapter, story)
|
||||
self._finalize(chapter)
|
||||
else:
|
||||
self._process_images(chapter)
|
||||
|
||||
|
|
@ -380,9 +379,9 @@ class SiteSpecificOption:
|
|||
name: str
|
||||
flag_pattern: str
|
||||
type: object = None
|
||||
default: bool = False
|
||||
help: str = None
|
||||
choices: tuple = None
|
||||
default: object = False
|
||||
help: str|None = None
|
||||
choices: tuple|None = None
|
||||
exposed: bool = True
|
||||
click_kwargs: frozenset = field(converter=lambda kwargs: frozenset(kwargs.items()), default={})
|
||||
|
||||
|
|
|
|||
|
|
@ -31,15 +31,15 @@ class SiteDefinition:
|
|||
author: str
|
||||
content_selector: str
|
||||
# If present, find something within `content` to use a chapter title; if not found, the link text to it will be used
|
||||
content_title_selector: str = False
|
||||
content_title_selector: str|None = None
|
||||
# If present, find a specific element in the `content` to be the chapter text
|
||||
content_text_selector: str = False
|
||||
content_text_selector: str|None = None
|
||||
# If present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter.
|
||||
chapter_selector: str = False
|
||||
chapter_selector: str|None = None
|
||||
# If present, use to find a link to the next content page (only used if not using chapter_selector)
|
||||
next_selector: str = False
|
||||
next_selector: str|None = None
|
||||
# If present, use to filter out content that matches the selector
|
||||
filter_selector: str = False
|
||||
filter_selector: str|None = None
|
||||
cover_url: str = ''
|
||||
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ class Arbitrary(Site):
|
|||
|
||||
return story
|
||||
|
||||
def _chapter(self, url, definition, title=False):
|
||||
def _chapter(self, url, definition, title=None):
|
||||
logger.info("Extracting chapter @ %s", url)
|
||||
soup, base = self._soup(url)
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ class XenForo2(XenForo):
|
|||
tags=tags
|
||||
)
|
||||
|
||||
def _posts_from_page(self, soup, postid=False):
|
||||
def _posts_from_page(self, soup, postid=None):
|
||||
if postid:
|
||||
return soup.find('article', id='js-post-' + postid)
|
||||
return soup.select('article.message--post')
|
||||
|
|
|
|||
Loading…
Reference in a new issue