diff --git a/leech.py b/leech.py index dd9e157..9953cbf 100755 --- a/leech.py +++ b/leech.py @@ -45,9 +45,9 @@ def configure_logging(verbose): ) -def create_session(cache): +def create_session(cache) -> requests_cache.CachedSession | requests.Session: if cache: - session = requests_cache.CachedSession('leech', expire_after=4 * 3600, use_temp=True) + session = requests_cache.CachedSession('leech', expire_after=4 * 3600, use_temp=True, backend='sqlite') logger.debug("CachedSession at %s", session.cache.db_path) else: session = requests.Session() @@ -59,7 +59,7 @@ def create_session(cache): logger.debug("No leech.cookies present in %s", directory) continue try: - lwp_cookiejar.load(directory / 'leech.cookies', ignore_discard=True) + lwp_cookiejar.load(str(directory / 'leech.cookies'), ignore_discard=True) except Exception: # This file is very much optional, so this log isn't really necessary logger.exception("Couldn't load cookies from leech.cookies in %s", dirs.user_data_path) @@ -200,7 +200,7 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, user_age options, login = create_options(site, site_options, other_flags) if UA := user_agent or options.get('user_agent'): logger.debug('USER_AGENT overridden to "%s"', UA) - session.headers.update( {'USER_AGENT': UA}) + session.headers.update({'USER_AGENT': UA}) site_output_dir = Path(output_dir or options.get('output_dir', os.getcwd())).expanduser().resolve() if not os.path.exists(site_output_dir): logger.warning("output directory doesn't exist: %s", site_output_dir) diff --git a/sites/__init__.py b/sites/__init__.py index 8a2c14f..ede923e 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -7,9 +7,10 @@ import uuid import datetime import time import logging -import urllib import re import hashlib +import requests +from urllib import parse as urlparse from attrs import define, field, Factory from bs4 import BeautifulSoup @@ -34,7 +35,7 @@ class Image: if self.url.startswith("data:image") and 'base64' in self.url: head, base64data = self.url.split(',') return str(head.split(';')[0].split('/')[1]) - path = urllib.parse.urlparse(self.url).path + path = urlparse.urlparse(self.url).path return os.path.splitext(path)[1] @@ -42,7 +43,7 @@ class Image: class Chapter: title: str contents: str - date: datetime.datetime = False + date: datetime.datetime | None = None images: dict = Factory(dict) @@ -93,7 +94,7 @@ class Site: """A Site handles checking whether a URL might represent a site, and then extracting the content of a story from said site. """ - session: object = field() + session: requests.Session = field() footnotes: list = field(factory=list, init=False) options: dict = Factory( lambda site: site.get_default_options(), @@ -102,9 +103,7 @@ class Site: @classmethod def site_key(cls): - if hasattr(cls, '_key'): - return cls._key - return cls.__name__ + return getattr(cls, '_key', cls.__name__) @staticmethod def get_site_specific_option_defs(): @@ -176,8 +175,8 @@ class Site: options[option.name] = option_value return options - @staticmethod - def matches(url): + @classmethod + def matches(cls, url): raise NotImplementedError() def extract(self, url): @@ -196,7 +195,7 @@ class Site: def login(self, login_details): raise NotImplementedError() - def _soup(self, url, method=False, delay=0, retry=3, retry_delay=10, **kw): + def _soup(self, url, method=None, delay=0, retry=3, retry_delay=10, **kw) -> tuple[BeautifulSoup, str]: if not method: method = self.options.get('parser', 'lxml') if url.startswith('http://') or url.startswith('https://'): @@ -221,7 +220,7 @@ class Site: text = url fallback_base = '' soup = BeautifulSoup(text, method) - return soup, (soup.head and soup.head.base) and soup.head.base.get('href') or fallback_base + return soup, str((soup.head and soup.head.base) and soup.head.base.get('href') or fallback_base) def _form_in_soup(self, soup): if soup.name == 'form': @@ -265,7 +264,7 @@ class Site: return soup.new_tag(*args, **kw) def _join_url(self, *args, **kwargs): - return urllib.parse.urljoin(*args, **kwargs) + return urlparse.urljoin(*args, **kwargs) def _footnote(self, contents, chapterid): """Register a footnote and return a link to that footnote""" @@ -302,7 +301,7 @@ class Site: return spoiler_link - def _clean(self, contents, base=False): + def _clean(self, contents, base:str|None=None): """Clean up story content to be more ebook-friendly TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is @@ -339,7 +338,7 @@ class Site: # Call this on a story after it's fully extracted to clean up things for chapter in story: if hasattr(chapter, '__iter__'): - self._finalize(chapter, story) + self._finalize(chapter) else: self._process_images(chapter) @@ -380,9 +379,9 @@ class SiteSpecificOption: name: str flag_pattern: str type: object = None - default: bool = False - help: str = None - choices: tuple = None + default: object = False + help: str|None = None + choices: tuple|None = None exposed: bool = True click_kwargs: frozenset = field(converter=lambda kwargs: frozenset(kwargs.items()), default={}) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 14c2a2a..ec60bb6 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -31,15 +31,15 @@ class SiteDefinition: author: str content_selector: str # If present, find something within `content` to use a chapter title; if not found, the link text to it will be used - content_title_selector: str = False + content_title_selector: str|None = None # If present, find a specific element in the `content` to be the chapter text - content_text_selector: str = False + content_text_selector: str|None = None # If present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter. - chapter_selector: str = False + chapter_selector: str|None = None # If present, use to find a link to the next content page (only used if not using chapter_selector) - next_selector: str = False + next_selector: str|None = None # If present, use to filter out content that matches the selector - filter_selector: str = False + filter_selector: str|None = None cover_url: str = '' @@ -110,7 +110,7 @@ class Arbitrary(Site): return story - def _chapter(self, url, definition, title=False): + def _chapter(self, url, definition, title=None): logger.info("Extracting chapter @ %s", url) soup, base = self._soup(url) diff --git a/sites/xenforo2.py b/sites/xenforo2.py index b74b691..56c64e9 100644 --- a/sites/xenforo2.py +++ b/sites/xenforo2.py @@ -24,7 +24,7 @@ class XenForo2(XenForo): tags=tags ) - def _posts_from_page(self, soup, postid=False): + def _posts_from_page(self, soup, postid=None): if postid: return soup.find('article', id='js-post-' + postid) return soup.select('article.message--post')