From 1afac50437eb4e1d84b81e096fda1bb4f9ebee75 Mon Sep 17 00:00:00 2001 From: IdanDor Date: Sat, 23 Jan 2021 12:00:51 +0200 Subject: [PATCH] Made arbitrary sites no longer leak memory and fixed worm epub. Each `Chapter` object had a reference to the entire page tree, meaning that the program rose in RAM usage by a lot. Transformed Worm to be with next_selector so the chapters are correctly ordered, E.2 is not skipped and the download does not crush due to `?share=twitter` url matched before. Fixed Worm titles. --- examples/worm.json | 8 +++++--- sites/arbitrary.py | 3 +++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/worm.json b/examples/worm.json index 37b6b9b..50fcb4b 100644 --- a/examples/worm.json +++ b/examples/worm.json @@ -1,9 +1,11 @@ { - "url": "https://parahumans.wordpress.com/table-of-contents/", + "url": "https://parahumans.wordpress.com/2011/06/11/1-1/", "title": "Worm", "author": "Wildbow", - "chapter_selector": "#main .entry-content a", - "content_selector": "#main .entry-content", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", "filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']", + "next_selector": "a[rel=\"next\"]", "cover_url": "https://pre00.deviantart.net/969a/th/pre/i/2015/051/8/7/worm_cover_by_cactusfantastico-d8ivj4b.png" } diff --git a/sites/arbitrary.py b/sites/arbitrary.py index caebf57..f5f996f 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -126,6 +126,9 @@ class Arbitrary(Site): # TODO: consider `'\n'.join(map(str, content.contents))` content.name = 'div' + + # Extract from bs4 tree so the rest of the tree gets deleted. + content = content.extract() chapters.append(Chapter( title=title,