diff --git a/examples/thegodsarebastards.json b/examples/thegodsarebastards.json new file mode 100644 index 0000000..9fa80cd --- /dev/null +++ b/examples/thegodsarebastards.json @@ -0,0 +1,8 @@ +{ + "url": "https://tiraas.wordpress.com/table-of-contents/", + "title": "The Gods Are Bastards", + "author": "D. D. Webb", + "chapter_selector": "article .entry-content a[href*='20']", + "content_selector": "article .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style, a[href*='tiraas.wordpress.com']" +} diff --git a/examples/worm.json b/examples/worm.json new file mode 100644 index 0000000..a021a5e --- /dev/null +++ b/examples/worm.json @@ -0,0 +1,8 @@ +{ + "url": "https://parahumans.wordpress.com/table-of-contents/", + "title": "Worm", + "author": "Wildbow", + "chapter_selector": "#main .entry-content a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']" +} diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 1989bc6..d2864f2 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -80,6 +80,10 @@ class Arbitrary(Site): # Probably by changing it so that this returns a Chapter / Section. logger.info("Extracting chapter @ %s", url) soup = self._soup(url) + + if not soup.select(definition.content_selector): + return '' + content = soup.select(definition.content_selector)[0] if definition.filter_selector: