From b83b6d37f1cbe65b37bb68a0f313dba5af2a204d Mon Sep 17 00:00:00 2001 From: Alex Raubach <10493887+AlexRaubach@users.noreply.github.com> Date: Tue, 28 Aug 2018 21:13:06 -0400 Subject: [PATCH 1/4] Add The Gods Are Bastards site definition a[href*='20'] selects only chapter links that include the year in the url while not selecting links that share to social media. --- examples/thegodsarebastards.json | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 examples/thegodsarebastards.json diff --git a/examples/thegodsarebastards.json b/examples/thegodsarebastards.json new file mode 100644 index 0000000..9fa80cd --- /dev/null +++ b/examples/thegodsarebastards.json @@ -0,0 +1,8 @@ +{ + "url": "https://tiraas.wordpress.com/table-of-contents/", + "title": "The Gods Are Bastards", + "author": "D. D. Webb", + "chapter_selector": "article .entry-content a[href*='20']", + "content_selector": "article .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style, a[href*='tiraas.wordpress.com']" +} From 2019616505e571ead6a4da06fc71e30ae9ec2e07 Mon Sep 17 00:00:00 2001 From: Alex Raubach <10493887+AlexRaubach@users.noreply.github.com> Date: Tue, 28 Aug 2018 21:59:16 -0400 Subject: [PATCH 2/4] Check that the chapter has content before parsing Trying to select the first element in line 87 will throw a list index out of range error if there is no content matching the selector. --- sites/arbitrary.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 1989bc6..6f401a2 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -80,6 +80,10 @@ class Arbitrary(Site): # Probably by changing it so that this returns a Chapter / Section. logger.info("Extracting chapter @ %s", url) soup = self._soup(url) + + if not soup.select(definition.content_selector): + return '' + content = soup.select(definition.content_selector)[0] if definition.filter_selector: From 97a2a8899a9d8566b4f0670eb4a619b2ddb44bda Mon Sep 17 00:00:00 2001 From: Alex Raubach <10493887+AlexRaubach@users.noreply.github.com> Date: Tue, 28 Aug 2018 22:01:10 -0400 Subject: [PATCH 3/4] Create Worm site definition --- examples/worm.json | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 examples/worm.json diff --git a/examples/worm.json b/examples/worm.json new file mode 100644 index 0000000..a021a5e --- /dev/null +++ b/examples/worm.json @@ -0,0 +1,8 @@ +{ + "url": "https://parahumans.wordpress.com/table-of-contents/", + "title": "Worm", + "author": "Wildbow", + "chapter_selector": "#main .entry-content a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']" +} From 1bfc9b75f7e64a991d84e1f3abbcccac7bb25e98 Mon Sep 17 00:00:00 2001 From: Alex Raubach <10493887+AlexRaubach@users.noreply.github.com> Date: Tue, 28 Aug 2018 23:24:59 -0400 Subject: [PATCH 4/4] Remove unneeded whitespace --- sites/arbitrary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 6f401a2..d2864f2 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -80,10 +80,10 @@ class Arbitrary(Site): # Probably by changing it so that this returns a Chapter / Section. logger.info("Extracting chapter @ %s", url) soup = self._soup(url) - + if not soup.select(definition.content_selector): - return '' - + return '' + content = soup.select(definition.content_selector)[0] if definition.filter_selector: