From d71184ae8ba3d453dcad8a462a3d146d22f169d8 Mon Sep 17 00:00:00 2001 From: Alex Raubach Date: Sun, 16 Sep 2018 16:51:17 -0400 Subject: [PATCH 1/5] Place post-chapter RR author notes at the end of the chapter --- sites/royalroad.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sites/royalroad.py b/sites/royalroad.py index e41c668..52fb72c 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -52,14 +52,20 @@ class RoyalRoad(Site): soup = self._soup(url) content = soup.find('div', class_='chapter-content') - # TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well. + # TODO: this could be more robust. author_note = soup.find('div', class_='author-note-portlet') + # Find the portlet-body and check if the first child div is the author note. + if 'author-note-portlet' in soup.find('div', class_='portlet-body').find('div', recursive=False)['class']: + output = (author_note and (author_note.prettify() + '
') or '') + content.prettify() + else: # Possible post chapter note + output = content.prettify() + (author_note and ('
' + author_note.prettify()) or '') + updated = datetime.datetime.fromtimestamp( int(soup.find(class_="profile-info").find('time').get('unixtime')) ) - return (author_note and (author_note.prettify() + '
') or '') + content.prettify(), updated + return output, updated @register From 94900cb1263b365185705b9ebe499415a0d5442d Mon Sep 17 00:00:00 2001 From: Alex Raubach Date: Sun, 16 Sep 2018 23:40:05 -0400 Subject: [PATCH 2/5] Simplify Royal Road chapter scraper --- sites/royalroad.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sites/royalroad.py b/sites/royalroad.py index 52fb72c..8d029e2 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -50,22 +50,22 @@ class RoyalRoad(Site): def _chapter(self, url): logger.info("Extracting chapter @ %s", url) soup = self._soup(url) - content = soup.find('div', class_='chapter-content') + content = soup.find('div', class_='chapter-content').prettify() - # TODO: this could be more robust. author_note = soup.find('div', class_='author-note-portlet') - # Find the portlet-body and check if the first child div is the author note. - if 'author-note-portlet' in soup.find('div', class_='portlet-body').find('div', recursive=False)['class']: - output = (author_note and (author_note.prettify() + '
') or '') + content.prettify() - else: # Possible post chapter note - output = content.prettify() + (author_note and ('
' + author_note.prettify()) or '') + if author_note: + # Find the portlet-body and check if the first child div is the author note. + if 'author-note-portlet' in soup.find('div', class_='portlet-body').find('div', recursive=False)['class']: + content = author_note.prettify() + '
' + content + else: # Post-chapter note goes on the end + content = content + '
' + author_note.prettify() updated = datetime.datetime.fromtimestamp( int(soup.find(class_="profile-info").find('time').get('unixtime')) ) - return output, updated + return content, updated @register From cf62faf5dd60a658b34cea9bd7259678046414fd Mon Sep 17 00:00:00 2001 From: Alex Raubach Date: Mon, 17 Sep 2018 20:03:01 -0400 Subject: [PATCH 3/5] Support two RR author notes in one chapter --- sites/royalroad.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sites/royalroad.py b/sites/royalroad.py index 8d029e2..be3c208 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -52,14 +52,16 @@ class RoyalRoad(Site): soup = self._soup(url) content = soup.find('div', class_='chapter-content').prettify() - author_note = soup.find('div', class_='author-note-portlet') + author_note = soup.find_all('div', class_='author-note-portlet') - if author_note: - # Find the portlet-body and check if the first child div is the author note. + if len(author_note) is 1: + # The first child div is either the chapter content or an author note if 'author-note-portlet' in soup.find('div', class_='portlet-body').find('div', recursive=False)['class']: - content = author_note.prettify() + '
' + content - else: # Post-chapter note goes on the end - content = content + '
' + author_note.prettify() + content = author_note[0].prettify() + '
' + content + else: # The author note must be after the chapter content + content = content + '
' + author_note[0].prettify() + elif len(author_note) is 2: + content = author_note[0].prettify() + '
' + content + '
' + author_note[1].prettify() updated = datetime.datetime.fromtimestamp( int(soup.find(class_="profile-info").find('time').get('unixtime')) From a9dfdb5dd30e01bb804fa798f7bb25b6e9c33b76 Mon Sep 17 00:00:00 2001 From: Alex Raubach Date: Mon, 17 Sep 2018 20:22:13 -0400 Subject: [PATCH 4/5] Add a null check to RR author note placement --- sites/royalroad.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sites/royalroad.py b/sites/royalroad.py index be3c208..00eb684 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -55,8 +55,9 @@ class RoyalRoad(Site): author_note = soup.find_all('div', class_='author-note-portlet') if len(author_note) is 1: - # The first child div is either the chapter content or an author note - if 'author-note-portlet' in soup.find('div', class_='portlet-body').find('div', recursive=False)['class']: + # The first child div of portlet-body is either the chapter content or an author note + first_div = soup.find('div', class_='portlet-body').find('div', recursive=False) + if first_div and ('author-note-portlet' in first_div['class']): content = author_note[0].prettify() + '
' + content else: # The author note must be after the chapter content content = content + '
' + author_note[0].prettify() From 1ff009f89394bc8cb7da8b7aeccce79c5944a97b Mon Sep 17 00:00:00 2001 From: Alex Raubach Date: Thu, 27 Sep 2018 13:48:33 -0400 Subject: [PATCH 5/5] Improve Prechapter author note detection --- sites/royalroad.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sites/royalroad.py b/sites/royalroad.py index 00eb684..c29a2fc 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -55,9 +55,8 @@ class RoyalRoad(Site): author_note = soup.find_all('div', class_='author-note-portlet') if len(author_note) is 1: - # The first child div of portlet-body is either the chapter content or an author note - first_div = soup.find('div', class_='portlet-body').find('div', recursive=False) - if first_div and ('author-note-portlet' in first_div['class']): + # Find the parent of chapter-content and check if the author's note is the first child div + if 'author-note-portlet' in soup.find('div', class_='chapter-content').parent.find('div')['class']: content = author_note[0].prettify() + '
' + content else: # The author note must be after the chapter content content = content + '
' + author_note[0].prettify()