Add New Site: novelfull.com (#688)

* Add basic support for novelfull.com

* remove extra log line

* set status to in-progress when not completed

* leave description as html to rely on existing conversion

* force removal of paragraphs with chapter headers

The previous version sometimes finds text elements which don't have a
decompose method, so forcing Beautiful Soup to find paragraph tags
ensures this will not crash

* add authors separately

* parse genre too
This commit is contained in:
Alex Riina 2021-04-20 10:13:35 -04:00 committed by GitHub
parent 2cd6f53f76
commit 9bc70b79e6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 89 additions and 0 deletions

View file

@ -168,6 +168,7 @@ from . import adapter_wuxiaworldsite
from . import adapter_thesietchcom
from . import adapter_fastnovelnet
from . import adapter_squidgeworldorg
from . import adapter_novelfull
## This bit of complexity allows adapters to be added by just adding
## importing. It eliminates the long if/else clauses we used to need

View file

@ -0,0 +1,88 @@
from __future__ import absolute_import
import logging
import re
# py2 vs py3 transition
from ..six.moves.urllib import parse as urlparse
from .base_adapter import BaseSiteAdapter
logger = logging.getLogger(__name__)
def getClass():
return NovelFullSiteAdapter
class NovelFullSiteAdapter(BaseSiteAdapter):
@staticmethod
def getSiteDomain():
return "novelfull.com"
def getSiteURLPattern(self):
return r"https?://%s/(?P<name>.+).html?" % re.escape(self.getSiteDomain())
def extractChapterUrlsAndMetadata(self):
data = self.get_request(self.url)
soup = self.make_soup(data)
self.story.setMetadata("title", soup.select_one("h3.title").text)
for author in soup.find("h3", text="Author:").fetchNextSiblings(
"a", href=re.compile("/author/")
):
self.story.addToList("authorId", author.text)
self.story.addToList(
"authorUrl", urlparse.urljoin(self.url, author.attrs["href"])
)
self.story.addToList("author", author.text)
status = soup.find("a", href=re.compile("status")).text
if status == "Completed":
self.story.setMetadata("status", "Completed")
else:
self.story.setMetadata("status", "In-Progress")
cover_url = soup.find("div", class_="book").find("img").attrs["src"]
self.setCoverImage(self.url, urlparse.urljoin(self.url, cover_url))
self._crawl_chapters(self.url)
self.setDescription(self.url, soup.select_one("div.desc-text"))
for genre in soup.find(class_="info").find_all("a", href=re.compile("/genre/")):
self.story.addToList("genre", genre.text)
def _crawl_chapters(self, url):
data = self.get_request(url)
soup = self.make_soup(data)
for a in soup.select("ul.list-chapter a"):
self.add_chapter(a.attrs["title"], urlparse.urljoin(url, a.attrs["href"]))
next_page = soup.select_one("#list-chapter .next a")
if next_page:
self._crawl_chapters(urlparse.urljoin(url, next_page.attrs["href"]))
def getChapterText(self, url):
data = self.get_request(url)
soup = self.make_soup(data)
content = soup.find(id="chapter-content")
# Remove chapter header if present
chapter_header = content.find("p", text=re.compile(r"Chapter \d+:"))
if chapter_header:
chapter_header.decompose()
# Remove generic end-text added to all books
for extra in content.find_all(attrs={"align": "left"}):
extra.decompose()
return self.utf8FromSoup(url, content)