diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py index 90daed5b..d04df838 100644 --- a/fanficfare/adapters/__init__.py +++ b/fanficfare/adapters/__init__.py @@ -168,6 +168,7 @@ from . import adapter_wuxiaworldsite from . import adapter_thesietchcom from . import adapter_fastnovelnet from . import adapter_squidgeworldorg +from . import adapter_novelfull ## This bit of complexity allows adapters to be added by just adding ## importing. It eliminates the long if/else clauses we used to need diff --git a/fanficfare/adapters/adapter_novelfull.py b/fanficfare/adapters/adapter_novelfull.py new file mode 100644 index 00000000..33bf26c5 --- /dev/null +++ b/fanficfare/adapters/adapter_novelfull.py @@ -0,0 +1,88 @@ +from __future__ import absolute_import +import logging +import re + +# py2 vs py3 transition +from ..six.moves.urllib import parse as urlparse + +from .base_adapter import BaseSiteAdapter + + +logger = logging.getLogger(__name__) + + +def getClass(): + return NovelFullSiteAdapter + + +class NovelFullSiteAdapter(BaseSiteAdapter): + @staticmethod + def getSiteDomain(): + return "novelfull.com" + + def getSiteURLPattern(self): + return r"https?://%s/(?P.+).html?" % re.escape(self.getSiteDomain()) + + def extractChapterUrlsAndMetadata(self): + data = self.get_request(self.url) + + soup = self.make_soup(data) + + self.story.setMetadata("title", soup.select_one("h3.title").text) + + for author in soup.find("h3", text="Author:").fetchNextSiblings( + "a", href=re.compile("/author/") + ): + self.story.addToList("authorId", author.text) + self.story.addToList( + "authorUrl", urlparse.urljoin(self.url, author.attrs["href"]) + ) + self.story.addToList("author", author.text) + + status = soup.find("a", href=re.compile("status")).text + + if status == "Completed": + self.story.setMetadata("status", "Completed") + else: + self.story.setMetadata("status", "In-Progress") + + cover_url = soup.find("div", class_="book").find("img").attrs["src"] + self.setCoverImage(self.url, urlparse.urljoin(self.url, cover_url)) + + self._crawl_chapters(self.url) + + self.setDescription(self.url, soup.select_one("div.desc-text")) + + for genre in soup.find(class_="info").find_all("a", href=re.compile("/genre/")): + self.story.addToList("genre", genre.text) + + def _crawl_chapters(self, url): + data = self.get_request(url) + soup = self.make_soup(data) + + for a in soup.select("ul.list-chapter a"): + self.add_chapter(a.attrs["title"], urlparse.urljoin(url, a.attrs["href"])) + + next_page = soup.select_one("#list-chapter .next a") + + if next_page: + self._crawl_chapters(urlparse.urljoin(url, next_page.attrs["href"])) + + def getChapterText(self, url): + data = self.get_request(url) + soup = self.make_soup(data) + + content = soup.find(id="chapter-content") + + # Remove chapter header if present + chapter_header = content.find("p", text=re.compile(r"Chapter \d+:")) + + if chapter_header: + chapter_header.decompose() + + # Remove generic end-text added to all books + + for extra in content.find_all(attrs={"align": "left"}): + extra.decompose() + + return self.utf8FromSoup(url, content)