From e68d2484a6a9001e98dcf4b4c4f166777f1be5a9 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Wed, 6 Apr 2016 10:29:39 -0500 Subject: [PATCH] Updates to adapter_literotica from davidfor --- calibre-plugin/plugin-defaults.ini | 11 ++ fanficfare/adapters/adapter_literotica.py | 227 +++++++++++++++------- fanficfare/configurable.py | 10 +- fanficfare/defaults.ini | 11 ++ 4 files changed, 191 insertions(+), 68 deletions(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index c9bd984e..c8653821 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1339,6 +1339,17 @@ eroticatags_label:Erotica Tags averrating_label:Average Rating extra_titlepage_entries:eroticatags,averrating +## Extract more erotica_tags from the meta tag of each chapter +use_meta_keywords: true + +## For multiple chapter stories, attempt to clean up the chapter title. This will +## remove the story title and change "Ch. 01" to "Chapter 1", "Pt. 01" to "Part 1" +## or just use the text. If this can't be done, the full title is used. +clean_chapter_titles: false + +## Add the chapter description at the start of each chapter. +description_in_chapter: false + [lotrfanfiction.com] extra_valid_entries: readings readings_label: Readings diff --git a/fanficfare/adapters/adapter_literotica.py b/fanficfare/adapters/adapter_literotica.py index f4f81a03..c56e1215 100644 --- a/fanficfare/adapters/adapter_literotica.py +++ b/fanficfare/adapters/adapter_literotica.py @@ -32,13 +32,15 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) + logger.debug("LiteroticaComAdapter:__init__ - url='%s'" % url) self.decode = ["utf8", "Windows-1252"] # 1252 is a superset of iso-8859-1. - # Most sites that claim to be - # iso-8859-1 (and some that claim to be - # utf8) are really windows-1252. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. + # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','litero') # normalize to first chapter. Not sure if they ever have more than 2 digits. @@ -61,7 +63,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior - self.dateformat = '%m/%d/%y' + self.dateformat = "%m/%d/%y" @staticmethod def getSiteDomain(): @@ -95,6 +97,18 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): def getSiteURLPattern(self): return r"https?://(www|german|spanish|french|dutch|italian|romanian|portuguese|other)(\.i)?\.literotica\.com/s/([a-zA-Z0-9_-]+)" + def getCategories(self, soup): + if self.getConfig("use_meta_keywords"): + categories = soup.find("meta", {"name":"keywords"})['content'].split(', ') + categories = [c for c in categories if not self.story.getMetadata('title') in c] + if self.story.getMetadata('author') in categories: + categories.remove(self.story.getMetadata('author')) + logger.debug("Meta = %s" % categories) + for category in categories: + # logger.debug("\tCategory=%s" % category) +# self.story.addToList('category', category.title()) + self.story.addToList('eroticatags', category.title()) + def extractChapterUrlsAndMetadata(self): """ NOTE: Some stories can have versions, @@ -118,6 +132,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): raise exceptions.AdultCheckRequired(self.url) logger.debug("Chapter/Story URL: <%s> " % self.url) + try: data1 = self._fetchUrl(self.url) soup1 = self.make_soup(data1) @@ -144,6 +159,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): soupAuth = self.make_soup(dataAuth) #strip comments from soup [comment.extract() for comment in soupAuth.findAll(text=lambda text:isinstance(text, Comment))] +# logger.debug(soupAuth) except urllib2.HTTPError, e: if e.code == 404: raise exceptions.StoryDoesNotExist(authorurl) @@ -154,6 +170,15 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): ## site has started using //domain.name/asdf urls remove https?: from front ## site has started putting https back on again. storyLink = soupAuth.find('a', href=re.compile(r'(https?:)?'+re.escape(self.url[self.url.index(':')+1:]))) +# storyLink = soupAuth.find('a', href=self.url)#[self.url.index(':')+1:]) + + if storyLink is not None: + # pull the published date from the author page + # default values from single link. Updated below if multiple chapter. + logger.debug("Found story on the author page.") + date = storyLink.parent.parent.findAll('td')[-1].text + self.story.setMetadata('datePublished', makeDate(date, self.dateformat)) + self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat)) if storyLink is not None: urlTr = storyLink.parent.parent @@ -165,9 +190,14 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (self.url, authorurl)) if isSingleStory: - self.story.setMetadata('title', storyLink.text) - self.setDescription(authorurl,urlTr.findAll("td")[1].text) - self.story.addToList('eroticatags', urlTr.findAll("td")[2].text) +# self.chapterUrls = [(soup1.h1.string, self.url)] +# self.story.setMetadata('title', soup1.h1.string) + + self.story.setMetadata('title', storyLink.text.strip('/')) + logger.debug('Title: "%s"' % storyLink.text.strip('/')) + self.story.setMetadata('description', urlTr.findAll("td")[1].text) + self.story.addToList('category', urlTr.findAll("td")[2].text) +# self.story.addToList('eroticatags', urlTr.findAll("td")[2].text) date = urlTr.findAll('td')[-1].text self.story.setMetadata('datePublished', makeDate(date, self.dateformat)) self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat)) @@ -175,13 +205,19 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): averrating = stripHTML(storyLink.parent) ## title (0.00) averrating = averrating[averrating.rfind('(')+1:averrating.rfind(')')] - self.story.setMetadata('averrating',averrating) + try: + self.story.setMetadata('averrating', float(averrating)) + except: + pass +# self.story.setMetadata('averrating',averrating) + # parse out the list of chapters else: seriesTr = urlTr.previousSibling while 'ser-ttl' not in seriesTr['class']: seriesTr = seriesTr.previousSibling m = re.match("^(?P.*?):\s(?P<numChapters>\d+)\sPart\sSeries$", seriesTr.find("strong").text) self.story.setMetadata('title', m.group('title')) + seriesTitle = m.group('title') ## Walk the chapters chapterTr = seriesTr.nextSibling @@ -189,88 +225,149 @@ class LiteroticaSiteAdapter(BaseSiteAdapter): dates = [] descriptions = [] ratings = [] + chapters = [] while chapterTr is not None and 'sl' in chapterTr['class']: - descriptions.append("%d. %s" % (len(descriptions)+1,stripHTML(chapterTr.findAll("td")[1])) ) + description = "%d. %s" % (len(descriptions)+1,stripHTML(chapterTr.findAll("td")[1])) + description = stripHTML(chapterTr.findAll("td")[1]) chapterLink = chapterTr.find("td", "fc").find("a") - if not chapterLink["href"].startswith('http'): - chapterLink["href"] = "http:" + chapterLink["href"] - self.chapterUrls.append((chapterLink.text, chapterLink["href"])) self.story.addToList('eroticatags', chapterTr.findAll("td")[2].text) - dates.append(makeDate(chapterTr.findAll('td')[-1].text, self.dateformat)) + pub_date = makeDate(chapterTr.findAll('td')[-1].text, self.dateformat) + dates.append(pub_date) chapterTr = chapterTr.nextSibling + + chapter_title = chapterLink.text + if self.getConfig("clean_chapter_titles"): + logger.debug('\tChapter Name: "%s"' % chapterLink.string) + logger.debug('\tChapter Name: "%s"' % chapterLink.text) + if chapterLink.text.lower().startswith(seriesTitle.lower()): + chapter = chapterLink.text[len(seriesTitle):].strip() + logger.debug('\tChapter: "%s"' % chapter) + if chapter == '': + chapter_title = 'Chapter %d' % (len(self.chapterUrls) + 1) + else: + separater_char = chapter[0] + logger.debug('\tseparater_char: "%s"' % separater_char) + chapter = chapter[1:].strip() if separater_char in [":", "-"] else chapter + logger.debug('\tChapter: "%s"' % chapter) + if chapter.lower().startswith('ch.'): + chapter = chapter[len('ch.'):] + try: + chapter_title = 'Chapter %d' % int(chapter) + except: + chapter_title = 'Chapter %s' % chapter + elif chapter.lower().startswith('pt.'): + chapter = chapter[len('pt.'):] + try: + chapter_title = 'Part %d' % int(chapter) + except: + chapter_title = 'Part %s' % chapter + elif separater_char in [":", "-"]: + chapter_title = chapter + + # if chapter_title == '': + # chapter_title = chapterLink.string + + # pages include full URLs. + chapurl = chapterLink['href'] + if chapurl.startswith('//'): + chapurl = self.parsedUrl.scheme + ':' + chapurl + logger.debug("Chapter URL: " + chapurl) + logger.debug("Chapter Title: " + chapter_title) + logger.debug("Chapter description: " + description) + chapters.append((chapter_title, chapurl, description, pub_date)) +# self.chapterUrls.append((chapter_title, chapurl)) numrating = stripHTML(chapterLink.parent) ## title (0.00) numrating = numrating[numrating.rfind('(')+1:numrating.rfind(')')] - ratings.append(float(numrating)) - - ## Set description to joint chapter descriptions - self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>") + try: + ratings.append(float(numrating)) + except: + pass + chapters = sorted(chapters, key=lambda chapter: chapter[3]) + for i, chapter in enumerate(chapters): + self.chapterUrls.append((chapter[0], chapter[1])) + descriptions.append("%d. %s" % (i + 1, chapter[2])) ## Set the oldest date as publication date, the newest as update date dates.sort() self.story.setMetadata('datePublished', dates[0]) self.story.setMetadata('dateUpdated', dates[-1]) + self.story.setMetadata('datePublished', chapters[0][3]) + self.story.setMetadata('dateUpdated', chapters[-1][3]) + ## Set description to joint chapter descriptions + self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>") - # normalize on first chapter URL. - self._setURL(self.chapterUrls[0][1]) + if len(ratings) > 0: + self.story.setMetadata('averrating','%4.2f' % (sum(ratings) / float(len(ratings)))) - self.story.setMetadata('averrating','%4.2f' % (sum(ratings) / float(len(ratings)))) + # normalize on first chapter URL. + self._setURL(self.chapterUrls[0][1]) + + # reset storyId to first chapter. + self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) self.story.setMetadata('numChapters', len(self.chapterUrls)) - # set storyId to 'title-author' to avoid duplicates - # self.story.setMetadata('storyId', - # re.sub("[^a-z0-9]", "", self.story.getMetadata('title').lower()) - # + "-" - # + re.sub("[^a-z0-9]", "", self.story.getMetadata('author').lower())) + self.story.setMetadata('category', soup1.find('div', 'b-breadcrumbs').findAll('a')[1].string) + self.getCategories(soup1) +# self.story.setMetadata('description', soup1.find('meta', {'name': 'description'})['content']) return + + def getPageText(self, raw_page, url): + logger.debug('Getting page text') +# logger.debug(soup) + raw_page = raw_page.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>') +# logger.debug("\tChapter text: %s" % raw_page) + page_soup = self.make_soup(raw_page) + [comment.extract() for comment in page_soup.findAll(text=lambda text:isinstance(text, Comment))] + story2 = page_soup.find('div', 'b-story-body-x').div +# logger.debug("getPageText- name div div...") +# logger.debug(soup) +# story2.append(page_soup.new_tag('br')) + div = self.utf8FromSoup(url, story2) +# logger.debug(div) + + fullhtml = str(div) +# logger.debug(fullhtml) + fullhtml = re.sub(r'<br />\s*<br />', r'</p><p>', fullhtml) + fullhtml = re.sub(r'^<div>', r'', fullhtml) + fullhtml = re.sub(r'</div>$', r'', fullhtml) + fullhtml = re.sub(r'(<p><br/></p>\s+)+$', r'', fullhtml) +# logger.debug(fullhtml) + return fullhtml + def getChapterText(self, url): - logger.debug('Getting chapter text from <%s>' % url) - data1 = self._fetchUrl(url) - # brute force approach to replace the wrapping <p> tag. If - # done by changing tag name, it causes problems with nested - # <p> tags. - data1 = data1.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>') - soup1 = self.make_soup(data1) - #strip comments from soup - [comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, Comment))] + logger.debug('Getting chapter text from: %s' % url) - # get story text - story1 = soup1.find('div', 'b-story-body-x').div - #print("story1:%s"%story1) - # story1.name='div' - story1.append(soup1.new_tag('br')) - storytext = self.utf8FromSoup(url,story1) + raw_page = self._fetchUrl(url) + page_soup = self.make_soup(raw_page) + pages = page_soup.find('select', {'name' : 'page'}) + page_nums = [page.text for page in pages.findAll('option')] if pages else 0 - # find num pages - pgs = int(soup1.find("span", "b-pager-caption-t r-d45").string.split(' ')[0]) - logger.debug("pages: "+unicode(pgs)) + fullhtml = "" + self.getCategories(page_soup) + if self.getConfig("description_in_chapter"): + chapter_description = page_soup.find("meta", {"name" : "description"})['content'] + logger.debug("\tChapter description: %s" % chapter_description) + fullhtml += '<p><b>Description:</b> %s</p><hr />' % chapter_description + fullhtml += self.getPageText(raw_page, url) + if pages: + for page_no in xrange(2, len(page_nums) + 1): + page_url = url + "?page=%s" % page_no + logger.debug("page_url= %s" % page_url) + raw_page = self._fetchUrl(page_url) + fullhtml += self.getPageText(raw_page, url) + +# fullhtml = self.utf8FromSoup(url, bs.BeautifulSoup(fullhtml)) +# fullhtml = re.sub(r'^<div>', r'', fullhtml) +# fullhtml = re.sub(r'</div>$', r'', fullhtml) +# if None == div: +# raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) - # get all the pages - for i in xrange(2, pgs+1): - try: - logger.debug("fetching page "+unicode(i)) - time.sleep(0.5) - data2 = self._fetchUrl(url, {'page': i}) - # brute force approach to replace the wrapping <p> tag. If - # done by changing tag name, it causes problems with nested - # <p> tags. - data2 = data2.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>') - soup2 = self.make_soup(data2) - [comment.extract() for comment in soup2.findAll(text=lambda text:isinstance(text, Comment))] - story2 = soup2.find('div', 'b-story-body-x').div - # story2.name='div' - story2.append(soup2.new_tag('br')) - storytext += self.utf8FromSoup(url,story2) - except urllib2.HTTPError, e: - if e.code == 404: - raise exceptions.StoryDoesNotExist(url) - else: - raise e - return storytext + return fullhtml def getClass(): diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index d684b38e..39ae7ad4 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -166,6 +166,10 @@ def get_valid_set_options(): 'pairingcat_to_characters_ships':(['tthfanfic.org'],None,boollist), 'romancecat_to_characters_ships':(['tthfanfic.org'],None,boollist), + 'use_meta_keywords':(['literotica.com'],None,boollist), + 'clean_chapter_titles':(['literotica.com'],None,boollist), + 'description_in_chapter':(['literotica.com'],None,boollist), + # eFiction Base adapters allow bulk_load # kept forgetting to add them, so now it's automatic. 'bulk_load':(adapters.get_bulk_load_sites(), @@ -184,9 +188,6 @@ def get_valid_set_options(): 'minimum_threadmarks':(base_xenforo_list,None,None), 'first_post_title':(base_xenforo_list,None,None), 'always_include_first_post':(base_xenforo_list,None,boollist), - '':(base_xenforo_list,None,boollist), - '':(base_xenforo_list,None,boollist), - '':(base_xenforo_list,None,boollist), } return dict(valdict) @@ -326,6 +327,9 @@ def get_valid_keywords(): 'centeredcat_to_characters', 'pairingcat_to_characters_ships', 'romancecat_to_characters_ships', + 'use_meta_keywords', + 'clean_chapter_titles', + 'description_in_chapter', 'titlepage_end', 'titlepage_entries', 'titlepage_entry', diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 8e976e1f..962f1785 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1327,6 +1327,17 @@ eroticatags_label:Erotica Tags averrating_label:Average Rating extra_titlepage_entries:eroticatags,averrating +## Extract more erotica_tags from the meta tag of each chapter +use_meta_keywords: true + +## For multiple chapter stories, attempt to clean up the chapter title. This will +## remove the story title and change "Ch. 01" to "Chapter 1", "Pt. 01" to "Part 1" +## or just use the text. If this can't be done, the full title is used. +clean_chapter_titles: false + +## Add the chapter description at the start of each chapter. +description_in_chapter: false + [lotrfanfiction.com] extra_valid_entries: readings readings_label: Readings