Merge pull request #171 from davidfor/master

Storiesonline and Literotica updates
This commit is contained in:
Jim Miller 2017-04-06 10:46:30 -05:00 committed by GitHub
commit cd0178030c
5 changed files with 42 additions and 11 deletions

View file

@ -1475,6 +1475,10 @@ extra_titlepage_entries:eroticatags,averrating
## Extract more erotica_tags from the meta tag of each chapter
use_meta_keywords: true
## Chapters can be in different categories. Default to not using all
## to be consistent with previous version.
chapter_categories_use_all: false
## For multiple chapter stories, attempt to clean up the chapter title. This will
## remove the story title and change "Ch. 01" to "Chapter 1", "Pt. 01" to "Part 1"
## or just use the text. If this can't be done, the full title is used.

View file

@ -227,6 +227,8 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
description = "%d. %s" % (len(descriptions)+1,stripHTML(chapterTr.findAll("td")[1]))
description = stripHTML(chapterTr.findAll("td")[1])
chapterLink = chapterTr.find("td", "fc").find("a")
if self.getConfig('chapter_categories_use_all'):
self.story.addToList('category', chapterTr.findAll("td")[2].text)
self.story.addToList('eroticatags', chapterTr.findAll("td")[2].text)
pub_date = makeDate(chapterTr.findAll('td')[-1].text, self.dateformat)
dates.append(pub_date)
@ -305,7 +307,8 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('numChapters', len(self.chapterUrls))
self.story.setMetadata('category', soup1.find('div', 'b-breadcrumbs').findAll('a')[1].string)
# Add the category from the breadcumb. This might duplicate a category already added.
self.story.addToList('category', soup1.find('div', 'b-breadcrumbs').findAll('a')[1].string)
self.getCategories(soup1)
# self.story.setMetadata('description', soup1.find('meta', {'name': 'description'})['content'])

View file

@ -169,19 +169,20 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
a = soup.find('h1')
self.story.setMetadata('title',stripHTML(a))
notice = soup.find('div', {'class' : 'notice'})
if notice:
self.story.setMetadata('notice',unicode(notice))
# Find authorid and URL from... author url.
for a in soup.findAll('a', href=re.compile(r"/a/\w+")):
nav_section = soup.find('nav')
for a in nav_section.findAll('a', {'rel' : 'author'}):
self.story.addToList('authorId',a['href'].split('/')[2])
self.story.addToList('authorUrl','http://'+self.host+a['href'])
self.story.addToList('author',stripHTML(a).replace("'s Page",""))
# The rest of the metadata is within the article tag.
soup = soup.find('article')
# Find the chapters:
chapters = soup.findAll('a', href=re.compile(r'^/s/'+self.story.getMetadata('storyId')+":\d+(/.*)?$"))
if len(chapters) != 0:
logger.debug("Number of chapters: {0}".format(len(chapters)))
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['href']))
@ -192,18 +193,17 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
# surprisingly, the detailed page does not give enough details, so go to author's page
page=0
i=0
while i == 0:
story_found = False
while not story_found:
page = page + 1
data = self._fetchUrl(self.story.getList('authorUrl')[0]+"/"+unicode(page))
asoup = self.make_soup(data)
a = asoup.findAll('td', {'class' : 'lc2'})
for lc2 in a:
if lc2.find('a', href=re.compile(r'^/s/'+self.story.getMetadata('storyId'))):
i=1
story_found = True
break
if a[len(a)-1] == lc2:
page=page+1
for cat in lc2.findAll('div', {'class' : 'typediv'}):
self.story.addToList('genre',cat.text)
@ -352,6 +352,24 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
else:
self.story.setMetadata('status', 'Completed')
# Remove all the metadata elements to leave and preamble text. This is usually
# a notice or a forward.
if len(self.chapterUrls) > 1:
header = soup.find('header')
header.extract()
else:
soup = soup.find('header')
# Remove some tags based on their class or id
elements_to_remove = ['#det-link', '#s-details', '#index-list', '#s-title', '#s-auth', '.copy']
if not self.getConfig('include_images'):
elements_to_remove.append('img')
for element_name in elements_to_remove:
elements = soup.select(element_name)
for element in elements:
element.extract()
if len(soup.contents ) > 0 and (len(soup.text.strip()) > 0 or len(soup.find_all('img')) > 0):
self.story.setMetadata('notice', self.utf8FromSoup(url, soup))
# grab the text for an individual chapter.
def getChapterText(self, url):

View file

@ -209,6 +209,7 @@ def get_valid_set_options():
'romancecat_to_characters_ships':(['tthfanfic.org'],None,boollist),
'use_meta_keywords':(['literotica.com'],None,boollist),
'chapter_categories_use_all':(['literotica.com'],None,boollist),
'clean_chapter_titles':(['literotica.com'],None,boollist),
'description_in_chapter':(['literotica.com'],None,boollist),
@ -389,6 +390,7 @@ def get_valid_keywords():
'pairingcat_to_characters_ships',
'romancecat_to_characters_ships',
'use_meta_keywords',
'chapter_categories_use_all',
'clean_chapter_titles',
'description_in_chapter',
'inject_chapter_title',

View file

@ -1497,6 +1497,10 @@ extra_titlepage_entries:eroticatags,averrating
## Extract more erotica_tags from the meta tag of each chapter
use_meta_keywords: true
## Chapters can be in different categories. Default to not using all
## to be consistent with previous version.
chapter_categories_use_all: false
## For multiple chapter stories, attempt to clean up the chapter title. This will
## remove the story title and change "Ch. 01" to "Chapter 1", "Pt. 01" to "Part 1"
## or just use the text. If this can't be done, the full title is used.