Fix adapters that used getMetadata(title), which can be changed by various settings.

This commit is contained in:
Jim Miller 2018-08-05 18:21:09 -05:00
parent 5c49248700
commit c48c5dd35a
3 changed files with 19 additions and 13 deletions

View file

@ -129,7 +129,8 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter):
if not title:
raise exceptions.StoryDoesNotExist('Cannot find title on the page {}'.format(url))
self.story.setMetadata('title', stripHTML(soup.find('h2')))
rawtitle = stripHTML(soup.find('h2'))
self.story.setMetadata('title', rawtitle)
# This site has the entire story on one page, so we will be using the normalized URL as
# the chapterUrl and the Title as the chapter Title
@ -153,7 +154,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter):
if ',' in mdata:
self.story.setMetadata('coauthor', ', '.join(mdata.split(',')[1:]).strip())
mdata = mdata.split(',')[0]
# print mdata
# self.story.getMetadata('coauthor')
# sys.exit()
@ -184,13 +185,16 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter):
if stories:
for story in stories:
# There alot of nbsp's (non broken spaces) in here, so I'm going to remove them
# I'm also getting rid of the bold tags and the nextline characters to make it
# I'm also getting rid of the bold tags and the nextline characters to make it
# easier to get the information below
story = repr(story).replace(u'\\xa0', '').replace(' ',' ').replace(
'<b>','').replace('</b>','').replace(r'\n','')
story = self.make_soup(story).find('p')
story_a = story.find('a')
title = self.story.getMetadata('title').split('-')[0].strip()
# not sure why this split is here, but it caused
# problems when title_chapter_range_pattern
# introduces a '-', so save rawtitle --JM
title = rawtitle.split('-')[0].strip()
if story_a.get_text() == title:
story_found = True
break

View file

@ -129,7 +129,8 @@ class BFAArchiveShriftwebOrgSiteAdapter(BaseSiteAdapter):
if not title:
raise exceptions.StoryDoesNotExist('Cannot find title on the page {}'.format(url))
self.story.setMetadata('title', stripHTML(title))
rawtitle = stripHTML(title)
self.story.setMetadata('title', rawtitle)
# This site has the entire story on one page, so we will be using the normalized URL as
# the chapterUrl and the Title as the chapter Title
@ -182,7 +183,7 @@ class BFAArchiveShriftwebOrgSiteAdapter(BaseSiteAdapter):
story = self.make_soup(story).find('div')
story_a = story.find('a')
## some stories have special characters... need to fix them.
title = repr(self.story.getMetadata('title'))[2:-1].replace('&amp;', '&')
title = repr(rawtitle)[2:-1].replace('&amp;', '&')
if title in story_a.get_text():
story_found = True
break

View file

@ -48,7 +48,7 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter):
# 1252 is a superset of iso-8859-1. Most sites that claim to be iso-8859-1 (and some that
# claim to be utf8) are really windows-1252.
self.decode = ["Windows-1252", "utf8", "iso-8859-1"]
self.decode = ["Windows-1252", "utf8", "iso-8859-1"]
# Setting the adult status to false initially
self.is_adult=False
@ -122,10 +122,11 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('authorId', author)
self.story.setMetadata('authorUrl', 'http://'+self.getSiteDomain())
self.story.setMetadata('author', author)
## Title
self.story.setMetadata('title',stripHTML(soup.find('h1')).replace(
'by '+self.story.getMetadata('author'), '').strip())
rawtitle = stripHTML(soup.find('h1')).replace(
'by '+self.story.getMetadata('author'), '').strip()
self.story.setMetadata('title',rawtitle)
# Find the chapters:
for chapter in soup.find('select').find_all('option', value=re.compile(
@ -148,12 +149,12 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter):
story_found = False
for story in asoup.find('ul', {'id':'fic_list'}).find_all('li'):
if self.story.getMetadata('title') == stripHTML(story.a):
if rawtitle == stripHTML(story.a):
story_found = True
break
else:
story_found = False
if not story_found:
raise exceptions.StoryDoesNotExist("Cannot find story '{}' on author's page '{}'".format(
url, self.story.getMetadata('authorUrl')))
@ -200,7 +201,7 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter):
'rd,', ',').replace('th,', ',').replace('.', '').strip()
self.story.setMetadata('dateUpdated', makeDate(value, self.dateformat))
# I'm going to add the disclaimer
# I'm going to add the disclaimer
disclaimer = soup.find('strong', {'id':'disclaimer'})
if disclaimer:
self.story.setMetadata('disclaimer', stripHTML(disclaimer).replace(