From c48c5dd35a1f8775a3fbde0b8fa74940dc64fbf5 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sun, 5 Aug 2018 18:21:09 -0500 Subject: [PATCH] Fix adapters that used getMetadata(title), which can be changed by various settings. --- fanficfare/adapters/adapter_fireflypopulliorg.py | 12 ++++++++---- fanficfare/adapters/adapter_shriftweborgbfa.py | 5 +++-- fanficfare/adapters/adapter_unknowableroomorg.py | 15 ++++++++------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/fanficfare/adapters/adapter_fireflypopulliorg.py b/fanficfare/adapters/adapter_fireflypopulliorg.py index 07539ed1..41b45a9c 100644 --- a/fanficfare/adapters/adapter_fireflypopulliorg.py +++ b/fanficfare/adapters/adapter_fireflypopulliorg.py @@ -129,7 +129,8 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): if not title: raise exceptions.StoryDoesNotExist('Cannot find title on the page {}'.format(url)) - self.story.setMetadata('title', stripHTML(soup.find('h2'))) + rawtitle = stripHTML(soup.find('h2')) + self.story.setMetadata('title', rawtitle) # This site has the entire story on one page, so we will be using the normalized URL as # the chapterUrl and the Title as the chapter Title @@ -153,7 +154,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): if ',' in mdata: self.story.setMetadata('coauthor', ', '.join(mdata.split(',')[1:]).strip()) mdata = mdata.split(',')[0] - + # print mdata # self.story.getMetadata('coauthor') # sys.exit() @@ -184,13 +185,16 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter): if stories: for story in stories: # There alot of nbsp's (non broken spaces) in here, so I'm going to remove them - # I'm also getting rid of the bold tags and the nextline characters to make it + # I'm also getting rid of the bold tags and the nextline characters to make it # easier to get the information below story = repr(story).replace(u'\\xa0', '').replace(' ',' ').replace( '','').replace('','').replace(r'\n','') story = self.make_soup(story).find('p') story_a = story.find('a') - title = self.story.getMetadata('title').split('-')[0].strip() + # not sure why this split is here, but it caused + # problems when title_chapter_range_pattern + # introduces a '-', so save rawtitle --JM + title = rawtitle.split('-')[0].strip() if story_a.get_text() == title: story_found = True break diff --git a/fanficfare/adapters/adapter_shriftweborgbfa.py b/fanficfare/adapters/adapter_shriftweborgbfa.py index 261fcd7e..f662c4d7 100644 --- a/fanficfare/adapters/adapter_shriftweborgbfa.py +++ b/fanficfare/adapters/adapter_shriftweborgbfa.py @@ -129,7 +129,8 @@ class BFAArchiveShriftwebOrgSiteAdapter(BaseSiteAdapter): if not title: raise exceptions.StoryDoesNotExist('Cannot find title on the page {}'.format(url)) - self.story.setMetadata('title', stripHTML(title)) + rawtitle = stripHTML(title) + self.story.setMetadata('title', rawtitle) # This site has the entire story on one page, so we will be using the normalized URL as # the chapterUrl and the Title as the chapter Title @@ -182,7 +183,7 @@ class BFAArchiveShriftwebOrgSiteAdapter(BaseSiteAdapter): story = self.make_soup(story).find('div') story_a = story.find('a') ## some stories have special characters... need to fix them. - title = repr(self.story.getMetadata('title'))[2:-1].replace('&', '&') + title = repr(rawtitle)[2:-1].replace('&', '&') if title in story_a.get_text(): story_found = True break diff --git a/fanficfare/adapters/adapter_unknowableroomorg.py b/fanficfare/adapters/adapter_unknowableroomorg.py index b8ec3f50..494fc908 100644 --- a/fanficfare/adapters/adapter_unknowableroomorg.py +++ b/fanficfare/adapters/adapter_unknowableroomorg.py @@ -48,7 +48,7 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): # 1252 is a superset of iso-8859-1. Most sites that claim to be iso-8859-1 (and some that # claim to be utf8) are really windows-1252. - self.decode = ["Windows-1252", "utf8", "iso-8859-1"] + self.decode = ["Windows-1252", "utf8", "iso-8859-1"] # Setting the adult status to false initially self.is_adult=False @@ -122,10 +122,11 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): self.story.setMetadata('authorId', author) self.story.setMetadata('authorUrl', 'http://'+self.getSiteDomain()) self.story.setMetadata('author', author) - + ## Title - self.story.setMetadata('title',stripHTML(soup.find('h1')).replace( - 'by '+self.story.getMetadata('author'), '').strip()) + rawtitle = stripHTML(soup.find('h1')).replace( + 'by '+self.story.getMetadata('author'), '').strip() + self.story.setMetadata('title',rawtitle) # Find the chapters: for chapter in soup.find('select').find_all('option', value=re.compile( @@ -148,12 +149,12 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): story_found = False for story in asoup.find('ul', {'id':'fic_list'}).find_all('li'): - if self.story.getMetadata('title') == stripHTML(story.a): + if rawtitle == stripHTML(story.a): story_found = True break else: story_found = False - + if not story_found: raise exceptions.StoryDoesNotExist("Cannot find story '{}' on author's page '{}'".format( url, self.story.getMetadata('authorUrl'))) @@ -200,7 +201,7 @@ class UnknowableRoomOrgSiteAdapter(BaseSiteAdapter): 'rd,', ',').replace('th,', ',').replace('.', '').strip() self.story.setMetadata('dateUpdated', makeDate(value, self.dateformat)) - # I'm going to add the disclaimer + # I'm going to add the disclaimer disclaimer = soup.find('strong', {'id':'disclaimer'}) if disclaimer: self.story.setMetadata('disclaimer', stripHTML(disclaimer).replace(