From 0d5099dbe429146cebb33ba7e8aee6b03167b91f Mon Sep 17 00:00:00 2001 From: Ivan Kulikov Date: Mon, 9 Sep 2019 08:51:46 +0300 Subject: [PATCH] adapter_swiorgru: issues was fixed. (metadata parsing was added, adult check was added) --- fanficfare/adapters/adapter_swiorgru.py | 59 +++++++++++++++++++------ 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/fanficfare/adapters/adapter_swiorgru.py b/fanficfare/adapters/adapter_swiorgru.py index 83c6d3ec..ba81067d 100644 --- a/fanficfare/adapters/adapter_swiorgru.py +++ b/fanficfare/adapters/adapter_swiorgru.py @@ -39,11 +39,11 @@ class SwiOrgRuAdapter(BaseSiteAdapter): self._setURL('http://' + self.getSiteDomain() + '/mlp-fim/story/'+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. - self.story.setMetadata('siteabbrev','swi') + self.story.setMetadata('siteabbrev','swiorgru') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior - self.dateformat = "%b %d, %Y" + self.dateformat = "%Y.%m.%d" @staticmethod # must be @staticmethod, don't remove it. @@ -72,29 +72,64 @@ class SwiOrgRuAdapter(BaseSiteAdapter): soup = self.make_soup(data) title = soup.find('h1') + for tag in title.findAll('sup'): + tag.extract() - self.story.setMetadata('title', stripHTML(title)) + self.story.setMetadata('title', stripHTML(title.text)) logger.debug("Title: (%s)"%self.story.getMetadata('title')) - author_title = soup.find('strong', text = re.compile("Автор: ")) + author_title = soup.find('strong', text = re.compile(u"Автор: ")) if author_title == None: - logger.info('author_title no found... exiting') - sys.exit() + raise exceptions.FailedToDownload("Error downloading page: %s! Missing required author_title element!" % url) author = author_title.next_sibling self.story.setMetadata('authorId', author.text) # Author's name is unique self.story.setMetadata('authorUrl','http://'+self.host + author['href']) - self.story.setMetadata('author',author.text) + self.story.setMetadata('author', author.text) logger.debug("Author: (%s)"%self.story.getMetadata('author')) - - chapters_header = soup.find('h2', text = re.compile("Главы:")) + + date_pub = soup.find('em', text = re.compile(r'\d{4}.\d{2}.\d{2}')) + if not date_pub == None: + self.story.setMetadata('datePublished', makeDate(date_pub.text, self.dateformat)) + + rating_label = soup.find('strong', text = re.compile(u"рейтинг:")) + if not rating_label == None: + rating = rating_label.next_sibling.next_sibling + self.story.setMetadata('rating', stripHTML(rating)) + + if not self.is_adult or self.getConfig("is_adult"): + if "NC-18" in rating: + raise exceptions.AdultCheckRequired(self.url) + + characters = soup.findAll('img', src=re.compile(r"/mlp-fim/img/chars/\d+.png")) + logger.debug("numCharacters: (%s)"%str(len(characters))) + + for x in range(0,len(characters)): + character=characters[x] + self.story.addToList('characters', character['title']) + + if soup.find('font', color = r"green", text = u"завершен"): + self.story.setMetadata('status', 'Completed') + else: + self.story.setMetadata('status', 'In-Progress') + + categories_label = soup.find('strong', text = u"категории:") + if not categories_label == None: + categories_element = categories_label.next_sibling.next_sibling + categories = re.findall(r'"(.+?)"', categories_element.text) + for x in range(0, len(categories)): + category=categories[x] + self.story.addToList('category', category) + + chapters_header = soup.find('h2', text = re.compile(u"Главы:")) if chapters_header==None: - logger.info('chapters_header no found... exiting') - sys.exit() + raise exceptions.FailedToDownload("Error downloading page: %s! Missing required chapters_header element!" % url) chapters_table = chapters_header.next_sibling.next_sibling + self.story.setMetadata('language','Russian') + chapters=chapters_table.findAll('a', href=re.compile(r'/mlp-fim/story/'+self.story.getMetadata('storyId')+"/chapter\d+")) self.story.setMetadata('numChapters', len(chapters)) logger.debug("numChapters: (%s)"%str(self.story.getMetadata('numChapters'))) @@ -104,8 +139,6 @@ class SwiOrgRuAdapter(BaseSiteAdapter): churl='http://'+self.host+chapter['href'] self.add_chapter(chapter,churl) - self.story.setMetadata('language','Russian') - # grab the text for an individual chapter. def getChapterText(self, url): logger.debug('Getting chapter text from: %s' % url)