diff --git a/fanficfare/adapters/adapter_archiveofourownorg.py b/fanficfare/adapters/adapter_archiveofourownorg.py index 79263cc5..53b6adc6 100644 --- a/fanficfare/adapters/adapter_archiveofourownorg.py +++ b/fanficfare/adapters/adapter_archiveofourownorg.py @@ -18,56 +18,20 @@ from __future__ import absolute_import import logging logger = logging.getLogger(__name__) -import re -import json -from ..six import text_type as unicode -from ..htmlcleanup import stripHTML -from .. import exceptions as exceptions - -# py2 vs py3 transition - -from .base_adapter import BaseSiteAdapter, makeDate +from .base_otw_adapter import BaseOTWAdapter def getClass(): return ArchiveOfOurOwnOrgAdapter -class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter): +class ArchiveOfOurOwnOrgAdapter(BaseOTWAdapter): def __init__(self, config, url): - BaseSiteAdapter.__init__(self, config, url) - - self.username = "NoneGiven" # if left empty, site doesn't return any message at all. - self.password = "" - self.is_adult=False - self.addurl = "" - - self.full_work_soup = None - self.full_work_chapters = None - self.use_full_work_soup = True - - # get storyId from url--url validation guarantees query is only sid=1234 - self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) - - # get storyId from url--url validation guarantees query correct - m = re.match(self.getSiteURLPattern(),url) - if m: - self.story.setMetadata('storyId',m.group('id')) - - # normalized story URL. - self._setURL('https://' + self.getSiteDomain() + '/works/'+self.story.getMetadata('storyId')) - else: - raise exceptions.InvalidStoryURL(url, - self.getSiteDomain(), - self.getSiteExampleURLs()) + BaseOTWAdapter.__init__(self, config, url) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','ao3') - # The date format will vary from site to site. - # http://docs.python.org/library/datetime.html#strftime-strptime-behavior - self.dateformat = "%Y-%b-%d" - @staticmethod # must be @staticmethod, don't remove it. def getSiteDomain(): # The site domain. Does have www here, if it uses it. @@ -90,567 +54,3 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter): 'download.archiveofourown.net', 'ao3.org', ] - - @classmethod - def getSiteExampleURLs(cls): - return "https://"+cls.getSiteDomain()+"/works/123456 https://"+cls.getSiteDomain()+"/collections/Some_Archive/works/123456 https://"+cls.getSiteDomain()+"/works/123456/chapters/78901" - - def getSiteURLPattern(self): - # https://archiveofourown.org/collections/Smallville_Slash_Archive/works/159770 - # Discard leading zeros from story ID numbers--AO3 doesn't use them in it's own chapter URLs. - # logger.debug(r"https?://" + r"|".join([x.replace('.','\.') for x in self.getAcceptDomains()]) + r"(/collections/[^/]+)?/works/0*(?P\d+)") - return r"https?://(" + r"|".join([x.replace('.',r'\.') for x in self.getAcceptDomains()]) + r")(/collections/[^/]+)?/works/0*(?P\d+)" - - @classmethod - def get_section_url(cls,url): - ## minimal URL used for section names in INI and reject list - ## for comparison - # logger.debug("pre--url:%s"%url) - ## https://archiveofourown.org/works/19334905/chapters/71697933 - # http://archiveofourown.org/works/34686793/chapters/89043733 - url = re.sub(r'^https?://(.*/works/\d+).*$',r'https://\1',url) - # logger.debug("post-url:%s"%url) - return url - - ## Login - def needToLoginCheck(self, data): - if 'This work is only available to registered users of the Archive.' in data \ - or "The password or user name you entered doesn't match our records" in data: - return True - else: - return False - - def performLogin(self, url, data): - - params = {} - if self.password: - params['user[login]'] = self.username - params['user[password]'] = self.password - else: - params['user[login]'] = self.getConfig("username") - params['user[password]'] = self.getConfig("password") - params['user[remember_me]'] = '1' - params['commit'] = 'Log in' - params['utf8'] = u'\x2713' # utf8 *is* required now. hex code works better than actual character for some reason. u'✓' - - # authenticity_token now comes from a completely separate json call. - token_json = json.loads(self.get_request('https://' + self.getSiteDomain() + "/token_dispenser.json")) - params['authenticity_token'] = token_json['token'] - - loginUrl = 'https://' + self.getSiteDomain() + '/users/login' - logger.info("Will now login to URL (%s) as (%s)" % (loginUrl, - params['user[login]'])) - - d = self.post_request(loginUrl, params) - - if 'href="/users/logout"' not in d : - logger.info("Failed to login to URL %s as %s" % (loginUrl, - params['user[login]'])) - raise exceptions.FailedToLogin(url,params['user[login]']) - return False - else: - return True - - ## Getting the chapter list and the meta data, plus 'is adult' checking. - def extractChapterUrlsAndMetadata(self): - - if self.is_adult or self.getConfig("is_adult"): - self.addurl = "?view_adult=true" - else: - self.addurl="" - - metaurl = self.url+self.addurl - url = self.url+'/navigate'+self.addurl - logger.info("url: "+url) - logger.info("metaurl: "+metaurl) - - data = self.get_request(url) - if '

Error 503 - Service unavailable

' in data: - # note that it's not *actually* a 503 code... - raise exceptions.FailedToDownload('Site is currently unavailable.') - - meta = self.get_request(metaurl) - - if 'This work is part of an ongoing challenge and will be revealed soon!' in meta: - raise exceptions.FailedToDownload('Site says: "This work is part of an ongoing challenge and will be revealed soon!"') - - if '

' in meta: - logger.debug('

found. If download fails, check for changed "is adult" string') - # This work could have adult content. If you continue, you have agreed that you are willing to see such content. - # This work could have adult content. If you proceed you have agreed that you are willing to see such content. - if re.search(r"This work could have adult content. If you (continue,|proceed) you have agreed that you are willing to see such content.", meta): - if self.addurl: - ## "?view_adult=true" doesn't work on base story - ## URL anymore, which means we have to - metasoup = self.make_soup(meta) - a = metasoup.find('a',text='Proceed') - metaurl = 'https://'+self.host+a['href'] - meta = self.get_request(metaurl) - else: - raise exceptions.AdultCheckRequired(self.url) - - if "Sorry, we couldn't find the work you were looking for." in data: - raise exceptions.StoryDoesNotExist(self.url) - - # need to log in for this one, or always_login. - if self.needToLoginCheck(data) or \ - ( self.getConfig("always_login") and 'href="/users/logout"' not in data ): - self.performLogin(url,data) - data = self.get_request(url,usecache=False) - meta = self.get_request(metaurl,usecache=False) - - ## duplicate of check above for login-required stories that - ## are also hidden. - if 'This work is part of an ongoing challenge and will be revealed soon!' in meta: - raise exceptions.FailedToDownload('Site says: "This work is part of an ongoing challenge and will be revealed soon!"') - - soup = self.make_soup(data) - for tag in soup.findAll('div',id='admin-banner'): - tag.extract() - metasoup = self.make_soup(meta) - for tag in metasoup.findAll('div',id='admin-banner'): - tag.extract() - - - ## Title - a = soup.find('a', href=re.compile(r"/works/\d+$")) - self.story.setMetadata('title',stripHTML(a)) - - if self.getConfig("always_login"): - # deliberately using always_login instead of checking for - # actual login so we don't have a case where these show up - # for a user only when they get user-restricted stories. - - # is bookmarked if has update /bookmarks/ form -- - # create bookmark form uses different url - self.story.setMetadata('bookmarked', - None != metasoup.find('form',action=re.compile(r'^/bookmarks/'))) - if metasoup.find('input',id='bookmark_tag_string').has_attr('value'): - self.story.extendList('bookmarktags', - metasoup.find('input',id='bookmark_tag_string')['value'].split(', ')) - self.story.setMetadata('bookmarkprivate', - metasoup.find('input',id='bookmark_private').has_attr('checked')) - self.story.setMetadata('bookmarkrec', - metasoup.find('input',id='bookmark_rec').has_attr('checked')) - - # detect subscription by unsub button - # logger.debug(metasoup.find('input',value="Unsubscribe")) - self.story.setMetadata('subscribed', - metasoup.find('input',value="Unsubscribe") is not None) - # detect 'marked for later' by 'Mark as Read' button - # logger.debug(metasoup.find('a', href=re.compile(r'/mark_as_read$'))) - self.story.setMetadata('markedforlater', - metasoup.find('a', href=re.compile(r'/mark_as_read$')) is not None) - - self.story.setMetadata('bookmarksummary', - stripHTML(metasoup.find('textarea',id='bookmark_notes'))) - - if metasoup.find('img',alt='(Restricted)'): - self.story.setMetadata('restricted','Restricted') - - # Find authorid and URL from... author url. - alist = soup.findAll('a', href=re.compile(r"/users/\w+/pseuds/.+")) - if len(alist) < 1: # ao3 allows for author 'Anonymous' with no author link. - self.story.setMetadata('author','Anonymous') - self.story.setMetadata('authorUrl','https://' + self.getSiteDomain() + '/') - self.story.setMetadata('authorId','0') - else: - for a in alist: - self.story.addToList('authorId',a['href'].split('/')[-1]) - self.story.addToList('authorUrl','https://'+self.host+a['href']) - self.story.addToList('author',a.text) - - byline = metasoup.find('h3',{'class':'byline'}) - if byline: - self.story.setMetadata('byline',stripHTML(byline)) - - # byline: - #

- # stripped:"Hope Roy [archived by ssa_archivist]" - m = re.match(r'(?P.*) \[archived by ?(?P.*)\]',stripHTML(byline)) - if( m and - len(alist) == 1 and - self.getConfig('use_archived_author') ): - self.story.setMetadata('author',m.group('author')) - - newestChapter = None - self.newestChapterNum = None # save for comparing during update. - # Scan all chapters to find the oldest and newest, on AO3 it's - # possible for authors to insert new chapters out-of-order or - # change the dates of earlier ones by editing them--That WILL - # break epub update. - # Find the chapters: - chapters=soup.findAll('a', href=re.compile(r'/works/'+self.story.getMetadata('storyId')+r"/chapters/\d+$")) - self.story.setMetadata('numChapters',len(chapters)) - logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters')) - if len(chapters)==1: - self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+chapters[0]['href']) - else: - for index, chapter in enumerate(chapters): - # strip just in case there's tags, like in chapter titles. - # (2013-09-21) - date = stripHTML(chapter.findNext('span',class_='datetime'))[1:-1] - chapterDate = makeDate(date,self.dateformat) - self.add_chapter(chapter,'https://'+self.host+chapter['href'], - {'date':chapterDate.strftime(self.getConfig("datechapter_format",self.getConfig("datePublished_format","%Y-%m-%d")))}) - if newestChapter == None or chapterDate > newestChapter: - newestChapter = chapterDate - self.newestChapterNum = index - - a = metasoup.find('blockquote',{'class':'userstuff'}) - if a != None: - a.name='div' # Change blockquote to div. - self.setDescription(url,a) - #self.story.setMetadata('description',a.text) - - a = metasoup.find('dd',{'class':"rating tags"}) - if a != None: - self.story.setMetadata('rating',stripHTML(a.text)) - - d = metasoup.find('dd',{'class':"language"}) - if d != None: - self.story.setMetadata('language',stripHTML(d.text)) - - a = metasoup.find('dd',{'class':"fandom tags"}) - if a != None: - fandoms = a.findAll('a',{'class':"tag"}) - for fandom in fandoms: - self.story.addToList('fandoms',fandom.string) - - a = metasoup.find('dd',{'class':"warning tags"}) - if a != None: - warnings = a.findAll('a',{'class':"tag"}) - for warning in warnings: - self.story.addToList('warnings',warning.string) - - a = metasoup.find('dd',{'class':"freeform tags"}) - if a != None: - genres = a.findAll('a',{'class':"tag"}) - for genre in genres: - self.story.addToList('freeformtags',genre.string) - - a = metasoup.find('dd',{'class':"category tags"}) - if a != None: - genres = a.findAll('a',{'class':"tag"}) - for genre in genres: - if genre != "Gen": - self.story.addToList('ao3categories',genre.string) - - a = metasoup.find('dd',{'class':"character tags"}) - if a != None: - chars = a.findAll('a',{'class':"tag"}) - for char in chars: - self.story.addToList('characters',char.string) - - a = metasoup.find('dd',{'class':"relationship tags"}) - if a != None: - ships = a.findAll('a',{'class':"tag"}) - for ship in ships: - self.story.addToList('ships',ship.string) - - a = metasoup.find('dd',{'class':"collections"}) - if a != None: - collections = a.findAll('a') - for collection in collections: - self.story.addToList('collections',collection.string) - - stats = metasoup.find('dl',{'class':'stats'}) - dt = stats.findAll('dt') - dd = stats.findAll('dd') - for x in range(0,len(dt)): - label = dt[x].text - value = dd[x].text - - if 'Words:' in label: - self.story.setMetadata('numWords', value) - - if 'Comments:' in label: - self.story.setMetadata('comments', value) - - if 'Kudos:' in label: - self.story.setMetadata('kudos', value) - - if 'Hits:' in label: - self.story.setMetadata('hits', value) - - if 'Bookmarks:' in label: - self.story.setMetadata('bookmarks', value) - - if 'Chapters:' in label: - self.story.setMetadata('chapterslashtotal', value) - if value.split('/')[0] == value.split('/')[1]: - self.story.setMetadata('status', 'Completed') - else: - self.story.setMetadata('status', 'In-Progress') - - - if 'Published' in label: - self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat)) - self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) - - if 'Updated' in label: - self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) - - if 'Completed' in label: - self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) - - - # Find Series name from series URL. - ddseries = metasoup.find('dd',{'class':"series"}) - - if ddseries: - for i, a in enumerate(ddseries.findAll('a', href=re.compile(r"/series/\d+"))): - series_name = stripHTML(a) - series_url = 'https://'+self.host+a['href'] - series_index = int(stripHTML(a.previousSibling).replace(', ','').split(' ')[1]) # "Part # of" or ", Part #" - self.story.setMetadata('series%02d'%i,"%s [%s]"%(series_name,series_index)) - self.story.setMetadata('series%02dUrl'%i,series_url) - if i == 0: - self.setSeries(series_name, series_index) - self.story.setMetadata('seriesUrl',series_url) - - if self.getConfig('use_workskin',False): - divmain = metasoup.find('div',{'id':'main'}) - if divmain: - # we sort of assume ddmain exists because otherwise, there would be no fic - workskin = divmain.style - if workskin: - workskin = unicode(workskin.contents[0]) # 'contents' returns a list with (here) a single element - # some transformation to adjust which classes are affected - workskin = workskin.replace('#workskin', '.userstuff') - self.story.extra_css = "/*start of AO3 workskin*/\n" + workskin + "\n/* end of AO3 workskin*/\n" - - def hookForUpdates(self,chaptercount): - if self.newestChapterNum and self.oldchapters and len(self.oldchapters) > self.newestChapterNum: - logger.info("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1)) - self.oldchapters = self.oldchapters[:self.newestChapterNum] - return len(self.oldchapters) - - ## Normalize chapter URLs because a) site has changed from http to - ## https and b) in case of title change. That way updates to - ## existing stories don't re-download all chapters. - def normalize_chapterurl(self,url): - url = re.sub(r"https?://("+self.getSiteDomain()+r"/works/\d+/chapters/\d+)(\?view_adult=true)?$", - r"https://\1",url) - return url - - # grab the text for an individual chapter. - def getChapterTextNum(self, url, index): - ## FYI: Chapter urls used to include ?view_adult=true in each - ## one. With cookiejar being passed now, that's not - ## necessary. However, there is a corner case with plugin--If - ## a user-required story is attempted after gathering metadata - ## for one that needs adult, but not user AND the user doesn't - ## enter a valid user, the is_adult cookie from before can be - ## lost. - logger.debug('Getting chapter text for: %s index: %s' % (url,index)) - - save_chapter_soup = self.make_soup('
') - ## use the div because the full soup will also have . - ## need save_chapter_soup for .new_tag() - save_chapter=save_chapter_soup.find('div') - - whole_dl_soup = chapter_dl_soup = None - - if self.use_full_work_soup and self.getConfig("use_view_full_work",True) and self.getConfig("always_reload_first_chapter"): - self.use_full_work_soup = False - logger.warning("OVERRIDE: AO3 - use_view_full_work not used when always_reload_first_chapter:true") - - if self.use_full_work_soup and self.getConfig("use_view_full_work",True) and self.num_chapters() > 1: - logger.debug("USE view_full_work") - ## Assumed view_adult=true was cookied during metadata - if not self.full_work_soup: - self.full_work_soup = self.make_soup(self.get_request(self.url+"?view_full_work=true"+self.addurl.replace('?','&'))) - ## AO3 has had several cases now where chapter numbers - ## are missing, breaking the link between - ##
and Chapter ##. - ## But they should all still be there and in the right - ## order, so array[index] - self.full_work_chapters = self.full_work_soup.find_all('div',{'id':re.compile(r'chapter-\d+')}) - if len(self.full_work_chapters) != self.num_chapters(): - ## sanity check just in case. - self.use_full_work_soup = False - self.full_work_soup = None - logger.warning("chapter count in view_full_work(%s) disagrees with num of chapters(%s)--ending use_view_full_work"%(len(self.full_work_chapters),self.num_chapters())) - whole_dl_soup = self.full_work_soup - - if whole_dl_soup: - chapter_dl_soup = self.full_work_chapters[index] - else: - whole_dl_soup = chapter_dl_soup = self.make_soup(self.get_request(url+self.addurl)) - if None == chapter_dl_soup: - raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) - - exclude_notes=self.getConfigList('exclude_notes') - - def append_tag(elem,tag,string=None,classes=None): - '''bs4 requires tags be added separately.''' - new_tag = save_chapter_soup.new_tag(tag) - if string: - new_tag.string=string - if classes: - new_tag['class']=[classes] - elem.append(new_tag) - return new_tag - - ## These are the over-all work's 'Notes at the beginning'. - ## They only appear on the first chapter in individual chapter - ## pages and before chapter-1 div. Appending removes - ## headnotes from whole_dl_soup, so be sure to only do it on - ## the first chapter. - head_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_head_notes") - if 'authorheadnotes' not in exclude_notes and index == 0: - headnotes = whole_dl_soup.find('div', {'class' : "preface group"}).find('div', {'class' : "notes module"}) - if headnotes != None: - ## Also include ul class='associations'. - ulassoc = headnotes.find('ul', {'class' : "associations"}) - headnotes = headnotes.find('blockquote', {'class' : "userstuff"}) - if headnotes != None or ulassoc != None: - append_tag(head_notes_div,'b',"Author's Note:") - if ulassoc != None: - # fix relative links--all examples so far have been. - for alink in ulassoc.find_all('a'): - if 'http' not in alink['href']: - alink['href']='https://' + self.getSiteDomain() + alink['href'] - head_notes_div.append(ulassoc) - if headnotes != None: - head_notes_div.append(headnotes) - - ## Can appear on every chapter - if 'chaptersummary' not in exclude_notes: - chapsumm = chapter_dl_soup.find('div', {'id' : "summary"}) - if chapsumm != None: - chapsumm = chapsumm.find('blockquote') - append_tag(head_notes_div,'b',"Summary for the Chapter:") - head_notes_div.append(chapsumm) - - ## Can appear on every chapter - if 'chapterheadnotes' not in exclude_notes: - chapnotes = chapter_dl_soup.find('div', {'id' : "notes"}) - if chapnotes != None: - chapnotes = chapnotes.find('blockquote') - if chapnotes != None: - append_tag(head_notes_div,'b',"Notes for the Chapter:") - head_notes_div.append(chapnotes) - - text = chapter_dl_soup.find('div', {'class' : "userstuff module"}) - chtext = text.find('h3', {'class' : "landmark heading"}) - if chtext: - chtext.extract() - save_chapter.append(text) - - foot_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_foot_notes") - ## Can appear on every chapter - if 'chapterfootnotes' not in exclude_notes: - chapfoot = chapter_dl_soup.find('div', {'class' : "end notes module"}) - if chapfoot != None: - chapfoot = chapfoot.find('blockquote') - append_tag(foot_notes_div,'b',"Notes for the Chapter:") - foot_notes_div.append(chapfoot) - - skip_on_update_tags = [] - ## These are the over-all work's 'Notes at the end'. - ## They only appear on the last chapter in individual chapter - ## pages and after chapter-# div. Appending removes - ## headnotes from whole_dl_soup, so be sure to only do it on - ## the last chapter. - if 'authorfootnotes' not in exclude_notes and index+1 == self.num_chapters(): - footnotes = whole_dl_soup.find('div', {'id' : "work_endnotes"}) - if footnotes != None: - footnotes = footnotes.find('blockquote') - if footnotes: - b = append_tag(foot_notes_div,'b',"Author's Note:") - skip_on_update_tags.append(b) - skip_on_update_tags.append(footnotes) - foot_notes_div.append(footnotes) - - ## It looks like 'Inspired by' links now all appear in the ul - ## class=associations tag in authorheadnotes. This code is - ## left in case I'm wrong and there are still stories with div - ## id=children inspired links at the end. - if 'inspiredlinks' not in exclude_notes and index+1 == self.num_chapters(): - inspiredlinks = whole_dl_soup.find('div', {'id' : "children"}) - if inspiredlinks != None: - if inspiredlinks: - inspiredlinks.find('h3').name='b' # don't want a big h3 at the end. - # fix relative links--all examples so far have been. - for alink in inspiredlinks.find_all('a'): - if 'http' not in alink['href']: - alink['href']='https://' + self.getSiteDomain() + alink['href'] - skip_on_update_tags.append(inspiredlinks) - foot_notes_div.append(inspiredlinks) - - ## remove empty head/food notes div(s) - if not head_notes_div.find(True): - head_notes_div.extract() - if not foot_notes_div.find(True): - foot_notes_div.extract() - ## AO3 story end notes end up in the 'last' chapter, but if - ## updated, then there's a new 'last' chapter. This option - ## applies the 'skip_on_ffdl_update' class to those tags which - ## means they will be removed during epub reading for update. - ## Results: only the last chapter will have end notes. - ## Side-effect: An 'Update Always' that doesn't add a new - ## lasts chapter will remove the end notes. - if self.getConfig("remove_authorfootnotes_on_update"): - for skip_tag in skip_on_update_tags: - if skip_tag.has_attr('class'): - skip_tag['class'].append('skip_on_ffdl_update') - else: - skip_tag['class']=['skip_on_ffdl_update'] - # logger.debug(skip_tag) - - return self.utf8FromSoup(url,save_chapter) - - def before_get_urls_from_page(self,url,normalize): - # special stuff to log into archiveofourown.org, if possible. - # Unlike most that show the links to 'adult' stories, but protect - # them, AO3 doesn't even show them if not logged in. Only works - # with saved user/pass--not going to prompt for list. - if self.getConfig("username"): - if self.getConfig("is_adult"): - if '?' in url: - addurl = "&view_adult=true" - else: - addurl = "?view_adult=true" - else: - addurl="" - # just to get an authenticity_token. - data = self.get_request(url+addurl) - # login the session. - self.performLogin(url,data) - # get the list page with logged in session. - - def get_series_from_page(self,url,data,normalize=False): - ''' - This method is to make it easier for adapters to detect a - series URL, pick out the series metadata and list of storyUrls - to return without needing to override get_urls_from_page - entirely. - ''' - - if 'This work is only available to registered users of the Archive' in data: - raise exceptions.FailedToDownload("This work is only available to registered users of the Archive -- set username/password in personal.ini under [archiveofourown.org]") - ## easiest way to get all the weird URL possibilities and stay - ## up to date with future changes. - m = re.match(self.getSiteURLPattern().replace('/works/','/series/'),url) - if m: - soup = self.make_soup(data) - retval = {} - retval['urllist']=[ 'https://'+self.host+a['href'] for a in soup.select('h4.heading a:first-child') ] - retval['name']=stripHTML(soup.select_one("h2.heading")) - desc=soup.select_one("div.wrapper dd blockquote.userstuff") - if desc: - desc.name='div' # change blockquote to div to match stories. - retval['desc']=desc - stats=stripHTML(soup.select_one("dl.series dl.stats")) - if 'Complete:Yes' in stats: - retval['status'] = "Completed" - elif 'Complete:No' in stats: - retval['status'] = "In-Progress" - return retval - ## return dict with at least {'urllist':['storyUrl','storyUrl',...]} - ## optionally 'name' and 'desc'? - return {} diff --git a/fanficfare/adapters/adapter_squidgeworldorg.py b/fanficfare/adapters/adapter_squidgeworldorg.py index 4540c3c6..6d393f5d 100644 --- a/fanficfare/adapters/adapter_squidgeworldorg.py +++ b/fanficfare/adapters/adapter_squidgeworldorg.py @@ -19,15 +19,15 @@ from __future__ import absolute_import import logging logger = logging.getLogger(__name__) -from .adapter_archiveofourownorg import ArchiveOfOurOwnOrgAdapter +from .base_otw_adapter import BaseOTWAdapter def getClass(): return SquidgeWorldOrgAdapter -class SquidgeWorldOrgAdapter(ArchiveOfOurOwnOrgAdapter): +class SquidgeWorldOrgAdapter(BaseOTWAdapter): def __init__(self, config, url): - ArchiveOfOurOwnOrgAdapter.__init__(self, config, url) + BaseOTWAdapter.__init__(self, config, url) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','sqwo') @@ -36,12 +36,3 @@ class SquidgeWorldOrgAdapter(ArchiveOfOurOwnOrgAdapter): def getSiteDomain(): # The site domain. Does have www here, if it uses it. return 'squidgeworld.org' - - @classmethod - def getAcceptDomains(cls): - # adapter_archiveofourownorg overrides getAcceptDomains, go - # back to the base_adapter version. - # XXX - if/when a third OTW site comes along, refactor code to - # a base_otw_adapter - # https://github.com/otwcode/otwarchive/ - return super(ArchiveOfOurOwnOrgAdapter,cls).getAcceptDomains() diff --git a/fanficfare/adapters/base_otw_adapter.py b/fanficfare/adapters/base_otw_adapter.py new file mode 100644 index 00000000..4d2f2f83 --- /dev/null +++ b/fanficfare/adapters/base_otw_adapter.py @@ -0,0 +1,633 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import logging +logger = logging.getLogger(__name__) +import re +import json + +from ..six import text_type as unicode +from ..htmlcleanup import stripHTML +from .. import exceptions as exceptions + +from .base_adapter import BaseSiteAdapter, makeDate + +class BaseOTWAdapter(BaseSiteAdapter): + + def __init__(self, config, url): + BaseSiteAdapter.__init__(self, config, url) + + self.username = "NoneGiven" # if left empty, site doesn't return any message at all. + self.password = "" + self.is_adult=False + self.addurl = "" + + self.full_work_soup = None + self.full_work_chapters = None + self.use_full_work_soup = True + + # get storyId from url--url validation guarantees query is only sid=1234 + self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) + + # get storyId from url--url validation guarantees query correct + m = re.match(self.getSiteURLPattern(),url) + if m: + self.story.setMetadata('storyId',m.group('id')) + + # normalized story URL. + self._setURL('https://' + self.getSiteDomain() + '/works/'+self.story.getMetadata('storyId')) + else: + raise exceptions.InvalidStoryURL(url, + self.getSiteDomain(), + self.getSiteExampleURLs()) + + # Each adapter needs to have a unique site abbreviation. + self.story.setMetadata('siteabbrev','ao3') + + # The date format will vary from site to site. + # http://docs.python.org/library/datetime.html#strftime-strptime-behavior + self.dateformat = "%Y-%b-%d" + + @classmethod + def getConfigSections(cls): + "Only needs to be overriden if has additional ini sections." + return ['base_otw',cls.getConfigSection()] + + @classmethod + def getSiteExampleURLs(cls): + return "https://"+cls.getSiteDomain()+"/works/123456 https://"+cls.getSiteDomain()+"/collections/Some_Archive/works/123456 https://"+cls.getSiteDomain()+"/works/123456/chapters/78901" + + def getSiteURLPattern(self): + # https://archiveofourown.org/collections/Smallville_Slash_Archive/works/159770 + # Discard leading zeros from story ID numbers--AO3 doesn't use them in it's own chapter URLs. + # logger.debug(r"https?://" + r"|".join([x.replace('.','\.') for x in self.getAcceptDomains()]) + r"(/collections/[^/]+)?/works/0*(?P\d+)") + return r"https?://(" + r"|".join([x.replace('.',r'\.') for x in self.getAcceptDomains()]) + r")(/collections/[^/]+)?/works/0*(?P\d+)" + + @classmethod + def get_section_url(cls,url): + ## minimal URL used for section names in INI and reject list + ## for comparison + # logger.debug("pre--url:%s"%url) + ## https://archiveofourown.org/works/19334905/chapters/71697933 + # http://archiveofourown.org/works/34686793/chapters/89043733 + url = re.sub(r'^https?://(.*/works/\d+).*$',r'https://\1',url) + # logger.debug("post-url:%s"%url) + return url + + ## Login + def needToLoginCheck(self, data): + if 'This work is only available to registered users of the Archive.' in data \ + or "The password or user name you entered doesn't match our records" in data: + return True + else: + return False + + def performLogin(self, url, data): + + params = {} + if self.password: + params['user[login]'] = self.username + params['user[password]'] = self.password + else: + params['user[login]'] = self.getConfig("username") + params['user[password]'] = self.getConfig("password") + params['user[remember_me]'] = '1' + params['commit'] = 'Log in' + params['utf8'] = u'\x2713' # utf8 *is* required now. hex code works better than actual character for some reason. u'✓' + + # authenticity_token now comes from a completely separate json call. + token_json = json.loads(self.get_request('https://' + self.getSiteDomain() + "/token_dispenser.json")) + params['authenticity_token'] = token_json['token'] + + loginUrl = 'https://' + self.getSiteDomain() + '/users/login' + logger.info("Will now login to URL (%s) as (%s)" % (loginUrl, + params['user[login]'])) + + d = self.post_request(loginUrl, params) + + if 'href="/users/logout"' not in d : + logger.info("Failed to login to URL %s as %s" % (loginUrl, + params['user[login]'])) + raise exceptions.FailedToLogin(url,params['user[login]']) + return False + else: + return True + + ## Getting the chapter list and the meta data, plus 'is adult' checking. + def extractChapterUrlsAndMetadata(self): + + if self.is_adult or self.getConfig("is_adult"): + self.addurl = "?view_adult=true" + else: + self.addurl="" + + metaurl = self.url+self.addurl + url = self.url+'/navigate'+self.addurl + logger.info("url: "+url) + logger.info("metaurl: "+metaurl) + + data = self.get_request(url) + if '

Error 503 - Service unavailable

' in data: + # note that it's not *actually* a 503 code... + raise exceptions.FailedToDownload('Site is currently unavailable.') + + meta = self.get_request(metaurl) + + if 'This work is part of an ongoing challenge and will be revealed soon!' in meta: + raise exceptions.FailedToDownload('Site says: "This work is part of an ongoing challenge and will be revealed soon!"') + + if '

' in meta: + logger.debug('

found. If download fails, check for changed "is adult" string') + # This work could have adult content. If you continue, you have agreed that you are willing to see such content. + # This work could have adult content. If you proceed you have agreed that you are willing to see such content. + if re.search(r"This work could have adult content. If you (continue,|proceed) you have agreed that you are willing to see such content.", meta): + if self.addurl: + ## "?view_adult=true" doesn't work on base story + ## URL anymore, which means we have to + metasoup = self.make_soup(meta) + a = metasoup.find('a',text='Proceed') + metaurl = 'https://'+self.host+a['href'] + meta = self.get_request(metaurl) + else: + raise exceptions.AdultCheckRequired(self.url) + + if "Sorry, we couldn't find the work you were looking for." in data: + raise exceptions.StoryDoesNotExist(self.url) + + # need to log in for this one, or always_login. + if self.needToLoginCheck(data) or \ + ( self.getConfig("always_login") and 'href="/users/logout"' not in data ): + self.performLogin(url,data) + data = self.get_request(url,usecache=False) + meta = self.get_request(metaurl,usecache=False) + + ## duplicate of check above for login-required stories that + ## are also hidden. + if 'This work is part of an ongoing challenge and will be revealed soon!' in meta: + raise exceptions.FailedToDownload('Site says: "This work is part of an ongoing challenge and will be revealed soon!"') + + soup = self.make_soup(data) + for tag in soup.findAll('div',id='admin-banner'): + tag.extract() + metasoup = self.make_soup(meta) + for tag in metasoup.findAll('div',id='admin-banner'): + tag.extract() + + + ## Title + a = soup.find('a', href=re.compile(r"/works/\d+$")) + self.story.setMetadata('title',stripHTML(a)) + + if self.getConfig("always_login"): + # deliberately using always_login instead of checking for + # actual login so we don't have a case where these show up + # for a user only when they get user-restricted stories. + + # is bookmarked if has update /bookmarks/ form -- + # create bookmark form uses different url + self.story.setMetadata('bookmarked', + None != metasoup.find('form',action=re.compile(r'^/bookmarks/'))) + if metasoup.find('input',id='bookmark_tag_string').has_attr('value'): + self.story.extendList('bookmarktags', + metasoup.find('input',id='bookmark_tag_string')['value'].split(', ')) + self.story.setMetadata('bookmarkprivate', + metasoup.find('input',id='bookmark_private').has_attr('checked')) + self.story.setMetadata('bookmarkrec', + metasoup.find('input',id='bookmark_rec').has_attr('checked')) + + # detect subscription by unsub button + # logger.debug(metasoup.find('input',value="Unsubscribe")) + self.story.setMetadata('subscribed', + metasoup.find('input',value="Unsubscribe") is not None) + # detect 'marked for later' by 'Mark as Read' button + # logger.debug(metasoup.find('a', href=re.compile(r'/mark_as_read$'))) + self.story.setMetadata('markedforlater', + metasoup.find('a', href=re.compile(r'/mark_as_read$')) is not None) + + self.story.setMetadata('bookmarksummary', + stripHTML(metasoup.find('textarea',id='bookmark_notes'))) + + if metasoup.find('img',alt='(Restricted)'): + self.story.setMetadata('restricted','Restricted') + + # Find authorid and URL from... author url. + alist = soup.findAll('a', href=re.compile(r"/users/\w+/pseuds/.+")) + if len(alist) < 1: # ao3 allows for author 'Anonymous' with no author link. + self.story.setMetadata('author','Anonymous') + self.story.setMetadata('authorUrl','https://' + self.getSiteDomain() + '/') + self.story.setMetadata('authorId','0') + else: + for a in alist: + self.story.addToList('authorId',a['href'].split('/')[-1]) + self.story.addToList('authorUrl','https://'+self.host+a['href']) + self.story.addToList('author',a.text) + + byline = metasoup.find('h3',{'class':'byline'}) + if byline: + self.story.setMetadata('byline',stripHTML(byline)) + + # byline: + #

+ # stripped:"Hope Roy [archived by ssa_archivist]" + m = re.match(r'(?P.*) \[archived by ?(?P.*)\]',stripHTML(byline)) + if( m and + len(alist) == 1 and + self.getConfig('use_archived_author') ): + self.story.setMetadata('author',m.group('author')) + + newestChapter = None + self.newestChapterNum = None # save for comparing during update. + # Scan all chapters to find the oldest and newest, on AO3 it's + # possible for authors to insert new chapters out-of-order or + # change the dates of earlier ones by editing them--That WILL + # break epub update. + # Find the chapters: + chapters=soup.findAll('a', href=re.compile(r'/works/'+self.story.getMetadata('storyId')+r"/chapters/\d+$")) + self.story.setMetadata('numChapters',len(chapters)) + logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters')) + if len(chapters)==1: + self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+chapters[0]['href']) + else: + for index, chapter in enumerate(chapters): + # strip just in case there's tags, like in chapter titles. + # (2013-09-21) + date = stripHTML(chapter.findNext('span',class_='datetime'))[1:-1] + chapterDate = makeDate(date,self.dateformat) + self.add_chapter(chapter,'https://'+self.host+chapter['href'], + {'date':chapterDate.strftime(self.getConfig("datechapter_format",self.getConfig("datePublished_format","%Y-%m-%d")))}) + if newestChapter == None or chapterDate > newestChapter: + newestChapter = chapterDate + self.newestChapterNum = index + + a = metasoup.find('blockquote',{'class':'userstuff'}) + if a != None: + a.name='div' # Change blockquote to div. + self.setDescription(url,a) + #self.story.setMetadata('description',a.text) + + a = metasoup.find('dd',{'class':"rating tags"}) + if a != None: + self.story.setMetadata('rating',stripHTML(a.text)) + + d = metasoup.find('dd',{'class':"language"}) + if d != None: + self.story.setMetadata('language',stripHTML(d.text)) + + a = metasoup.find('dd',{'class':"fandom tags"}) + if a != None: + fandoms = a.findAll('a',{'class':"tag"}) + for fandom in fandoms: + self.story.addToList('fandoms',fandom.string) + + a = metasoup.find('dd',{'class':"warning tags"}) + if a != None: + warnings = a.findAll('a',{'class':"tag"}) + for warning in warnings: + self.story.addToList('warnings',warning.string) + + a = metasoup.find('dd',{'class':"freeform tags"}) + if a != None: + genres = a.findAll('a',{'class':"tag"}) + for genre in genres: + self.story.addToList('freeformtags',genre.string) + + a = metasoup.find('dd',{'class':"category tags"}) + if a != None: + genres = a.findAll('a',{'class':"tag"}) + for genre in genres: + if genre != "Gen": + self.story.addToList('ao3categories',genre.string) + + a = metasoup.find('dd',{'class':"character tags"}) + if a != None: + chars = a.findAll('a',{'class':"tag"}) + for char in chars: + self.story.addToList('characters',char.string) + + a = metasoup.find('dd',{'class':"relationship tags"}) + if a != None: + ships = a.findAll('a',{'class':"tag"}) + for ship in ships: + self.story.addToList('ships',ship.string) + + a = metasoup.find('dd',{'class':"collections"}) + if a != None: + collections = a.findAll('a') + for collection in collections: + self.story.addToList('collections',collection.string) + + stats = metasoup.find('dl',{'class':'stats'}) + dt = stats.findAll('dt') + dd = stats.findAll('dd') + for x in range(0,len(dt)): + label = dt[x].text + value = dd[x].text + + if 'Words:' in label: + self.story.setMetadata('numWords', value) + + if 'Comments:' in label: + self.story.setMetadata('comments', value) + + if 'Kudos:' in label: + self.story.setMetadata('kudos', value) + + if 'Hits:' in label: + self.story.setMetadata('hits', value) + + if 'Bookmarks:' in label: + self.story.setMetadata('bookmarks', value) + + if 'Chapters:' in label: + self.story.setMetadata('chapterslashtotal', value) + if value.split('/')[0] == value.split('/')[1]: + self.story.setMetadata('status', 'Completed') + else: + self.story.setMetadata('status', 'In-Progress') + + + if 'Published' in label: + self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat)) + self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) + + if 'Updated' in label: + self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) + + if 'Completed' in label: + self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) + + + # Find Series name from series URL. + ddseries = metasoup.find('dd',{'class':"series"}) + + if ddseries: + for i, a in enumerate(ddseries.findAll('a', href=re.compile(r"/series/\d+"))): + series_name = stripHTML(a) + series_url = 'https://'+self.host+a['href'] + series_index = int(stripHTML(a.previousSibling).replace(', ','').split(' ')[1]) # "Part # of" or ", Part #" + self.story.setMetadata('series%02d'%i,"%s [%s]"%(series_name,series_index)) + self.story.setMetadata('series%02dUrl'%i,series_url) + if i == 0: + self.setSeries(series_name, series_index) + self.story.setMetadata('seriesUrl',series_url) + + if self.getConfig('use_workskin',False): + divmain = metasoup.find('div',{'id':'main'}) + if divmain: + # we sort of assume ddmain exists because otherwise, there would be no fic + workskin = divmain.style + if workskin: + workskin = unicode(workskin.contents[0]) # 'contents' returns a list with (here) a single element + # some transformation to adjust which classes are affected + workskin = workskin.replace('#workskin', '.userstuff') + self.story.extra_css = "/*start of AO3 workskin*/\n" + workskin + "\n/* end of AO3 workskin*/\n" + + def hookForUpdates(self,chaptercount): + if self.newestChapterNum and self.oldchapters and len(self.oldchapters) > self.newestChapterNum: + logger.info("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1)) + self.oldchapters = self.oldchapters[:self.newestChapterNum] + return len(self.oldchapters) + + ## Normalize chapter URLs because a) site has changed from http to + ## https and b) in case of title change. That way updates to + ## existing stories don't re-download all chapters. + def normalize_chapterurl(self,url): + url = re.sub(r"https?://("+self.getSiteDomain()+r"/works/\d+/chapters/\d+)(\?view_adult=true)?$", + r"https://\1",url) + return url + + # grab the text for an individual chapter. + def getChapterTextNum(self, url, index): + ## FYI: Chapter urls used to include ?view_adult=true in each + ## one. With cookiejar being passed now, that's not + ## necessary. However, there is a corner case with plugin--If + ## a user-required story is attempted after gathering metadata + ## for one that needs adult, but not user AND the user doesn't + ## enter a valid user, the is_adult cookie from before can be + ## lost. + logger.debug('Getting chapter text for: %s index: %s' % (url,index)) + + save_chapter_soup = self.make_soup('
') + ## use the div because the full soup will also have . + ## need save_chapter_soup for .new_tag() + save_chapter=save_chapter_soup.find('div') + + whole_dl_soup = chapter_dl_soup = None + + if self.use_full_work_soup and self.getConfig("use_view_full_work",True) and self.getConfig("always_reload_first_chapter"): + self.use_full_work_soup = False + logger.warning("OVERRIDE: AO3 - use_view_full_work not used when always_reload_first_chapter:true") + + if self.use_full_work_soup and self.getConfig("use_view_full_work",True) and self.num_chapters() > 1: + logger.debug("USE view_full_work") + ## Assumed view_adult=true was cookied during metadata + if not self.full_work_soup: + self.full_work_soup = self.make_soup(self.get_request(self.url+"?view_full_work=true"+self.addurl.replace('?','&'))) + ## AO3 has had several cases now where chapter numbers + ## are missing, breaking the link between + ##
and Chapter ##. + ## But they should all still be there and in the right + ## order, so array[index] + self.full_work_chapters = self.full_work_soup.find_all('div',{'id':re.compile(r'chapter-\d+')}) + if len(self.full_work_chapters) != self.num_chapters(): + ## sanity check just in case. + self.use_full_work_soup = False + self.full_work_soup = None + logger.warning("chapter count in view_full_work(%s) disagrees with num of chapters(%s)--ending use_view_full_work"%(len(self.full_work_chapters),self.num_chapters())) + whole_dl_soup = self.full_work_soup + + if whole_dl_soup: + chapter_dl_soup = self.full_work_chapters[index] + else: + whole_dl_soup = chapter_dl_soup = self.make_soup(self.get_request(url+self.addurl)) + if None == chapter_dl_soup: + raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) + + exclude_notes=self.getConfigList('exclude_notes') + + def append_tag(elem,tag,string=None,classes=None): + '''bs4 requires tags be added separately.''' + new_tag = save_chapter_soup.new_tag(tag) + if string: + new_tag.string=string + if classes: + new_tag['class']=[classes] + elem.append(new_tag) + return new_tag + + ## These are the over-all work's 'Notes at the beginning'. + ## They only appear on the first chapter in individual chapter + ## pages and before chapter-1 div. Appending removes + ## headnotes from whole_dl_soup, so be sure to only do it on + ## the first chapter. + head_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_head_notes") + if 'authorheadnotes' not in exclude_notes and index == 0: + headnotes = whole_dl_soup.find('div', {'class' : "preface group"}).find('div', {'class' : "notes module"}) + if headnotes != None: + ## Also include ul class='associations'. + ulassoc = headnotes.find('ul', {'class' : "associations"}) + headnotes = headnotes.find('blockquote', {'class' : "userstuff"}) + if headnotes != None or ulassoc != None: + append_tag(head_notes_div,'b',"Author's Note:") + if ulassoc != None: + # fix relative links--all examples so far have been. + for alink in ulassoc.find_all('a'): + if 'http' not in alink['href']: + alink['href']='https://' + self.getSiteDomain() + alink['href'] + head_notes_div.append(ulassoc) + if headnotes != None: + head_notes_div.append(headnotes) + + ## Can appear on every chapter + if 'chaptersummary' not in exclude_notes: + chapsumm = chapter_dl_soup.find('div', {'id' : "summary"}) + if chapsumm != None: + chapsumm = chapsumm.find('blockquote') + append_tag(head_notes_div,'b',"Summary for the Chapter:") + head_notes_div.append(chapsumm) + + ## Can appear on every chapter + if 'chapterheadnotes' not in exclude_notes: + chapnotes = chapter_dl_soup.find('div', {'id' : "notes"}) + if chapnotes != None: + chapnotes = chapnotes.find('blockquote') + if chapnotes != None: + append_tag(head_notes_div,'b',"Notes for the Chapter:") + head_notes_div.append(chapnotes) + + text = chapter_dl_soup.find('div', {'class' : "userstuff module"}) + chtext = text.find('h3', {'class' : "landmark heading"}) + if chtext: + chtext.extract() + save_chapter.append(text) + + foot_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_foot_notes") + ## Can appear on every chapter + if 'chapterfootnotes' not in exclude_notes: + chapfoot = chapter_dl_soup.find('div', {'class' : "end notes module"}) + if chapfoot != None: + chapfoot = chapfoot.find('blockquote') + append_tag(foot_notes_div,'b',"Notes for the Chapter:") + foot_notes_div.append(chapfoot) + + skip_on_update_tags = [] + ## These are the over-all work's 'Notes at the end'. + ## They only appear on the last chapter in individual chapter + ## pages and after chapter-# div. Appending removes + ## headnotes from whole_dl_soup, so be sure to only do it on + ## the last chapter. + if 'authorfootnotes' not in exclude_notes and index+1 == self.num_chapters(): + footnotes = whole_dl_soup.find('div', {'id' : "work_endnotes"}) + if footnotes != None: + footnotes = footnotes.find('blockquote') + if footnotes: + b = append_tag(foot_notes_div,'b',"Author's Note:") + skip_on_update_tags.append(b) + skip_on_update_tags.append(footnotes) + foot_notes_div.append(footnotes) + + ## It looks like 'Inspired by' links now all appear in the ul + ## class=associations tag in authorheadnotes. This code is + ## left in case I'm wrong and there are still stories with div + ## id=children inspired links at the end. + if 'inspiredlinks' not in exclude_notes and index+1 == self.num_chapters(): + inspiredlinks = whole_dl_soup.find('div', {'id' : "children"}) + if inspiredlinks != None: + if inspiredlinks: + inspiredlinks.find('h3').name='b' # don't want a big h3 at the end. + # fix relative links--all examples so far have been. + for alink in inspiredlinks.find_all('a'): + if 'http' not in alink['href']: + alink['href']='https://' + self.getSiteDomain() + alink['href'] + skip_on_update_tags.append(inspiredlinks) + foot_notes_div.append(inspiredlinks) + + ## remove empty head/food notes div(s) + if not head_notes_div.find(True): + head_notes_div.extract() + if not foot_notes_div.find(True): + foot_notes_div.extract() + ## AO3 story end notes end up in the 'last' chapter, but if + ## updated, then there's a new 'last' chapter. This option + ## applies the 'skip_on_ffdl_update' class to those tags which + ## means they will be removed during epub reading for update. + ## Results: only the last chapter will have end notes. + ## Side-effect: An 'Update Always' that doesn't add a new + ## lasts chapter will remove the end notes. + if self.getConfig("remove_authorfootnotes_on_update"): + for skip_tag in skip_on_update_tags: + if skip_tag.has_attr('class'): + skip_tag['class'].append('skip_on_ffdl_update') + else: + skip_tag['class']=['skip_on_ffdl_update'] + # logger.debug(skip_tag) + + return self.utf8FromSoup(url,save_chapter) + + def before_get_urls_from_page(self,url,normalize): + # special stuff to log into archiveofourown.org, if possible. + # Unlike most that show the links to 'adult' stories, but protect + # them, AO3 doesn't even show them if not logged in. Only works + # with saved user/pass--not going to prompt for list. + if self.getConfig("username"): + if self.getConfig("is_adult"): + if '?' in url: + addurl = "&view_adult=true" + else: + addurl = "?view_adult=true" + else: + addurl="" + # just to get an authenticity_token. + data = self.get_request(url+addurl) + # login the session. + self.performLogin(url,data) + # get the list page with logged in session. + + def get_series_from_page(self,url,data,normalize=False): + ''' + This method is to make it easier for adapters to detect a + series URL, pick out the series metadata and list of storyUrls + to return without needing to override get_urls_from_page + entirely. + ''' + + if 'This work is only available to registered users of the Archive' in data: + raise exceptions.FailedToDownload("This work is only available to registered users of the Archive -- set username/password in personal.ini under [%s]"%self.getSiteDomain()) + ## easiest way to get all the weird URL possibilities and stay + ## up to date with future changes. + m = re.match(self.getSiteURLPattern().replace('/works/','/series/'),url) + if m: + soup = self.make_soup(data) + retval = {} + retval['urllist']=[ 'https://'+self.host+a['href'] for a in soup.select('h4.heading a:first-child') ] + retval['name']=stripHTML(soup.select_one("h2.heading")) + desc=soup.select_one("div.wrapper dd blockquote.userstuff") + if desc: + desc.name='div' # change blockquote to div to match stories. + retval['desc']=desc + stats=stripHTML(soup.select_one("dl.series dl.stats")) + if 'Complete:Yes' in stats: + retval['status'] = "Completed" + elif 'Complete:No' in stats: + retval['status'] = "In-Progress" + return retval + ## return dict with at least {'urllist':['storyUrl','storyUrl',...]} + ## optionally 'name' and 'desc'? + return {}