Error 503 - Service unavailable

' in meta: logger.debug('

found. If download fails, check for changed "is adult" string') # This work could have adult content. If you continue, you have agreed that you are willing to see such content. # This work could have adult content. If you proceed you have agreed that you are willing to see such content. if re.search(r"This work could have adult content. If you (continue,|proceed) you have agreed that you are willing to see such content.", meta): if self.addurl: ## "?view_adult=true" doesn't work on base story ## URL anymore, which means we have to metasoup = self.make_soup(meta) a = metasoup.find('a',string='Proceed') metaurl = 'https://'+self.host+a['href'] meta = self.get_request(metaurl) else: raise exceptions.AdultCheckRequired(self.url) if "Sorry, we couldn't find the work you were looking for." in data: raise exceptions.StoryDoesNotExist(self.url) # need to log in for this one, or always_login. if self.needToLoginCheck(data) or \ ( self.getConfig("always_login") and 'href="/users/logout"' not in data ): self.performLogin(url,data) data = self.get_request(url,usecache=False) meta = self.get_request(metaurl,usecache=False) ## duplicate of check above for login-required stories that ## are also hidden. if 'This work is part of an ongoing challenge and will be revealed soon!' in meta: raise exceptions.FailedToDownload('Site says: "This work is part of an ongoing challenge and will be revealed soon!"') if '

Sorry, you don't have permission to access the page you were trying to reach.

# Hope Roy [archived by ssa_archivist] #

.*) \[archived by ?(?P.*)\]',stripHTML(byline)) if( m and len(alist) == 1 and self.getConfig('use_archived_author') ): self.story.setMetadata('author',m.group('author')) newestChapter = None self.newestChapterNum = None # save for comparing during update. # Scan all chapters to find the oldest and newest, on AO3 it's # possible for authors to insert new chapters out-of-order or # change the dates of earlier ones by editing them--That WILL # break epub update. # Find the chapters: chapters=soup.findAll('a', href=re.compile(r'/works/'+self.story.getMetadata('storyId')+r"/chapters/\d+$")) self.story.setMetadata('numChapters',len(chapters)) logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters')) if len(chapters)==1: self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+chapters[0]['href']) else: for index, chapter in enumerate(chapters): # strip just in case there's tags, like in chapter titles. # (2013-09-21) date = stripHTML(chapter.findNext('span',class_='datetime'))[1:-1] chapterDate = makeDate(date,self.dateformat) self.add_chapter(chapter,'https://'+self.host+chapter['href'], {'date':chapterDate.strftime(self.getConfig("datechapter_format",self.getConfig("datePublished_format","%Y-%m-%d")))}) if newestChapter == None or chapterDate > newestChapter: newestChapter = chapterDate self.newestChapterNum = index a = metasoup.find('blockquote',{'class':'userstuff'}) if a != None: a.name='div' # Change blockquote to div. self.setDescription(url,a) #self.story.setMetadata('description',a.text) a = metasoup.find('dd',{'class':"rating tags"}) if a != None: self.story.setMetadata('rating',stripHTML(a.text)) d = metasoup.find('dd',{'class':"language"}) if d != None: self.story.setMetadata('language',stripHTML(d.text)) a = metasoup.find('dd',{'class':"fandom tags"}) if a != None: fandoms = a.findAll('a',{'class':"tag"}) for fandom in fandoms: self.story.addToList('fandoms',fandom.string) a = metasoup.find('dd',{'class':"warning tags"}) if a != None: warnings = a.findAll('a',{'class':"tag"}) for warning in warnings: self.story.addToList('warnings',warning.string) a = metasoup.find('dd',{'class':"freeform tags"}) if a != None: genres = a.findAll('a',{'class':"tag"}) for genre in genres: self.story.addToList('freeformtags',genre.string) a = metasoup.find('dd',{'class':"category tags"}) if a != None: genres = a.findAll('a',{'class':"tag"}) for genre in genres: if genre != "Gen": self.story.addToList('ao3categories',genre.string) a = metasoup.find('dd',{'class':"character tags"}) if a != None: chars = a.findAll('a',{'class':"tag"}) for char in chars: self.story.addToList('characters',char.string) a = metasoup.find('dd',{'class':"relationship tags"}) if a != None: ships = a.findAll('a',{'class':"tag"}) for ship in ships: self.story.addToList('ships',ship.string) a = metasoup.find('dd',{'class':"collections"}) if a != None: collections = a.findAll('a') for collection in collections: self.story.addToList('collections',collection.string) stats = metasoup.find('dl',{'class':'stats'}) dt = stats.findAll('dt') dd = stats.findAll('dd') for x in range(0,len(dt)): label = dt[x].text value = dd[x].text if 'Words:' in label: self.story.setMetadata('numWords', value) if 'Comments:' in label: self.story.setMetadata('comments', value) if 'Kudos:' in label: self.story.setMetadata('kudos', value) if 'Hits:' in label: self.story.setMetadata('hits', value) if 'Bookmarks:' in label: self.story.setMetadata('bookmarks', value) if 'Chapters:' in label: self.story.setMetadata('chapterslashtotal', value) if value.split('/')[0] == value.split('/')[1]: self.story.setMetadata('status', 'Completed') else: self.story.setMetadata('status', 'In-Progress') if 'Published' in label: self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat)) self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) if 'Updated' in label: self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) if 'Completed' in label: self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) # Find Series name from series URL. ddseries = metasoup.find('dd',{'class':"series"}) if ddseries: for i, a in enumerate(ddseries.findAll('a', href=re.compile(r"/series/\d+"))): series_name = stripHTML(a) series_url = 'https://'+self.host+a['href'] series_index = int(stripHTML(a.previousSibling).replace(', ','').split(' ')[1]) # "Part # of" or ", Part #" self.story.setMetadata('series%02d'%i,"%s [%s]"%(series_name,series_index)) self.story.setMetadata('series%02dUrl'%i,series_url) if i == 0: self.setSeries(series_name, series_index) self.story.setMetadata('seriesUrl',series_url) if self.getConfig('use_workskin',False): divmain = metasoup.find('div',{'id':'main'}) if divmain: # we sort of assume ddmain exists because otherwise, there would be no fic workskin = divmain.style if workskin: workskin = unicode(workskin.contents[0]) # 'contents' returns a list with (here) a single element # some transformation to adjust which classes are affected workskin = workskin.replace('#workskin', '.userstuff') self.story.extra_css = "/*start of AO3 workskin*/\n" + workskin + "\n/* end of AO3 workskin*/\n" def hookForUpdates(self,chaptercount): if self.newestChapterNum and self.oldchapters and len(self.oldchapters) > self.newestChapterNum: logger.info("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1)) self.oldchapters = self.oldchapters[:self.newestChapterNum] return len(self.oldchapters) ## Normalize chapter URLs because a) site has changed from http to ## https and b) in case of title change. That way updates to ## existing stories don't re-download all chapters. def normalize_chapterurl(self,url): url = re.sub(r"https?://("+self.getSiteDomain()+r"/works/\d+/chapters/\d+)(\?view_adult=true)?$", r"https://\1",url) return url # grab the text for an individual chapter. def getChapterTextNum(self, url, index): ## FYI: Chapter urls used to include ?view_adult=true in each ## one. With cookiejar being passed now, that's not ## necessary. However, there is a corner case with plugin--If ## a user-required story is attempted after gathering metadata ## for one that needs adult, but not user AND the user doesn't ## enter a valid user, the is_adult cookie from before can be ## lost. logger.debug('Getting chapter text for: %s index: %s' % (url,index)) save_chapter_soup = self.make_soup('
') ## use the div because the full soup will also have . ## need save_chapter_soup for .new_tag() save_chapter=save_chapter_soup.find('div') whole_dl_soup = chapter_dl_soup = None if self.use_full_work_soup and self.getConfig("use_view_full_work",True) and self.getConfig("always_reload_first_chapter"): self.use_full_work_soup = False logger.warning("OVERRIDE: AO3 - use_view_full_work not used when always_reload_first_chapter:true") if self.use_full_work_soup and self.getConfig("use_view_full_work",True) and self.num_chapters() > 1: logger.debug("USE view_full_work") ## Assumed view_adult=true was cookied during metadata if not self.full_work_soup: self.full_work_soup = self.make_soup(self.get_request(self.url+"?view_full_work=true"+self.addurl.replace('?','&'))) ## AO3 has had several cases now where chapter numbers ## are missing, breaking the link between ##
and Chapter ##. ## But they should all still be there and in the right ## order, so array[index] self.full_work_chapters = self.full_work_soup.find_all('div',{'id':re.compile(r'chapter-\d+')}) if len(self.full_work_chapters) != self.num_chapters(): ## sanity check just in case. self.use_full_work_soup = False self.full_work_soup = None logger.warning("chapter count in view_full_work(%s) disagrees with num of chapters(%s)--ending use_view_full_work"%(len(self.full_work_chapters),self.num_chapters())) whole_dl_soup = self.full_work_soup if whole_dl_soup: chapter_dl_soup = self.full_work_chapters[index] else: whole_dl_soup = chapter_dl_soup = self.make_soup(self.get_request(url+self.addurl)) if None == chapter_dl_soup: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) exclude_notes=self.getConfigList('exclude_notes') def append_tag(elem,tag,string=None,classes=None): '''bs4 requires tags be added separately.''' new_tag = save_chapter_soup.new_tag(tag) if string: new_tag.string=string if classes: new_tag['class']=[classes] elem.append(new_tag) return new_tag ## These are the over-all work's 'Notes at the beginning'. ## They only appear on the first chapter in individual chapter ## pages and before chapter-1 div. Appending removes ## headnotes from whole_dl_soup, so be sure to only do it on ## the first chapter. head_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_head_notes") if 'authorheadnotes' not in exclude_notes and index == 0: headnotes = whole_dl_soup.find('div', {'class' : "preface group"}).find('div', {'class' : "notes module"}) if headnotes != None: ## Also include ul class='associations'. ulassoc = headnotes.find('ul', {'class' : "associations"}) headnotes = headnotes.find('blockquote', {'class' : "userstuff"}) if headnotes != None or ulassoc != None: append_tag(head_notes_div,'b',"Author's Note:") if ulassoc != None: # fix relative links--all examples so far have been. for alink in ulassoc.find_all('a'): if 'http' not in alink['href']: alink['href']='https://' + self.getSiteDomain() + alink['href'] head_notes_div.append(ulassoc) if headnotes != None: head_notes_div.append(headnotes) ## Can appear on every chapter if 'chaptersummary' not in exclude_notes: chapsumm = chapter_dl_soup.find('div', {'id' : "summary"}) if chapsumm != None: chapsumm = chapsumm.find('blockquote') append_tag(head_notes_div,'b',"Summary for the Chapter:") head_notes_div.append(chapsumm) ## Can appear on every chapter if 'chapterheadnotes' not in exclude_notes: chapnotes = chapter_dl_soup.find('div', {'id' : "notes"}) if chapnotes != None: chapnotes = chapnotes.find('blockquote') if chapnotes != None: append_tag(head_notes_div,'b',"Notes for the Chapter:") head_notes_div.append(chapnotes) text = chapter_dl_soup.find('div', {'class' : "userstuff module"}) chtext = text.find('h3', {'class' : "landmark heading"}) if chtext: chtext.extract() save_chapter.append(text) foot_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_foot_notes") ## Can appear on every chapter if 'chapterfootnotes' not in exclude_notes: chapfoot = chapter_dl_soup.find('div', {'class' : "end notes module"}) if chapfoot != None: chapfoot = chapfoot.find('blockquote') append_tag(foot_notes_div,'b',"Notes for the Chapter:") foot_notes_div.append(chapfoot) skip_on_update_tags = [] ## These are the over-all work's 'Notes at the end'. ## They only appear on the last chapter in individual chapter ## pages and after chapter-# div. Appending removes ## headnotes from whole_dl_soup, so be sure to only do it on ## the last chapter. if 'authorfootnotes' not in exclude_notes and index+1 == self.num_chapters(): footnotes = whole_dl_soup.find('div', {'id' : "work_endnotes"}) if footnotes != None: footnotes = footnotes.find('blockquote') if footnotes: b = append_tag(foot_notes_div,'b',"Author's Note:") skip_on_update_tags.append(b) skip_on_update_tags.append(footnotes) foot_notes_div.append(footnotes) ## It looks like 'Inspired by' links now all appear in the ul ## class=associations tag in authorheadnotes. This code is ## left in case I'm wrong and there are still stories with div ## id=children inspired links at the end. if 'inspiredlinks' not in exclude_notes and index+1 == self.num_chapters(): inspiredlinks = whole_dl_soup.find('div', {'id' : "children"}) if inspiredlinks != None: if inspiredlinks: inspiredlinks.find('h3').name='b' # don't want a big h3 at the end. # fix relative links--all examples so far have been. for alink in inspiredlinks.find_all('a'): if 'http' not in alink['href']: alink['href']='https://' + self.getSiteDomain() + alink['href'] skip_on_update_tags.append(inspiredlinks) foot_notes_div.append(inspiredlinks) ## remove empty head/food notes div(s) if not head_notes_div.find(True): head_notes_div.extract() if not foot_notes_div.find(True): foot_notes_div.extract() ## AO3 story end notes end up in the 'last' chapter, but if ## updated, then there's a new 'last' chapter. This option ## applies the 'skip_on_ffdl_update' class to those tags which ## means they will be removed during epub reading for update. ## Results: only the last chapter will have end notes. ## Side-effect: An 'Update Always' that doesn't add a new ## lasts chapter will remove the end notes. if self.getConfig("remove_authorfootnotes_on_update"): for skip_tag in skip_on_update_tags: if skip_tag.has_attr('class'): skip_tag['class'].append('skip_on_ffdl_update') else: skip_tag['class']=['skip_on_ffdl_update'] # logger.debug(skip_tag) return self.utf8FromSoup(url,save_chapter) def before_get_urls_from_page(self,url,normalize): # special stuff to log into archiveofourown.org, if possible. # Unlike most that show the links to 'adult' stories, but protect # them, AO3 doesn't even show them if not logged in. Only works # with saved user/pass--not going to prompt for list. if self.getConfig("username"): if self.getConfig("is_adult"): if '?' in url: addurl = "&view_adult=true" else: addurl = "?view_adult=true" else: addurl="" # just to get an authenticity_token. data = self.get_request(url+addurl) # login the session. self.performLogin(url,data) # get the list page with logged in session. def get_series_from_page(self,url,data,normalize=False): ''' This method is to make it easier for adapters to detect a series URL, pick out the series metadata and list of storyUrls to return without needing to override get_urls_from_page entirely. ''' if 'This work is only available to registered users of the Archive' in data: raise exceptions.FailedToDownload("This work is only available to registered users of the Archive -- set username/password in personal.ini under [%s]"%self.getSiteDomain()) ## easiest way to get all the weird URL possibilities and stay ## up to date with future changes. m = re.match(self.getSiteURLPattern().replace('/works/','/series/'),url) if m: seriesid = m.group('id') soup = self.make_soup(data) retval = {} urllist = [] ## series pages can do '...' and not have a link for all ## pages. Also, the page for the given URL, eg ## /series/99999?page=3, will *not* be in the list. pageparam = '?page=' pageas = soup.select("ol.pagination li a") if pageas: pageurls = [ a['href'] for a in pageas ] if pageparam in url: pageurls.append(url) # logger.debug(pageurls) ## need to find largest page number, including url, ## but excluding any further params maxpagenum = max([ int(re.sub(r'^.*'+re.escape(pageparam)+r'(\d+).*$','\\1',x)) for x in pageurls ]) # logger.debug(maxpagenum) for j in range(1,maxpagenum+1): pageurl = 'https://' + self.getSiteDomain() + '/series/' + seriesid + pageparam + unicode(j) # logger.debug(pageurl) pagesoup = self.make_soup(self.get_request(pageurl)) urllist.extend([ 'https://'+self.host+a['href'] for a in pagesoup.select('h4.heading a:first-child') ]) # logger.debug(urllist) if urllist: retval['urllist']=urllist else: retval['urllist']=[ 'https://'+self.host+a['href'] for a in soup.select('h4.heading a:first-child') ] retval['name']=stripHTML(soup.select_one("h2.heading")) desc=soup.select_one("div.wrapper dd blockquote.userstuff") if desc: desc.name='div' # change blockquote to div to match stories. retval['desc']=desc stats=stripHTML(soup.select_one("dl.series dl.stats")) if 'Complete:Yes' in stats: retval['status'] = "Completed" elif 'Complete:No' in stats: retval['status'] = "In-Progress" return retval ## return dict with at least {'urllist':['storyUrl','storyUrl',...]} ## optionally 'name' and 'desc'? return {}