diff --git a/fanficdownloader/adapters/adapter_fanfictionnet.py b/fanficdownloader/adapters/adapter_fanfictionnet.py index a536c7ab..47996c7d 100644 --- a/fanficdownloader/adapters/adapter_fanfictionnet.py +++ b/fanficdownloader/adapters/adapter_fanfictionnet.py @@ -23,9 +23,14 @@ import time from .. import BeautifulSoup as bs from .. import exceptions as exceptions +from ..htmlcleanup import stripHTML from base_adapter import BaseSiteAdapter, makeDate +ffnetgenres=["Adventure", "Angst", "Crime", "Drama", "Family", "Fantasy", "Friendship", "General", + "Horror", "Humor", "Hurt-Comfort", "Mystery", "Parody", "Poetry", "Romance", "Sci-Fi", + "Spiritual", "Supernatural", "Suspense", "Tragedy", "Western"] + class FanFictionNetSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): @@ -198,61 +203,45 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): ## Pull some additional data from html. Find Rating and look around it. a = soup.find('a', href='http://www.fictionratings.com/') - self.story.setMetadata('rating',a.string) + rating = a.string + if 'Fiction' in rating: # if rating has 'Fiction ', strip that out for consistency with past. + rating = rating[8:] + + self.story.setMetadata('rating',rating) - # used below to get correct characters. - metatext = a.findNext(text=re.compile(r' - Reviews:')) - if metatext == None: # indicates there's no Reviews, look for id: instead. - metatext = a.findNext(text=re.compile(r' - id:')) - - m = re.match(r" - (?P[^ ]+)",metatext) - if m.group('lang') != None: - self.story.setMetadata('language',m.group('lang')) - # after Rating, the same bit of text containing id:123456 contains # Complete--if completed. - if 'Complete' in soup.find(text=re.compile(r'id:'+self.story.getMetadata('storyId'))): + gui_table1i = soup.find(id="gui_table1i") + metatext = stripHTML(gui_table1i.find('div', {'style':'color:gray;'})).replace('Hurt/Comfort','Hurt-Comfort') + metalist = metatext.split(" - ") + #print("metatext:(%s)"%metalist) + + # rating is obtained above more robustly. + if metalist[0].startswith('Rated:'): + metalist=metalist[1:] + + # next is assumed to be language. + self.story.setMetadata('language',metalist[0]) + metalist=metalist[1:] + + # next might be genre. + genrelist = metalist[0].split('/') # Hurt/Comfort already changed above. + goodgenres=True + for g in genrelist: + if g not in ffnetgenres: + goodgenres=False + if goodgenres: + self.story.extendList('genre',genrelist) + metalist=metalist[1:] + + # next might be characters, otherwise Reviews, Updated or Published + if not ( metalist[0].startswith('Reviews') or metalist[0].startswith('Updated') or metalist[0].startswith('Published') ): + self.story.extendList('characters',metalist[0].split(' & ')) + + if 'Status: Complete' in metatext: self.story.setMetadata('status', 'Completed') else: self.story.setMetadata('status', 'In-Progress') - - - # Parse genre(s) from - # - # (fp) - # - # - # - # Chapter 1 of a SpongeBob SquarePants - Romance/Humor fanfiction with characters SpongeBob. Bob Esponja tiene un admirador secreto ¿quien será?. update existing id:1684 - m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?) ?(?:- (?P.*?) )?(?:crossover )?(?:fan)?fiction(?P[ ]+with characters)?", - soup.find('meta',{'name':'description'})['content']) - #print("meta desc:%s"%soup.find('meta',{'name':'description'})['content']) - if m != None: - genres=m.group('genres') - if genres != None: - # Hurt/Comfort is one genre. - genres=re.sub('Hurt/Comfort','Hurt-Comfort',genres) - for g in genres.split('/'): - self.story.addToList('genre',g) - - if m.group('chars') != None: - - # At this point we've proven that there's character(s) - # We can't reliably parse characters out of meta name="description". - # There's no way to tell that "with characters Ichigo K. & Neliel T. O./Nel. " ends at "Nel.", not "T." - # But we can pull them from the reviewstext line, now that we know about existance of chars. - # reviewstext can take form of: - # - English - Shinji H. - Updated: 01-13-12 - Published: 12-20-11 - id:7654123 - # - English - Adventure/Angst - Ichigo K. & Neliel T. O./Nel - Reviews: - # - English - Humor/Adventure - Harry P. & Ironhide - Reviews: - # - Spanish - Romance/Humor - SpongeBob - Reviews: - #print("metatext:%s"%metatext) - mc = re.match(r" - (?P[^ ]+ - )(?P[^ ]+ - )? ?(?P.+?) - (Reviews|Updated|Published)", - metatext) - chars = mc.group("chars") - for c in chars.split('&'): - self.story.addToList('characters',c.strip()) return def getChapterText(self, url): @@ -269,10 +258,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): if sharediv: sharediv.extract() - div = soup.find('div', {'id' : 'storytext'}) + div = soup.find('div', {'id' : 'storytextp'}) if None == div: - logging.debug('div id=storytext not found. data:%s'%data) + logging.debug('div id=storytextp not found. data:%s'%data) raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) return self.utf8FromSoup(url,div) diff --git a/fanficdownloader/story.py b/fanficdownloader/story.py index 7dc7bdb0..d6b5f881 100644 --- a/fanficdownloader/story.py +++ b/fanficdownloader/story.py @@ -252,7 +252,12 @@ class Story: allmetadata[l] = self.getMetadata(l, removeallentities, doreplacements) return allmetadata - + + # just for less clutter in adapters. + def extendList(self,listname,l): + for v in l: + self.addToList(listname,v) + def addToList(self,listname,value): if value==None: return diff --git a/index.html b/index.html index 2270782a..cc9a3cdf 100644 --- a/index.html +++ b/index.html @@ -54,6 +54,10 @@ much easier.

+

fanfiction.net fixed

+

+ I hope. I'm parsing it a different way now that I hope will work in all cases. Story text should be fine, but keep an eye on the metadata. +

New Sites

New sites www.dokuga.com, www.ik-eternal.net added. Thanks, Ida.