diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index a87d1d8f..1c78d469 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1213,65 +1213,6 @@ extra_titlepage_entries:eroticatags,disclaimer #username:YourName #password:yourpassword -[archive.hpfanfictalk.com] -## Some sites also require the user to confirm they are adult for -## adult content. In commandline version, this should go in your -## personal.ini, not defaults.ini. -#is_adult:true - -add_to_extra_valid_entries:,themes,inclusivity,house, - series00,series00Url,series00HTML, - series01,series01Url,series01HTML, - series02,series02Url,series02HTML, - series03,series03Url,series03HTML, - series04,series04Url,series04HTML, - series05,series05Url,series05HTML, - series06,series06Url,series06HTML, - series07,series07Url,series07HTML, - series08,series08Url,series08HTML, - series09,series09Url,series09HTML, - -## Assume entryUrl, apply to "%s" to -## make entryHTML. -make_linkhtml_entries:series00,series01,series02,series03,series04, - series05,series06,series07,series08,series09 - -themes_label:Themes -inclusivity_label:Inclusivity -house_label:HPFT Forum House - -## series00 will be the same as common metadata series. -series00HTML_label:Series -series01HTML_label:Additional Series -series02HTML_label:Additional Series -series03HTML_label:Additional Series -series04HTML_label:Additional Series -series05HTML_label:Additional Series -series06HTML_label:Additional Series -series07HTML_label:Additional Series -series08HTML_label:Additional Series -series09HTML_label:Additional Series - -## Try to collect series names and numbers of this story in those -## series. This lets us turn it on and off by site without keeping a -## lengthy titlepage_entries per site and prevents it updating in the -## plugin. -collect_series: true - -add_to_extra_titlepage_entries:,series01HTML,series02HTML,series03HTML, - series04HTML,series05HTML,series06HTML,series07HTML,series08HTML,series09HTML - -## archive.hpfanfictalk.com takes margins away, even from p tags, by -## default. So authors have to either include extra br/p tags or -## their own styles. These allow for both, but leave you at the mercy -## of author CSS. -add_to_output_css: - * { - margin: 0; - padding: 0; - } -add_to_keep_html_attrs:,style - [archive.shriftweb.org] website_encodings:Windows-1252,utf8,iso-8859-1 @@ -1678,6 +1619,72 @@ make_linkhtml_entries:translators,betas ## can change it. include_in_category:fandoms +[fanfictalk.com] +## Some sites also require the user to confirm they are adult for +## adult content. In commandline version, this should go in your +## personal.ini, not defaults.ini. +#is_adult:true + +add_to_extra_valid_entries:,tropes,themes,representation,inclusivity, + house,storytype,contentwarnings, + series00,series00Url,series00HTML, + series01,series01Url,series01HTML, + series02,series02Url,series02HTML, + series03,series03Url,series03HTML, + series04,series04Url,series04HTML, + series05,series05Url,series05HTML, + series06,series06Url,series06HTML, + series07,series07Url,series07HTML, + series08,series08Url,series08HTML, + series09,series09Url,series09HTML, + +# fields changed name with domain name change. +include_in_inclusivity:representation +include_in_themes:tropes + +## Assume entryUrl, apply to "%s" to +## make entryHTML. +make_linkhtml_entries:series00,series01,series02,series03,series04, + series05,series06,series07,series08,series09 + +tropes_label:Tropes +representation_label:Representation +house_label:HPFT Forum House +storytype_label:Story Type +contentwarnings_label:Content Warnings + +## series00 will be the same as common metadata series. +series00HTML_label:Series +series01HTML_label:Additional Series +series02HTML_label:Additional Series +series03HTML_label:Additional Series +series04HTML_label:Additional Series +series05HTML_label:Additional Series +series06HTML_label:Additional Series +series07HTML_label:Additional Series +series08HTML_label:Additional Series +series09HTML_label:Additional Series + +## Try to collect series names and numbers of this story in those +## series. This lets us turn it on and off by site without keeping a +## lengthy titlepage_entries per site and prevents it updating in the +## plugin. +collect_series: true + +#add_to_extra_titlepage_entries:,tropes,themes,representation,inclusivity,house,storytype,contentwarnings,series01HTML,series02HTML,series03HTML, +# series04HTML,series05HTML,series06HTML,series07HTML,series08HTML,series09HTML + +## fanfictalk.com takes margins away, even from p tags, by default. +## So authors have to either include extra br/p tags or their own +## styles. These allow for both, but leave you at the mercy of author +## CSS. +add_to_output_css: + * { + margin: 0; + padding: 0; + } +add_to_keep_html_attrs:,style + [fanfiction-junkies.de] website_encodings:Windows-1252,utf8 diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py index 5a501e12..5c2842d8 100644 --- a/fanficfare/adapters/__init__.py +++ b/fanficfare/adapters/__init__.py @@ -166,7 +166,7 @@ from . import adapter_hentaifoundrycom from . import adapter_mugglenetfanfictioncom from . import adapter_swiorgru from . import adapter_fanficsme -from . import adapter_archivehpfanfictalkcom +from . import adapter_fanfictalkcom from . import adapter_scifistoriescom from . import adapter_silmarillionwritersguildorg from . import adapter_chireadscom diff --git a/fanficfare/adapters/adapter_archivehpfanfictalkcom.py b/fanficfare/adapters/adapter_fanfictalkcom.py similarity index 80% rename from fanficfare/adapters/adapter_archivehpfanfictalkcom.py rename to fanficfare/adapters/adapter_fanfictalkcom.py index 9eae3095..e04d222e 100644 --- a/fanficfare/adapters/adapter_archivehpfanfictalkcom.py +++ b/fanficfare/adapters/adapter_fanfictalkcom.py @@ -30,11 +30,11 @@ from ..six.moves.urllib.error import HTTPError from .base_adapter import BaseSiteAdapter, makeDate def getClass(): - return ArchiveHPfanfictalkComAdapter + return FanfictalkComAdapter # Class name has to be unique. Our convention is camel case the # sitename with Adapter at the end. www is skipped. -class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter): +class FanfictalkComAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) @@ -48,7 +48,7 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter): # normalized story URL. - self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId')) + self._setURL('https://' + self.getSiteDomain() + '/archive/viewstory.php?sid='+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','ahpfftc') @@ -57,17 +57,26 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter): # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d %b %Y" + @classmethod + def getAcceptDomains(cls): + return [cls.getSiteDomain(),'archive.hpfanfictalk.com'] + + @classmethod + def getConfigSections(cls): + "Only needs to be overriden if has additional ini sections." + return [cls.getConfigSection(),'archive.hpfanfictalk.com'] + @staticmethod # must be @staticmethod, don't remove it. def getSiteDomain(): # The site domain. Does have www here, if it uses it. - return 'archive.hpfanfictalk.com' + return 'fanfictalk.com' @classmethod def getSiteExampleURLs(cls): - return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234" + return "https://"+cls.getSiteDomain()+"/archive/viewstory.php?sid=1234" def getSiteURLPattern(self): - return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$" + return r"https?://(archive\.hp)?"+re.escape(self.getSiteDomain())+r"(/archive)?/viewstory\.php\?sid=\d+$" def use_pagecache(self): ''' @@ -111,24 +120,27 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter): # Now go hunting for all the meta data and the chapter list. - pagetitle = soup.find('h3') + pagetitle = soup.select_one('div#pagetitle') # logger.debug(pagetitle) ## Title a = pagetitle.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$")) self.story.setMetadata('title',stripHTML(a)) # Find authorid and URL from... author url. - a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+")) - self.story.setMetadata('authorId',a['href'].split('=')[1]) - self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href']) - self.story.setMetadata('author',stripHTML(a)) + for a in pagetitle.find_all('a', href=re.compile(r"viewuser.php\?uid=\d+")): + self.story.addToList('authorId',a['href'].split('=')[1]) + self.story.addToList('authorUrl','https://'+self.host+'/'+a['href']) + self.story.addToList('author',stripHTML(a)) # Find the chapters: for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")): # just in case there's tags, like in chapter titles. - self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href']) + self.add_chapter(chapter,'https://'+self.host+'/archive/'+chapter['href']) + + # categories + for a in soup.select("div#sort a"): + self.story.addToList('category',stripHTML(a)) - listbox = soup.find('div', {'class':'listbox'}) # this site has two divs with class=gb-50 and no immediate container. gb50s = soup.find_all('div', {'class':'gb-50'}) @@ -137,14 +149,15 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter): for url in urls: self.story.addToList(metadata,stripHTML(url)) - list_from_urls(listbox,r'browse.php\?type=categories','category') list_from_urls(gb50s[0],r'browse.php\?type=characters','characters') list_from_urls(gb50s[0],r'browse.php\?type=class&type_id=11','ships') + list_from_urls(gb50s[0],r'browse.php\?type=class&type_id=10','representation') + list_from_urls(gb50s[0],r'browse.php\?type=class&type_id=7','storytype') list_from_urls(gb50s[0],r'browse.php\?type=class&type_id=14','house') - list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=4','genre') - list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=13','themes') list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=8','warnings') - list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=10','inclusivity') + list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=15','contentwarnings') + list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=4','genre') + list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=13','tropes') bq = soup.find('blockquote2') if bq: @@ -162,40 +175,27 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter): # logger.debug(value) # logger.debug(label) - if 'Rating' in label: - # Mature Audiences · Incomplete - (rating,status) = value.split('·') - self.story.setMetadata('rating', rating) - if 'Complete' in status: - self.story.setMetadata('status', 'Completed') - else: - self.story.setMetadata('status', 'In-Progress') - - if 'Story Length' in label: + if 'Words:' in label: stripHTML(value) - # 10 chapters (45462 words) - v = stripHTML(value) - v = v.split('(')[1] - v = v.split(' words')[0] - self.story.setMetadata('numWords', v) + self.story.setMetadata('numWords', stripHTML(value).replace('·','')) - if 'Published' in label: + if 'Published:' in label: self.story.setMetadata('datePublished', makeDate(stripHTML(value).replace('·',''), self.dateformat)) - if 'Updated' in label: - self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) + if 'Updated:' in label: + self.story.setMetadata('dateUpdated', makeDate(stripHTML(value).replace('·',''), self.dateformat)) # Site allows stories to be in several series at once. FFF # isn't thrilled with that, we have series00, series01, etc. # Example: - # http://archive.hpfanfictalk.com/viewstory.php?sid=483 + # https://fanfictalk.com/archive/viewstory.php?sid=483 if self.getConfig("collect_series"): seriesspan = soup.find('span',label='Series') for i, seriesa in enumerate(seriesspan.find_all('a', href=re.compile(r"viewseries\.php\?seriesid=\d+"))): # logger.debug(seriesa) series_name = stripHTML(seriesa) - series_url = 'https://'+self.host+'/'+seriesa['href'] + series_url = 'https://'+self.host+'/archive/'+seriesa['href'] seriessoup = self.make_soup(self._fetchUrl(series_url)) storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+')) diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 91c66417..6570804a 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1244,65 +1244,6 @@ extra_titlepage_entries:eroticatags,disclaimer #username:YourName #password:yourpassword -[archive.hpfanfictalk.com] -## Some sites also require the user to confirm they are adult for -## adult content. In commandline version, this should go in your -## personal.ini, not defaults.ini. -#is_adult:true - -add_to_extra_valid_entries:,themes,inclusivity,house, - series00,series00Url,series00HTML, - series01,series01Url,series01HTML, - series02,series02Url,series02HTML, - series03,series03Url,series03HTML, - series04,series04Url,series04HTML, - series05,series05Url,series05HTML, - series06,series06Url,series06HTML, - series07,series07Url,series07HTML, - series08,series08Url,series08HTML, - series09,series09Url,series09HTML, - -## Assume entryUrl, apply to "%s" to -## make entryHTML. -make_linkhtml_entries:series00,series01,series02,series03,series04, - series05,series06,series07,series08,series09 - -themes_label:Themes -inclusivity_label:Inclusivity -house_label:HPFT Forum House - -## series00 will be the same as common metadata series. -series00HTML_label:Series -series01HTML_label:Additional Series -series02HTML_label:Additional Series -series03HTML_label:Additional Series -series04HTML_label:Additional Series -series05HTML_label:Additional Series -series06HTML_label:Additional Series -series07HTML_label:Additional Series -series08HTML_label:Additional Series -series09HTML_label:Additional Series - -## Try to collect series names and numbers of this story in those -## series. This lets us turn it on and off by site without keeping a -## lengthy titlepage_entries per site and prevents it updating in the -## plugin. -collect_series: true - -add_to_extra_titlepage_entries:,series01HTML,series02HTML,series03HTML, - series04HTML,series05HTML,series06HTML,series07HTML,series08HTML,series09HTML - -## archive.hpfanfictalk.com takes margins away, even from p tags, by -## default. So authors have to either include extra br/p tags or -## their own styles. These allow for both, but leave you at the mercy -## of author CSS. -add_to_output_css: - * { - margin: 0; - padding: 0; - } -add_to_keep_html_attrs:,style - [archive.shriftweb.org] website_encodings:Windows-1252,utf8,iso-8859-1 @@ -1709,6 +1650,72 @@ make_linkhtml_entries:translators,betas ## can change it. include_in_category:fandoms +[fanfictalk.com] +## Some sites also require the user to confirm they are adult for +## adult content. In commandline version, this should go in your +## personal.ini, not defaults.ini. +#is_adult:true + +add_to_extra_valid_entries:,tropes,themes,representation,inclusivity, + house,storytype,contentwarnings, + series00,series00Url,series00HTML, + series01,series01Url,series01HTML, + series02,series02Url,series02HTML, + series03,series03Url,series03HTML, + series04,series04Url,series04HTML, + series05,series05Url,series05HTML, + series06,series06Url,series06HTML, + series07,series07Url,series07HTML, + series08,series08Url,series08HTML, + series09,series09Url,series09HTML, + +# fields changed name with domain name change. +include_in_inclusivity:representation +include_in_themes:tropes + +## Assume entryUrl, apply to "%s" to +## make entryHTML. +make_linkhtml_entries:series00,series01,series02,series03,series04, + series05,series06,series07,series08,series09 + +tropes_label:Tropes +representation_label:Representation +house_label:HPFT Forum House +storytype_label:Story Type +contentwarnings_label:Content Warnings + +## series00 will be the same as common metadata series. +series00HTML_label:Series +series01HTML_label:Additional Series +series02HTML_label:Additional Series +series03HTML_label:Additional Series +series04HTML_label:Additional Series +series05HTML_label:Additional Series +series06HTML_label:Additional Series +series07HTML_label:Additional Series +series08HTML_label:Additional Series +series09HTML_label:Additional Series + +## Try to collect series names and numbers of this story in those +## series. This lets us turn it on and off by site without keeping a +## lengthy titlepage_entries per site and prevents it updating in the +## plugin. +collect_series: true + +#add_to_extra_titlepage_entries:,tropes,themes,representation,inclusivity,house,storytype,contentwarnings,series01HTML,series02HTML,series03HTML, +# series04HTML,series05HTML,series06HTML,series07HTML,series08HTML,series09HTML + +## fanfictalk.com takes margins away, even from p tags, by default. +## So authors have to either include extra br/p tags or their own +## styles. These allow for both, but leave you at the mercy of author +## CSS. +add_to_output_css: + * { + margin: 0; + padding: 0; + } +add_to_keep_html_attrs:,style + [fanfiction-junkies.de] website_encodings:Windows-1252,utf8