Fix series# with javascript and different warning#s. geturls w/javascript.

This commit is contained in:
Jim Miller 2012-06-18 12:24:50 -05:00
parent ce0a9b93e0
commit f472c75b8c
9 changed files with 58 additions and 36 deletions

View file

@ -27,7 +27,7 @@ class FanFictionDownLoaderBase(InterfaceActionBase):
description = 'UI plugin to download FanFiction stories from various sites.'
supported_platforms = ['windows', 'osx', 'linux']
author = 'Jim Miller'
version = (1, 5, 34)
version = (1, 5, 36)
minimum_calibre_version = (0, 8, 30)
#: This field defines the GUI plugin class that contains all the code

View file

@ -227,13 +227,15 @@ class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
except:
# I find it hard to care if the series parsing fails

View file

@ -274,13 +274,15 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
except:
# I find it hard to care if the series parsing fails

View file

@ -222,13 +222,15 @@ class LibraryOfMoriaComAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
except:
# I find it hard to care if the series parsing fails

View file

@ -260,10 +260,12 @@ class MidnightwhispersCaAdapter(BaseSiteAdapter): # XXX
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
except:
# I find it hard to care if the series parsing fails

View file

@ -303,13 +303,15 @@ class MuggleNetComAdapter(BaseSiteAdapter): # XXX
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
except:
# I find it hard to care if the series parsing fails

View file

@ -261,13 +261,15 @@ class NfaCommunityComAdapter(BaseSiteAdapter): # XXX
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
except:
# I find it hard to care if the series parsing fails

View file

@ -234,13 +234,17 @@ class YourFanfictionComAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
print("series a['href']:%s"%a['href'])
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
break
i+=1
except:
# I find it hard to care if the series parsing fails

View file

@ -15,7 +15,7 @@
# limitations under the License.
#
import re
import urlparse
import urllib2 as u2
import ConfigParser
@ -37,7 +37,13 @@ def get_urls_from_page(url):
for a in soup.findAll('a'):
if a.has_key('href'):
href = form_url(url,a['href'])
# lots of eFiction sites use similar 'are you old enough' javascript links.
if 'javascript' in a['href'] and 'viewstory.php' in a['href']:
m = re.search(r"'(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",a['href'])
if m != None:
href = form_url(url,m.group('sid'))
try:
href = href.replace('&index=1','')
adapter = adapters.getAdapter(config,href,"EPUB")
if adapter.story.getMetadata('storyUrl') not in normalized:
normalized.add(adapter.story.getMetadata('storyUrl'))