Fix series# with javascript and different warning#s. geturls w/javascript.

2026-04-23 07:23:23 +02:00 · 2012-06-18 12:24:50 -05:00 · 2012-06-18 12:24:50 -05:00 · f472c75b8c
commit f472c75b8c
parent ce0a9b93e0
9 changed files with 58 additions and 36 deletions
--- a/calibre-plugin/init.py
+++ b/calibre-plugin/init.py
@ -27,7 +27,7 @@ class FanFictionDownLoaderBase(InterfaceActionBase):
    description         = 'UI plugin to download FanFiction stories from various sites.'
    supported_platforms = ['windows', 'osx', 'linux']
    author              = 'Jim Miller'
-    version             = (1, 5, 34)
+    version             = (1, 5, 36)
    minimum_calibre_version = (0, 8, 30)

    #: This field defines the GUI plugin class that contains all the code
--- a/fanficdownloader/adapters/adapter_erosnsapphosycophanthexcom.py
+++ b/fanficdownloader/adapters/adapter_erosnsapphosycophanthexcom.py
@ -227,13 +227,15 @@ class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter):

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
-            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
+            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
            for a in storyas:
-                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                    self.setSeries(series_name, i)
-                    break
-                i+=1
+                # skip 'report this' and 'TOC' links
+                if 'contact.php' not in a['href'] and 'index' not in a['href']:
+                    if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                        self.setSeries(series_name, i)
+                        break
+                    i+=1
            
        except:
            # I find it hard to care if the series parsing fails
--- a/fanficdownloader/adapters/adapter_ksarchivecom.py
+++ b/fanficdownloader/adapters/adapter_ksarchivecom.py
@ -274,13 +274,15 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
-            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
+            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
            for a in storyas:
-                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                    self.setSeries(series_name, i)
-                    break
-                i+=1
+                # skip 'report this' and 'TOC' links
+                if 'contact.php' not in a['href'] and 'index' not in a['href']:
+                    if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                        self.setSeries(series_name, i)
+                        break
+                    i+=1
            
        except:
            # I find it hard to care if the series parsing fails
--- a/fanficdownloader/adapters/adapter_libraryofmoriacom.py
+++ b/fanficdownloader/adapters/adapter_libraryofmoriacom.py
@ -222,13 +222,15 @@ class LibraryOfMoriaComAdapter(BaseSiteAdapter):

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
-            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
+            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
            for a in storyas:
-                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                    self.setSeries(series_name, i)
-                    break
-                i+=1
+                # skip 'report this' and 'TOC' links
+                if 'contact.php' not in a['href'] and 'index' not in a['href']:
+                    if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                        self.setSeries(series_name, i)
+                        break
+                    i+=1
            
        except:
            # I find it hard to care if the series parsing fails
--- a/fanficdownloader/adapters/adapter_midnightwhispersca.py
+++ b/fanficdownloader/adapters/adapter_midnightwhispersca.py
@ -260,10 +260,12 @@ class MidnightwhispersCaAdapter(BaseSiteAdapter): # XXX
            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
            i=1
            for a in storyas:
-                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                    self.setSeries(series_name, i)
-                    break
-                i+=1
+                # skip 'report this' and 'TOC' links
+                if 'contact.php' not in a['href'] and 'index' not in a['href']:
+                    if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                        self.setSeries(series_name, i)
+                        break
+                    i+=1
            
        except:
            # I find it hard to care if the series parsing fails
--- a/fanficdownloader/adapters/adapter_mugglenetcom.py
+++ b/fanficdownloader/adapters/adapter_mugglenetcom.py
@ -303,13 +303,15 @@ class MuggleNetComAdapter(BaseSiteAdapter): # XXX

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
-            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
+            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
            for a in storyas:
-                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                    self.setSeries(series_name, i)
-                    break
-                i+=1
+                # skip 'report this' and 'TOC' links
+                if 'contact.php' not in a['href'] and 'index' not in a['href']:
+                    if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                        self.setSeries(series_name, i)
+                        break
+                    i+=1
            
        except:
            # I find it hard to care if the series parsing fails
--- a/fanficdownloader/adapters/adapter_nfacommunitycom.py
+++ b/fanficdownloader/adapters/adapter_nfacommunitycom.py
@ -261,13 +261,15 @@ class NfaCommunityComAdapter(BaseSiteAdapter): # XXX

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
-            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
+            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
            for a in storyas:
-                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                    self.setSeries(series_name, i)
-                    break
-                i+=1
+                # skip 'report this' and 'TOC' links
+                if 'contact.php' not in a['href'] and 'index' not in a['href']:
+                    if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                        self.setSeries(series_name, i)
+                        break
+                    i+=1
            
        except:
            # I find it hard to care if the series parsing fails
--- a/fanficdownloader/adapters/adapter_yourfanfictioncom.py
+++ b/fanficdownloader/adapters/adapter_yourfanfictioncom.py
@ -234,13 +234,17 @@ class YourFanfictionComAdapter(BaseSiteAdapter):

            # use BeautifulSoup HTML parser to make everything easier to find.
            seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
-            storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
+            # can't use ^viewstory...$ in case of higher rated stories with javascript href.
+            storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
            i=1
            for a in storyas:
-                if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                    self.setSeries(series_name, i)
-                    break
-                i+=1
+                print("series a['href']:%s"%a['href'])
+                # skip 'report this' and 'TOC' links
+                if 'contact.php' not in a['href'] and 'index' not in a['href']:
+                    if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                        self.setSeries(series_name, i)
+                        break
+                    i+=1
            
        except:
            # I find it hard to care if the series parsing fails
--- a/fanficdownloader/geturls.py
+++ b/fanficdownloader/geturls.py
@ -15,7 +15,7 @@
 # limitations under the License.
 #

-
+import re
 import urlparse
 import urllib2 as u2
 import ConfigParser
@ -37,7 +37,13 @@ def get_urls_from_page(url):
    for a in soup.findAll('a'):
        if a.has_key('href'):
            href = form_url(url,a['href'])
+            # lots of eFiction sites use similar 'are you old enough' javascript links.
+            if 'javascript' in a['href'] and 'viewstory.php' in a['href']:
+                m = re.search(r"'(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",a['href'])
+                if m != None:
+                    href = form_url(url,m.group('sid'))
            try:
+                href = href.replace('&index=1','')
                adapter = adapters.getAdapter(config,href,"EPUB")
                if adapter.story.getMetadata('storyUrl') not in normalized:
                    normalized.add(adapter.story.getMetadata('storyUrl'))