Add a new adapter for the potionsandsnitches web site. Once we started looking for the additional information for the title page, things were sufficiently different from the twilighted adapter that it couldn't be used anymore.

2026-05-05 02:51:48 +02:00 · 2010-11-10 23:11:08 -06:00 · 2010-11-10 23:11:08 -06:00 · 30b914788c
commit 30b914788c
parent ed4ff4b6ab
3 changed files with 534 additions and 6 deletions
--- a/downloader.py
+++ b/downloader.py
@ -22,6 +22,7 @@ import adapter
 import fictionalley
 import hpfiction
 import twilighted
+import potionsNsnitches

 import time

@ -99,8 +100,10 @@ if __name__ == '__main__':
 		adapter = ffnet.FFNet(url)
 	elif url.find('harrypotterfanfiction.com') != -1:
 		adapter = hpfiction.HPFiction(url)
-	elif url.find('twilighted.net') != -1 or url.find('potionsandsnitches.net') != -1:
+	elif url.find('twilighted.net') != -1:
 		adapter = twilighted.Twilighted(url)
+	elif url.find('potionsandsnitches.net') != -1:
+		adapter = potionsNsnitches.PotionsNSnitches(url)
 	else:
 		print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
 		sys.exit(1)
--- a/potionsNsnitches.py
+++ b/potionsNsnitches.py
@ -0,0 +1,498 @@
+# -*- coding: utf-8 -*-
+
+# Copied from the twilighted.py because site is almost the same..
+# of course, now that we're trying to scrape more detail about the
+# story, there were differences in how headers are displayed
+
+import os
+import re
+import sys
+import shutil
+import os.path
+import urllib as u
+import logging
+import pprint as pp
+import unittest
+import urllib2 as u2
+import urlparse as up
+import BeautifulSoup as bs
+import htmlentitydefs as hdefs
+import time
+import datetime
+
+from adapter import *
+
+class PotionsNSnitches(FanfictionSiteAdapter):
+    def __init__(self, url):
+        self.url = url
+        parsedUrl = up.urlparse(url)
+        self.host = parsedUrl.netloc
+        self.path = parsedUrl.path
+        self.opener = u2.build_opener(u2.HTTPCookieProcessor())
+        self.password = ''
+        self.login='sigizmund'
+        self.storyDescription = 'Fanfiction Story'
+        self.authorId = '0'
+        self.authorURL = ''
+        self.storyId = '0'
+        self.storyPublished = datetime.date(1970, 01, 31)
+        self.storyCreated = datetime.datetime.now()
+        self.storyUpdated = datetime.date(1970, 01, 31)
+        self.languageId = 'en-UK'
+        self.language = 'English'
+        self.subjects = []
+        self.subjects.append ('fanfiction')
+        self.subjects.append ('Harry Potter')
+        self.publisher = self.host
+        self.numChapters = 0
+        self.numWords = 0
+        self.genre = 'FanFiction'
+        self.category = 'Category'
+        self.storyStatus = 'In-Progress'
+        self.storyRating = 'PG'
+        self.storyUserRating = '0'
+        self.storyCharacters = []
+        self.storySeries = ''
+        
+        self.chapurl = False
+        ss=self.url.split('?')
+        logging.debug('ss=%s' % ss)
+        if ss is not None and len(ss) > 1:
+            sss = ss[1].replace('&amp;','&').split('&')
+            logging.debug('sss=%s' % sss)
+            if sss is not None and len(sss) > 0:
+                ssss = sss[0].split('=')
+                logging.debug('ssss=%s' % ssss)
+                if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
+                    self.storyId = ssss[1]
+                if len(sss) > 1:
+                    ssss = sss[1].split('=')
+                    logging.debug('ssss=%s' % ssss)
+                    if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
+                        self.chapurl = True
+
+        self.url = 'http://' + self.host + '/' + self.path + '?sid=' + self.storyId
+        logging.debug('self.url=%s' % self.url)
+        
+        self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+        logging.debug('self.uuid=%s' % self.uuid)
+    
+        logging.debug("Created PotionsNSnitches: url=%s" % (self.url))
+
+
+    def requiresLogin(self, url = None):
+        # potionsandsnitches.net doesn't require login.
+        if self.host == 'potionsandsnitches.net':
+          return False
+        else:
+          return True
+
+    def performLogin(self, url = None):
+        data = {}
+    
+        data['penname'] = self.login
+        data['password'] = self.password
+        data['cookiecheck'] = '1'
+        data['submit'] = 'Submit'
+    
+        urlvals = u.urlencode(data)
+        loginUrl = 'http://' + self.host + self._getLoginScript()
+        logging.debug("Will now login to URL %s" % loginUrl)
+    
+        req = self.opener.open(loginUrl, urlvals)
+    
+        d = req.read().decode('utf-8')
+    
+        if self.reqLoginData(d) :
+          return False
+        else:
+          return True
+
+
+    def setLogin(self, login):
+        self.login = login
+
+    def setPassword(self, password):
+        self.password = password
+
+    def _addSubject(self, subject):
+        subj = subject.upper()
+        for s in self.subjects:
+            if s.upper() == subj:
+                return False
+        self.subjects.append(subject)
+        return True
+
+    def _addCharacter(self, character):
+        chara = character.upper()
+        for c in self.storyCharacters:
+            if c.upper() == chara:
+                return False
+        self.storyCharacters.append(character)
+        return True 
+    
+    def _fillCharacters(self, strlist, idx, maxlen):
+        ii = idx
+        while ii < maxlen:
+            chara = strlist[ii].strip()
+            if len(chara) > 0:
+                if chara.find(':') != -1:
+                    return (ii-1)
+                elif chara.find(',') == -1:
+                    self._addCharacter (chara)
+            ii = ii + 1
+        return (ii) 
+
+    def _buildGenre(self, strlist, idx, maxlen):
+        self.genre = ''
+        ii = idx
+        while ii < maxlen:
+            genre = strlist[ii].strip()
+            if len(genre) > 0:
+                if genre.find(':') != -1:
+                    return (ii-1)
+                elif genre.find(',') != -1:
+                    genre = ', '
+                else:
+                    self._addSubject (genre)
+                self.genre = self.genre + genre
+            ii = ii + 1
+        return (ii) 
+
+    def _buildCategory(self, strlist, idx, maxlen):
+        self.category = ''
+        ii = idx
+        while ii < maxlen:
+            cat = strlist[ii].strip()
+            if len(cat) > 0:
+                if cat.find(':') != -1:
+                    return (ii-1)
+                elif cat.find(',') != -1:
+                    cat = ', '
+                else:
+                    self._addSubject (cat)
+                self.category = self.category + cat
+            ii = ii + 1
+        return (ii) 
+
+    def extractIndividualUrls(self):
+        url = self.url + '&chapter=1'
+        data = self.opener.open(url).read()
+        
+        if self.reqLoginData(data):
+          self.performLogin()
+          data = self.opener.open(url).read()
+          if self.reqLoginData(data):
+            return None
+        
+        soup = bs.BeautifulStoneSoup(data)
+
+        self.storyName = ''
+        self.authorName = ''
+        self.storyId = '0'
+        title = soup.find('title').string
+        if title is not None and len(title) > 0:
+            logging.debug('Title: %s' % title)
+            ss = title.split(' by ')
+            if ss is not None and len(ss) > 1:
+                self.storyName = ss[0].strip()
+                self.authorName = ss[1].strip()
+                self.outputName = self.storyName.replace(" ", "_") + '-pNs_' + self.storyId
+
+        logging.debug('self.storyId=%s, self.storyName=%s, self.outputName=%s' % (self.storyId, self.storyName, self.outputName))
+        logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
+                
+        select = soup.find('select', { 'name' : 'chapter' } )
+    	 
+        result = []
+        if select is None:
+    	   # no chapters found, try url by itself.
+           chaptitle = soup.find('div', { 'id' : 'chaptertitle' } )
+           if chaptitle is not None and chaptitle.string is not None and len(chaptitle.string) > 0:
+               result.append((url,chaptitle.string))
+           else:
+    	       result.append((url,self.storyName))
+        else:
+    	   allOptions = select.findAll('option')
+    	   for o in allOptions:
+    	     url = self.url + "&chapter=%s" % o['value']
+    	     title = o.string
+    	     result.append((url,title))
+    
+        url = self.url + "&index=1"
+        data = self.opener.open(url).read()
+        lines = data.split('\n')
+        soup = bs.BeautifulStoneSoup(data)
+        pgt = soup.find('div', {'id' : 'pagetitle'})
+        #logging.debug('pagetitle: %s' % pgt)
+        pgtAs = pgt.findAll('a')
+        #logging.debug('pgtAs: %s' % pgtAs)
+        for a in pgtAs:
+            if a['href'].find('viewstory.php') != -1:
+                (u1, self.storyId) = a['href'].split('=')
+                self.storyName = a.string
+                logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
+            elif a['href'].find('viewuser.php') != -1:
+                self.authorName = a.string
+                self.authorURL = 'http://' + self.host + '/' + a['href']
+                (u1, self.authorId) = a['href'].split('=')
+                logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
+
+        output = soup.find('div', {'id' : 'output'})
+        #logging.debug('output: %s' % str(output))
+        if output is not None and len(str(output)) > 1:
+            s2 = re.split ('<[^>]+>', str(output))
+            #logging.debug('s2=%s' % s2)
+            ii = 0
+            ll = len(s2)
+            while ii < ll:
+                if s2[ii] == 'Summary:' and ii+1 < ll:
+                    self.storyDescription = s2[ii+1].strip()
+                    logging.debug('self.storyDescription: %s' % self.storyDescription)
+                    break;
+                ii = ii+1
+            
+        cnt = soup.find('div', {'class' : 'content'})
+        #logging.debug('content: %s' % cnt)
+        cnttd = cnt.findAll('td')
+        #logging.debug('cnttd: %s' % cnttd)
+        for td in cnttd:
+            #logging.debug('td: %s' % str(td))
+            ss = str(td).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
+            if len(ss) > 1:
+                s2 = re.split ('<[^>]+>', ss)
+                #logging.debug('s2=%s' % s2)
+                ii = 0
+                ll = len(s2)
+                while ii < ll-1:
+                    if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
+                        skey = s2[ii].strip()
+                        ii = ii+1
+                        if skey == 'Rated:':
+                            self.storyRating = s2[ii].strip()
+                            logging.debug('self.storyRating=%s' % self.storyRating)
+                            ii = ii + 1
+                        elif skey == 'Chapters:':
+                            self.numChapters = s2[ii].strip()
+                            logging.debug('self.numChapters=%s' % self.numChapters)
+                            ii = ii + 1
+                        elif skey == 'Characters:':
+                            ii = self._fillCharacters(s2, ii, ll)
+                            logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+                            ii = ii + 1
+                        elif skey == 'Genres:':
+                            ii = self._buildGenre(s2, ii, ll)
+                            logging.debug('self.genre=%s' % self.genre)
+                            logging.debug('self.subjects=%s' % self.subjects)
+                        elif skey == 'Categories:':
+                            ii = self._buildCategory(s2, ii, ll)
+                            logging.debug('self.category=%s' % self.category)
+                            logging.debug('self.subjects=%s' % self.subjects)
+                        elif skey == 'Completed:':
+                            if s2[ii].strip(' ') == "No":
+                                self.storyStatus = 'In-Progress'
+                            else:
+                                self.storyStatus = 'Completed'
+                            ii = ii + 1
+                        elif skey == 'Word count:':
+                            self.numWords = s2[ii].strip()
+                            if self.numWords is None or len(self.numWords) == 0:
+                                self.numWords = '0'
+                            logging.debug('self.numWords=%s' % self.numWords)
+                            ii = ii + 1
+                        elif skey == 'Takes Place:':
+                            ii = ii + 1
+                        elif skey == 'Awards:':
+                            ii = ii + 1
+                        elif skey == 'Series:':
+                            ii = ii + 1
+                        elif skey == 'Read:':
+                            ii = ii + 1
+                        elif skey == 'Warnings:':
+                            ii = ii + 1
+                    else:
+                        ii = ii + 1
+                                        
+        tls = soup.findAll('div', {'style' : 'text-align: center;'})
+        for tl in tls:
+            #logging.debug('tl: %s' % tl)
+            ss = str(tl).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
+            if ss.find('Published:') != -1:
+                s2 = re.split ('<[^>]+>', ss)
+                #logging.debug('s2: %s' % s2)
+                ii = 0
+                ll = len(s2)
+                while ii < ll-1:
+                    if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
+                        skey = s2[ii].strip()
+                        #logging.debug('skey: %s' % skey)
+                        ii = ii+1
+                        if skey == 'Published:':
+                            self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
+                            logging.debug('self.storyPublished=%s' % self.storyPublished)
+                            ii = ii + 1
+                        elif skey == 'Updated:':
+                            self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
+                            logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+                            ii = ii + 1
+                    else:
+                        ii = ii + 1
+            
+        if (self.storyName is None or len(self.storyName) == 0) and self.storyId == '0': 
+            logging.error('self.storyName is empty!!  Exitting!')
+            exit(1)
+            
+        self.outputName = self.storyName.replace(" ", "_") + '-pNs_' + self.storyId
+        logging.debug('self.outputName=%s' % self.outputName)
+        
+        self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+        logging.debug('self.uuid=%s' % self.uuid)
+
+        return result
+
+    def getStoryName(self):
+        return self.storyName
+
+    def getOutputName(self):
+        return self.outputName
+		
+    def getAuthorName(self):
+        return self.authorName
+    
+    def getText(self, url):
+        if url.find('http://') == -1:
+          url = 'http://' + self.host + '/' + url
+    
+        logging.debug('Getting data from: %s' % url)
+    
+        data = self.opener.open(url).read()
+        
+        # need to do this, because for some reason the <br /> tag in the story causes problems
+        data = data.replace('<br />', ' SOMETHING_BR ')
+        soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
+    
+        div = soup.find('div', {'id' : 'story'})
+    
+        if None == div:
+          return '<html/>'
+
+        # put the <br /> tags back in..
+        text = div.__str__('utf8').replace(' SOMETHING_BR ','<br />')    
+        return text
+
+    def _getLoginScript(self):
+        return '/user.php?action=login'
+
+    def reqLoginData(self, data):
+        if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
+          return True
+        else:
+          return False
+
+    def getHost(self):
+        logging.debug('self.host=%s' % self.host)
+        return self.host
+
+    def getStoryURL(self):
+        logging.debug('self.url=%s' % self.url)
+        return self.url
+
+    def getAuthorURL(self):
+        logging.debug('self.authorURL=%s' % self.authorURL)
+        return self.authorURL
+
+    def getUUID(self):
+        logging.debug('self.uuid=%s' % self.uuid)
+        return self.uuid
+    
+    def getStoryDescription(self):
+        logging.debug('self.storyDescription=%s' % self.storyDescription)
+        return self.storyDescription
+    
+    def getStoryPublished(self):
+        logging.debug('self.storyPublished=%s' % self.storyPublished)
+        return self.storyPublished
+    
+    def getStoryCreated(self):
+        self.storyCreated = datetime.datetime.now()
+        logging.debug('self.storyCreated=%s' % self.storyCreated)
+        return self.storyCreated
+    
+    def getStoryUpdated(self):
+        logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+        return self.storyUpdated
+    
+    def getLanguage(self):
+        logging.debug('self.language=%s' % self.language)
+        return self.language
+    
+    def getLanguageId(self):
+        logging.debug('self.languageId=%s' % self.languageId)
+        return self.languageId
+    
+    def getSubjects(self):
+        logging.debug('self.subjects=%s' % self.authorName)
+        return self.subjects
+    
+    def getPublisher(self):
+        logging.debug('self.publisher=%s' % self.publisher)
+        return self.publisher
+    
+    def getNumChapters(self):
+        logging.debug('self.numChapters=%s' % self.numChapters)
+        return self.numChapters
+    
+    def getNumWords(self):
+        logging.debug('self.numWords=%s' % self.numWords)
+        return self.numWords
+    
+    def getAuthorId(self):
+        logging.debug('self.authorId=%s' % self.authorId)
+        return self.authorId
+    
+    def getStoryId(self):
+        logging.debug('self.storyId=%s' % self.storyId)
+        return self.storyId
+    
+    def getCategory(self):
+        logging.debug('self.category=%s' % self.category)
+        return self.category
+    
+    def getGenre(self):
+        logging.debug('self.genre=%s' % self.genre)
+        return self.genre
+    
+    def getStoryStatus(self):
+        logging.debug('self.storyStatus=%s' % self.storyStatus)
+        return self.storyStatus
+    
+    def getStoryRating(self):
+        logging.debug('self.storyRating=%s' % self.storyRating)
+        return self.storyRating
+    
+    def getStoryUserRating(self):
+        logging.debug('self.storyUserRating=%s' % self.storyUserRating)
+        return self.storyUserRating
+    
+    def getStoryCharacters(self):
+        logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+        return self.storyCharacters
+
+    def getStorySeries(self):
+        logging.debug('self.storySeries=%s' % self.storySeries)
+        return self.storySeries
+
+class PotionsNSnitches_UnitTests(unittest.TestCase):
+    def setUp(self):
+        logging.basicConfig(level=logging.DEBUG)
+        pass
+  
+    def testLoginWorks(self):
+        pass
+  
+    def testGetUrlsWorks(self):
+        url = 'http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2230'
+        self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
+
+if __name__ == '__main__':
+    unittest.main()
--- a/twilighted.py
+++ b/twilighted.py
@ -50,6 +50,27 @@ class Twilighted(FanfictionSiteAdapter):
        self.storyUserRating = '0'
        self.storyCharacters = []
        self.storySeries = ''
+        
+        self.chapurl = False
+        ss=self.url.split('?')
+        logging.debug('ss=%s' % ss)
+        if ss is not None and len(ss) > 1:
+            sss = ss[1].replace('&amp;','&').split('&')
+            logging.debug('sss=%s' % sss)
+            if sss is not None and len(sss) > 0:
+                ssss = sss[0].split('=')
+                logging.debug('ssss=%s' % ssss)
+                if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
+                    self.storyId = ssss[1]
+                if len(sss) > 1:
+                    ssss = sss[1].split('=')
+                    logging.debug('ssss=%s' % ssss)
+                    if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
+                        self.chapurl = True
+
+        self.url = 'http://' + self.host + '/' + self.path + '?sid=' + self.storyId
+        logging.debug('self.url=%s' % self.url)
+        
        self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
        logging.debug('self.uuid=%s' % self.uuid)
    
@ -108,21 +129,26 @@ class Twilighted(FanfictionSiteAdapter):
        return True

    def extractIndividualUrls(self):
-        data = self.opener.open(self.url).read()
+        url = self.url + '&chapter=1'
+        data = self.opener.open(url).read()
        
        if self.reqLoginData(data):
          self.performLogin()
-          data = self.opener.open(self.url).read()
+          data = self.opener.open(url).read()
          if self.reqLoginData(data):
            return None
        
        soup = bs.BeautifulStoneSoup(data)
-    
+
        title = soup.find('title').string
+        logging.debug('Title: %s' % title)
        self.storyName = title.split(' by ')[0].strip()
        self.authorName = title.split(' by ')[1].strip()
-        self.outputName = self.storyName.replace(" ", "_")
-    
+        self.outputName = self.storyName.replace(" ", "_") + '-tw_' + self.storyId
+
+        logging.debug('self.storyId=%s, self.storyName=%s, self.outputName=%s' % (self.storyId, self.storyName, self.outputName))
+        logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
+                
        select = soup.find('select', { 'name' : 'chapter' } )
    	 
        result = []
@ -141,6 +167,7 @@ class Twilighted(FanfictionSiteAdapter):
        lines = data.split('\n')
        soup = bs.BeautifulStoneSoup(data)
        metas = soup.findAll('meta')
+        
        for meta in metas:
            if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
                #logging.debug('Meta: %s' % meta)