Removed dwiggie.com adapter

2025-12-26 19:00:03 +01:00 · 2012-10-25 19:09:09 -04:00 · 2012-10-25 19:09:09 -04:00 · d53b7411d9
commit d53b7411d9
parent 88bd4f395f
2 changed files with 0 additions and 356 deletions
--- a/fanficdownloader/adapters/init.py
+++ b/fanficdownloader/adapters/init.py
@ -103,7 +103,6 @@ import adapter_merlinficdtwinscouk
 import adapter_thehookupzonenet
 import adapter_bloodtiesfancom
 import adapter_indeathnet
-import adapter_dwiggiecom
 import adapter_jlaunlimitedcom
 import adapter_qafficcom

--- a/fanficdownloader/adapters/adapter_dwiggiecom.py
+++ b/fanficdownloader/adapters/adapter_dwiggiecom.py
@ -1,355 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2011 Fanficdownloader team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import time
-import logging
-import re
-import urllib2
-
-from .. import BeautifulSoup as bs
-from ..htmlcleanup import stripHTML
-from .. import exceptions as exceptions
-
-from base_adapter import BaseSiteAdapter,  makeDate
-
-def getClass():
-    return DwiggieComAdapter
-
-# Class name has to be unique.  Our convention is camel case the
-# sitename with Adapter at the end.  www is skipped.
-class DwiggieComAdapter(BaseSiteAdapter):
-
-    def __init__(self, config, url):
-        BaseSiteAdapter.__init__(self, config, url)
-
-        self.decode = ["Windows-1252",
-                       "utf8"] # 1252 is a superset of iso-8859-1.
-                               # Most sites that claim to be
-                               # iso-8859-1 (and some that claim to be
-                               # utf8) are really windows-1252.
-        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
-        self.password = ""
-        self.is_adult=False
-        self.sectionUrl = ""
-        self.section = []
-        self.chapters = dict()
-
-        
-#        # get storyId from url--url validation guarantees query is only sid=1234
-#        self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
-#        logging.debug("storyId: (%s)"%self.story.getMetadata('storyId'))
-		
-		
-		  # get storyId from url--url validation guarantees query correct
-        m = re.match(self.getSiteURLPattern(),url)
-        if m:
-            self.story.setMetadata('storyId',m.group('id'))
-            logging.debug("storyId: (%s)"%self.story.getMetadata('storyId'))
-            # normalized story URL.
-            self._setURL('http://www.' + self.getSiteDomain() + '/derby/'+self.story.getMetadata('storyId')+'.htm')
-        else:
-            raise exceptions.InvalidStoryURL(url,
-                                             self.getSiteDomain(),
-                                             self.getSiteExampleURLs())
-        
-        # Each adapter needs to have a unique site abbreviation.
-        self.story.setMetadata('siteabbrev','dwg')
-
-        # The date format will vary from site to site.
-        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
-        self.dateformat = "%m/%d/%y"
-            
-    @staticmethod # must be @staticmethod, don't remove it.
-    def getSiteDomain():
-        # The site domain.  Does have www here, if it uses it.
-        return 'dwiggie.com'
-
-    @classmethod
-    def getAcceptDomains(cls):
-        return ['www.dwiggie.com','dwiggie.com']
-
-    def getSiteExampleURLs(self):
-        return "http://"+self.getSiteDomain()+"/derby/name1b.htm"
-
-    def getSiteURLPattern(self):
-        # http://www.dwiggie.com/derby/mari17b.htm
-        return re.escape("http://")+"(www.)?"+re.escape(self.getSiteDomain())+r"/derby/(?P<id>(old_\d{4}\/)?[a-z]+\d+)(?P<part>[a-z]*)\.htm$"     
-        
-    def tryArchivePage(self, url):
-    	
-        try:
-            data = self._fetchUrl(url)
-      
-        except urllib2.HTTPError, e:
-            if e.code == 404:
-                raise exceptions.StoryDoesNotExist(self.meta)		# need to change the exception returned
-            else:
-                raise e
-      
-        archivesoup = bs.BeautifulSoup(data)
-        m = re.compile(r"/derby/"+self.story.getMetadata('storyId')+"[a-z]?.htm$")
-        #print m.pattern
-        #print archivesoup
-        a = archivesoup.find('a', href=m)		#http://www.indeath.net/user/9083-cyrex/
-        
-        return a
-    
-    def getGenre(self, url):
-        if re.search('id=E',url):
-            genre='Epilogue Abbey'
-        else:
-            genre='Fantasia Gallery'
-        self.story.addToList('genre',genre)
-        
-    def getItemFromArchivePage(self):
-        
-        urls = ["http://www.dwiggie.com/toc/index.php?id=E&page=all&comp=n","http://www.dwiggie.com/toc/index.php?id=F&page=all&comp=n"]
-        for url in urls:
-            a = self.tryArchivePage(url)
-            if a != None:
-                self.getGenre(url)
-                return a.parent
-        else:
-            return None
-	            
-	            
-    def getMetaFromSearch(self):
-        
-        params = {}
-        params['title_name'] = self.story.getMetadata('title')
-        
-        searchUrl = "http://" + self.getSiteDomain() + "/toc/search.php"
-    	
-    	d = self._postUrl(searchUrl, params)
-    	#print d
-    	
-    	searchsoup = bs.BeautifulSoup(d)
-    	m = re.compile(r"/derby/"+self.story.getMetadata('storyId')+"[a-z]?.htm$")
-        #print m.pattern
-        #print self.story.getMetadata('storyId')
-        a = searchsoup.find('a', href=m)		#http://www.indeath.net/user/9083-cyrex/
-
-        return a
-        
-        
-    def getChaptersFromPage(self, url):
-        data = self._fetchUrl(url)
-        
-        s = self.story.getMetadata('storyId').split('/')
-        s.reverse()
-        storyId_trimmed = s[0]
-                
-        m = re.match('.*?<body[^>]*>(\s*<ul>)?(?P<content>.*?)</body>', data, re.DOTALL)
-        newdata = m.group('content')
-        regex=re.compile(r'<a\ href\=\"'+storyId_trimmed+'[a-z]?.htm\">(Continued\ [Ii]n\ )?(the\ )?[Nn]ext\ [Ss]ection</a>')
-        newdata = re.sub(regex, '', newdata)
-
-        
-        
-        #pagesections = filter(lambda x:x!=None, re.split('(?m)<hr( \/)?>|<p>\s*<hr( \/)?>\s*<\/p>', newdata, re.MULTILINE))
-        #pagesections = filter(lambda x:x!=None, re.split('(?m)(<p>\s*)*<hr( \/)?>(\s*<\/p>)?', newdata, re.MULTILINE))
-        pagesections = filter(lambda x:x!=None, re.split('<hr( \/)?>', newdata))                        
-        pagesections = filter(lambda x:x.strip()!='/', pagesections)  
-        #regex = re.compile(r'(href\="'+storyId_trimmed+'[a-z]?.htm$"')
-        #pagesections = filter(lambda x:re.search(re.compile(storyId_trimmed+"[a-z]?.htm$"),x)==None, pagesections)      
-        pagesections.pop(0)     # always remove header
-        
-        
-        regex = re.compile(r'(?m)(href\="'+storyId_trimmed+'[a-z]?.htm\"|Copyright\ held\ by\ the\ author|<p>\s*(Section\ I|Beginning),\s*</?p>)', re.MULTILINE)
-        s = filter(lambda x:regex.search(x), pagesections)
-        #print s
-        pagesections = filter(lambda x: not regex.search(x), pagesections)
-        #print pagesections[0]
-        return pagesections  
-		
-
-    ## Getting the chapter list and the meta data, plus 'is adult' checking.
-    def extractChapterUrlsAndMetadata(self):
-
-    	url = self.url
-    	meta = self.getItemFromArchivePage()
-    	#print meta
-       
-        # Title
-        t = meta.a
-        self.story.setMetadata('title',t.string.strip())
-		
-        # Author
-        author = meta.find('a','author_link')
-        if author != None:
-            self.story.setMetadata('author',author.string.strip())
-            self.story.setMetadata('authorId',author['href'].split('=')[1])
-            self.story.setMetadata('authorUrl',author['href'])
-            author=author.parent
-        else:
-            author=meta.i
-            self.story.setMetadata('author',author.string.replace('Written by','').strip())
-            self.story.setMetadata('authorId','unknown')
-            self.story.setMetadata('authorUrl','unknown')
-           
-
-        # DateUpdated
-        dUpdate = meta.find('i',text = re.compile('Last update'))
-        du = dUpdate.replace('Last update','').replace('.','').strip()
-        self.story.setMetadata('dateUpdated', makeDate(du, self.dateformat))
-        compImg=meta.find('img',alt="Dot")
-        if compImg != None:
-            self.story.setMetadata('status', 'Completed')
-        else:
-            self.story.setMetadata('status', 'In-Progress')
-
-
-        # Summary & Category
-        # Get the summary components from the meta listing        
-        metalist=meta.contents
-        s=[]
-        for x in xrange(0,len(metalist)-1):
-            item=metalist[x]
-            if item==author or item==compImg:
-                s=[]
-                continue
-            if item==dUpdate or item==dUpdate.parent:
-                break
-            s.append(item)            
-        
-        # create a soup object from the summary components
-        soup=bs.BeautifulSoup("<p></p>")
-        d=soup.p
-        for x in s:
-            d.append(x)
-        #print d       
-
-        # extract category from summary text
-        desc=stripHTML(d)
-        books = re.compile(r'(?P<book>\~P&P;?\~|\~Em;?\~|\~MP;?\~|\~S\&S;?\~|\~Per;?\~|\~NA;?\~|\~Juv;?\~|\~Misc;?\~)')
-        booklist=dict({'~P&P~':'Pride and Prejudice','~Em~':'Emma','~MP~':'Mansfield Park','~S&S~':'Sense and Sensibility','~Per~':'Persuasion','~NA~':'Northanger Abbey','~Juv~':'Juvenilia','~Misc~':'Miscellaneous'})
-        m=re.search(books,desc)
-        print m.group('book')
-        book=booklist.get(m.group('book').replace(';',''))
-        print book
-        self.story.addToList('category',book)
-
-        
-        # assign summary info
-        desc=stripHTML(desc).replace(book,'').strip()
-        desc=re.sub('^.\s*','',desc)        
-        if desc != None:
-            self.setDescription(url,desc)
-
-            
-            
-     
-
-        ## Chapters (Sections in this case - don't know if we can subdivide them)
-        
-        # get the last Section from the archive page link 
-        #chapters = ["http://www.dwiggie.com"+t['href']]
-
-        # get the section letter from the last page
-        m = re.match("/derby/"+self.story.getMetadata('storyId')+"(?P<section>[a-z]?).htm$",t['href'])
-        inc = m.group('section')
-        if inc == '':
-            inc = 'a'
-
-                
-        # get the presumed list of section urls with 'lower' section letters
-        sections = []
-        baseurl = "http://www.dwiggie.com/derby/"+self.story.getMetadata('storyId')
-        extension = ".htm"
-        ordend = ord(inc)
-        ordbegin = ord('a')
-        for numinc in xrange(ordbegin,ordend+1):
-        	inc = chr(numinc)
-        	if inc == 'a':
-        	    sections.append(baseurl+extension)
-        	else:
-        	    sections.append(baseurl+inc+extension)
-            
-            
-
-		# Process List of Chapters 
-		# create 'dummy' urls for individual chapters in the form 'pageurl#pageindex' where page index is an index starting with 0 per page             
-        c = 0
-        postdate=None
-        chapters = []
-        for x in range(0,len(sections)):
-            section=sections[x]
-            i=0
-            for chapter in self.getChaptersFromPage(section):
-                c+=1 
-                chaptersoup = bs.BeautifulSoup(chapter)
-                #self.chapterUrls.append(('Chapter '+str(c),section+'#'+str(i)))
-                cUrl = section+'#'+str(i)
-                t = chaptersoup.find('font',size="+1",color="#336666")
-                ctitle = ''
-                if t!=None:
-                    ctitle=stripHTML(t)
-                #self.chapterUrls.append(('Chapter '+str(c),cUrl))
-                self.chapterUrls.append((ctitle,cUrl))
-                chapters.append((cUrl,chaptersoup))
-                if postdate==None:
-                    regex=re.compile(r'Posted\ on\:?\ (?P<date>\d{4}\-\d{2}\-\d{2}|\w+,\ \d+\ \w+\ \d{4})')
-                    #Sunday, 21 March 2004, at 6:00 a.m.
-                    m=re.search(regex,chapter)
-                    if m!=None:
-                        postdate=m.group('date')
-                i+=1
-        self.chapters=dict(chapters)
-        #print postdate
-        pubdate=None
-        if postdate!=None:
-            format1=re.match(re.compile(r'\d{4}\-\d{2}\-\d{2}'),postdate)
-            format2=re.match(re.compile(r'\w+,\ \d+\ \w+\ \d{4}'),postdate)
-            if format1!=None:
-                pubdate = makeDate(postdate, "%Y-%m-%d")
-            if format2!=None:
-                pubdate = makeDate(postdate, "%A, %d %B %Y")
-            
-                    
-        if pubdate==None:
-            pubdate=makeDate(self.story.getMetadata('dateUpdated'), "%Y-%m-%d")
-        #print pubdate
-        self.story.setMetadata('datePublished', pubdate)   
-        #print self.story.getMetadata('dateUpdated')
-        #print self.story.getMetadata('datePublished')      
-        self.story.setMetadata('numChapters',c)
-        logging.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
-        
-        
-        
-    # grab the text for an individual chapter.
-    def getChapterText(self, url):
-        logging.debug('Getting chapter text from: %s' % url)
-        
-        chapter = self.chapters.get(url)
-#        for c in self.chapters:
-#            if c[0] == url:
-#                chapter = c[1]
-                #chapter = bs.BeautifulSoup(c[1])       
-        
-        #chapter = find(lambda c: c[0] == url, self.chapters)[1]
-#        page_url = url.split('#')[0]
-#        x = url.split('#')[1]        
-#        if self.sectionUrl != page_url:
-#            self.sectionUrl = page_url
-#            self.section = self.getChaptersFromPage(page_url)
-#        
-#        chapter = bs.BeautifulSoup(self.section[int(x)])
-
-        #chapter = bs.BeautifulSoup(self.getChaptersFromPage(page_url)[int(x)])
-
-        return self.utf8FromSoup(url,chapter)