# -*- coding: utf-8 -*- import os import re import sys import cgi import uuid import shutil import os.path import logging import unittest import urllib as u import pprint as pp import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs import time import datetime from constants import * from adapter import * try: import login_password except: # tough luck pass class MediaMiner(FanfictionSiteAdapter): def __init__(self, url): self.url = url parsedUrl = up.urlparse(url) self.host = parsedUrl.netloc self.path = parsedUrl.path self.storyName = '' self.authorName = '' self.storyDescription = '' self.storyCharacters = [] self.storySeries = '' self.authorId = '0' self.authorURL = self.path self.storyId = '0' self.storyPublished = datetime.date(1970, 01, 31) self.storyCreated = datetime.datetime.now() self.storyUpdated = datetime.date(1970, 01, 31) self.languageId = 'en-UK' self.language = 'English' self.subjects = [] self.publisher = self.host self.numChapters = 0 self.numWords = 0 self.genre = '' self.category = '' self.storyStatus = 'In-Progress' self.storyRating = 'K' self.storyUserRating = '0' self.outputName = '' self.outputStorySep = '-mm_' logging.debug('self.url=%s' % self.url) if self.url.find('view_st.php') != -1: ss = self.url.split('view_st.php') logging.debug('ss=%s' % ss) if ss is not None and len(ss) > 1: self.storyId = ss[1].replace('/','').strip() elif self.url.find('view_ch.php?') != -1: ss = self.url.split('=') logging.debug('ss=%s' % ss) if ss is not None and len(ss) > 1: self.storyId = ss[-1].replace('/','').strip() self.path = '/fanfic/view_st.php/' + self.storyId self.url = 'http://' + self.host + self.path logging.debug('self.url=%s' % self.url) elif self.url.find('view_ch.php/') != -1: ss = self.url.split('/') logging.debug('ss=%s' % ss) if ss is not None and len(ss) > 2: self.storyId = ss[-2].strip() self.path = '/fanfic/view_st.php/' + self.storyId self.url = 'http://' + self.host + self.path logging.debug('self.url=%s' % self.url) logging.debug('self.storyId=%s' % self.storyId) logging.debug('self.path=%s' % self.path) if not self.appEngine: self.opener = u2.build_opener(u2.HTTPCookieProcessor()) else: self.opener = None logging.debug("Created MediaMiner: url=%s" % (self.url)) def _getLoginScript(self): return self.path def _getVarValue(self, varstr): #logging.debug('_getVarValue varstr=%s' % varstr) vals = varstr.split('=') #logging.debug('vals=%s' % vals) retstr="".join(vals[+1:]) #logging.debug('retstr=%s' % retstr) if retstr.startswith(' '): retstr = retstr[1:] if retstr.endswith(';'): retstr = retstr[:-1] return retstr def _splitCrossover(self, subject): if "Crossover" in subject: self.addSubject ("Crossover") logging.debug('Crossover=%s' % subject) if subject.find(' and ') != -1: words = subject.split(' ') logging.debug('words=%s' % words) subj = '' for s in words: if s in "and Crossover": if len(subj) > 0: self.addSubject(subj) subj = '' else: if len(subj) > 0: subj = subj + ' ' subj = subj + s if len(subj) > 0: self.addSubject(subj) else: self.addSubject(subject) else: self.addSubject(subject) return True def _splitGenre(self, subject): if len(subject) > 0: words = subject.split('/') logging.debug('words=%s' % words) for subj in words: if len(subj) > 0: self.addSubject(subj) return True def extractIndividualUrls(self): data = self.fetchUrl(self.url) #data.replace('
',' ').replace('
',' ').replace('
',' ') soup = bs.BeautifulSoup(data) #logging.debug('soap=%s' % soup) urls = [] td_ffh = soup.find('td', {'class' : 'ffh'}) #logging.debug('td_ffh=%s' % td_ffh) if td_ffh is not None: #logging.debug('td_ffh.text=%s' % td_ffh.find(text=True)) self.storyName = str(td_ffh.find(text=True)).strip() logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName)) fft = td_ffh.find('font', {'class' : 'smtxt'}) #logging.debug('fft=%s' % fft) if fft is not None: ffts = fft.string.split(' ') if ffts is not None: if len(ffts) > 1: self.storyRating = ffts[1] logging.debug('self.storyRating=%s' % self.storyRating) self.genre = '' td_smtxt = soup.findAll('td') if td_smtxt is None: logging.debug('td_smtxt is NONE!') pass else: ll = len(td_smtxt) #logging.debug('td_smtxt=%s, len=%s' % (td_smtxt, ll)) for ii in range(ll): td = td_smtxt[ii] if 'class' in td._getAttrMap() and td['class'] != 'smtxt': logging.debug('td has class attribute but is not smtxt') continue ss = str(td).replace('\n','').replace('\r','').replace(' ', ' ') #logging.debug('ss=%s' % ss) if len(ss) > 1 and (ss.find('Genre(s):') != -1 or ss.find('Type:') != -1): #logging.debug('ss=%s' % ss) ssbs = td.findAll('b') #logging.debug('ssbs=%s' % ssbs) bb = 0 while bb < len(ssbs): nvs = bs.NavigableString('') sst='' ssb = ssbs[bb] ssbt = str(ssb.text).strip() #logging.debug('ssb=%s' % ssb) #logging.debug('ssbt=%s' % ssbt) ssbn = ssb.nextSibling while ssbn is not None: #logging.debug('ssbn=%s' % ssbn) #logging.debug('ssbn.class=%s' % ssbn.__class__) if nvs.__class__ == ssbn.__class__: st = str(ssbn) if st.strip() != '|': sst = sst + st else: #logging.debug('ssbn.name=%s' % ssbn.name) if ssbn.name == 'b': break ssbnts = ssbn.findAll(text=True) for ssbnt in ssbnts: sst = sst + ssbnt ssbn = ssbn.nextSibling sst = sst.replace(' ',' ').strip() #logging.debug('sst=%s' % sst) if bb == 0: ssbt = ssbt.replace(':','') self.addSubject(ssbt) self.addSubject(sst) logging.debug('self.subjects=%s' % self.subjects) else: if ssbt == 'Genre(s):': self.genre = sst logging.debug('self.genre=%s' % self.genre) sts = sst.split(' / ') for st in sts: self.addSubject(st.strip()) logging.debug('self.subjects=%s' % self.subjects) elif ssbt == 'Type:': self.category = sst logging.debug('self.category=%s' % self.category) self.addSubject(sst) logging.debug('self.subjects=%s' % self.subjects) elif ssbt == 'Author:': pass elif ssbt == 'Visits:': pass elif ssbt == 'Size:': pass elif ssbt == 'Pages:': pass elif ssbt == 'Status:': if sst == "Completed": self.storyStatus = 'Completed' else: self.storyStatus = 'In-Progress' elif ssbt == 'Words:': self.numWords = sst.replace('|','').strip() logging.debug('self.numWords=%s' % self.numWords) pass elif ssbt == 'Summary:': self.storyDescription = sst.strip() logging.debug('self.storyDescription=%s' % self.storyDescription) elif ssbt == 'Latest Revision:' or ssbt == 'Uploaded On:': logging.debug('sst=%s' % sst) self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sst.strip(' '), "%B %d, %Y %H:%M %Z"))) logging.debug('self.storyUpdated=%s' % self.storyUpdated) else: pass bb = bb+1 smtxt_as = td_smtxt[ii].findAll('a') #logging.debug('smtxt_as=%s' % smtxt_as) for smtxt_a in smtxt_as: if 'href' in smtxt_a._getAttrMap() and smtxt_a['href'].find('/u/'): sta = smtxt_a['href'] #logging.debug('sta=%s' % sta) stas = sta.split('/u/') #logging.debug('stas=%s' % stas) if stas is not None and len(stas) > 1: self.authorId = stas[1] self.authorURL = 'http://' + self.host + sta self.authorName = smtxt_a.string logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId)) urlstory='' numchapters = 0 td_tbbrdr = soup.find('td', {'class' : 'tbbrdr'}) if td_tbbrdr is not None: #logging.debug('td_tbbrdr=%s' % td_tbbrdr ) sl = td_tbbrdr.find('select', {'name':'cid'}) if sl is not None: #logging.debug('sl=%s' % sl ) opts = sl.findAll('option') for o in opts: #logging.debug('o=%s' % o) if 'value' in o._getAttrMap(): url = 'http://' + self.host + '/fanfic/view_ch.php/' + self.storyId + '/' + o['value'] logging.debug('URL=%s, Title=%s' % (url, o.string)) if numchapters == 0: ss = o.string.split('[') if ss is not None and len(ss) > 1: ssd = ss[-1].replace(']','') #logging.debug('ssd=%s' % ssd) self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(ssd.strip(' '), "%b %d, %Y"))) logging.debug('self.storyPublished=%s' % self.storyPublished) urls.append((url, o.string)) numchapters = numchapters + 1 if numchapters == 0: numchapters = 1 url = 'http://' + self.host + '/fanfic/view_st.php/' + self.storyId self.storyPublished = self.storyUpdated logging.debug('self.storyPublished=%s' % self.storyPublished) ssd = self.storyName + ' [' + self.storyPublished.strftime("%b %d, %Y") + ']' logging.debug('URL=%s, Title=%s' % (url, ssd)) urls.append((url, ssd)) self.numChapters = str(numchapters) logging.debug('self.numChapters=%s' % self.numChapters) logging.debug('urls=%s' % urls) return urls def getText(self, url): time.sleep( 2.0 ) logging.debug('url=%s' % url) data = self.fetchUrl(url) try: soup = bs.BeautifulSoup(data) except: logging.info("Failed to decode: <%s>" % data) soup = None exit(20) return '' #div = soup.find('div', {'id' : 'storytext'}) #if div is None: #logging.error("Error downloading Chapter: %s" % url) #exit (20) #return '' #logging.info("Soup: %s" % soup.prettify()) nvs = bs.NavigableString('') sst='' allAs = soup.findAll ('a', { 'name' : 'fic_c' }) #logging.debug('allAs=%s' % allAs) for a in allAs: #logging.debug('a=%s' % a) foundfirst = False done = False nxta = a.nextSibling while nxta is not None and not done: #logging.debug('nxta=%s' % nxta) #logging.debug('nxta.class=%s' % nxta.__class__) st = str(nxta) if nvs.__class__ != nxta.__class__: #logging.debug('nxta.name=%s' % nxta.name) if nxta.name == 'table': st = '' if foundfirst: done = True if nxta.name == 'div' and 'class' in nxta._getAttrMap() and nxta['class'] == 'acl' and foundfirst: st = '' done = True if nxta.name == 'br': if not foundfirst: st = '' else: foundfirst = True else: foundfirst = True sst = sst + st nxta = nxta.nextSibling #sst = sst.replace(' ',' ').strip() #logging.debug('sst=%s' % sst) #logging.debug('sst.0=%s' % sst) #sst0 = sst.replace(u'≴', u'“').replace(u'≵','”').replace(u'≰',u'‘').replace(u'≱',u'’') #sst0 = sst.replace(u"≵","”") #logging.debug('sst.1=%s' % sst0) #sst1 = sst.replace(u'≴', u'\"').replace('≵','\"').replace('≰','\'').replace('≱','\'') #logging.debug('sst.2=%s' % sst1) return sst class FPC_UnitTests(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) pass def testFictionPress(self): url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade' f = FPCom(url) urls = f.extractIndividualUrls() self.assertEquals('Behind This Facade', f.getStoryName()) self.assertEquals('IntoxicatingMelody', f.getAuthorName()) text = f.getText(url) self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1) if __name__ == '__main__': unittest.main()