diff --git a/downloader.py b/downloader.py index 431bbad4..310eac41 100644 --- a/downloader.py +++ b/downloader.py @@ -24,6 +24,7 @@ import fictionalley import hpfiction import twilighted import potionsNsnitches +import mediaminer import time @@ -107,6 +108,8 @@ if __name__ == '__main__': adapter = twilighted.Twilighted(url) elif url.find('potionsandsnitches.net') != -1: adapter = potionsNsnitches.PotionsNSnitches(url) + elif url.find('mediaminer.org') != -1: + adapter = mediaminer.MediaMiner(url) else: print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url sys.exit(1) diff --git a/mediaminer.py b/mediaminer.py new file mode 100644 index 00000000..dd2ef3ea --- /dev/null +++ b/mediaminer.py @@ -0,0 +1,402 @@ +# -*- coding: utf-8 -*- + +import os +import re +import sys +import cgi +import uuid +import shutil +import os.path +import logging +import unittest +import urllib as u +import pprint as pp +import urllib2 as u2 +import urlparse as up +import BeautifulSoup as bs +import htmlentitydefs as hdefs +import time +import datetime + +from constants import * +from adapter import * + +try: + import login_password +except: + # tough luck + pass + +class MediaMiner(FanfictionSiteAdapter): + def __init__(self, url): + self.url = url + parsedUrl = up.urlparse(url) + self.host = parsedUrl.netloc + self.path = parsedUrl.path + + self.storyName = '' + self.authorName = '' + self.storyDescription = '' + self.storyCharacters = [] + self.storySeries = '' + self.authorId = '0' + self.authorURL = self.path + self.storyId = '0' + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = '' + self.category = '' + self.storyStatus = 'In-Progress' + self.storyRating = 'K' + self.storyUserRating = '0' + self.outputName = '' + self.outputStorySep = '-mm_' + + logging.debug('self.url=%s' % self.url) + + if self.url.find('view_st.php') != -1: + ss = self.url.split('view_st.php') + logging.debug('ss=%s' % ss) + if ss is not None and len(ss) > 1: + self.storyId = ss[1].replace('/','').strip() + elif self.url.find('view_ch.php?') != -1: + ss = self.url.split('=') + logging.debug('ss=%s' % ss) + if ss is not None and len(ss) > 1: + self.storyId = ss[-1].replace('/','').strip() + self.path = '/fanfic/view_st.php/' + self.storyId + self.url = 'http://' + self.host + self.path + logging.debug('self.url=%s' % self.url) + elif self.url.find('view_ch.php/') != -1: + ss = self.url.split('/') + logging.debug('ss=%s' % ss) + if ss is not None and len(ss) > 2: + self.storyId = ss[-2].strip() + self.path = '/fanfic/view_st.php/' + self.storyId + self.url = 'http://' + self.host + self.path + logging.debug('self.url=%s' % self.url) + + logging.debug('self.storyId=%s' % self.storyId) + + logging.debug('self.path=%s' % self.path) + + if not self.appEngine: + self.opener = u2.build_opener(u2.HTTPCookieProcessor()) + else: + self.opener = None + + logging.debug("Created MediaMiner: url=%s" % (self.url)) + + def _getLoginScript(self): + return self.path + + def _getVarValue(self, varstr): + #logging.debug('_getVarValue varstr=%s' % varstr) + vals = varstr.split('=') + #logging.debug('vals=%s' % vals) + retstr="".join(vals[+1:]) + #logging.debug('retstr=%s' % retstr) + if retstr.startswith(' '): + retstr = retstr[1:] + if retstr.endswith(';'): + retstr = retstr[:-1] + return retstr + + def _splitCrossover(self, subject): + if "Crossover" in subject: + self.addSubject ("Crossover") + logging.debug('Crossover=%s' % subject) + if subject.find(' and ') != -1: + words = subject.split(' ') + logging.debug('words=%s' % words) + subj = '' + for s in words: + if s in "and Crossover": + if len(subj) > 0: + self.addSubject(subj) + subj = '' + else: + if len(subj) > 0: + subj = subj + ' ' + subj = subj + s + if len(subj) > 0: + self.addSubject(subj) + else: + self.addSubject(subject) + else: + self.addSubject(subject) + return True + + def _splitGenre(self, subject): + if len(subject) > 0: + words = subject.split('/') + logging.debug('words=%s' % words) + for subj in words: + if len(subj) > 0: + self.addSubject(subj) + return True + + def _fetchUrl(self, url): + if not self.appEngine: + return self.opener.open(url).read().decode('utf-8') + else: + return googlefetch(url).content + + def extractIndividualUrls(self): + data = self._fetchUrl(self.url) + #data.replace('
',' ').replace('
',' ').replace('
',' ') + soup = bs.BeautifulSoup(data) + #logging.debug('soap=%s' % soup) + urls = [] + + td_ffh = soup.find('td', {'class' : 'ffh'}) + #logging.debug('td_ffh=%s' % td_ffh) + if td_ffh is not None: + #logging.debug('td_ffh.text=%s' % td_ffh.find(text=True)) + self.storyName = str(td_ffh.find(text=True)).strip() + logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName)) + fft = td_ffh.find('font', {'class' : 'smtxt'}) + #logging.debug('fft=%s' % fft) + if fft is not None: + ffts = fft.string.split(' ') + if ffts is not None: + if len(ffts) > 1: + self.storyRating = ffts[1] + logging.debug('self.storyRating=%s' % self.storyRating) + self.genre = '' + td_smtxt = soup.findAll('td') + if td_smtxt is None: + logging.debug('td_smtxt is NONE!') + pass + else: + ll = len(td_smtxt) + #logging.debug('td_smtxt=%s, len=%s' % (td_smtxt, ll)) + for ii in range(ll): + td = td_smtxt[ii] + if 'class' in td._getAttrMap() and td['class'] != 'smtxt': + logging.debug('td has class attribute but is not smtxt') + continue + ss = str(td).replace('\n','').replace('\r','').replace(' ', ' ') + #logging.debug('ss=%s' % ss) + if len(ss) > 1 and (ss.find('Genre(s):') != -1 or ss.find('Type:') != -1): + #logging.debug('ss=%s' % ss) + ssbs = td.findAll('b') + #logging.debug('ssbs=%s' % ssbs) + bb = 0 + while bb < len(ssbs): + nvs = bs.NavigableString('') + sst='' + ssb = ssbs[bb] + ssbt = str(ssb.text).strip() + #logging.debug('ssb=%s' % ssb) + #logging.debug('ssbt=%s' % ssbt) + ssbn = ssb.nextSibling + while ssbn is not None: + #logging.debug('ssbn=%s' % ssbn) + #logging.debug('ssbn.class=%s' % ssbn.__class__) + if nvs.__class__ == ssbn.__class__: + st = str(ssbn) + if st.strip() != '|': + sst = sst + st + else: + #logging.debug('ssbn.name=%s' % ssbn.name) + if ssbn.name == 'b': + break + ssbnts = ssbn.findAll(text=True) + for ssbnt in ssbnts: + sst = sst + ssbnt + ssbn = ssbn.nextSibling + sst = sst.replace(' ',' ').strip() + #logging.debug('sst=%s' % sst) + if bb == 0: + ssbt = ssbt.replace(':','') + self.addSubject(ssbt) + self.addSubject(sst) + logging.debug('self.subjects=%s' % self.subjects) + else: + if ssbt == 'Genre(s):': + self.genre = sst + logging.debug('self.genre=%s' % self.genre) + sts = sst.split(' / ') + for st in sts: + self.addSubject(st.strip()) + logging.debug('self.subjects=%s' % self.subjects) + elif ssbt == 'Type:': + self.category = sst + logging.debug('self.category=%s' % self.category) + self.addSubject(sst) + logging.debug('self.subjects=%s' % self.subjects) + elif ssbt == 'Author:': + pass + elif ssbt == 'Visits:': + pass + elif ssbt == 'Size:': + pass + elif ssbt == 'Pages:': + pass + elif ssbt == 'Status:': + if sst == "Completed": + self.storyStatus = 'Completed' + else: + self.storyStatus = 'In-Progress' + elif ssbt == 'Words:': + self.numWords = sst.replace('|','').strip() + logging.debug('self.numWords=%s' % self.numWords) + pass + elif ssbt == 'Summary:': + self.storyDescription = sst.strip() + logging.debug('self.storyDescription=%s' % self.storyDescription) + elif ssbt == 'Latest Revision:' or ssbt == 'Uploaded On:': + logging.debug('sst=%s' % sst) + self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sst.strip(' '), "%B %d, %Y %H:%M %Z"))) + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + else: + pass + bb = bb+1 + + smtxt_as = td_smtxt[ii].findAll('a') + #logging.debug('smtxt_as=%s' % smtxt_as) + for smtxt_a in smtxt_as: + if 'href' in smtxt_a._getAttrMap() and smtxt_a['href'].find('/u/'): + sta = smtxt_a['href'] + #logging.debug('sta=%s' % sta) + stas = sta.split('/u/') + #logging.debug('stas=%s' % stas) + if stas is not None and len(stas) > 1: + self.authorId = stas[1] + self.authorURL = 'http://' + self.host + sta + self.authorName = smtxt_a.string + logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId)) + + urlstory='' + numchapters = 0 + td_tbbrdr = soup.find('td', {'class' : 'tbbrdr'}) + if td_tbbrdr is not None: + #logging.debug('td_tbbrdr=%s' % td_tbbrdr ) + + sl = td_tbbrdr.find('select', {'name':'cid'}) + if sl is not None: + #logging.debug('sl=%s' % sl ) + opts = sl.findAll('option') + for o in opts: + #logging.debug('o=%s' % o) + if 'value' in o._getAttrMap(): + url = 'http://' + self.host + '/fanfic/view_ch.php/' + self.storyId + '/' + o['value'] + logging.debug('URL=%s, Title=%s' % (url, o.string)) + if numchapters == 0: + ss = o.string.split('[') + if ss is not None and len(ss) > 1: + ssd = ss[-1].replace(']','') + #logging.debug('ssd=%s' % ssd) + self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(ssd.strip(' '), "%b %d, %Y"))) + logging.debug('self.storyPublished=%s' % self.storyPublished) + urls.append((url, o.string)) + numchapters = numchapters + 1 + + if numchapters == 0: + numchapters = 1 + url = 'http://' + self.host + '/fanfic/view_st.php/' + self.storyId + self.storyPublished = self.storyUpdated + logging.debug('self.storyPublished=%s' % self.storyPublished) + ssd = self.storyName + ' [' + self.storyPublished.strftime("%b %d, %Y") + ']' + logging.debug('URL=%s, Title=%s' % (url, ssd)) + urls.append((url, ssd)) + + self.numChapters = str(numchapters) + logging.debug('self.numChapters=%s' % self.numChapters) + logging.debug('urls=%s' % urls) + + return urls + + def getText(self, url): + time.sleep( 2.0 ) + logging.debug('url=%s' % url) + data = self._fetchUrl(url) + + try: + soup = bs.BeautifulSoup(data) + except: + logging.info("Failed to decode: <%s>" % data) + soup = None + exit(20) + return '' + + #div = soup.find('div', {'id' : 'storytext'}) + #if div is None: + #logging.error("Error downloading Chapter: %s" % url) + #exit (20) + #return '' + + #logging.info("Soup: %s" % soup.prettify()) + + nvs = bs.NavigableString('') + sst='' + allAs = soup.findAll ('a', { 'name' : 'fic_c' }) + #logging.debug('allAs=%s' % allAs) + for a in allAs: + #logging.debug('a=%s' % a) + foundfirst = False + done = False + nxta = a.nextSibling + while nxta is not None and not done: + #logging.debug('nxta=%s' % nxta) + #logging.debug('nxta.class=%s' % nxta.__class__) + st = str(nxta) + if nvs.__class__ != nxta.__class__: + #logging.debug('nxta.name=%s' % nxta.name) + if nxta.name == 'table': + st = '' + if foundfirst: + done = True + if nxta.name == 'div' and 'class' in nxta._getAttrMap() and nxta['class'] == 'acl' and foundfirst: + st = '' + done = True + + if nxta.name == 'br': + if not foundfirst: + st = '' + else: + foundfirst = True + else: + foundfirst = True + + sst = sst + st + nxta = nxta.nextSibling + + #sst = sst.replace(' ',' ').strip() + #logging.debug('sst=%s' % sst) + + #logging.debug('sst.0=%s' % sst) + #sst0 = sst.replace(u'≴', u'“').replace(u'≵','”').replace(u'≰',u'‘').replace(u'≱',u'’') + #sst0 = sst.replace(u"≵","”") + #logging.debug('sst.1=%s' % sst0) + #sst1 = sst.replace(u'≴', u'\"').replace('≵','\"').replace('≰','\'').replace('≱','\'') + #logging.debug('sst.2=%s' % sst1) + + return sst + +class FPC_UnitTests(unittest.TestCase): + def setUp(self): + logging.basicConfig(level=logging.DEBUG) + pass + + def testFictionPress(self): + url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade' + f = FPCom(url) + urls = f.extractIndividualUrls() + + self.assertEquals('Behind This Facade', f.getStoryName()) + self.assertEquals('IntoxicatingMelody', f.getAuthorName()) + + text = f.getText(url) + self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1) + +if __name__ == '__main__': + unittest.main()