mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Added in mediaminer.org adapter. Still has problems becaue mm seems to use the ≴ type character encoding, and I couldn't manage to replace them.
This commit is contained in:
parent
981a922d00
commit
bdf429b41c
2 changed files with 405 additions and 0 deletions
|
|
@ -24,6 +24,7 @@ import fictionalley
|
|||
import hpfiction
|
||||
import twilighted
|
||||
import potionsNsnitches
|
||||
import mediaminer
|
||||
|
||||
import time
|
||||
|
||||
|
|
@ -107,6 +108,8 @@ if __name__ == '__main__':
|
|||
adapter = twilighted.Twilighted(url)
|
||||
elif url.find('potionsandsnitches.net') != -1:
|
||||
adapter = potionsNsnitches.PotionsNSnitches(url)
|
||||
elif url.find('mediaminer.org') != -1:
|
||||
adapter = mediaminer.MediaMiner(url)
|
||||
else:
|
||||
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
|
||||
sys.exit(1)
|
||||
|
|
|
|||
402
mediaminer.py
Normal file
402
mediaminer.py
Normal file
|
|
@ -0,0 +1,402 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class MediaMiner(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
self.storyName = ''
|
||||
self.authorName = ''
|
||||
self.storyDescription = ''
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.authorId = '0'
|
||||
self.authorURL = self.path
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = ''
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-mm_'
|
||||
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
if self.url.find('view_st.php') != -1:
|
||||
ss = self.url.split('view_st.php')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyId = ss[1].replace('/','').strip()
|
||||
elif self.url.find('view_ch.php?') != -1:
|
||||
ss = self.url.split('=')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyId = ss[-1].replace('/','').strip()
|
||||
self.path = '/fanfic/view_st.php/' + self.storyId
|
||||
self.url = 'http://' + self.host + self.path
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
elif self.url.find('view_ch.php/') != -1:
|
||||
ss = self.url.split('/')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 2:
|
||||
self.storyId = ss[-2].strip()
|
||||
self.path = '/fanfic/view_st.php/' + self.storyId
|
||||
self.url = 'http://' + self.host + self.path
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if not self.appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
self.opener = None
|
||||
|
||||
logging.debug("Created MediaMiner: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def _getVarValue(self, varstr):
|
||||
#logging.debug('_getVarValue varstr=%s' % varstr)
|
||||
vals = varstr.split('=')
|
||||
#logging.debug('vals=%s' % vals)
|
||||
retstr="".join(vals[+1:])
|
||||
#logging.debug('retstr=%s' % retstr)
|
||||
if retstr.startswith(' '):
|
||||
retstr = retstr[1:]
|
||||
if retstr.endswith(';'):
|
||||
retstr = retstr[:-1]
|
||||
return retstr
|
||||
|
||||
def _splitCrossover(self, subject):
|
||||
if "Crossover" in subject:
|
||||
self.addSubject ("Crossover")
|
||||
logging.debug('Crossover=%s' % subject)
|
||||
if subject.find(' and ') != -1:
|
||||
words = subject.split(' ')
|
||||
logging.debug('words=%s' % words)
|
||||
subj = ''
|
||||
for s in words:
|
||||
if s in "and Crossover":
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
subj = ''
|
||||
else:
|
||||
if len(subj) > 0:
|
||||
subj = subj + ' '
|
||||
subj = subj + s
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
return True
|
||||
|
||||
def _splitGenre(self, subject):
|
||||
if len(subject) > 0:
|
||||
words = subject.split('/')
|
||||
logging.debug('words=%s' % words)
|
||||
for subj in words:
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
return True
|
||||
|
||||
def _fetchUrl(self, url):
|
||||
if not self.appEngine:
|
||||
return self.opener.open(url).read().decode('utf-8')
|
||||
else:
|
||||
return googlefetch(url).content
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self._fetchUrl(self.url)
|
||||
#data.replace('<br />',' ').replace('<br>',' ').replace('</br>',' ')
|
||||
soup = bs.BeautifulSoup(data)
|
||||
#logging.debug('soap=%s' % soup)
|
||||
urls = []
|
||||
|
||||
td_ffh = soup.find('td', {'class' : 'ffh'})
|
||||
#logging.debug('td_ffh=%s' % td_ffh)
|
||||
if td_ffh is not None:
|
||||
#logging.debug('td_ffh.text=%s' % td_ffh.find(text=True))
|
||||
self.storyName = str(td_ffh.find(text=True)).strip()
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
fft = td_ffh.find('font', {'class' : 'smtxt'})
|
||||
#logging.debug('fft=%s' % fft)
|
||||
if fft is not None:
|
||||
ffts = fft.string.split(' ')
|
||||
if ffts is not None:
|
||||
if len(ffts) > 1:
|
||||
self.storyRating = ffts[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
self.genre = ''
|
||||
td_smtxt = soup.findAll('td')
|
||||
if td_smtxt is None:
|
||||
logging.debug('td_smtxt is NONE!')
|
||||
pass
|
||||
else:
|
||||
ll = len(td_smtxt)
|
||||
#logging.debug('td_smtxt=%s, len=%s' % (td_smtxt, ll))
|
||||
for ii in range(ll):
|
||||
td = td_smtxt[ii]
|
||||
if 'class' in td._getAttrMap() and td['class'] != 'smtxt':
|
||||
logging.debug('td has class attribute but is not smtxt')
|
||||
continue
|
||||
ss = str(td).replace('\n','').replace('\r','').replace(' ', ' ')
|
||||
#logging.debug('ss=%s' % ss)
|
||||
if len(ss) > 1 and (ss.find('Genre(s):') != -1 or ss.find('Type:') != -1):
|
||||
#logging.debug('ss=%s' % ss)
|
||||
ssbs = td.findAll('b')
|
||||
#logging.debug('ssbs=%s' % ssbs)
|
||||
bb = 0
|
||||
while bb < len(ssbs):
|
||||
nvs = bs.NavigableString('')
|
||||
sst=''
|
||||
ssb = ssbs[bb]
|
||||
ssbt = str(ssb.text).strip()
|
||||
#logging.debug('ssb=%s' % ssb)
|
||||
#logging.debug('ssbt=%s' % ssbt)
|
||||
ssbn = ssb.nextSibling
|
||||
while ssbn is not None:
|
||||
#logging.debug('ssbn=%s' % ssbn)
|
||||
#logging.debug('ssbn.class=%s' % ssbn.__class__)
|
||||
if nvs.__class__ == ssbn.__class__:
|
||||
st = str(ssbn)
|
||||
if st.strip() != '|':
|
||||
sst = sst + st
|
||||
else:
|
||||
#logging.debug('ssbn.name=%s' % ssbn.name)
|
||||
if ssbn.name == 'b':
|
||||
break
|
||||
ssbnts = ssbn.findAll(text=True)
|
||||
for ssbnt in ssbnts:
|
||||
sst = sst + ssbnt
|
||||
ssbn = ssbn.nextSibling
|
||||
sst = sst.replace(' ',' ').strip()
|
||||
#logging.debug('sst=%s' % sst)
|
||||
if bb == 0:
|
||||
ssbt = ssbt.replace(':','')
|
||||
self.addSubject(ssbt)
|
||||
self.addSubject(sst)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
else:
|
||||
if ssbt == 'Genre(s):':
|
||||
self.genre = sst
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
sts = sst.split(' / ')
|
||||
for st in sts:
|
||||
self.addSubject(st.strip())
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif ssbt == 'Type:':
|
||||
self.category = sst
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self.addSubject(sst)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif ssbt == 'Author:':
|
||||
pass
|
||||
elif ssbt == 'Visits:':
|
||||
pass
|
||||
elif ssbt == 'Size:':
|
||||
pass
|
||||
elif ssbt == 'Pages:':
|
||||
pass
|
||||
elif ssbt == 'Status:':
|
||||
if sst == "Completed":
|
||||
self.storyStatus = 'Completed'
|
||||
else:
|
||||
self.storyStatus = 'In-Progress'
|
||||
elif ssbt == 'Words:':
|
||||
self.numWords = sst.replace('|','').strip()
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
pass
|
||||
elif ssbt == 'Summary:':
|
||||
self.storyDescription = sst.strip()
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
elif ssbt == 'Latest Revision:' or ssbt == 'Uploaded On:':
|
||||
logging.debug('sst=%s' % sst)
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sst.strip(' '), "%B %d, %Y %H:%M %Z")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
pass
|
||||
bb = bb+1
|
||||
|
||||
smtxt_as = td_smtxt[ii].findAll('a')
|
||||
#logging.debug('smtxt_as=%s' % smtxt_as)
|
||||
for smtxt_a in smtxt_as:
|
||||
if 'href' in smtxt_a._getAttrMap() and smtxt_a['href'].find('/u/'):
|
||||
sta = smtxt_a['href']
|
||||
#logging.debug('sta=%s' % sta)
|
||||
stas = sta.split('/u/')
|
||||
#logging.debug('stas=%s' % stas)
|
||||
if stas is not None and len(stas) > 1:
|
||||
self.authorId = stas[1]
|
||||
self.authorURL = 'http://' + self.host + sta
|
||||
self.authorName = smtxt_a.string
|
||||
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
||||
|
||||
urlstory=''
|
||||
numchapters = 0
|
||||
td_tbbrdr = soup.find('td', {'class' : 'tbbrdr'})
|
||||
if td_tbbrdr is not None:
|
||||
#logging.debug('td_tbbrdr=%s' % td_tbbrdr )
|
||||
|
||||
sl = td_tbbrdr.find('select', {'name':'cid'})
|
||||
if sl is not None:
|
||||
#logging.debug('sl=%s' % sl )
|
||||
opts = sl.findAll('option')
|
||||
for o in opts:
|
||||
#logging.debug('o=%s' % o)
|
||||
if 'value' in o._getAttrMap():
|
||||
url = 'http://' + self.host + '/fanfic/view_ch.php/' + self.storyId + '/' + o['value']
|
||||
logging.debug('URL=%s, Title=%s' % (url, o.string))
|
||||
if numchapters == 0:
|
||||
ss = o.string.split('[')
|
||||
if ss is not None and len(ss) > 1:
|
||||
ssd = ss[-1].replace(']','')
|
||||
#logging.debug('ssd=%s' % ssd)
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(ssd.strip(' '), "%b %d, %Y")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
urls.append((url, o.string))
|
||||
numchapters = numchapters + 1
|
||||
|
||||
if numchapters == 0:
|
||||
numchapters = 1
|
||||
url = 'http://' + self.host + '/fanfic/view_st.php/' + self.storyId
|
||||
self.storyPublished = self.storyUpdated
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
ssd = self.storyName + ' [' + self.storyPublished.strftime("%b %d, %Y") + ']'
|
||||
logging.debug('URL=%s, Title=%s' % (url, ssd))
|
||||
urls.append((url, ssd))
|
||||
|
||||
self.numChapters = str(numchapters)
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
logging.debug('urls=%s' % urls)
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
time.sleep( 2.0 )
|
||||
logging.debug('url=%s' % url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
soup = None
|
||||
exit(20)
|
||||
return '<html/>'
|
||||
|
||||
#div = soup.find('div', {'id' : 'storytext'})
|
||||
#if div is None:
|
||||
#logging.error("Error downloading Chapter: %s" % url)
|
||||
#exit (20)
|
||||
#return '<html/>'
|
||||
|
||||
#logging.info("Soup: %s" % soup.prettify())
|
||||
|
||||
nvs = bs.NavigableString('')
|
||||
sst=''
|
||||
allAs = soup.findAll ('a', { 'name' : 'fic_c' })
|
||||
#logging.debug('allAs=%s' % allAs)
|
||||
for a in allAs:
|
||||
#logging.debug('a=%s' % a)
|
||||
foundfirst = False
|
||||
done = False
|
||||
nxta = a.nextSibling
|
||||
while nxta is not None and not done:
|
||||
#logging.debug('nxta=%s' % nxta)
|
||||
#logging.debug('nxta.class=%s' % nxta.__class__)
|
||||
st = str(nxta)
|
||||
if nvs.__class__ != nxta.__class__:
|
||||
#logging.debug('nxta.name=%s' % nxta.name)
|
||||
if nxta.name == 'table':
|
||||
st = ''
|
||||
if foundfirst:
|
||||
done = True
|
||||
if nxta.name == 'div' and 'class' in nxta._getAttrMap() and nxta['class'] == 'acl' and foundfirst:
|
||||
st = ''
|
||||
done = True
|
||||
|
||||
if nxta.name == 'br':
|
||||
if not foundfirst:
|
||||
st = ''
|
||||
else:
|
||||
foundfirst = True
|
||||
else:
|
||||
foundfirst = True
|
||||
|
||||
sst = sst + st
|
||||
nxta = nxta.nextSibling
|
||||
|
||||
#sst = sst.replace(' ',' ').strip()
|
||||
#logging.debug('sst=%s' % sst)
|
||||
|
||||
#logging.debug('sst.0=%s' % sst)
|
||||
#sst0 = sst.replace(u'≴', u'“').replace(u'≵','”').replace(u'≰',u'‘').replace(u'≱',u'’')
|
||||
#sst0 = sst.replace(u"≵","”")
|
||||
#logging.debug('sst.1=%s' % sst0)
|
||||
#sst1 = sst.replace(u'≴', u'\"').replace('≵','\"').replace('≰','\'').replace('≱','\'')
|
||||
#logging.debug('sst.2=%s' % sst1)
|
||||
|
||||
return sst
|
||||
|
||||
class FPC_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testFictionPress(self):
|
||||
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
|
||||
f = FPCom(url)
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Behind This Facade', f.getStoryName())
|
||||
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
|
||||
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Loading…
Reference in a new issue