Added in mediaminer.org adapter. Still has problems becaue mm seems to use the ≴ type character encoding, and I couldn't manage to replace them.

2025-12-06 08:52:55 +01:00 · 2010-11-12 20:25:55 -06:00 · 2010-11-12 20:25:55 -06:00 · bdf429b41c
commit bdf429b41c
parent 981a922d00
2 changed files with 405 additions and 0 deletions
--- a/downloader.py
+++ b/downloader.py
@ -24,6 +24,7 @@ import fictionalley
 import hpfiction
 import twilighted
 import potionsNsnitches
+import mediaminer

 import time

@ -107,6 +108,8 @@ if __name__ == '__main__':
 		adapter = twilighted.Twilighted(url)
 	elif url.find('potionsandsnitches.net') != -1:
 		adapter = potionsNsnitches.PotionsNSnitches(url)
+	elif url.find('mediaminer.org') != -1:
+		adapter = mediaminer.MediaMiner(url)
 	else:
 		print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
 		sys.exit(1)
--- a/mediaminer.py
+++ b/mediaminer.py
@ -0,0 +1,402 @@
+# -*- coding: utf-8 -*-
+
+import os
+import re
+import sys
+import cgi
+import uuid
+import shutil
+import os.path
+import logging
+import unittest
+import urllib as u
+import pprint as pp
+import urllib2 as u2
+import urlparse as up
+import BeautifulSoup as bs
+import htmlentitydefs as hdefs
+import time
+import datetime
+
+from constants import *
+from adapter import *
+
+try:
+	import login_password
+except:
+	# tough luck
+	pass
+
+class MediaMiner(FanfictionSiteAdapter):
+	def __init__(self, url):		
+		self.url = url
+		parsedUrl = up.urlparse(url)
+		self.host = parsedUrl.netloc
+		self.path = parsedUrl.path
+		
+		self.storyName = ''
+		self.authorName = ''
+		self.storyDescription = ''
+		self.storyCharacters = []
+		self.storySeries = ''
+		self.authorId = '0'
+		self.authorURL = self.path
+		self.storyId = '0'
+		self.storyPublished = datetime.date(1970, 01, 31)
+		self.storyCreated = datetime.datetime.now()
+		self.storyUpdated = datetime.date(1970, 01, 31)
+		self.languageId = 'en-UK'
+		self.language = 'English'
+		self.subjects = []
+		self.publisher = self.host
+		self.numChapters = 0
+		self.numWords = 0
+		self.genre = ''
+		self.category = ''
+		self.storyStatus = 'In-Progress'
+		self.storyRating = 'K'
+		self.storyUserRating = '0'
+		self.outputName = ''
+		self.outputStorySep = '-mm_'
+				
+		logging.debug('self.url=%s' % self.url)
+		
+		if self.url.find('view_st.php') != -1:
+			ss = self.url.split('view_st.php')
+			logging.debug('ss=%s' % ss)
+			if ss is not None and len(ss) > 1:
+				self.storyId = ss[1].replace('/','').strip()
+		elif self.url.find('view_ch.php?') != -1:
+			ss = self.url.split('=')
+			logging.debug('ss=%s' % ss)
+			if ss is not None and len(ss) > 1:
+				self.storyId = ss[-1].replace('/','').strip()
+				self.path = '/fanfic/view_st.php/' + self.storyId
+				self.url = 'http://' + self.host + self.path
+				logging.debug('self.url=%s' % self.url)
+		elif self.url.find('view_ch.php/') != -1:
+			ss = self.url.split('/')
+			logging.debug('ss=%s' % ss)
+			if ss is not None and len(ss) > 2:
+				self.storyId = ss[-2].strip()
+				self.path = '/fanfic/view_st.php/' + self.storyId
+				self.url = 'http://' + self.host + self.path
+				logging.debug('self.url=%s' % self.url)			
+			
+		logging.debug('self.storyId=%s' % self.storyId)
+		
+		logging.debug('self.path=%s' % self.path)
+		
+		if not self.appEngine:
+			self.opener = u2.build_opener(u2.HTTPCookieProcessor())
+		else:
+			self.opener = None
+	
+		logging.debug("Created MediaMiner: url=%s" % (self.url))
+	
+	def _getLoginScript(self):
+		return self.path
+
+	def _getVarValue(self, varstr):
+		#logging.debug('_getVarValue varstr=%s' % varstr)
+		vals = varstr.split('=')
+		#logging.debug('vals=%s' % vals)
+		retstr="".join(vals[+1:])
+		#logging.debug('retstr=%s' % retstr)
+		if retstr.startswith(' '):
+			retstr = retstr[1:]
+		if retstr.endswith(';'):
+			retstr = retstr[:-1]
+		return retstr
+	
+	def _splitCrossover(self, subject):
+		if "Crossover" in subject:
+			self.addSubject ("Crossover")
+			logging.debug('Crossover=%s' % subject)
+			if subject.find(' and ') != -1:
+				words = subject.split(' ')
+				logging.debug('words=%s' % words)
+				subj = ''
+				for s in words:
+					if s in "and Crossover":
+						if len(subj) > 0:
+							self.addSubject(subj)
+						subj = ''
+					else:
+						if len(subj) > 0:
+							subj = subj + ' '
+						subj = subj + s
+				if len(subj) > 0:
+					self.addSubject(subj)
+			else:
+				self.addSubject(subject)
+		else:
+			self.addSubject(subject)
+		return True
+
+	def _splitGenre(self, subject):
+		if len(subject) > 0:
+			words = subject.split('/')
+			logging.debug('words=%s' % words)
+			for subj in words:
+			    if len(subj) > 0:
+				self.addSubject(subj)
+		return True
+
+	def _fetchUrl(self, url):
+		if not self.appEngine:
+			return self.opener.open(url).read().decode('utf-8')
+		else:
+			return googlefetch(url).content
+	
+	def extractIndividualUrls(self):
+		data = self._fetchUrl(self.url)
+		#data.replace('<br />',' ').replace('<br>',' ').replace('</br>',' ')
+		soup = bs.BeautifulSoup(data)
+		#logging.debug('soap=%s' % soup)
+		urls = []
+		
+		td_ffh = soup.find('td', {'class' : 'ffh'})
+		#logging.debug('td_ffh=%s' % td_ffh)
+		if td_ffh is not None:
+			#logging.debug('td_ffh.text=%s' % td_ffh.find(text=True))
+			self.storyName = str(td_ffh.find(text=True)).strip()
+			logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
+			fft = td_ffh.find('font', {'class' : 'smtxt'})
+			#logging.debug('fft=%s' % fft)
+			if fft is not None:
+				ffts = fft.string.split(' ')
+				if ffts is not None:
+					if len(ffts) > 1:
+						self.storyRating = ffts[1]
+						logging.debug('self.storyRating=%s' % self.storyRating)
+		self.genre = ''
+		td_smtxt = soup.findAll('td')
+		if td_smtxt is None:
+			logging.debug('td_smtxt is NONE!')
+			pass
+		else:
+			ll = len(td_smtxt)
+			#logging.debug('td_smtxt=%s, len=%s' % (td_smtxt, ll))
+			for ii in range(ll):
+				td = td_smtxt[ii]
+				if 'class' in td._getAttrMap() and td['class'] != 'smtxt':
+					logging.debug('td has class attribute but is not smtxt')
+					continue
+				ss = str(td).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
+				#logging.debug('ss=%s' % ss)
+				if len(ss) > 1 and (ss.find('Genre(s):') != -1 or ss.find('Type:') != -1):
+					#logging.debug('ss=%s' % ss)
+					ssbs = td.findAll('b')
+					#logging.debug('ssbs=%s' % ssbs)
+					bb = 0
+					while bb < len(ssbs):
+						nvs = bs.NavigableString('')
+						sst=''
+						ssb = ssbs[bb]
+						ssbt = str(ssb.text).strip()
+						#logging.debug('ssb=%s' % ssb)
+						#logging.debug('ssbt=%s' % ssbt)
+						ssbn = ssb.nextSibling
+						while ssbn is not None:
+							#logging.debug('ssbn=%s' % ssbn)
+							#logging.debug('ssbn.class=%s' % ssbn.__class__)
+							if nvs.__class__ == ssbn.__class__:
+								st = str(ssbn)
+								if st.strip() != '|':
+									sst = sst + st
+							else:
+								#logging.debug('ssbn.name=%s' % ssbn.name)
+								if ssbn.name == 'b':
+									break								
+								ssbnts = ssbn.findAll(text=True)
+								for ssbnt in ssbnts:
+									sst = sst + ssbnt
+							ssbn = ssbn.nextSibling
+						sst = sst.replace('&nbsp;',' ').strip()
+						#logging.debug('sst=%s' % sst)
+						if bb == 0:
+							ssbt = ssbt.replace(':','')
+							self.addSubject(ssbt)
+							self.addSubject(sst)
+							logging.debug('self.subjects=%s' % self.subjects)
+						else:
+							if ssbt == 'Genre(s):':
+								self.genre = sst
+								logging.debug('self.genre=%s' % self.genre)
+								sts = sst.split(' / ')
+								for st in sts:
+									self.addSubject(st.strip())
+								logging.debug('self.subjects=%s' % self.subjects)
+							elif ssbt == 'Type:':
+								self.category = sst
+								logging.debug('self.category=%s' % self.category)
+								self.addSubject(sst)
+								logging.debug('self.subjects=%s' % self.subjects)
+							elif ssbt == 'Author:':
+								pass
+							elif ssbt == 'Visits:':
+								pass
+							elif ssbt == 'Size:':
+								pass
+							elif ssbt == 'Pages:':
+								pass
+							elif ssbt == 'Status:':
+								if sst == "Completed":
+									self.storyStatus = 'Completed'
+								else:
+									self.storyStatus = 'In-Progress'
+							elif ssbt == 'Words:':
+								self.numWords = sst.replace('|','').strip()
+								logging.debug('self.numWords=%s' % self.numWords)
+								pass
+							elif ssbt == 'Summary:':
+								self.storyDescription = sst.strip()
+								logging.debug('self.storyDescription=%s' % self.storyDescription)
+							elif ssbt == 'Latest Revision:' or ssbt == 'Uploaded On:':
+								logging.debug('sst=%s' % sst)
+								self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sst.strip(' '), "%B %d, %Y %H:%M %Z")))
+								logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+							else:
+								pass
+						bb = bb+1
+						
+					smtxt_as = td_smtxt[ii].findAll('a')
+					#logging.debug('smtxt_as=%s' % smtxt_as)
+					for smtxt_a in smtxt_as:
+						if 'href' in smtxt_a._getAttrMap() and smtxt_a['href'].find('/u/'):
+							sta = smtxt_a['href']
+							#logging.debug('sta=%s' % sta)
+							stas = sta.split('/u/')
+							#logging.debug('stas=%s' % stas)
+							if stas is not None and len(stas) > 1:
+								self.authorId = stas[1]
+								self.authorURL = 'http://' + self.host + sta
+								self.authorName = smtxt_a.string
+								logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
+				
+		urlstory=''
+		numchapters = 0
+		td_tbbrdr = soup.find('td', {'class' : 'tbbrdr'})
+		if td_tbbrdr is not None:
+			#logging.debug('td_tbbrdr=%s' % td_tbbrdr )
+
+			sl = td_tbbrdr.find('select', {'name':'cid'})
+			if sl is not None:
+				#logging.debug('sl=%s' % sl )
+				opts = sl.findAll('option')
+				for o in opts:
+					#logging.debug('o=%s' % o)				
+					if 'value' in o._getAttrMap():
+						url = 'http://' + self.host + '/fanfic/view_ch.php/' + self.storyId  + '/' + o['value']
+						logging.debug('URL=%s, Title=%s' % (url, o.string))
+						if numchapters == 0:
+							ss = o.string.split('[')
+							if ss is not None and len(ss) > 1:
+								ssd = ss[-1].replace(']','') 
+								#logging.debug('ssd=%s' % ssd)
+								self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(ssd.strip(' '), "%b %d, %Y")))
+								logging.debug('self.storyPublished=%s' % self.storyPublished)
+						urls.append((url, o.string))
+						numchapters = numchapters + 1
+		
+		if numchapters == 0:
+			numchapters = 1
+			url = 'http://' + self.host + '/fanfic/view_st.php/' + self.storyId
+			self.storyPublished = self.storyUpdated
+			logging.debug('self.storyPublished=%s' % self.storyPublished)
+			ssd = self.storyName + ' [' + self.storyPublished.strftime("%b %d, %Y") + ']'
+			logging.debug('URL=%s, Title=%s' % (url, ssd))
+			urls.append((url, ssd))
+			
+		self.numChapters = str(numchapters)
+		logging.debug('self.numChapters=%s' % self.numChapters)
+		logging.debug('urls=%s' % urls)
+		
+		return urls
+	
+	def getText(self, url):
+		time.sleep( 2.0 )
+		logging.debug('url=%s' % url)
+		data = self._fetchUrl(url)
+
+		try:
+			soup = bs.BeautifulSoup(data)
+		except:
+			logging.info("Failed to decode: <%s>" % data)
+			soup = None
+			exit(20)
+			return '<html/>'
+		
+		#div = soup.find('div', {'id' : 'storytext'})
+		#if div is None:
+			#logging.error("Error downloading Chapter: %s" % url)
+			#exit (20)
+			#return '<html/>'
+			
+		#logging.info("Soup: %s" % soup.prettify())
+
+		nvs = bs.NavigableString('')
+		sst=''
+		allAs = soup.findAll ('a', { 'name' : 'fic_c' })
+		#logging.debug('allAs=%s' % allAs)
+		for a in allAs:
+			#logging.debug('a=%s' % a)
+			foundfirst = False
+			done = False
+			nxta = a.nextSibling
+			while nxta is not None and not done:
+				#logging.debug('nxta=%s' % nxta)
+				#logging.debug('nxta.class=%s' % nxta.__class__)
+				st = str(nxta)
+				if nvs.__class__ != nxta.__class__:
+					#logging.debug('nxta.name=%s' % nxta.name)
+					if nxta.name == 'table':
+						st = ''
+						if foundfirst:
+							done = True
+					if nxta.name == 'div' and 'class' in nxta._getAttrMap() and nxta['class'] == 'acl' and foundfirst:
+						st = ''
+						done = True
+				
+					if nxta.name == 'br':
+						if not foundfirst:
+							st = ''
+					else:
+						foundfirst = True
+				else:
+					foundfirst = True
+					
+				sst = sst + st
+				nxta = nxta.nextSibling
+
+		#sst = sst.replace('&nbsp;',' ').strip()
+		#logging.debug('sst=%s' % sst)
+		
+		#logging.debug('sst.0=%s' % sst)
+		#sst0 = sst.replace(u'&#8820;', u'&ldquo;').replace(u'&#8821;','&rdquo;').replace(u'&#8816;',u'&lsquo;').replace(u'&#8817;',u'&rsquo;')
+		#sst0 = sst.replace(u"&#8821;","&rdquo;")
+		#logging.debug('sst.1=%s' % sst0)
+		#sst1 = sst.replace(u'&#8820;', u'\"').replace('&#8821;','\"').replace('&#8816;','\'').replace('&#8817;','\'')
+		#logging.debug('sst.2=%s' % sst1)
+					
+		return sst
+			
+class FPC_UnitTests(unittest.TestCase):
+	def setUp(self):
+		logging.basicConfig(level=logging.DEBUG)
+		pass
+	
+	def testFictionPress(self):
+		url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
+		f = FPCom(url)
+		urls = f.extractIndividualUrls()
+		
+		self.assertEquals('Behind This Facade', f.getStoryName())
+		self.assertEquals('IntoxicatingMelody', f.getAuthorName())
+	
+		text = f.getText(url)
+		self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
+
+if __name__ == '__main__':
+	unittest.main()