From 36d4b9afcf7ea28fb9e6b773c9e3f9b0b6cf2c44 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Mon, 9 May 2011 14:46:05 -0500 Subject: [PATCH] Add adastrafanfic.com support and support to confirm adult w/o login/pass. --- defaults.ini | 6 + example.ini | 5 + .../adapters/adapter_adastrafanficcom.py | 173 ++++++++++++++++++ fanficdownloader/adapters/adapter_test1.py | 7 + .../adapters/adapter_twilightednet.py | 2 +- fanficdownloader/exceptions.py | 7 + login.html | 16 +- main.py | 17 +- newdownload.py | 11 +- 9 files changed, 235 insertions(+), 9 deletions(-) create mode 100644 fanficdownloader/adapters/adapter_adastrafanficcom.py diff --git a/defaults.ini b/defaults.ini index d825058a..307ce2c9 100644 --- a/defaults.ini +++ b/defaults.ini @@ -148,6 +148,12 @@ extratags: [www.whofic.com] +[www.adastrafanfic.com] +## Some sites do not require a login, but do require the user to +## confirm they are adult for adult content. In commandline version, +## this should go in your personal.ini, not defaults.ini. +#is_adult:true + [overrides] ## It may sometimes be useful to override all of the specific format, ## site and site:format sections in your private configuration. For diff --git a/example.ini b/example.ini index 81f2042b..cf63ac6b 100644 --- a/example.ini +++ b/example.ini @@ -11,6 +11,11 @@ #username:YourUsername #password:YourPassword +[www.adastrafanfic.com] +## Some sites do not require a login, but do require the user to +## confirm they are adult for adult content. +#is_adult:true + ## The [defaults] section here will override the system [defaults], ## but not format, site for site:format sections. [defaults] diff --git a/fanficdownloader/adapters/adapter_adastrafanficcom.py b/fanficdownloader/adapters/adapter_adastrafanficcom.py new file mode 100644 index 00000000..119ef626 --- /dev/null +++ b/fanficdownloader/adapters/adapter_adastrafanficcom.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +import time +import datetime +import logging +import re +import urllib +import urllib2 + +import fanficdownloader.BeautifulSoup as bs +from fanficdownloader.htmlcleanup import stripHTML +import fanficdownloader.exceptions as exceptions + +from base_adapter import BaseSiteAdapter, utf8FromSoup + +class AdAstraFanficComSiteAdapter(BaseSiteAdapter): + + def __init__(self, config, url): + BaseSiteAdapter.__init__(self, config, url) + self.story.setMetadata('siteabbrev','aaff') + self.decode = "utf8" + self.story.addToList("category","Star Trek") + self.is_adult=False + + # get storyId from url--url validation guarantees query is only sid=1234 + self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) + logging.debug("storyId: (%s)"%self.story.getMetadata('storyId')) + + # normalized story URL. + self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId')) + + + @staticmethod + def getSiteDomain(): + return 'www.adastrafanfic.com' + + @classmethod + def getAcceptDomains(cls): + return [cls.getSiteDomain()] + + def getSiteExampleURLs(self): + return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234" + + def getSiteURLPattern(self): + return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$" + + def extractChapterUrlsAndMetadata(self): + + if self.is_adult or self.getConfig("is_adult"): + addurl = "&warning=5" + else: + addurl="" + + url = self.url+'&index=1'+addurl + logging.debug("URL: "+url) + + try: + data = self._fetchUrl(url) + except urllib2.HTTPError, e: + if e.code == 404: + raise exceptions.StoryDoesNotExist(self.url) + else: + raise e + + if "Content is only suitable for mature adults. May contain explicit language and adult themes. Equivalent of NC-17." in data: + raise exceptions.AdultCheckRequired(self.url) + + # use BeautifulSoup HTML parser to make everything easier to find. + soup = bs.BeautifulSoup(data) + + ## Title + a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$")) + self.story.setMetadata('title',a.string) + + # Find authorid and URL from... author url. + a = soup.find('a', href=re.compile(r"viewuser.php")) + self.story.setMetadata('authorId',a['href'].split('=')[1]) + self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href']) + self.story.setMetadata('author',a.string) + + # Find the chapters: + for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")): + # just in case there's tags, like in chapter titles. + self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl)) + + self.story.setMetadata('numChapters',len(self.chapterUrls)) + + ## + ## Summary, strangely, is in the content attr of a tag + ## which is escaped HTML. Unfortunately, we can't use it because they don't + ## escape (') chars in the desc, breakin the tag. + #meta_desc = soup.find('meta',{'name':'description'}) + #metasoup = bs.BeautifulStoneSoup(meta_desc['content']) + #self.story.setMetadata('description',stripHTML(metasoup)) + + def defaultGetattr(d,k): + try: + return d[k] + except: + return "" + + # Rated: NC-17
etc + labels = soup.findAll('span',{'class':'label'}) + for labelspan in labels: + value = labelspan.nextSibling + label = labelspan.string + + if 'Summary' in label: + ## Everything until the next span class='label' + svalue = str(value) + while not defaultGetattr(value,'class') == 'label': + svalue += str(value) + value = value.nextSibling + self.story.setMetadata('description',stripHTML(svalue)) + + if 'Rated' in label: + self.story.setMetadata('rating', value) + + if 'Word count' in label: + self.story.setMetadata('numWords', value) + + if 'Categories' in label: + cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories')) + catstext = [cat.string for cat in cats] + for cat in catstext: + self.story.addToList('category',cat.string) + + if 'Genre' in label: + genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) + genrestext = [genre.string for genre in genres] + self.genre = ', '.join(genrestext) + for genre in genrestext: + self.story.addToList('genre',genre.string) + + if 'Warnings' in label: + warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) + warningstext = [warning.string for warning in warnings] + self.warning = ', '.join(warningstext) + for warning in warningstext: + self.story.addToList('warnings',warning.string) + + if 'Completed' in label: + if 'Yes' in value: + self.story.setMetadata('status', 'Completed') + else: + self.story.setMetadata('status', 'In-Progress') + + if 'Published' in label: + self.story.setMetadata('datePublished', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%m/%d/%Y")))) + + if 'Updated' in label: + # there's a stray [ at the end. + #value = value[0:-1] + self.story.setMetadata('dateUpdated', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%m/%d/%Y")))) + + + def getChapterText(self, url): + + logging.debug('Getting chapter text from: %s' % url) + + soup = bs.BeautifulStoneSoup(self._fetchUrl(url), + selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags. + + span = soup.find('div', {'id' : 'story'}) + + if None == span: + raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) + + return utf8FromSoup(span) + +def getClass(): + return AdAstraFanficComSiteAdapter + diff --git a/fanficdownloader/adapters/adapter_test1.py b/fanficdownloader/adapters/adapter_test1.py index 47f00dbc..d2cb6457 100644 --- a/fanficdownloader/adapters/adapter_test1.py +++ b/fanficdownloader/adapters/adapter_test1.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import datetime +import logging import fanficdownloader.BeautifulSoup as bs import fanficdownloader.exceptions as exceptions @@ -16,6 +17,7 @@ class TestSiteAdapter(BaseSiteAdapter): # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) self.username='' + self.is_adult=False @staticmethod def getSiteDomain(): @@ -29,6 +31,10 @@ class TestSiteAdapter(BaseSiteAdapter): def extractChapterUrlsAndMetadata(self): + if self.story.getMetadata('storyId') == '665' and not (self.is_adult or self.getConfig("is_adult")): + logging.warn("self.is_adult:%s"%self.is_adult) + raise exceptions.AdultCheckRequired(self.url) + if self.story.getMetadata('storyId') == '666': raise exceptions.StoryDoesNotExist(self.url) @@ -86,6 +92,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"

Prologue

This is a fake adapter for testing purposes. Different storyId's will give different errors:

+

http://test1.com?sid=665 - raises AdultCheckRequired

http://test1.com?sid=666 - raises StoryDoesNotExist

http://test1.com?sid=667 - raises FailedToDownload on chapter 1

http://test1.com?sid=668 - raises FailedToLogin unless username='Me'

diff --git a/fanficdownloader/adapters/adapter_twilightednet.py b/fanficdownloader/adapters/adapter_twilightednet.py index 3a8e14c8..6a654e46 100644 --- a/fanficdownloader/adapters/adapter_twilightednet.py +++ b/fanficdownloader/adapters/adapter_twilightednet.py @@ -166,7 +166,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter): # genrestext = [genre.string for genre in genres] # self.genre = ', '.join(genrestext) # for genre in genrestext: - # self.addSubject(genre.string) + # self.story.addToList('genre',genre.string) if 'Completed' in label: if 'Yes' in value: diff --git a/fanficdownloader/exceptions.py b/fanficdownloader/exceptions.py index 44cae238..4b316442 100644 --- a/fanficdownloader/exceptions.py +++ b/fanficdownloader/exceptions.py @@ -24,6 +24,13 @@ class FailedToLogin(Exception): def __str__(self): return "Failed to Login for URL: (%s) with username: (%s)" % (self.url, self.username) +class AdultCheckRequired(Exception): + def __init__(self,url): + self.url=url + + def __str__(self): + return "Story requires confirmation of adult status: (%s)" % self.url + class StoryDoesNotExist(Exception): def __init__(self,url): self.url=url diff --git a/login.html b/login.html index e54141cd..2d1240f8 100644 --- a/login.html +++ b/login.html @@ -45,15 +45,18 @@
{% endif %}
+ +
+ + {% if login %} +

Login and Password

{{ site }} requires a Login/Password for this story. You need to provide your Login/Password for {{ site }} to download it.
- -
Login
@@ -63,6 +66,15 @@
Password
+ + {% else %} + +
+
Are you an Adult?
+
+ + {% endif %} +
diff --git a/main.py b/main.py index 5d5f1d94..39d84ea2 100644 --- a/main.py +++ b/main.py @@ -240,7 +240,7 @@ class UserConfigServer(webapp.RequestHandler): ## TEST THIS if l and l[0].config: uconfig=l[0] - logging.debug('reading config from UserConfig(%s)'%uconfig.config) + #logging.debug('reading config from UserConfig(%s)'%uconfig.config) config.readfp(StringIO.StringIO(uconfig.config)) return config @@ -260,6 +260,7 @@ class FanfictionDownloader(UserConfigServer): url = self.request.get('url') login = self.request.get('login') password = self.request.get('password') + is_adult = self.request.get('is_adult') == "on" logging.info("Queuing Download: " + url) @@ -287,6 +288,7 @@ class FanfictionDownloader(UserConfigServer): if len(login) > 1: adapter.username=login adapter.password=password + adapter.is_adult=is_adult ## This scrapes the metadata, which will be ## duplicated in the queue task, but it ## detects bad URLs, bad login, bad story, etc @@ -304,20 +306,23 @@ class FanfictionDownloader(UserConfigServer): 'url':url, 'login':login, 'password':password, - 'user':user.email()}) + 'user':user.email(), + 'is_adult':is_adult}) logging.info("enqueued download key: " + str(download.key())) - except exceptions.FailedToLogin, e: + except (exceptions.FailedToLogin,exceptions.AdultCheckRequired), e: logging.exception(e) download.failure = str(e) download.put() logging.debug('Need to Login, display log in page') + login= ( e is exceptions.FailedToLogin ) template_values = dict(nickname = user.nickname(), url = url, format = format, site = adapter.getSiteDomain(), - fic = download + fic = download, + login=login, ) path = os.path.join(os.path.dirname(__file__), 'login.html') self.response.out.write(template.render(path, template_values)) @@ -348,6 +353,7 @@ class FanfictionDownloaderTask(UserConfigServer): url = self.request.get('url') login = self.request.get('login') password = self.request.get('password') + is_adult = self.request.get('is_adult') # User object can't pass, just email address user = users.User(self.request.get('user')) @@ -381,11 +387,12 @@ class FanfictionDownloaderTask(UserConfigServer): download.put() return - logging.info('Created an adaper: %s' % adapter) + logging.info('Created an adapter: %s' % adapter) if len(login) > 1: adapter.username=login adapter.password=password + adapter.is_adult=is_adult try: # adapter.getStory() is what does all the heavy lifting. diff --git a/newdownload.py b/newdownload.py index d174cd50..46dc73dd 100644 --- a/newdownload.py +++ b/newdownload.py @@ -7,6 +7,10 @@ import sys, os from optparse import OptionParser import getpass +if sys.version_info < (2, 5): + print "This program requires Python 2.5 or newer." + sys.exit(1) + from fanficdownloader import adapters,writers,exceptions import ConfigParser @@ -56,13 +60,18 @@ def main(): try: adapter.getStoryMetadataOnly() - except exceptions.FailedToLogin, ftl: + except exceptions.FailedToLogin: print "Login Failed, Need Username/Password." sys.stdout.write("Username: ") adapter.username = sys.stdin.readline().strip() adapter.password = getpass.getpass(prompt='Password: ') #print("Login: `%s`, Password: `%s`" % (adapter.username, adapter.password)) adapter.getStoryMetadataOnly() + except exceptions.AdultCheckRequired: + print "Please confirm you are an adult in your locale: (y/n)?" + if sys.stdin.readline().strip().lower().startswith('y'): + adapter.is_adult=True + adapter.getStoryMetadataOnly() if options.metaonly: print adapter.getStoryMetadataOnly()