FanFicFare/fanficdownloader/geturls.py

175 lines
6.4 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
import urlparse
import urllib2 as u2
from BeautifulSoup import BeautifulSoup
from gziphttp import GZipProcessor
import adapters
from configurable import Configuration
from exceptions import UnknownSite
def get_urls_from_page(url,configuration=None,normalize=False):
if not configuration:
configuration = Configuration("test1.com","EPUB")
data = None
adapter = None
try:
adapter = adapters.getAdapter(configuration,url,anyurl=True)
# special stuff to log into archiveofourown.org, if possible.
# Unlike most that show the links to 'adult' stories, but protect
# them, AO3 doesn't even show them if not logged in. Only works
# with saved user/pass--not going to prompt for list.
if 'archiveofourown.org' in url:
if adapter.getConfig("username"):
if adapter.getConfig("is_adult"):
if '?' in url:
addurl = "&view_adult=true"
else:
addurl = "?view_adult=true"
else:
addurl=""
# just to get an authenticity_token.
data = adapter._fetchUrl(url+addurl)
# login the session.
adapter.performLogin(url,data)
# get the list page with logged in session.
# this way it uses User-Agent or other special settings. Only AO3
# is doing login.
data = adapter._fetchUrl(url)
except UnknownSite:
# no adapter with anyurl=True, must be a random site.
opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
data = opener.open(url).read()
# kludge because I don't see it on enough sites to be worth generalizing yet.
restrictsearch=None
if 'scarvesandcoffee.net' in url:
restrictsearch=('div',{'id':'mainpage'})
return get_urls_from_html(data,url,configuration,normalize,restrictsearch)
def get_urls_from_html(data,url=None,configuration=None,normalize=False,restrictsearch=None):
normalized = [] # normalized url
retlist = [] # orig urls.
if not configuration:
configuration = Configuration("test1.com","EPUB")
soup = BeautifulSoup(data)
if restrictsearch:
soup = soup.find(*restrictsearch)
#print("restrict search:%s"%soup)
for a in soup.findAll('a'):
if a.has_key('href'):
#print("a['href']:%s"%a['href'])
href = form_url(url,a['href'])
#print("1 urlhref:%s"%href)
# this (should) catch normal story links, some javascript
# 'are you old enough' links, and 'Report This' links.
# The 'normalized' set prevents duplicates.
if 'story.php' in a['href']:
#print("trying:%s"%a['href'])
m = re.search(r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",a['href'])
if m != None:
href = form_url(a['href'] if '//' in a['href'] else url,
m.group('sid'))
try:
href = href.replace('&index=1','')
#print("2 urlhref:%s"%href)
adapter = adapters.getAdapter(configuration,href)
#print("found adapter")
if adapter.story.getMetadata('storyUrl') not in normalized:
normalized.append(adapter.story.getMetadata('storyUrl'))
retlist.append(href)
except Exception, e:
#print e
pass
if normalize:
return normalized
else:
return retlist
def get_urls_from_text(data,configuration=None,normalize=False):
normalized = [] # normalized url
retlist = [] # orig urls.
data=unicode(data)
if not configuration:
configuration = Configuration("test1.com","EPUB")
for href in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data):
# this (should) catch normal story links, some javascript
# 'are you old enough' links, and 'Report This' links.
# The 'normalized' set prevents duplicates.
if 'story.php' in href:
m = re.search(r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",href)
if m != None:
href = form_url(href,m.group('sid'))
try:
href = href.replace('&index=1','')
adapter = adapters.getAdapter(configuration,href)
if adapter.story.getMetadata('storyUrl') not in normalized:
normalized.append(adapter.story.getMetadata('storyUrl'))
retlist.append(href)
except:
pass
if normalize:
return normalized
else:
return retlist
def form_url(parenturl,url):
url = url.strip() # ran across an image with a space in the
# src. Browser handled it, so we'd better, too.
if "//" in url or parenturl == None:
returl = url
else:
parsedUrl = urlparse.urlparse(parenturl)
if url.startswith("/") :
returl = urlparse.urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
url,
'','',''))
else:
toppath=""
if parsedUrl.path.endswith("/"):
toppath = parsedUrl.path
else:
toppath = parsedUrl.path[:parsedUrl.path.rindex('/')]
returl = urlparse.urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
toppath + '/' + url,
'','',''))
return returl