mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-09 05:21:13 +02:00
Adding six, html5lib and bs4(BeautifulSoup) libraries.
This commit is contained in:
commit
ebcf2b056b
293 changed files with 91575 additions and 0 deletions
78
allrecent.html
Normal file
78
allrecent.html
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="/css/index.css" rel="stylesheet" type="text/css">
|
||||
<title>FanFictionDownLoader (fanfiction.net, fanficauthors, fictionalley, ficwad to epub and HTML)</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-12136939-1']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<div id='main'>
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFictionDownLoader</a>
|
||||
</h1>
|
||||
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
<!-- <div id='yourfile'> -->
|
||||
{{yourfile}}
|
||||
<!-- </div> -->
|
||||
|
||||
<div id='helpbox'>
|
||||
{% for fic in fics %}
|
||||
<p>
|
||||
<a href="{{ fic.url }}" title="Link to original story"><span class="recent"><i>{{ fic.title }}</i></span></a>
|
||||
by <a href="{{ fic.authorUrl }}">{{ fic.author }}</a> <b>Download Count:</b> {{ fic.count }} <br />
|
||||
<b>Word Count:</b> {{ fic.numWords }} <b>Chapter Count:</b> {{ fic.numChapters }}<br />
|
||||
{% if fic.category %} <b>Categories:</b> {{ fic.category }} <br /> {% endif %}
|
||||
{% if fic.genre %} <b>Genres:</b> {{ fic.genre }} <br /> {% endif %}
|
||||
{% if fic.language %} <b>Language:</b> {{ fic.language }} <br /> {% endif %}
|
||||
{% if fic.series %} <b>Series:</b> {{ fic.series }} <br /> {% endif %}
|
||||
{% if fic.characters %} <b>Characters:</b> {{ fic.characters }} <br /> {% endif %}
|
||||
{% if fic.status %} <b>Status:</b> {{ fic.status }} <br /> {% endif %}
|
||||
{% if fic.datePublished %} <b>Published:</b> {{ fic.datePublished }} <br /> {% endif %}
|
||||
{% if fic.dateUpdated %} <b>Last Updated:</b> {{ fic.dateUpdated }} <br /> {% endif %}
|
||||
{% if fic.dateCreated %} <b>Last Downloaded:</b> {{ fic.dateCreated }} <br /> {% endif %}
|
||||
{% if fic.rating %} <b>Rating:</b> {{ fic.rating }} <br /> {% endif %}
|
||||
{% if fic.warnings %} <b>Warnings:</b> {{ fic.warnings }} <br /> {% endif %}
|
||||
{% if fic.description %} <b>Summary:</b> {{ fic.description }} <br /> {% endif %}
|
||||
</p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
46
app.yaml
Normal file
46
app.yaml
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
# ffd-retief-hrd fanfictiondownloader
|
||||
application: ffd-retief-hrd
|
||||
version: 2-0-10
|
||||
runtime: python27
|
||||
api_version: 1
|
||||
threadsafe: true
|
||||
|
||||
handlers:
|
||||
|
||||
- url: /r3m0v3r.*
|
||||
script: utils.remover.app
|
||||
login: admin
|
||||
|
||||
- url: /tally.*
|
||||
script: utils.tally.app
|
||||
login: admin
|
||||
|
||||
- url: /fdownloadtask
|
||||
script: main.app
|
||||
login: admin
|
||||
|
||||
- url: /css
|
||||
static_dir: css
|
||||
|
||||
- url: /js
|
||||
static_dir: js
|
||||
|
||||
- url: /static
|
||||
static_dir: static
|
||||
|
||||
- url: /favicon\.ico
|
||||
static_files: static/favicon.ico
|
||||
upload: static/favicon\.ico
|
||||
|
||||
- url: /.*
|
||||
script: main.app
|
||||
|
||||
#builtins:
|
||||
#- datastore_admin: on
|
||||
|
||||
libraries:
|
||||
- name: django
|
||||
version: "1.2"
|
||||
|
||||
- name: PIL
|
||||
version: "1.1.7"
|
||||
26
bs4/COPYING.txt
Normal file
26
bs4/COPYING.txt
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
Beautiful Soup is made available under the MIT license:
|
||||
|
||||
Copyright (c) 2004-2012 Leonard Richardson
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE, DAMMIT.
|
||||
|
||||
Beautiful Soup incorporates code from the html5lib library, which is
|
||||
also made available under the MIT license.
|
||||
406
bs4/__init__.py
Normal file
406
bs4/__init__.py
Normal file
|
|
@ -0,0 +1,406 @@
|
|||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||
provides provides methods and Pythonic idioms that make it easy to
|
||||
navigate, search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
documentation:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.3.2"
|
||||
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry, ParserRejectedMarkup
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
Comment,
|
||||
DEFAULT_OUTPUT_ENCODING,
|
||||
Declaration,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ProcessingInstruction,
|
||||
ResultSet,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
This class defines the basic interface called by the tree builders.
|
||||
|
||||
These methods will be called by the parser:
|
||||
reset()
|
||||
feed(markup)
|
||||
|
||||
The tree builder may call these methods from its feed() implementation:
|
||||
handle_starttag(name, attrs) # See note about return value
|
||||
handle_endtag(name)
|
||||
handle_data(data) # Appends to the current data node
|
||||
endData(containerClass=NavigableString) # Ends the current data node
|
||||
|
||||
No matter how complicated the underlying parser is, you should be
|
||||
able to build a tree using 'start tag' events, 'end tag' events,
|
||||
'data' events, and "done with data" events.
|
||||
|
||||
If you encounter an empty-element tag (aka a self-closing tag,
|
||||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = u'[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||
|
||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, **kwargs):
|
||||
"""The Soup object is initialized as the 'root tag', and the
|
||||
provided markup (which can be a string or a file-like object)
|
||||
is fed into the underlying parser."""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'markupMassage' in kwargs:
|
||||
del kwargs['markupMassage']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the markupMassage argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for any necessary markup massage.")
|
||||
|
||||
if 'smartQuotesTo' in kwargs:
|
||||
del kwargs['smartQuotesTo']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the smartQuotesTo argument to the "
|
||||
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'selfClosingTags' in kwargs:
|
||||
del kwargs['selfClosingTags']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the selfClosingTags argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for understanding self-closing tags.")
|
||||
|
||||
if 'isHTML' in kwargs:
|
||||
del kwargs['isHTML']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the isHTML argument to the "
|
||||
"BeautifulSoup constructor. You can pass in features='html' "
|
||||
"or features='xml' to get a builder capable of handling "
|
||||
"one or the other.")
|
||||
|
||||
def deprecated_argument(old_name, new_name):
|
||||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
"parseOnlyThese", "parse_only")
|
||||
|
||||
from_encoding = from_encoding or deprecated_argument(
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = kwargs.keys().pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
if isinstance(features, basestring):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
builder_class = builder_registry.lookup(*features)
|
||||
if builder_class is None:
|
||||
raise FeatureNotFound(
|
||||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self.parse_only = parse_only
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
elif len(markup) <= 256:
|
||||
# Print out warnings for a couple beginner problems
|
||||
# involving passing non-markup to Beautiful Soup.
|
||||
# Beautiful Soup will still parse the input as markup,
|
||||
# just in case that's what the user really wants.
|
||||
if (isinstance(markup, unicode)
|
||||
and not os.path.supports_unicode_filenames):
|
||||
possible_filename = markup.encode("utf8")
|
||||
else:
|
||||
possible_filename = markup
|
||||
is_file = False
|
||||
try:
|
||||
is_file = os.path.exists(possible_filename)
|
||||
except Exception, e:
|
||||
# This is almost certainly a problem involving
|
||||
# characters not valid in filenames on this
|
||||
# system. Just let it go.
|
||||
pass
|
||||
if is_file:
|
||||
warnings.warn(
|
||||
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
|
||||
if markup[:5] == "http:" or markup[:6] == "https:":
|
||||
# TODO: This is ugly but I couldn't get it to work in
|
||||
# Python 3 otherwise.
|
||||
if ((isinstance(markup, bytes) and not b' ' in markup)
|
||||
or (isinstance(markup, unicode) and not u' ' in markup)):
|
||||
warnings.warn(
|
||||
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
|
||||
|
||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) in (
|
||||
self.builder.prepare_markup(markup, from_encoding)):
|
||||
self.reset()
|
||||
try:
|
||||
self._feed()
|
||||
break
|
||||
except ParserRejectedMarkup:
|
||||
pass
|
||||
|
||||
# Clear out the markup and remove the builder's circular
|
||||
# reference to this object.
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def _feed(self):
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
|
||||
self.builder.feed(self.markup)
|
||||
# Close out any unfinished strings and close all the open tags.
|
||||
self.endData()
|
||||
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||
self.popTag()
|
||||
|
||||
def reset(self):
|
||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||
self.hidden = 1
|
||||
self.builder.reset()
|
||||
self.current_data = []
|
||||
self.currentTag = None
|
||||
self.tagStack = []
|
||||
self.preserve_whitespace_tag_stack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||
|
||||
def new_string(self, s, subclass=NavigableString):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
navigable = subclass(s)
|
||||
navigable.setup()
|
||||
return navigable
|
||||
|
||||
def insert_before(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||
|
||||
def insert_after(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||
|
||||
def popTag(self):
|
||||
tag = self.tagStack.pop()
|
||||
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
|
||||
self.preserve_whitespace_tag_stack.pop()
|
||||
#print "Pop", tag.name
|
||||
if self.tagStack:
|
||||
self.currentTag = self.tagStack[-1]
|
||||
return self.currentTag
|
||||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
if tag.name in self.builder.preserve_whitespace_tags:
|
||||
self.preserve_whitespace_tag_stack.append(tag)
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.current_data:
|
||||
current_data = u''.join(self.current_data)
|
||||
# If whitespace is not preserved, and this string contains
|
||||
# nothing but ASCII spaces, replace it with a single space
|
||||
# or newline.
|
||||
if not self.preserve_whitespace_tag_stack:
|
||||
strippable = True
|
||||
for i in current_data:
|
||||
if i not in self.ASCII_SPACES:
|
||||
strippable = False
|
||||
break
|
||||
if strippable:
|
||||
if '\n' in current_data:
|
||||
current_data = '\n'
|
||||
else:
|
||||
current_data = ' '
|
||||
|
||||
# Reset the data collector.
|
||||
self.current_data = []
|
||||
|
||||
# Should we add this string to the tree at all?
|
||||
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||
(not self.parse_only.text or \
|
||||
not self.parse_only.search(current_data)):
|
||||
return
|
||||
|
||||
o = containerClass(current_data)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
parent = parent or self.currentTag
|
||||
most_recent_element = most_recent_element or self._most_recent_element
|
||||
o.setup(parent, most_recent_element)
|
||||
|
||||
if most_recent_element is not None:
|
||||
most_recent_element.next_element = o
|
||||
self._most_recent_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
stack up to but *not* including the most recent instqance of
|
||||
the given tag."""
|
||||
#print "Popping to %s" % name
|
||||
if name == self.ROOT_TAG_NAME:
|
||||
# The BeautifulSoup object itself can never be popped.
|
||||
return
|
||||
|
||||
most_recently_popped = None
|
||||
|
||||
stack_size = len(self.tagStack)
|
||||
for i in range(stack_size - 1, 0, -1):
|
||||
t = self.tagStack[i]
|
||||
if (name == t.name and nsprefix == t.prefix):
|
||||
if inclusivePop:
|
||||
most_recently_popped = self.popTag()
|
||||
break
|
||||
most_recently_popped = self.popTag()
|
||||
|
||||
return most_recently_popped
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||
"""Push a start tag on to the stack.
|
||||
|
||||
If this method returns None, the tag was rejected by the
|
||||
SoupStrainer. You should proceed as if the tag had not occured
|
||||
in the document. For instance, if this was a self-closing tag,
|
||||
don't call handle_endtag.
|
||||
"""
|
||||
|
||||
# print "Start tag %s: %s" % (name, attrs)
|
||||
self.endData()
|
||||
|
||||
if (self.parse_only and len(self.tagStack) <= 1
|
||||
and (self.parse_only.text
|
||||
or not self.parse_only.search_tag(name, attrs))):
|
||||
return None
|
||||
|
||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self._most_recent_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self._most_recent_element:
|
||||
self._most_recent_element.next_element = tag
|
||||
self._most_recent_element = tag
|
||||
self.pushTag(tag)
|
||||
return tag
|
||||
|
||||
def handle_endtag(self, name, nsprefix=None):
|
||||
#print "End tag: " + name
|
||||
self.endData()
|
||||
self._popToTag(name, nsprefix)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.current_data.append(data)
|
||||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Returns a string or Unicode representation of this document.
|
||||
To get Unicode, pass None for encoding."""
|
||||
|
||||
if self.is_xml:
|
||||
# Print the XML declaration
|
||||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = u''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
indent_level = 0
|
||||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
|
||||
# Alias to make it easier to type import: 'from bs4 import _soup'
|
||||
_s = BeautifulSoup
|
||||
_soup = BeautifulSoup
|
||||
|
||||
class BeautifulStoneSoup(BeautifulSoup):
|
||||
"""Deprecated interface to an XML parser."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
pass
|
||||
|
||||
class FeatureNotFound(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
#By default, act as an HTML pretty-printer.
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print soup.prettify()
|
||||
321
bs4/builder/__init__.py
Normal file
321
bs4/builder/__init__.py
Normal file
|
|
@ -0,0 +1,321 @@
|
|||
from collections import defaultdict
|
||||
import itertools
|
||||
import sys
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
whitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'HTMLTreeBuilder',
|
||||
'SAXTreeBuilder',
|
||||
'TreeBuilder',
|
||||
'TreeBuilderRegistry',
|
||||
]
|
||||
|
||||
# Some useful features for a TreeBuilder to have.
|
||||
FAST = 'fast'
|
||||
PERMISSIVE = 'permissive'
|
||||
STRICT = 'strict'
|
||||
XML = 'xml'
|
||||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
|
||||
def __init__(self):
|
||||
self.builders_for_feature = defaultdict(list)
|
||||
self.builders = []
|
||||
|
||||
def register(self, treebuilder_class):
|
||||
"""Register a treebuilder based on its advertised features."""
|
||||
for feature in treebuilder_class.features:
|
||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||
self.builders.insert(0, treebuilder_class)
|
||||
|
||||
def lookup(self, *features):
|
||||
if len(self.builders) == 0:
|
||||
# There are no builders at all.
|
||||
return None
|
||||
|
||||
if len(features) == 0:
|
||||
# They didn't ask for any features. Give them the most
|
||||
# recently registered builder.
|
||||
return self.builders[0]
|
||||
|
||||
# Go down the list of features in order, and eliminate any builders
|
||||
# that don't match every feature.
|
||||
features = list(features)
|
||||
features.reverse()
|
||||
candidates = None
|
||||
candidate_set = None
|
||||
while len(features) > 0:
|
||||
feature = features.pop()
|
||||
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||
if len(we_have_the_feature) > 0:
|
||||
if candidates is None:
|
||||
candidates = we_have_the_feature
|
||||
candidate_set = set(candidates)
|
||||
else:
|
||||
# Eliminate any candidates that don't have this feature.
|
||||
candidate_set = candidate_set.intersection(
|
||||
set(we_have_the_feature))
|
||||
|
||||
# The only valid candidates are the ones in candidate_set.
|
||||
# Go through the original list of candidates and pick the first one
|
||||
# that's in candidate_set.
|
||||
if candidate_set is None:
|
||||
return None
|
||||
for candidate in candidates:
|
||||
if candidate in candidate_set:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# The BeautifulSoup class will take feature lists from developers and use them
|
||||
# to look up builders in this registry.
|
||||
builder_registry = TreeBuilderRegistry()
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Turn a document into a Beautiful Soup object tree."""
|
||||
|
||||
features = []
|
||||
|
||||
is_xml = False
|
||||
preserve_whitespace_tags = set()
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
cdata_list_attributes = {}
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.soup = None
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def can_be_empty_element(self, tag_name):
|
||||
"""Might a tag with this name be an empty-element tag?
|
||||
|
||||
The final markup may or may not actually present this tag as
|
||||
self-closing.
|
||||
|
||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||
an empty-element tag (it's not in
|
||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||
will be presented as "<p></p>", not "<p />".
|
||||
|
||||
The default implementation has no opinion about which tags are
|
||||
empty-element tags, so a tag will be presented as an
|
||||
empty-element tag if and only if it has no contents.
|
||||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
||||
be left alone.
|
||||
"""
|
||||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
return markup, None, None, False
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""Wrap an HTML fragment to make it look like a document.
|
||||
|
||||
Different parsers do this differently. For instance, lxml
|
||||
introduces an empty <head> tag, and html5lib
|
||||
doesn't. Abstracting this away lets us write simple tests
|
||||
which run HTML fragments through the parser and compare the
|
||||
results against other HTML fragments.
|
||||
|
||||
This method should not be used outside of tests.
|
||||
"""
|
||||
return fragment
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
return False
|
||||
|
||||
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||
"""Replaces class="foo bar" with class=["foo", "bar"]
|
||||
|
||||
Modifies its input in place.
|
||||
"""
|
||||
if not attrs:
|
||||
return attrs
|
||||
if self.cdata_list_attributes:
|
||||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), None)
|
||||
for attr in attrs.keys():
|
||||
if attr in universal or (tag_specific and attr in tag_specific):
|
||||
# We have a "class"-type attribute whose string
|
||||
# value is a whitespace-separated list of
|
||||
# values. Split it into a list.
|
||||
value = attrs[attr]
|
||||
if isinstance(value, basestring):
|
||||
values = whitespace_re.split(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
# for the same tag when rearranging the parse
|
||||
# tree. On the second call the attribute value
|
||||
# here is already a list. If this happens,
|
||||
# leave the value alone rather than trying to
|
||||
# split it again.
|
||||
values = value
|
||||
attrs[attr] = values
|
||||
return attrs
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||
#print "Start %s, %r" % (name, attrs)
|
||||
self.soup.handle_starttag(name, attrs)
|
||||
|
||||
def endElement(self, name):
|
||||
#print "End %s" % name
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.startElement(nodeName, attrs)
|
||||
|
||||
def endElementNS(self, nsTuple, nodeName):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.endElement(nodeName)
|
||||
#handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
|
||||
def startPrefixMapping(self, prefix, nodeValue):
|
||||
# Ignore the prefix for now.
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix):
|
||||
# Ignore the prefix for now.
|
||||
# handler.endPrefixMapping(prefix)
|
||||
pass
|
||||
|
||||
def characters(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def startDocument(self):
|
||||
pass
|
||||
|
||||
def endDocument(self):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTreeBuilder(TreeBuilder):
|
||||
"""This TreeBuilder knows facts about HTML.
|
||||
|
||||
Such as which tags are empty-element tags.
|
||||
"""
|
||||
|
||||
preserve_whitespace_tags = set(['pre', 'textarea'])
|
||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
||||
'spacer', 'link', 'frame', 'base'])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
# class="foo bar" means that the 'class' attribute has two values,
|
||||
# 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||
# encounter one of these attributes, we will parse its value into
|
||||
# a list of values if possible. Upon output, the list will be
|
||||
# converted back into a string.
|
||||
cdata_list_attributes = {
|
||||
"*" : ['class', 'accesskey', 'dropzone'],
|
||||
"a" : ['rel', 'rev'],
|
||||
"link" : ['rel', 'rev'],
|
||||
"td" : ["headers"],
|
||||
"th" : ["headers"],
|
||||
"td" : ["headers"],
|
||||
"form" : ["accept-charset"],
|
||||
"object" : ["archive"],
|
||||
|
||||
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||
"area" : ["rel"],
|
||||
"icon" : ["sizes"],
|
||||
"iframe" : ["sandbox"],
|
||||
"output" : ["for"],
|
||||
}
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != 'meta':
|
||||
return False
|
||||
|
||||
http_equiv = tag.get('http-equiv')
|
||||
content = tag.get('content')
|
||||
charset = tag.get('charset')
|
||||
|
||||
# We are interested in <meta> tags that say what encoding the
|
||||
# document was originally in. This means HTML 5-style <meta>
|
||||
# tags that provide the "charset" attribute. It also means
|
||||
# HTML 4-style <meta> tags that provide the "content"
|
||||
# attribute and have "http-equiv" set to "content-type".
|
||||
#
|
||||
# In both cases we will replace the value of the appropriate
|
||||
# attribute with a standin object that can take on any
|
||||
# encoding.
|
||||
meta_encoding = None
|
||||
if charset is not None:
|
||||
# HTML 5 style:
|
||||
# <meta charset="utf8">
|
||||
meta_encoding = charset
|
||||
tag['charset'] = CharsetMetaAttributeValue(charset)
|
||||
|
||||
elif (content is not None and http_equiv is not None
|
||||
and http_equiv.lower() == 'content-type'):
|
||||
# HTML 4 style:
|
||||
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||
tag['content'] = ContentMetaAttributeValue(content)
|
||||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
# I'm fairly sure this is not the best way to do this.
|
||||
this_module = sys.modules['bs4.builder']
|
||||
for name in module.__all__:
|
||||
obj = getattr(module, name)
|
||||
|
||||
if issubclass(obj, TreeBuilder):
|
||||
setattr(this_module, name, obj)
|
||||
this_module.__all__.append(name)
|
||||
# Register the builder while we're at it.
|
||||
this_module.builder_registry.register(obj)
|
||||
|
||||
class ParserRejectedMarkup(Exception):
|
||||
pass
|
||||
|
||||
# Builders are registered in reverse order of priority, so that custom
|
||||
# builder registrations will take precedence. In general, we want lxml
|
||||
# to take precedence over html5lib, because it's faster. And we only
|
||||
# want to use HTMLParser as a last result.
|
||||
from . import _htmlparser
|
||||
register_treebuilders_from(_htmlparser)
|
||||
try:
|
||||
from . import _html5lib
|
||||
register_treebuilders_from(_html5lib)
|
||||
except ImportError:
|
||||
# They don't have html5lib installed.
|
||||
pass
|
||||
try:
|
||||
from . import _lxml
|
||||
register_treebuilders_from(_lxml)
|
||||
except ImportError:
|
||||
# They don't have lxml installed.
|
||||
pass
|
||||
285
bs4/builder/_html5lib.py
Normal file
285
bs4/builder/_html5lib.py
Normal file
|
|
@ -0,0 +1,285 @@
|
|||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
]
|
||||
|
||||
import warnings
|
||||
from bs4.builder import (
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import NamespacedAttribute
|
||||
import html5lib
|
||||
from html5lib.constants import namespaces
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use html5lib to build a tree."""
|
||||
|
||||
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding):
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
yield (markup, None, None, False)
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup):
|
||||
if self.soup.parse_only is not None:
|
||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
doc = parser.parse(markup, encoding=self.user_specified_encoding)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, unicode):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
|
||||
def create_treebuilder(self, namespaceHTMLElements):
|
||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||
self.soup, namespaceHTMLElements)
|
||||
return self.underlying_builder
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||
|
||||
def __init__(self, soup, namespaceHTMLElements):
|
||||
self.soup = soup
|
||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup.reset()
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
tag = self.soup.new_tag(name, namespace)
|
||||
return Element(tag, self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
# XXX This code is not covered by the BS4 tests.
|
||||
self.soup.append(node.element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return list(self.attrs.items()).__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
"set attr", name, value
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
def keys(self):
|
||||
return list(self.attrs.keys())
|
||||
def __len__(self):
|
||||
return len(self.attrs)
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in list(self.attrs.keys())
|
||||
|
||||
|
||||
class Element(html5lib.treebuilders._base.Node):
|
||||
def __init__(self, element, soup, namespace):
|
||||
html5lib.treebuilders._base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def appendChild(self, node):
|
||||
string_child = child = None
|
||||
if isinstance(node, basestring):
|
||||
# Some other piece of code decided to pass in a string
|
||||
# instead of creating a TextElement object to contain the
|
||||
# string.
|
||||
string_child = child = node
|
||||
elif isinstance(node, Tag):
|
||||
# Some other piece of code decided to pass in a Tag
|
||||
# instead of creating an Element object to contain the
|
||||
# Tag.
|
||||
child = node
|
||||
elif node.element.__class__ == NavigableString:
|
||||
string_child = child = node.element
|
||||
else:
|
||||
child = node.element
|
||||
|
||||
if not isinstance(child, basestring) and child.parent is not None:
|
||||
node.element.extract()
|
||||
|
||||
if (string_child and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# We are appending a string onto another string.
|
||||
# TODO This has O(n^2) performance, for input like
|
||||
# "a</a>a</a>a</a>..."
|
||||
old_element = self.element.contents[-1]
|
||||
new_element = self.soup.new_string(old_element + string_child)
|
||||
old_element.replace_with(new_element)
|
||||
self.soup._most_recent_element = new_element
|
||||
else:
|
||||
if isinstance(node, basestring):
|
||||
# Create a brand new NavigableString from this string.
|
||||
child = self.soup.new_string(node)
|
||||
|
||||
# Tell Beautiful Soup to act as if it parsed this element
|
||||
# immediately after the parent's last descendant. (Or
|
||||
# immediately after the parent, if it has no children.)
|
||||
if self.element.contents:
|
||||
most_recent_element = self.element._last_descendant(False)
|
||||
else:
|
||||
most_recent_element = self.element
|
||||
|
||||
self.soup.object_was_parsed(
|
||||
child, parent=self.element,
|
||||
most_recent_element=most_recent_element)
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
|
||||
converted_attributes = []
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
new_name = NamespacedAttribute(*name)
|
||||
del attributes[name]
|
||||
attributes[new_name] = value
|
||||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
# Call set_up_substitutions manually.
|
||||
#
|
||||
# The Tag constructor called this method when the Tag was created,
|
||||
# but we just set/changed the attributes, so call it again.
|
||||
self.soup.builder.set_up_substitutions(self.element)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if insertBefore:
|
||||
text = TextNode(self.soup.new_string(data), self.soup)
|
||||
self.insertBefore(data, insertBefore)
|
||||
else:
|
||||
self.appendChild(data)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
# (See comments in appendChild)
|
||||
old_node = self.element.contents[index-1]
|
||||
new_str = self.soup.new_string(old_node + node.element)
|
||||
old_node.replace_with(new_str)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
|
||||
def reparentChildren(self, new_parent):
|
||||
"""Move all of this tag's children into another tag."""
|
||||
element = self.element
|
||||
new_parent_element = new_parent.element
|
||||
# Determine what this tag's next_element will be once all the children
|
||||
# are removed.
|
||||
final_next_element = element.next_sibling
|
||||
|
||||
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
|
||||
if len(new_parent_element.contents) > 0:
|
||||
# The new parent already contains children. We will be
|
||||
# appending this tag's children to the end.
|
||||
new_parents_last_child = new_parent_element.contents[-1]
|
||||
new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
|
||||
else:
|
||||
# The new parent contains no children.
|
||||
new_parents_last_child = None
|
||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||
|
||||
to_append = element.contents
|
||||
append_after = new_parent.element.contents
|
||||
if len(to_append) > 0:
|
||||
# Set the first child's previous_element and previous_sibling
|
||||
# to elements within the new parent
|
||||
first_child = to_append[0]
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
first_child.previous_sibling = new_parents_last_child
|
||||
|
||||
# Fix the last child's next_element and next_sibling
|
||||
last_child = to_append[-1]
|
||||
last_child.next_element = new_parents_last_descendant_next_element
|
||||
last_child.next_sibling = None
|
||||
|
||||
for child in to_append:
|
||||
child.parent = new_parent_element
|
||||
new_parent_element.contents.append(child)
|
||||
|
||||
# Now that this element has no children, change its .next_element.
|
||||
element.contents = []
|
||||
element.next_element = final_next_element
|
||||
|
||||
def cloneNode(self):
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
html5lib.treebuilders._base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
258
bs4/builder/_htmlparser.py
Normal file
258
bs4/builder/_htmlparser.py
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import (
|
||||
HTMLParser,
|
||||
HTMLParseError,
|
||||
)
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||
# argument, which we'd like to set to False. Unfortunately,
|
||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||
# before Python 3.2.3.
|
||||
#
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = (
|
||||
major > 3
|
||||
or (major == 3 and minor > 2)
|
||||
or (major == 3 and minor == 2 and release >= 3))
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
)
|
||||
|
||||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
def handle_starttag(self, name, attrs):
|
||||
# XXX namespace
|
||||
attr_dict = {}
|
||||
for key, value in attrs:
|
||||
# Change None attribute values to the empty string
|
||||
# for consistency with the other tree builders.
|
||||
if value is None:
|
||||
value = ''
|
||||
attr_dict[key] = value
|
||||
attrvalue = '""'
|
||||
self.soup.handle_starttag(name, None, None, attr_dict)
|
||||
|
||||
def handle_endtag(self, name):
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed.
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
elif name.startswith('X'):
|
||||
real_name = int(name.lstrip('X'), 16)
|
||||
else:
|
||||
real_name = int(name)
|
||||
|
||||
try:
|
||||
data = unichr(real_name)
|
||||
except (ValueError, OverflowError), e:
|
||||
data = u"\N{REPLACEMENT CHARACTER}"
|
||||
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.soup.endData()
|
||||
if data.startswith("DOCTYPE "):
|
||||
data = data[len("DOCTYPE "):]
|
||||
elif data == 'DOCTYPE':
|
||||
# i.e. "<!DOCTYPE>"
|
||||
data = ''
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Doctype)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
if data.upper().startswith('CDATA['):
|
||||
cls = CData
|
||||
data = data[len('CDATA['):]
|
||||
else:
|
||||
cls = Declaration
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(cls)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.soup.endData()
|
||||
if data.endswith("?") and data.lower().startswith("xml"):
|
||||
# "An XHTML processing instruction using the trailing '?'
|
||||
# will cause the '?' to be included in data." - HTMLParser
|
||||
# docs.
|
||||
#
|
||||
# Strip the question mark so we don't end up with two
|
||||
# question marks.
|
||||
data = data[:-1]
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
is_xml = False
|
||||
features = [HTML, STRICT, HTMLPARSER]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if CONSTRUCTOR_TAKES_STRICT:
|
||||
kwargs['strict'] = False
|
||||
self.parser_args = (args, kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:return: A 4-tuple (markup, original encoding, encoding
|
||||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
yield (markup, None, None, False)
|
||||
return
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||
yield (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
args, kwargs = self.parser_args
|
||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except HTMLParseError, e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
# string.
|
||||
#
|
||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||
import re
|
||||
attrfind_tolerant = re.compile(
|
||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||
|
||||
locatestarttagend = re.compile(r"""
|
||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||
(?:\s+ # whitespace before attribute name
|
||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||
(?:\s*=\s* # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|\"[^\"]*\" # LIT-enclosed value
|
||||
|[^'\">\s]+ # bare value
|
||||
)
|
||||
)?
|
||||
)
|
||||
)*
|
||||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||
|
||||
from html.parser import tagfind, attrfind
|
||||
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
return endpos
|
||||
rawdata = self.rawdata
|
||||
self.__starttag_text = rawdata[i:endpos]
|
||||
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
assert match, 'unexpected call to parse_starttag()'
|
||||
k = match.end()
|
||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||
while k < endpos:
|
||||
if self.strict:
|
||||
m = attrfind.match(rawdata, k)
|
||||
else:
|
||||
m = attrfind_tolerant.match(rawdata, k)
|
||||
if not m:
|
||||
break
|
||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = None
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = self.unescape(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
end = rawdata[k:endpos].strip()
|
||||
if end not in (">", "/>"):
|
||||
lineno, offset = self.getpos()
|
||||
if "\n" in self.__starttag_text:
|
||||
lineno = lineno + self.__starttag_text.count("\n")
|
||||
offset = len(self.__starttag_text) \
|
||||
- self.__starttag_text.rfind("\n")
|
||||
else:
|
||||
offset = offset + len(self.__starttag_text)
|
||||
if self.strict:
|
||||
self.error("junk characters in start tag: %r"
|
||||
% (rawdata[k:endpos][:20],))
|
||||
self.handle_data(rawdata[i:endpos])
|
||||
return endpos
|
||||
if end.endswith('/>'):
|
||||
# XHTML-style empty tag: <span attr="value" />
|
||||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
return endpos
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||
|
||||
CONSTRUCTOR_TAKES_STRICT = True
|
||||
233
bs4/builder/_lxml.py
Normal file
233
bs4/builder/_lxml.py
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
from io import BytesIO
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
from bs4.element import Comment, Doctype, NamespacedAttribute
|
||||
from bs4.builder import (
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
PERMISSIVE,
|
||||
ParserRejectedMarkup,
|
||||
TreeBuilder,
|
||||
XML)
|
||||
from bs4.dammit import EncodingDetector
|
||||
|
||||
LXML = 'lxml'
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
is_xml = True
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features = [LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||
|
||||
def default_parser(self, encoding):
|
||||
# This can either return a parser object or a class, which
|
||||
# will be instantiated with default arguments.
|
||||
if self._default_parser is not None:
|
||||
return self._default_parser
|
||||
return etree.XMLParser(
|
||||
target=self, strip_cdata=False, recover=True, encoding=encoding)
|
||||
|
||||
def parser_for(self, encoding):
|
||||
# Use the default parser.
|
||||
parser = self.default_parser(encoding)
|
||||
|
||||
if isinstance(parser, collections.Callable):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
||||
return parser
|
||||
|
||||
def __init__(self, parser=None, empty_element_tags=None):
|
||||
# TODO: Issue a warning if parser is present but not a
|
||||
# callable, since that means there's no way to create new
|
||||
# parsers for different encodings.
|
||||
self._default_parser = parser
|
||||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
if tag[0] == '{':
|
||||
return tuple(tag[1:].split('}', 1))
|
||||
else:
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:yield: A series of 4-tuples.
|
||||
(markup, encoding, declared encoding,
|
||||
has undergone character replacement)
|
||||
|
||||
Each 4-tuple represents a strategy for parsing the document.
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, unicode):
|
||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||
# tell lxml to parse it as UTF-8.
|
||||
yield (markup.encode("utf8"), "utf8",
|
||||
document_declared_encoding, False)
|
||||
|
||||
# Instead of using UnicodeDammit to convert the bytestring to
|
||||
# Unicode using different encodings, use EncodingDetector to
|
||||
# iterate over the encodings, and tell lxml to try to parse
|
||||
# the document as each one in turn.
|
||||
is_html = not self.is_xml
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
detector = EncodingDetector(markup, try_encodings, is_html)
|
||||
for encoding in detector.encodings:
|
||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, bytes):
|
||||
markup = BytesIO(markup)
|
||||
elif isinstance(markup, unicode):
|
||||
markup = StringIO(markup)
|
||||
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
try:
|
||||
self.parser = self.parser_for(self.soup.original_encoding)
|
||||
self.parser.feed(data)
|
||||
while len(data) != 0:
|
||||
# Now call feed() on the rest of the data, chunk by chunk.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
if len(data) != 0:
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in nsmap.items():
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
new_attrs[attr] = value
|
||||
attrs = new_attrs
|
||||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
|
||||
def _prefix_for_namespace(self, namespace):
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
if namespace is None:
|
||||
return None
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
return inverted_nsmap[namespace]
|
||||
return None
|
||||
|
||||
def end(self, name):
|
||||
self.soup.endData()
|
||||
completed_tag = self.soup.tagStack[-1]
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = None
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(name, nsprefix)
|
||||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
|
||||
def pi(self, target, data):
|
||||
pass
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def doctype(self, name, pubid, system):
|
||||
self.soup.endData()
|
||||
doctype = Doctype.for_name_and_ids(name, pubid, system)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def comment(self, content):
|
||||
"Handle comments as Comment objects."
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(content)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
||||
features = [LXML, HTML, FAST, PERMISSIVE]
|
||||
is_xml = False
|
||||
|
||||
def default_parser(self, encoding):
|
||||
return etree.HTMLParser
|
||||
|
||||
def feed(self, markup):
|
||||
encoding = self.soup.original_encoding
|
||||
try:
|
||||
self.parser = self.parser_for(encoding)
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><body>%s</body></html>' % fragment
|
||||
829
bs4/dammit.py
Normal file
829
bs4/dammit.py
Normal file
|
|
@ -0,0 +1,829 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||||
|
||||
This library converts a bytestream to Unicode through any means
|
||||
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||
Feed Parser. It works best on XML and XML, but it does not rewrite the
|
||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
import re
|
||||
import logging
|
||||
import string
|
||||
|
||||
# Import a library to autodetect character encodings.
|
||||
chardet_type = None
|
||||
try:
|
||||
# First try the fast C implementation.
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
def chardet_dammit(s):
|
||||
return cchardet.detect(s)['encoding']
|
||||
except ImportError:
|
||||
try:
|
||||
# Fall back to the pure Python implementation
|
||||
# Debian package: python-chardet
|
||||
# PyPI package: chardet
|
||||
import chardet
|
||||
def chardet_dammit(s):
|
||||
return chardet.detect(s)['encoding']
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
def chardet_dammit(s):
|
||||
return None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
"""Substitute XML or HTML entities for the corresponding characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = unichr(codepoint)
|
||||
if codepoint != 34:
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
# is handled elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to turn " into the quotation mark.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
CHARACTER_TO_XML_ENTITY = {
|
||||
"'": "apos",
|
||||
'"': "quot",
|
||||
"&": "amp",
|
||||
"<": "lt",
|
||||
">": "gt",
|
||||
}
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
||||
")")
|
||||
|
||||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||
|
||||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for an XML special character."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def quoted_attribute_value(self, value):
|
||||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||||
|
||||
Most strings will be quoted using double quotes.
|
||||
|
||||
Bob's Bar -> "Bob's Bar"
|
||||
|
||||
If a string contains double quotes, it will be quoted using
|
||||
single quotes.
|
||||
|
||||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||||
|
||||
If a string contains both single and double quotes, the
|
||||
double quotes will be escaped, and the string will be quoted
|
||||
using double quotes.
|
||||
|
||||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
||||
"""
|
||||
quote_with = '"'
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
# The string contains both single and double
|
||||
# quotes. Turn the double quotes into
|
||||
# entities. We quote the double quotes rather than
|
||||
# the single quotes because the entity name is
|
||||
# """ whether this is HTML or XML. If we
|
||||
# quoted the single quotes, we'd have to decide
|
||||
# between ' and &squot;.
|
||||
replace_with = """
|
||||
value = value.replace('"', replace_with)
|
||||
else:
|
||||
# There are double quotes but no single quotes.
|
||||
# We can use single quotes to quote the attribute.
|
||||
quote_with = "'"
|
||||
return quote_with + value + quote_with
|
||||
|
||||
@classmethod
|
||||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign
|
||||
will become <, the greater-than sign will become >,
|
||||
and any ampersands will become &. If you want ampersands
|
||||
that appear to be part of an entity definition to be left
|
||||
alone, use substitute_xml_containing_entities() instead.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets and ampersands.
|
||||
value = cls.AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_xml_containing_entities(
|
||||
cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign will
|
||||
become <, the greater-than sign will become >, and any
|
||||
ampersands that are not part of an entity defition will
|
||||
become &.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets, and ampersands that aren't part of
|
||||
# entities.
|
||||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_html(cls, s):
|
||||
"""Replace certain Unicode characters with named HTML entities.
|
||||
|
||||
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
||||
in that the goal is to make the result more readable (to those
|
||||
with ASCII displays) rather than to recover from
|
||||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||||
character with "é" will make it more readable to some
|
||||
people.
|
||||
"""
|
||||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||||
cls._substitute_html_entity, s)
|
||||
|
||||
|
||||
class EncodingDetector:
|
||||
"""Suggests a number of possible encodings for a bytestring.
|
||||
|
||||
Order of precedence:
|
||||
|
||||
1. Encodings you specifically tell EncodingDetector to try first
|
||||
(the override_encodings argument to the constructor).
|
||||
|
||||
2. An encoding declared within the bytestring itself, either in an
|
||||
XML declaration (if the bytestring is to be interpreted as an XML
|
||||
document), or in a <meta> tag (if the bytestring is to be
|
||||
interpreted as an HTML document.)
|
||||
|
||||
3. An encoding detected through textual analysis by chardet,
|
||||
cchardet, or a similar external library.
|
||||
|
||||
4. UTF-8.
|
||||
|
||||
5. Windows-1252.
|
||||
"""
|
||||
def __init__(self, markup, override_encodings=None, is_html=False):
|
||||
self.override_encodings = override_encodings or []
|
||||
self.chardet_encoding = None
|
||||
self.is_html = is_html
|
||||
self.declared_encoding = None
|
||||
|
||||
# First order of business: strip a byte-order mark.
|
||||
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
||||
|
||||
def _usable(self, encoding, tried):
|
||||
if encoding is not None:
|
||||
encoding = encoding.lower()
|
||||
if encoding not in tried:
|
||||
tried.add(encoding)
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def encodings(self):
|
||||
"""Yield a number of encodings that might work for this markup."""
|
||||
tried = set()
|
||||
for e in self.override_encodings:
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
# Did the document originally start with a byte-order mark
|
||||
# that indicated its encoding?
|
||||
if self._usable(self.sniffed_encoding, tried):
|
||||
yield self.sniffed_encoding
|
||||
|
||||
# Look within the document for an XML or HTML encoding
|
||||
# declaration.
|
||||
if self.declared_encoding is None:
|
||||
self.declared_encoding = self.find_declared_encoding(
|
||||
self.markup, self.is_html)
|
||||
if self._usable(self.declared_encoding, tried):
|
||||
yield self.declared_encoding
|
||||
|
||||
# Use third-party character set detection to guess at the
|
||||
# encoding.
|
||||
if self.chardet_encoding is None:
|
||||
self.chardet_encoding = chardet_dammit(self.markup)
|
||||
if self._usable(self.chardet_encoding, tried):
|
||||
yield self.chardet_encoding
|
||||
|
||||
# As a last-ditch effort, try utf-8 and windows-1252.
|
||||
for e in ('utf-8', 'windows-1252'):
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
@classmethod
|
||||
def strip_byte_order_mark(cls, data):
|
||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||
encoding = None
|
||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
data = data[2:]
|
||||
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16le'
|
||||
data = data[2:]
|
||||
elif data[:3] == b'\xef\xbb\xbf':
|
||||
encoding = 'utf-8'
|
||||
data = data[3:]
|
||||
elif data[:4] == b'\x00\x00\xfe\xff':
|
||||
encoding = 'utf-32be'
|
||||
data = data[4:]
|
||||
elif data[:4] == b'\xff\xfe\x00\x00':
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
return data, encoding
|
||||
|
||||
@classmethod
|
||||
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
|
||||
"""Given a document, tries to find its declared encoding.
|
||||
|
||||
An XML encoding is declared at the beginning of the document.
|
||||
|
||||
An HTML encoding is declared in a <meta> tag, hopefully near the
|
||||
beginning of the document.
|
||||
"""
|
||||
if search_entire_document:
|
||||
xml_endpos = html_endpos = len(markup)
|
||||
else:
|
||||
xml_endpos = 1024
|
||||
html_endpos = max(2048, int(len(markup) * 0.05))
|
||||
|
||||
declared_encoding = None
|
||||
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
||||
if not declared_encoding_match and is_html:
|
||||
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
||||
if declared_encoding_match is not None:
|
||||
declared_encoding = declared_encoding_match.groups()[0].decode(
|
||||
'ascii')
|
||||
if declared_encoding:
|
||||
return declared_encoding.lower()
|
||||
return None
|
||||
|
||||
class UnicodeDammit:
|
||||
"""A class for detecting the encoding of a *ML document and
|
||||
converting it to a Unicode string. If the source encoding is
|
||||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||||
equivalents."""
|
||||
|
||||
# This dictionary maps commonly seen values for "charset" in HTML
|
||||
# meta tags to the corresponding Python codec names. It only covers
|
||||
# values that aren't in Python's aliases and can't be determined
|
||||
# by the heuristics in find_codec.
|
||||
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
||||
"x-sjis": "shift-jis"}
|
||||
|
||||
ENCODINGS_WITH_SMART_QUOTES = [
|
||||
"windows-1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False):
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
self.is_html = is_html
|
||||
|
||||
self.detector = EncodingDetector(markup, override_encodings, is_html)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, unicode) or markup == '':
|
||||
self.markup = markup
|
||||
self.unicode_markup = unicode(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
# The encoding detector may have stripped a byte-order mark.
|
||||
# Use the stripped markup from this point on.
|
||||
self.markup = self.detector.markup
|
||||
|
||||
u = None
|
||||
for encoding in self.detector.encodings:
|
||||
markup = self.detector.markup
|
||||
u = self._convert_from(encoding)
|
||||
if u is not None:
|
||||
break
|
||||
|
||||
if not u:
|
||||
# None of the encodings worked. As an absolute last resort,
|
||||
# try them again with character replacement.
|
||||
|
||||
for encoding in self.detector.encodings:
|
||||
if encoding != "ascii":
|
||||
u = self._convert_from(encoding, "replace")
|
||||
if u is not None:
|
||||
logging.warning(
|
||||
"Some characters could not be decoded, and were "
|
||||
"replaced with REPLACEMENT CHARACTER.")
|
||||
self.contains_replacement_characters = True
|
||||
break
|
||||
|
||||
# If none of that worked, we could at this point force it to
|
||||
# ASCII, but that would destroy so much data that I think
|
||||
# giving up is better.
|
||||
self.unicode_markup = u
|
||||
if not u:
|
||||
self.original_encoding = None
|
||||
|
||||
def _sub_ms_char(self, match):
|
||||
"""Changes a MS smart quote character to an XML or HTML
|
||||
entity, or an ASCII character."""
|
||||
orig = match.group(1)
|
||||
if self.smart_quotes_to == 'ascii':
|
||||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
||||
else:
|
||||
sub = self.MS_CHARS.get(orig)
|
||||
if type(sub) == tuple:
|
||||
if self.smart_quotes_to == 'xml':
|
||||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
||||
else:
|
||||
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
||||
else:
|
||||
sub = sub.encode()
|
||||
return sub
|
||||
|
||||
def _convert_from(self, proposed, errors="strict"):
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||||
return None
|
||||
self.tried_encodings.append((proposed, errors))
|
||||
markup = self.markup
|
||||
# Convert smart quotes to HTML if coming from an encoding
|
||||
# that might have them.
|
||||
if (self.smart_quotes_to is not None
|
||||
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
|
||||
smart_quotes_re = b"([\x80-\x9f])"
|
||||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||||
|
||||
try:
|
||||
#print "Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors)
|
||||
u = self._to_unicode(markup, proposed, errors)
|
||||
self.markup = u
|
||||
self.original_encoding = proposed
|
||||
except Exception as e:
|
||||
#print "That didn't work!"
|
||||
#print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
return self.markup
|
||||
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
return unicode(data, encoding, errors)
|
||||
|
||||
@property
|
||||
def declared_html_encoding(self):
|
||||
if not self.is_html:
|
||||
return None
|
||||
return self.detector.declared_encoding
|
||||
|
||||
def find_codec(self, charset):
|
||||
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
||||
or (charset and self._codec(charset.replace("-", "")))
|
||||
or (charset and self._codec(charset.replace("-", "_")))
|
||||
or (charset and charset.lower())
|
||||
or charset
|
||||
)
|
||||
if value:
|
||||
return value.lower()
|
||||
return None
|
||||
|
||||
def _codec(self, charset):
|
||||
if not charset:
|
||||
return charset
|
||||
codec = None
|
||||
try:
|
||||
codecs.lookup(charset)
|
||||
codec = charset
|
||||
except (LookupError, ValueError):
|
||||
pass
|
||||
return codec
|
||||
|
||||
|
||||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||||
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
||||
b'\x81': ' ',
|
||||
b'\x82': ('sbquo', '201A'),
|
||||
b'\x83': ('fnof', '192'),
|
||||
b'\x84': ('bdquo', '201E'),
|
||||
b'\x85': ('hellip', '2026'),
|
||||
b'\x86': ('dagger', '2020'),
|
||||
b'\x87': ('Dagger', '2021'),
|
||||
b'\x88': ('circ', '2C6'),
|
||||
b'\x89': ('permil', '2030'),
|
||||
b'\x8A': ('Scaron', '160'),
|
||||
b'\x8B': ('lsaquo', '2039'),
|
||||
b'\x8C': ('OElig', '152'),
|
||||
b'\x8D': '?',
|
||||
b'\x8E': ('#x17D', '17D'),
|
||||
b'\x8F': '?',
|
||||
b'\x90': '?',
|
||||
b'\x91': ('lsquo', '2018'),
|
||||
b'\x92': ('rsquo', '2019'),
|
||||
b'\x93': ('ldquo', '201C'),
|
||||
b'\x94': ('rdquo', '201D'),
|
||||
b'\x95': ('bull', '2022'),
|
||||
b'\x96': ('ndash', '2013'),
|
||||
b'\x97': ('mdash', '2014'),
|
||||
b'\x98': ('tilde', '2DC'),
|
||||
b'\x99': ('trade', '2122'),
|
||||
b'\x9a': ('scaron', '161'),
|
||||
b'\x9b': ('rsaquo', '203A'),
|
||||
b'\x9c': ('oelig', '153'),
|
||||
b'\x9d': '?',
|
||||
b'\x9e': ('#x17E', '17E'),
|
||||
b'\x9f': ('Yuml', ''),}
|
||||
|
||||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||||
# horrors like stripping diacritical marks to turn á into a, but also
|
||||
# contains non-horrors like turning “ into ".
|
||||
MS_CHARS_TO_ASCII = {
|
||||
b'\x80' : 'EUR',
|
||||
b'\x81' : ' ',
|
||||
b'\x82' : ',',
|
||||
b'\x83' : 'f',
|
||||
b'\x84' : ',,',
|
||||
b'\x85' : '...',
|
||||
b'\x86' : '+',
|
||||
b'\x87' : '++',
|
||||
b'\x88' : '^',
|
||||
b'\x89' : '%',
|
||||
b'\x8a' : 'S',
|
||||
b'\x8b' : '<',
|
||||
b'\x8c' : 'OE',
|
||||
b'\x8d' : '?',
|
||||
b'\x8e' : 'Z',
|
||||
b'\x8f' : '?',
|
||||
b'\x90' : '?',
|
||||
b'\x91' : "'",
|
||||
b'\x92' : "'",
|
||||
b'\x93' : '"',
|
||||
b'\x94' : '"',
|
||||
b'\x95' : '*',
|
||||
b'\x96' : '-',
|
||||
b'\x97' : '--',
|
||||
b'\x98' : '~',
|
||||
b'\x99' : '(TM)',
|
||||
b'\x9a' : 's',
|
||||
b'\x9b' : '>',
|
||||
b'\x9c' : 'oe',
|
||||
b'\x9d' : '?',
|
||||
b'\x9e' : 'z',
|
||||
b'\x9f' : 'Y',
|
||||
b'\xa0' : ' ',
|
||||
b'\xa1' : '!',
|
||||
b'\xa2' : 'c',
|
||||
b'\xa3' : 'GBP',
|
||||
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
||||
#generic currency symbol.
|
||||
b'\xa5' : 'YEN',
|
||||
b'\xa6' : '|',
|
||||
b'\xa7' : 'S',
|
||||
b'\xa8' : '..',
|
||||
b'\xa9' : '',
|
||||
b'\xaa' : '(th)',
|
||||
b'\xab' : '<<',
|
||||
b'\xac' : '!',
|
||||
b'\xad' : ' ',
|
||||
b'\xae' : '(R)',
|
||||
b'\xaf' : '-',
|
||||
b'\xb0' : 'o',
|
||||
b'\xb1' : '+-',
|
||||
b'\xb2' : '2',
|
||||
b'\xb3' : '3',
|
||||
b'\xb4' : ("'", 'acute'),
|
||||
b'\xb5' : 'u',
|
||||
b'\xb6' : 'P',
|
||||
b'\xb7' : '*',
|
||||
b'\xb8' : ',',
|
||||
b'\xb9' : '1',
|
||||
b'\xba' : '(th)',
|
||||
b'\xbb' : '>>',
|
||||
b'\xbc' : '1/4',
|
||||
b'\xbd' : '1/2',
|
||||
b'\xbe' : '3/4',
|
||||
b'\xbf' : '?',
|
||||
b'\xc0' : 'A',
|
||||
b'\xc1' : 'A',
|
||||
b'\xc2' : 'A',
|
||||
b'\xc3' : 'A',
|
||||
b'\xc4' : 'A',
|
||||
b'\xc5' : 'A',
|
||||
b'\xc6' : 'AE',
|
||||
b'\xc7' : 'C',
|
||||
b'\xc8' : 'E',
|
||||
b'\xc9' : 'E',
|
||||
b'\xca' : 'E',
|
||||
b'\xcb' : 'E',
|
||||
b'\xcc' : 'I',
|
||||
b'\xcd' : 'I',
|
||||
b'\xce' : 'I',
|
||||
b'\xcf' : 'I',
|
||||
b'\xd0' : 'D',
|
||||
b'\xd1' : 'N',
|
||||
b'\xd2' : 'O',
|
||||
b'\xd3' : 'O',
|
||||
b'\xd4' : 'O',
|
||||
b'\xd5' : 'O',
|
||||
b'\xd6' : 'O',
|
||||
b'\xd7' : '*',
|
||||
b'\xd8' : 'O',
|
||||
b'\xd9' : 'U',
|
||||
b'\xda' : 'U',
|
||||
b'\xdb' : 'U',
|
||||
b'\xdc' : 'U',
|
||||
b'\xdd' : 'Y',
|
||||
b'\xde' : 'b',
|
||||
b'\xdf' : 'B',
|
||||
b'\xe0' : 'a',
|
||||
b'\xe1' : 'a',
|
||||
b'\xe2' : 'a',
|
||||
b'\xe3' : 'a',
|
||||
b'\xe4' : 'a',
|
||||
b'\xe5' : 'a',
|
||||
b'\xe6' : 'ae',
|
||||
b'\xe7' : 'c',
|
||||
b'\xe8' : 'e',
|
||||
b'\xe9' : 'e',
|
||||
b'\xea' : 'e',
|
||||
b'\xeb' : 'e',
|
||||
b'\xec' : 'i',
|
||||
b'\xed' : 'i',
|
||||
b'\xee' : 'i',
|
||||
b'\xef' : 'i',
|
||||
b'\xf0' : 'o',
|
||||
b'\xf1' : 'n',
|
||||
b'\xf2' : 'o',
|
||||
b'\xf3' : 'o',
|
||||
b'\xf4' : 'o',
|
||||
b'\xf5' : 'o',
|
||||
b'\xf6' : 'o',
|
||||
b'\xf7' : '/',
|
||||
b'\xf8' : 'o',
|
||||
b'\xf9' : 'u',
|
||||
b'\xfa' : 'u',
|
||||
b'\xfb' : 'u',
|
||||
b'\xfc' : 'u',
|
||||
b'\xfd' : 'y',
|
||||
b'\xfe' : 'b',
|
||||
b'\xff' : 'y',
|
||||
}
|
||||
|
||||
# A map used when removing rogue Windows-1252/ISO-8859-1
|
||||
# characters in otherwise UTF-8 documents.
|
||||
#
|
||||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
||||
# Windows-1252.
|
||||
WINDOWS_1252_TO_UTF8 = {
|
||||
0x80 : b'\xe2\x82\xac', # €
|
||||
0x82 : b'\xe2\x80\x9a', # ‚
|
||||
0x83 : b'\xc6\x92', # ƒ
|
||||
0x84 : b'\xe2\x80\x9e', # „
|
||||
0x85 : b'\xe2\x80\xa6', # …
|
||||
0x86 : b'\xe2\x80\xa0', # †
|
||||
0x87 : b'\xe2\x80\xa1', # ‡
|
||||
0x88 : b'\xcb\x86', # ˆ
|
||||
0x89 : b'\xe2\x80\xb0', # ‰
|
||||
0x8a : b'\xc5\xa0', # Š
|
||||
0x8b : b'\xe2\x80\xb9', # ‹
|
||||
0x8c : b'\xc5\x92', # Œ
|
||||
0x8e : b'\xc5\xbd', # Ž
|
||||
0x91 : b'\xe2\x80\x98', # ‘
|
||||
0x92 : b'\xe2\x80\x99', # ’
|
||||
0x93 : b'\xe2\x80\x9c', # “
|
||||
0x94 : b'\xe2\x80\x9d', # ”
|
||||
0x95 : b'\xe2\x80\xa2', # •
|
||||
0x96 : b'\xe2\x80\x93', # –
|
||||
0x97 : b'\xe2\x80\x94', # —
|
||||
0x98 : b'\xcb\x9c', # ˜
|
||||
0x99 : b'\xe2\x84\xa2', # ™
|
||||
0x9a : b'\xc5\xa1', # š
|
||||
0x9b : b'\xe2\x80\xba', # ›
|
||||
0x9c : b'\xc5\x93', # œ
|
||||
0x9e : b'\xc5\xbe', # ž
|
||||
0x9f : b'\xc5\xb8', # Ÿ
|
||||
0xa0 : b'\xc2\xa0', #
|
||||
0xa1 : b'\xc2\xa1', # ¡
|
||||
0xa2 : b'\xc2\xa2', # ¢
|
||||
0xa3 : b'\xc2\xa3', # £
|
||||
0xa4 : b'\xc2\xa4', # ¤
|
||||
0xa5 : b'\xc2\xa5', # ¥
|
||||
0xa6 : b'\xc2\xa6', # ¦
|
||||
0xa7 : b'\xc2\xa7', # §
|
||||
0xa8 : b'\xc2\xa8', # ¨
|
||||
0xa9 : b'\xc2\xa9', # ©
|
||||
0xaa : b'\xc2\xaa', # ª
|
||||
0xab : b'\xc2\xab', # «
|
||||
0xac : b'\xc2\xac', # ¬
|
||||
0xad : b'\xc2\xad', #
|
||||
0xae : b'\xc2\xae', # ®
|
||||
0xaf : b'\xc2\xaf', # ¯
|
||||
0xb0 : b'\xc2\xb0', # °
|
||||
0xb1 : b'\xc2\xb1', # ±
|
||||
0xb2 : b'\xc2\xb2', # ²
|
||||
0xb3 : b'\xc2\xb3', # ³
|
||||
0xb4 : b'\xc2\xb4', # ´
|
||||
0xb5 : b'\xc2\xb5', # µ
|
||||
0xb6 : b'\xc2\xb6', # ¶
|
||||
0xb7 : b'\xc2\xb7', # ·
|
||||
0xb8 : b'\xc2\xb8', # ¸
|
||||
0xb9 : b'\xc2\xb9', # ¹
|
||||
0xba : b'\xc2\xba', # º
|
||||
0xbb : b'\xc2\xbb', # »
|
||||
0xbc : b'\xc2\xbc', # ¼
|
||||
0xbd : b'\xc2\xbd', # ½
|
||||
0xbe : b'\xc2\xbe', # ¾
|
||||
0xbf : b'\xc2\xbf', # ¿
|
||||
0xc0 : b'\xc3\x80', # À
|
||||
0xc1 : b'\xc3\x81', # Á
|
||||
0xc2 : b'\xc3\x82', # Â
|
||||
0xc3 : b'\xc3\x83', # Ã
|
||||
0xc4 : b'\xc3\x84', # Ä
|
||||
0xc5 : b'\xc3\x85', # Å
|
||||
0xc6 : b'\xc3\x86', # Æ
|
||||
0xc7 : b'\xc3\x87', # Ç
|
||||
0xc8 : b'\xc3\x88', # È
|
||||
0xc9 : b'\xc3\x89', # É
|
||||
0xca : b'\xc3\x8a', # Ê
|
||||
0xcb : b'\xc3\x8b', # Ë
|
||||
0xcc : b'\xc3\x8c', # Ì
|
||||
0xcd : b'\xc3\x8d', # Í
|
||||
0xce : b'\xc3\x8e', # Î
|
||||
0xcf : b'\xc3\x8f', # Ï
|
||||
0xd0 : b'\xc3\x90', # Ð
|
||||
0xd1 : b'\xc3\x91', # Ñ
|
||||
0xd2 : b'\xc3\x92', # Ò
|
||||
0xd3 : b'\xc3\x93', # Ó
|
||||
0xd4 : b'\xc3\x94', # Ô
|
||||
0xd5 : b'\xc3\x95', # Õ
|
||||
0xd6 : b'\xc3\x96', # Ö
|
||||
0xd7 : b'\xc3\x97', # ×
|
||||
0xd8 : b'\xc3\x98', # Ø
|
||||
0xd9 : b'\xc3\x99', # Ù
|
||||
0xda : b'\xc3\x9a', # Ú
|
||||
0xdb : b'\xc3\x9b', # Û
|
||||
0xdc : b'\xc3\x9c', # Ü
|
||||
0xdd : b'\xc3\x9d', # Ý
|
||||
0xde : b'\xc3\x9e', # Þ
|
||||
0xdf : b'\xc3\x9f', # ß
|
||||
0xe0 : b'\xc3\xa0', # à
|
||||
0xe1 : b'\xa1', # á
|
||||
0xe2 : b'\xc3\xa2', # â
|
||||
0xe3 : b'\xc3\xa3', # ã
|
||||
0xe4 : b'\xc3\xa4', # ä
|
||||
0xe5 : b'\xc3\xa5', # å
|
||||
0xe6 : b'\xc3\xa6', # æ
|
||||
0xe7 : b'\xc3\xa7', # ç
|
||||
0xe8 : b'\xc3\xa8', # è
|
||||
0xe9 : b'\xc3\xa9', # é
|
||||
0xea : b'\xc3\xaa', # ê
|
||||
0xeb : b'\xc3\xab', # ë
|
||||
0xec : b'\xc3\xac', # ì
|
||||
0xed : b'\xc3\xad', # í
|
||||
0xee : b'\xc3\xae', # î
|
||||
0xef : b'\xc3\xaf', # ï
|
||||
0xf0 : b'\xc3\xb0', # ð
|
||||
0xf1 : b'\xc3\xb1', # ñ
|
||||
0xf2 : b'\xc3\xb2', # ò
|
||||
0xf3 : b'\xc3\xb3', # ó
|
||||
0xf4 : b'\xc3\xb4', # ô
|
||||
0xf5 : b'\xc3\xb5', # õ
|
||||
0xf6 : b'\xc3\xb6', # ö
|
||||
0xf7 : b'\xc3\xb7', # ÷
|
||||
0xf8 : b'\xc3\xb8', # ø
|
||||
0xf9 : b'\xc3\xb9', # ù
|
||||
0xfa : b'\xc3\xba', # ú
|
||||
0xfb : b'\xc3\xbb', # û
|
||||
0xfc : b'\xc3\xbc', # ü
|
||||
0xfd : b'\xc3\xbd', # ý
|
||||
0xfe : b'\xc3\xbe', # þ
|
||||
}
|
||||
|
||||
MULTIBYTE_MARKERS_AND_SIZES = [
|
||||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
||||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
||||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
||||
]
|
||||
|
||||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||||
|
||||
@classmethod
|
||||
def detwingle(cls, in_bytes, main_encoding="utf8",
|
||||
embedded_encoding="windows-1252"):
|
||||
"""Fix characters from one encoding embedded in some other encoding.
|
||||
|
||||
Currently the only situation supported is Windows-1252 (or its
|
||||
subset ISO-8859-1), embedded in UTF-8.
|
||||
|
||||
The input must be a bytestring. If you've already converted
|
||||
the document to Unicode, you're too late.
|
||||
|
||||
The output is a bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
"""
|
||||
if embedded_encoding.replace('_', '-').lower() not in (
|
||||
'windows-1252', 'windows_1252'):
|
||||
raise NotImplementedError(
|
||||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||||
"embedded encodings.")
|
||||
|
||||
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
||||
raise NotImplementedError(
|
||||
"UTF-8 is the only currently supported main encoding.")
|
||||
|
||||
byte_chunks = []
|
||||
|
||||
chunk_start = 0
|
||||
pos = 0
|
||||
while pos < len(in_bytes):
|
||||
byte = in_bytes[pos]
|
||||
if not isinstance(byte, int):
|
||||
# Python 2.x
|
||||
byte = ord(byte)
|
||||
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
||||
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
||||
# This is the start of a UTF-8 multibyte character. Skip
|
||||
# to the end.
|
||||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||||
if byte >= start and byte <= end:
|
||||
pos += size
|
||||
break
|
||||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||||
# We found a Windows-1252 character!
|
||||
# Save the string up to this point as a chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||||
|
||||
# Now translate the Windows-1252 character into UTF-8
|
||||
# and add it as another, one-byte chunk.
|
||||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||||
pos += 1
|
||||
chunk_start = pos
|
||||
else:
|
||||
# Go on to the next character.
|
||||
pos += 1
|
||||
if chunk_start == 0:
|
||||
# The string is unchanged.
|
||||
return in_bytes
|
||||
else:
|
||||
# Store the final chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:])
|
||||
return b''.join(byte_chunks)
|
||||
|
||||
204
bs4/diagnose.py
Normal file
204
bs4/diagnose.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||
import cProfile
|
||||
from StringIO import StringIO
|
||||
from HTMLParser import HTMLParser
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup, __version__
|
||||
from bs4.builder import builder_registry
|
||||
|
||||
import os
|
||||
import pstats
|
||||
import random
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
import sys
|
||||
import cProfile
|
||||
|
||||
def diagnose(data):
|
||||
"""Diagnostic suite for isolating common problems."""
|
||||
print "Diagnostic running on Beautiful Soup %s" % __version__
|
||||
print "Python version %s" % sys.version
|
||||
|
||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||
for name in basic_parsers:
|
||||
for builder in builder_registry.builders:
|
||||
if name in builder.features:
|
||||
break
|
||||
else:
|
||||
basic_parsers.remove(name)
|
||||
print (
|
||||
"I noticed that %s is not installed. Installing it may help." %
|
||||
name)
|
||||
|
||||
if 'lxml' in basic_parsers:
|
||||
basic_parsers.append(["lxml", "xml"])
|
||||
from lxml import etree
|
||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||
|
||||
if 'html5lib' in basic_parsers:
|
||||
import html5lib
|
||||
print "Found html5lib version %s" % html5lib.__version__
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
elif os.path.exists(data):
|
||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
||||
data = open(data).read()
|
||||
elif data.startswith("http:") or data.startswith("https:"):
|
||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
||||
return
|
||||
print
|
||||
|
||||
for parser in basic_parsers:
|
||||
print "Trying to parse your markup with %s" % parser
|
||||
success = False
|
||||
try:
|
||||
soup = BeautifulSoup(data, parser)
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "Here's what %s did with the markup:" % parser
|
||||
print soup.prettify()
|
||||
|
||||
print "-" * 80
|
||||
|
||||
def lxml_trace(data, html=True, **kwargs):
|
||||
"""Print out the lxml events that occur during parsing.
|
||||
|
||||
This lets you see how lxml parses a document when no Beautiful
|
||||
Soup code is running.
|
||||
"""
|
||||
from lxml import etree
|
||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||
print("%s, %4s, %s" % (event, element.tag, element.text))
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
"""Announces HTMLParser parse events, without doing anything else."""
|
||||
|
||||
def _p(self, s):
|
||||
print(s)
|
||||
|
||||
def handle_starttag(self, name, attrs):
|
||||
self._p("%s START" % name)
|
||||
|
||||
def handle_endtag(self, name):
|
||||
self._p("%s END" % name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self._p("%s DATA" % data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
self._p("%s CHARREF" % name)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
self._p("%s ENTITYREF" % name)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self._p("%s COMMENT" % data)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self._p("%s DECL" % data)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self._p("%s UNKNOWN-DECL" % data)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self._p("%s PI" % data)
|
||||
|
||||
def htmlparser_trace(data):
|
||||
"""Print out the HTMLParser events that occur during parsing.
|
||||
|
||||
This lets you see how HTMLParser parses a document when no
|
||||
Beautiful Soup code is running.
|
||||
"""
|
||||
parser = AnnouncingParser()
|
||||
parser.feed(data)
|
||||
|
||||
_vowels = "aeiou"
|
||||
_consonants = "bcdfghjklmnpqrstvwxyz"
|
||||
|
||||
def rword(length=5):
|
||||
"Generate a random word-like string."
|
||||
s = ''
|
||||
for i in range(length):
|
||||
if i % 2 == 0:
|
||||
t = _consonants
|
||||
else:
|
||||
t = _vowels
|
||||
s += random.choice(t)
|
||||
return s
|
||||
|
||||
def rsentence(length=4):
|
||||
"Generate a random sentence-like string."
|
||||
return " ".join(rword(random.randint(4,9)) for i in range(length))
|
||||
|
||||
def rdoc(num_elements=1000):
|
||||
"""Randomly generate an invalid HTML document."""
|
||||
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
|
||||
elements = []
|
||||
for i in range(num_elements):
|
||||
choice = random.randint(0,3)
|
||||
if choice == 0:
|
||||
# New tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("<%s>" % tag_name)
|
||||
elif choice == 1:
|
||||
elements.append(rsentence(random.randint(1,4)))
|
||||
elif choice == 2:
|
||||
# Close a tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("</%s>" % tag_name)
|
||||
return "<html>" + "\n".join(elements) + "</html>"
|
||||
|
||||
def benchmark_parsers(num_elements=100000):
|
||||
"""Very basic head-to-head performance benchmark."""
|
||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
||||
data = rdoc(num_elements)
|
||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
||||
|
||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
try:
|
||||
a = time.time()
|
||||
soup = BeautifulSoup(data, parser)
|
||||
b = time.time()
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
||||
|
||||
from lxml import etree
|
||||
a = time.time()
|
||||
etree.HTML(data)
|
||||
b = time.time()
|
||||
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser()
|
||||
a = time.time()
|
||||
parser.parse(data)
|
||||
b = time.time()
|
||||
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
|
||||
|
||||
def profile(num_elements=100000, parser="lxml"):
|
||||
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
|
||||
data = rdoc(num_elements)
|
||||
vars = dict(bs4=bs4, data=data, parser=parser)
|
||||
cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
|
||||
|
||||
stats = pstats.Stats(filename)
|
||||
# stats.strip_dirs()
|
||||
stats.sort_stats("cumulative")
|
||||
stats.print_stats('_html5lib|bs4', 50)
|
||||
|
||||
if __name__ == '__main__':
|
||||
diagnose(sys.stdin.read())
|
||||
1611
bs4/element.py
Normal file
1611
bs4/element.py
Normal file
File diff suppressed because it is too large
Load diff
592
bs4/testing.py
Normal file
592
bs4/testing.py
Normal file
|
|
@ -0,0 +1,592 @@
|
|||
"""Helper classes for tests."""
|
||||
|
||||
import copy
|
||||
import functools
|
||||
import unittest
|
||||
from unittest import TestCase
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
Comment,
|
||||
ContentMetaAttributeValue,
|
||||
Doctype,
|
||||
SoupStrainer,
|
||||
)
|
||||
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
|
||||
class SoupTest(unittest.TestCase):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return default_builder()
|
||||
|
||||
def soup(self, markup, **kwargs):
|
||||
"""Build a Beautiful Soup object from markup."""
|
||||
builder = kwargs.pop('builder', self.default_builder)
|
||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||
|
||||
def document_for(self, markup):
|
||||
"""Turn an HTML fragment into a document.
|
||||
|
||||
The details depend on the builder.
|
||||
"""
|
||||
return self.default_builder.test_fragment_to_document(markup)
|
||||
|
||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||
builder = self.default_builder
|
||||
obj = BeautifulSoup(to_parse, builder=builder)
|
||||
if compare_parsed_to is None:
|
||||
compare_parsed_to = to_parse
|
||||
|
||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
||||
Any HTML treebuilder, present or future, should be able to pass
|
||||
these tests. With invalid markup, there's room for interpretation,
|
||||
and different parsers can handle it differently. But with the
|
||||
markup in these tests, there's not much room for interpretation.
|
||||
"""
|
||||
|
||||
def assertDoctypeHandled(self, doctype_fragment):
|
||||
"""Assert that a given doctype string is handled correctly."""
|
||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||
|
||||
# Make sure a Doctype object was created.
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual(doctype.__class__, Doctype)
|
||||
self.assertEqual(doctype, doctype_fragment)
|
||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
||||
|
||||
# Make sure that the doctype was correctly associated with the
|
||||
# parse tree and that the rest of the document parsed.
|
||||
self.assertEqual(soup.p.contents[0], 'foo')
|
||||
|
||||
def _document_with_doctype(self, doctype_fragment):
|
||||
"""Generate and parse a document with the given doctype."""
|
||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
||||
markup = doctype + '\n<p>foo</p>'
|
||||
soup = self.soup(markup)
|
||||
return doctype, soup
|
||||
|
||||
def test_normal_doctypes(self):
|
||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||
self.assertDoctypeHandled("html")
|
||||
self.assertDoctypeHandled(
|
||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||
|
||||
def test_empty_doctype(self):
|
||||
soup = self.soup("<!DOCTYPE>")
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual("", doctype.strip())
|
||||
|
||||
def test_public_doctype_with_url(self):
|
||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||
self.assertDoctypeHandled(doctype)
|
||||
|
||||
def test_system_doctype(self):
|
||||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# We can handle a namespaced doctype with a system ID.
|
||||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# Test a namespaced doctype with a public id.
|
||||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out more or less the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_deepcopy(self):
|
||||
"""Make sure you can copy the tree builder.
|
||||
|
||||
This is important because the builder is part of a
|
||||
BeautifulSoup object, and we want to be able to copy that.
|
||||
"""
|
||||
copy.deepcopy(self.default_builder)
|
||||
|
||||
def test_p_tag_is_never_empty_element(self):
|
||||
"""A <p> tag is never designated as an empty-element tag.
|
||||
|
||||
Even if the markup shows it as an empty-element tag, it
|
||||
shouldn't be presented that way.
|
||||
"""
|
||||
soup = self.soup("<p/>")
|
||||
self.assertFalse(soup.p.is_empty_element)
|
||||
self.assertEqual(str(soup.p), "<p></p>")
|
||||
|
||||
def test_unclosed_tags_get_closed(self):
|
||||
"""A tag that's not closed by the end of the document should be closed.
|
||||
|
||||
This applies to all tags except empty-element tags.
|
||||
"""
|
||||
self.assertSoupEquals("<p>", "<p></p>")
|
||||
self.assertSoupEquals("<b>", "<b></b>")
|
||||
|
||||
self.assertSoupEquals("<br>", "<br/>")
|
||||
|
||||
def test_br_is_always_empty_element_tag(self):
|
||||
"""A <br> tag is designated as an empty-element tag.
|
||||
|
||||
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
||||
two tags, but it should always be an empty-element tag.
|
||||
"""
|
||||
soup = self.soup("<br></br>")
|
||||
self.assertTrue(soup.br.is_empty_element)
|
||||
self.assertEqual(str(soup.br), "<br/>")
|
||||
|
||||
def test_nested_formatting_elements(self):
|
||||
self.assertSoupEquals("<em><em></em></em>")
|
||||
|
||||
def test_comment(self):
|
||||
# Comments are represented as Comment objects.
|
||||
markup = "<p>foo<!--foobar-->baz</p>"
|
||||
self.assertSoupEquals(markup)
|
||||
|
||||
soup = self.soup(markup)
|
||||
comment = soup.find(text="foobar")
|
||||
self.assertEqual(comment.__class__, Comment)
|
||||
|
||||
# The comment is properly integrated into the tree.
|
||||
foo = soup.find(text="foo")
|
||||
self.assertEqual(comment, foo.next_element)
|
||||
baz = soup.find(text="baz")
|
||||
self.assertEqual(comment, baz.previous_element)
|
||||
|
||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
||||
self.assertSoupEquals("<pre> </pre>")
|
||||
self.assertSoupEquals("<textarea> woo </textarea>")
|
||||
|
||||
def test_nested_inline_elements(self):
|
||||
"""Inline elements can be nested indefinitely."""
|
||||
b_tag = "<b>Inside a B tag</b>"
|
||||
self.assertSoupEquals(b_tag)
|
||||
|
||||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
def test_nested_block_level_elements(self):
|
||||
"""Block elements can be nested."""
|
||||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
||||
blockquote = soup.blockquote
|
||||
self.assertEqual(blockquote.p.b.string, 'Foo')
|
||||
self.assertEqual(blockquote.b.string, 'Foo')
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""One table can go inside another one."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tr><td>foo</td></tr></table>'
|
||||
'</td></tr></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_deeply_nested_multivalued_attribute(self):
|
||||
# html5lib can set the attributes of the same tag many times
|
||||
# as it rearranges the tree. This has caused problems with
|
||||
# multivalued attributes.
|
||||
markup = '<table><div><div class="css"></div></div></table>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["css"], soup.div.div['class'])
|
||||
|
||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
|
||||
def test_quot_entity_converted_to_quotation_mark(self):
|
||||
self.assertSoupEquals("<p>I said "good day!"</p>",
|
||||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
|
||||
def test_multipart_strings(self):
|
||||
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||
self.assertEqual("p", soup.h2.string.next_element.name)
|
||||
self.assertEqual("p", soup.p.name)
|
||||
|
||||
def test_basic_namespaces(self):
|
||||
"""Parsers don't need to *understand* namespaces, but at the
|
||||
very least they should not choke on namespaces or lose
|
||||
data."""
|
||||
|
||||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode())
|
||||
html = soup.html
|
||||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
||||
|
||||
def test_multivalued_attribute_value_becomes_list(self):
|
||||
markup = b'<a class="foo bar">'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
||||
|
||||
#
|
||||
# Generally speaking, tests below this point are more tests of
|
||||
# Beautiful Soup than tests of the tree builders. But parsers are
|
||||
# weird, so we run these tests separately for every tree builder
|
||||
# to detect any differences between them.
|
||||
#
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
# A seemingly innocuous document... but it's in Unicode! And
|
||||
# it contains characters that can't be represented in the
|
||||
# encoding found in the declaration! The horror!
|
||||
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
||||
parse_only=strainer)
|
||||
self.assertEqual(soup.decode(), "<b>bold</b>")
|
||||
|
||||
def test_single_quote_attribute_values_become_double_quotes(self):
|
||||
self.assertSoupEquals("<foo attr='bar'></foo>",
|
||||
'<foo attr="bar"></foo>')
|
||||
|
||||
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
self.assertSoupEquals(text)
|
||||
|
||||
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
soup = self.soup(text)
|
||||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
||||
self.assertSoupEquals(
|
||||
soup.foo.decode(),
|
||||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
||||
|
||||
def test_ampersand_in_attribute_value_gets_escaped(self):
|
||||
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
||||
'<this is="really messed up & stuff"></this>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
||||
|
||||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
||||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
||||
|
||||
def test_entities_in_strings_converted_during_parsing(self):
|
||||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
# Microsoft smart quotes are converted to Unicode characters during
|
||||
# parsing.
|
||||
quote = b"<p>\x91Foo\x92</p>"
|
||||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
def test_real_iso_latin_document(self):
|
||||
# Smoke test of interrelated functionality, using an
|
||||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
iso_latin_html = unicode_html.encode("iso-8859-1")
|
||||
|
||||
# Parse the ISO-Latin-1 HTML.
|
||||
soup = self.soup(iso_latin_html)
|
||||
# Encode it to UTF-8.
|
||||
result = soup.encode("utf-8")
|
||||
|
||||
# What do we expect the result to look like? Well, it would
|
||||
# look like unicode_html, except that the META tag would say
|
||||
# UTF-8 instead of ISO-Latin-1.
|
||||
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
||||
|
||||
# And, of course, it would be in UTF-8, not Unicode.
|
||||
expected = expected.encode("utf-8")
|
||||
|
||||
# Ta-da!
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_real_shift_jis_document(self):
|
||||
# Smoke test to make sure the parser can handle a document in
|
||||
# Shift-JIS encoding, without choking.
|
||||
shift_jis_html = (
|
||||
b'<html><head></head><body><pre>'
|
||||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
||||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
||||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
||||
b'</pre></body></html>')
|
||||
unicode_html = shift_jis_html.decode("shift-jis")
|
||||
soup = self.soup(unicode_html)
|
||||
|
||||
# Make sure the parse tree is correctly encoded to various
|
||||
# encodings.
|
||||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
||||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
||||
|
||||
def test_real_hebrew_document(self):
|
||||
# A real-world test to make sure we can convert ISO-8859-9 (a
|
||||
# Hebrew encoding) to UTF-8.
|
||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||
soup = self.soup(
|
||||
hebrew_document, from_encoding="iso8859-8")
|
||||
self.assertEqual(soup.original_encoding, 'iso8859-8')
|
||||
self.assertEqual(
|
||||
soup.encode('utf-8'),
|
||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||
|
||||
def test_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||
'http-equiv="Content-type"/>')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
||||
content = parsed_meta['content']
|
||||
self.assertEqual('text/html; charset=x-sjis', content)
|
||||
|
||||
# But that value is actually a ContentMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
||||
|
||||
# For the rest of the story, see TestSubstitutions in
|
||||
# test_tree.py.
|
||||
|
||||
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', id="encoding")
|
||||
charset = parsed_meta['charset']
|
||||
self.assertEqual('x-sjis', charset)
|
||||
|
||||
# But that value is actually a CharsetMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('utf8', charset.encode("utf8"))
|
||||
|
||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||
data = self.soup("<a>text</a>")
|
||||
data.a['foo'] = 'bar'
|
||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_docstring_generated(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8"), markup)
|
||||
|
||||
def test_formatter_processes_script_tag_for_xml_documents(self):
|
||||
doc = """
|
||||
<script type="text/javascript">
|
||||
</script>
|
||||
"""
|
||||
soup = BeautifulSoup(doc, "xml")
|
||||
# lxml would have stripped this while parsing, but we can add
|
||||
# it later.
|
||||
soup.script.string = 'console.log("< < hey > > ");'
|
||||
encoded = soup.encode()
|
||||
self.assertTrue(b"< < hey > >" in encoded)
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
||||
|
||||
def test_popping_namespaced_tag(self):
|
||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
unicode(soup.rss), markup)
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode("latin1"),
|
||||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
||||
|
||||
def test_large_xml_document(self):
|
||||
"""A large XML document should come out the same as it went in."""
|
||||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
||||
+ b'0' * (2**12)
|
||||
+ b'</root>')
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(soup.encode("utf-8"), markup)
|
||||
|
||||
|
||||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
||||
self.assertSoupEquals("<p>", "<p/>")
|
||||
self.assertSoupEquals("<p>foo</p>")
|
||||
|
||||
def test_namespaces_are_preserved(self):
|
||||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
||||
soup = self.soup(markup)
|
||||
root = soup.root
|
||||
self.assertEqual("http://example.com/", root['xmlns:a'])
|
||||
self.assertEqual("http://example.net/", root['xmlns:b'])
|
||||
|
||||
def test_closing_namespaced_tag(self):
|
||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.p), markup)
|
||||
|
||||
def test_namespaced_attributes(self):
|
||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
def test_namespaced_attributes_xml_namespace(self):
|
||||
markup = '<foo xml:lang="fr">bar</foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
||||
# XHTML documents in any particular way.
|
||||
pass
|
||||
|
||||
def test_html_tags_have_namespace(self):
|
||||
markup = "<a>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
||||
|
||||
def test_svg_tags_have_namespace(self):
|
||||
markup = '<svg><circle/></svg>'
|
||||
soup = self.soup(markup)
|
||||
namespace = "http://www.w3.org/2000/svg"
|
||||
self.assertEqual(namespace, soup.svg.namespace)
|
||||
self.assertEqual(namespace, soup.circle.namespace)
|
||||
|
||||
|
||||
def test_mathml_tags_have_namespace(self):
|
||||
markup = '<math><msqrt>5</msqrt></math>'
|
||||
soup = self.soup(markup)
|
||||
namespace = 'http://www.w3.org/1998/Math/MathML'
|
||||
self.assertEqual(namespace, soup.math.namespace)
|
||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||
|
||||
def test_xml_declaration_becomes_comment(self):
|
||||
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertTrue(isinstance(soup.contents[0], Comment))
|
||||
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
|
||||
self.assertEqual("html", soup.contents[0].next_element.name)
|
||||
|
||||
def skipIf(condition, reason):
|
||||
def nothing(test, *args, **kwargs):
|
||||
return None
|
||||
|
||||
def decorator(test_item):
|
||||
if condition:
|
||||
return nothing
|
||||
else:
|
||||
return test_item
|
||||
|
||||
return decorator
|
||||
131
calibre-plugin/__init__.py
Normal file
131
calibre-plugin/__init__.py
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Jim Miller'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys
|
||||
if sys.version_info >= (2, 7):
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
loghandler=logging.StreamHandler()
|
||||
loghandler.setFormatter(logging.Formatter("FFDL:%(levelname)s:%(filename)s(%(lineno)d):%(message)s"))
|
||||
logger.addHandler(loghandler)
|
||||
loghandler.setLevel(logging.DEBUG)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# pulls in translation files for _() strings
|
||||
try:
|
||||
load_translations()
|
||||
except NameError:
|
||||
pass # load_translations() added in calibre 1.9
|
||||
|
||||
# The class that all Interface Action plugin wrappers must inherit from
|
||||
from calibre.customize import InterfaceActionBase
|
||||
|
||||
## Apparently the name for this class doesn't matter--it was still
|
||||
## 'demo' for the first few versions.
|
||||
class FanFictionDownLoaderBase(InterfaceActionBase):
|
||||
'''
|
||||
This class is a simple wrapper that provides information about the
|
||||
actual plugin class. The actual interface plugin class is called
|
||||
InterfacePlugin and is defined in the ffdl_plugin.py file, as
|
||||
specified in the actual_plugin field below.
|
||||
|
||||
The reason for having two classes is that it allows the command line
|
||||
calibre utilities to run without needing to load the GUI libraries.
|
||||
'''
|
||||
name = 'FanFictionDownLoader'
|
||||
description = _('UI plugin to download FanFiction stories from various sites.')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Jim Miller'
|
||||
version = (2, 0, 10)
|
||||
minimum_calibre_version = (1, 48, 0)
|
||||
|
||||
#: This field defines the GUI plugin class that contains all the code
|
||||
#: that actually does something. Its format is module_path:class_name
|
||||
#: The specified class must be defined in the specified module.
|
||||
actual_plugin = 'calibre_plugins.fanfictiondownloader_plugin.ffdl_plugin:FanFictionDownLoaderPlugin'
|
||||
|
||||
def is_customizable(self):
|
||||
'''
|
||||
This method must return True to enable customization via
|
||||
Preferences->Plugins
|
||||
'''
|
||||
return True
|
||||
|
||||
def config_widget(self):
|
||||
'''
|
||||
Implement this method and :meth:`save_settings` in your plugin to
|
||||
use a custom configuration dialog.
|
||||
|
||||
This method, if implemented, must return a QWidget. The widget can have
|
||||
an optional method validate() that takes no arguments and is called
|
||||
immediately after the user clicks OK. Changes are applied if and only
|
||||
if the method returns True.
|
||||
|
||||
If for some reason you cannot perform the configuration at this time,
|
||||
return a tuple of two strings (message, details), these will be
|
||||
displayed as a warning dialog to the user and the process will be
|
||||
aborted.
|
||||
|
||||
The base class implementation of this method raises NotImplementedError
|
||||
so by default no user configuration is possible.
|
||||
'''
|
||||
# It is important to put this import statement here rather than at the
|
||||
# top of the module as importing the config class will also cause the
|
||||
# GUI libraries to be loaded, which we do not want when using calibre
|
||||
# from the command line
|
||||
from calibre_plugins.fanfictiondownloader_plugin.config import ConfigWidget
|
||||
return ConfigWidget(self.actual_plugin_)
|
||||
|
||||
def save_settings(self, config_widget):
|
||||
'''
|
||||
Save the settings specified by the user with config_widget.
|
||||
|
||||
:param config_widget: The widget returned by :meth:`config_widget`.
|
||||
'''
|
||||
config_widget.save_settings()
|
||||
|
||||
# Apply the changes
|
||||
ac = self.actual_plugin_
|
||||
if ac is not None:
|
||||
ac.apply_settings()
|
||||
|
||||
def load_actual_plugin(self, gui):
|
||||
with self: # so the sys.path was modified while loading the
|
||||
# plug impl.
|
||||
return super(FanFictionDownLoaderBase, self).load_actual_plugin(gui)
|
||||
|
||||
def cli_main(self,argv):
|
||||
|
||||
with self: # so the sys.path was modified appropriately
|
||||
# I believe there's no performance hit loading these here when
|
||||
# CLI--it would load everytime anyway.
|
||||
from StringIO import StringIO
|
||||
from calibre.library import db
|
||||
from calibre_plugins.fanfictiondownloader_plugin.downloader import main as ffdl_main
|
||||
from calibre_plugins.fanfictiondownloader_plugin.prefs import PrefsFacade
|
||||
from calibre.utils.config import prefs as calibre_prefs
|
||||
from optparse import OptionParser
|
||||
|
||||
parser = OptionParser('%prog --run-plugin '+self.name+' -- [options] <storyurl>')
|
||||
parser.add_option('--library-path', '--with-library', default=None, help=_('Path to the calibre library. Default is to use the path stored in the settings.'))
|
||||
# parser.add_option('--dont-notify-gui', default=False, action='store_true',
|
||||
# help=_('Do not notify the running calibre GUI (if any) that the database has'
|
||||
# ' changed. Use with care, as it can lead to database corruption!'))
|
||||
|
||||
pargs = [x for x in argv if x.startswith('--with-library') or x.startswith('--library-path')
|
||||
or not x.startswith('-')]
|
||||
opts, args = parser.parse_args(pargs)
|
||||
|
||||
ffdl_prefs = PrefsFacade(db(path=opts.library_path,
|
||||
read_only=True))
|
||||
|
||||
ffdl_main(argv[1:],
|
||||
parser=parser,
|
||||
passed_defaultsini=StringIO(get_resources("defaults.ini")),
|
||||
passed_personalini=StringIO(ffdl_prefs["personal.ini"]))
|
||||
28
calibre-plugin/about.txt
Normal file
28
calibre-plugin/about.txt
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
<hr />
|
||||
|
||||
<p>Plugin created by Jim Miller, borrowing heavily from Grant Drake's
|
||||
'<a href="http://www.mobileread.com/forums/showthread.php?t=134856">Reading List</a>',
|
||||
'<a href="http://www.mobileread.com/forums/showthread.php?t=126727">Extract ISBN</a>' and
|
||||
'<a href="http://www.mobileread.com/forums/showthread.php?t=134000">Count Pages</a>'
|
||||
plugins.</p>
|
||||
|
||||
<p>
|
||||
Calibre officially distributes plugins from the mobileread.com forum site.
|
||||
The official distro channel for this plugin is there: <a href="http://www.mobileread.com/forums/showthread.php?t=163261">FanFictionDownLoader</a>
|
||||
</p>
|
||||
|
||||
<p> I also monitor the
|
||||
<a href="http://groups.google.com/group/fanfic-downloader">general users
|
||||
group</a> for the downloader. That covers the web application and CLI, too.
|
||||
</p>
|
||||
|
||||
The source for this plugin is available at it's
|
||||
<a href="http://code.google.com/p/fanficdownloader">project home</a>.
|
||||
<hr />
|
||||
|
||||
<p>
|
||||
See the <a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownloaderSupportedsites">list of supported sites</a>.
|
||||
</p>
|
||||
<p>
|
||||
Read the <a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownloaderFAQs">FAQs</a>.
|
||||
</p>
|
||||
553
calibre-plugin/common_utils.py
Normal file
553
calibre-plugin/common_utils.py
Normal file
|
|
@ -0,0 +1,553 @@
|
|||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
try:
|
||||
from PyQt5 import QtWidgets as QtGui
|
||||
from PyQt5.Qt import (Qt, QIcon, QPixmap, QLabel, QDialog, QHBoxLayout,
|
||||
QTableWidgetItem, QFont, QLineEdit, QComboBox,
|
||||
QVBoxLayout, QDialogButtonBox, QStyledItemDelegate, QDateTime,
|
||||
QTextEdit, QListWidget, QAbstractItemView)
|
||||
except ImportError as e:
|
||||
from PyQt4 import QtGui
|
||||
from PyQt4.Qt import (Qt, QIcon, QPixmap, QLabel, QDialog, QHBoxLayout,
|
||||
QTableWidgetItem, QFont, QLineEdit, QComboBox,
|
||||
QVBoxLayout, QDialogButtonBox, QStyledItemDelegate, QDateTime,
|
||||
QTextEdit, QListWidget, QAbstractItemView)
|
||||
|
||||
from calibre.constants import iswindows
|
||||
from calibre.gui2 import gprefs, error_dialog, UNDEFINED_QDATETIME, info_dialog
|
||||
from calibre.gui2.actions import menu_action_unique_name
|
||||
from calibre.gui2.keyboard import ShortcutConfig
|
||||
from calibre.utils.config import config_dir
|
||||
from calibre.utils.date import now, format_date, qt_to_dt, UNDEFINED_DATE
|
||||
|
||||
# Global definition of our plugin name. Used for common functions that require this.
|
||||
plugin_name = None
|
||||
# Global definition of our plugin resources. Used to share between the xxxAction and xxxBase
|
||||
# classes if you need any zip images to be displayed on the configuration dialog.
|
||||
plugin_icon_resources = {}
|
||||
|
||||
|
||||
def set_plugin_icon_resources(name, resources):
|
||||
'''
|
||||
Set our global store of plugin name and icon resources for sharing between
|
||||
the InterfaceAction class which reads them and the ConfigWidget
|
||||
if needed for use on the customization dialog for this plugin.
|
||||
'''
|
||||
global plugin_icon_resources, plugin_name
|
||||
plugin_name = name
|
||||
plugin_icon_resources = resources
|
||||
|
||||
|
||||
def get_icon(icon_name):
|
||||
'''
|
||||
Retrieve a QIcon for the named image from the zip file if it exists,
|
||||
or if not then from Calibre's image cache.
|
||||
'''
|
||||
if icon_name:
|
||||
pixmap = get_pixmap(icon_name)
|
||||
if pixmap is None:
|
||||
# Look in Calibre's cache for the icon
|
||||
return QIcon(I(icon_name))
|
||||
else:
|
||||
return QIcon(pixmap)
|
||||
return QIcon()
|
||||
|
||||
|
||||
def get_pixmap(icon_name):
|
||||
'''
|
||||
Retrieve a QPixmap for the named image
|
||||
Any icons belonging to the plugin must be prefixed with 'images/'
|
||||
'''
|
||||
global plugin_icon_resources, plugin_name
|
||||
|
||||
if not icon_name.startswith('images/'):
|
||||
# We know this is definitely not an icon belonging to this plugin
|
||||
pixmap = QPixmap()
|
||||
pixmap.load(I(icon_name))
|
||||
return pixmap
|
||||
|
||||
# Check to see whether the icon exists as a Calibre resource
|
||||
# This will enable skinning if the user stores icons within a folder like:
|
||||
# ...\AppData\Roaming\calibre\resources\images\Plugin Name\
|
||||
if plugin_name:
|
||||
local_images_dir = get_local_images_dir(plugin_name)
|
||||
local_image_path = os.path.join(local_images_dir, icon_name.replace('images/', ''))
|
||||
if os.path.exists(local_image_path):
|
||||
pixmap = QPixmap()
|
||||
pixmap.load(local_image_path)
|
||||
return pixmap
|
||||
|
||||
# As we did not find an icon elsewhere, look within our zip resources
|
||||
if icon_name in plugin_icon_resources:
|
||||
pixmap = QPixmap()
|
||||
pixmap.loadFromData(plugin_icon_resources[icon_name])
|
||||
return pixmap
|
||||
return None
|
||||
|
||||
|
||||
def get_local_images_dir(subfolder=None):
|
||||
'''
|
||||
Returns a path to the user's local resources/images folder
|
||||
If a subfolder name parameter is specified, appends this to the path
|
||||
'''
|
||||
images_dir = os.path.join(config_dir, 'resources/images')
|
||||
if subfolder:
|
||||
images_dir = os.path.join(images_dir, subfolder)
|
||||
if iswindows:
|
||||
images_dir = os.path.normpath(images_dir)
|
||||
return images_dir
|
||||
|
||||
|
||||
def create_menu_item(ia, parent_menu, menu_text, image=None, tooltip=None,
|
||||
shortcut=(), triggered=None, is_checked=None):
|
||||
'''
|
||||
Create a menu action with the specified criteria and action
|
||||
Note that if no shortcut is specified, will not appear in Preferences->Keyboard
|
||||
This method should only be used for actions which either have no shortcuts,
|
||||
or register their menus only once. Use create_menu_action_unique for all else.
|
||||
'''
|
||||
if shortcut is not None:
|
||||
if len(shortcut) == 0:
|
||||
shortcut = ()
|
||||
else:
|
||||
shortcut = _(shortcut)
|
||||
ac = ia.create_action(spec=(menu_text, None, tooltip, shortcut),
|
||||
attr=menu_text)
|
||||
if image:
|
||||
ac.setIcon(get_icon(image))
|
||||
if triggered is not None:
|
||||
ac.triggered.connect(triggered)
|
||||
if is_checked is not None:
|
||||
ac.setCheckable(True)
|
||||
if is_checked:
|
||||
ac.setChecked(True)
|
||||
|
||||
parent_menu.addAction(ac)
|
||||
return ac
|
||||
|
||||
|
||||
def create_menu_action_unique(ia, parent_menu, menu_text, image=None, tooltip=None,
|
||||
shortcut=None, triggered=None, is_checked=None, shortcut_name=None,
|
||||
unique_name=None):
|
||||
'''
|
||||
Create a menu action with the specified criteria and action, using the new
|
||||
InterfaceAction.create_menu_action() function which ensures that regardless of
|
||||
whether a shortcut is specified it will appear in Preferences->Keyboard
|
||||
'''
|
||||
orig_shortcut = shortcut
|
||||
kb = ia.gui.keyboard
|
||||
if unique_name is None:
|
||||
unique_name = menu_text
|
||||
if not shortcut == False:
|
||||
full_unique_name = menu_action_unique_name(ia, unique_name)
|
||||
if full_unique_name in kb.shortcuts:
|
||||
shortcut = False
|
||||
else:
|
||||
if shortcut is not None and not shortcut == False:
|
||||
if len(shortcut) == 0:
|
||||
shortcut = None
|
||||
else:
|
||||
shortcut = _(shortcut)
|
||||
|
||||
if shortcut_name is None:
|
||||
shortcut_name = menu_text.replace('&','')
|
||||
|
||||
ac = ia.create_menu_action(parent_menu, unique_name, menu_text, icon=None, shortcut=shortcut,
|
||||
description=tooltip, triggered=triggered, shortcut_name=shortcut_name)
|
||||
if shortcut == False and not orig_shortcut == False:
|
||||
if ac.calibre_shortcut_unique_name in ia.gui.keyboard.shortcuts:
|
||||
kb.replace_action(ac.calibre_shortcut_unique_name, ac)
|
||||
if image:
|
||||
ac.setIcon(get_icon(image))
|
||||
if is_checked is not None:
|
||||
ac.setCheckable(True)
|
||||
if is_checked:
|
||||
ac.setChecked(True)
|
||||
return ac
|
||||
|
||||
|
||||
def swap_author_names(author):
|
||||
if author.find(',') == -1:
|
||||
return author
|
||||
name_parts = author.strip().partition(',')
|
||||
return name_parts[2].strip() + ' ' + name_parts[0]
|
||||
|
||||
|
||||
def get_library_uuid(db):
|
||||
try:
|
||||
library_uuid = db.library_id
|
||||
except:
|
||||
library_uuid = ''
|
||||
return library_uuid
|
||||
|
||||
|
||||
class ImageLabel(QLabel):
|
||||
|
||||
def __init__(self, parent, icon_name, size=16):
|
||||
QLabel.__init__(self, parent)
|
||||
pixmap = get_pixmap(icon_name)
|
||||
self.setPixmap(pixmap)
|
||||
self.setMaximumSize(size, size)
|
||||
self.setScaledContents(True)
|
||||
|
||||
|
||||
class ImageTitleLayout(QHBoxLayout):
|
||||
'''
|
||||
A reusable layout widget displaying an image followed by a title
|
||||
'''
|
||||
def __init__(self, parent, icon_name, title, tooltip=None):
|
||||
QHBoxLayout.__init__(self)
|
||||
title_image_label = QLabel(parent)
|
||||
pixmap = get_pixmap(icon_name)
|
||||
if pixmap is None:
|
||||
pixmap = get_pixmap('library.png')
|
||||
# error_dialog(parent, _('Restart required'),
|
||||
# _('You must restart Calibre before using this plugin!'), show=True)
|
||||
else:
|
||||
title_image_label.setPixmap(pixmap)
|
||||
title_image_label.setMaximumSize(32, 32)
|
||||
title_image_label.setScaledContents(True)
|
||||
self.addWidget(title_image_label)
|
||||
|
||||
title_font = QFont()
|
||||
title_font.setPointSize(16)
|
||||
shelf_label = QLabel(title, parent)
|
||||
shelf_label.setFont(title_font)
|
||||
self.addWidget(shelf_label)
|
||||
self.insertStretch(-1)
|
||||
|
||||
if tooltip:
|
||||
title_image_label.setToolTip(tooltip)
|
||||
shelf_label.setToolTip(tooltip)
|
||||
|
||||
class SizePersistedDialog(QDialog):
|
||||
'''
|
||||
This dialog is a base class for any dialogs that want their size/position
|
||||
restored when they are next opened.
|
||||
'''
|
||||
def __init__(self, parent, unique_pref_name):
|
||||
QDialog.__init__(self, parent)
|
||||
self.unique_pref_name = unique_pref_name
|
||||
self.geom = gprefs.get(unique_pref_name, None)
|
||||
self.finished.connect(self.dialog_closing)
|
||||
|
||||
def resize_dialog(self):
|
||||
if self.geom is None:
|
||||
self.resize(self.sizeHint())
|
||||
else:
|
||||
self.restoreGeometry(self.geom)
|
||||
|
||||
def dialog_closing(self, result):
|
||||
self.geom = bytearray(self.saveGeometry())
|
||||
gprefs[self.unique_pref_name] = self.geom
|
||||
|
||||
|
||||
class ReadOnlyTableWidgetItem(QTableWidgetItem):
|
||||
|
||||
def __init__(self, text):
|
||||
if text is None:
|
||||
text = ''
|
||||
QTableWidgetItem.__init__(self, text, QtGui.QTableWidgetItem.UserType)
|
||||
self.setFlags(Qt.ItemIsSelectable|Qt.ItemIsEnabled)
|
||||
|
||||
|
||||
class RatingTableWidgetItem(QTableWidgetItem):
|
||||
|
||||
def __init__(self, rating, is_read_only=False):
|
||||
QTableWidgetItem.__init__(self, '', QtGui.QTableWidgetItem.UserType)
|
||||
self.setData(Qt.DisplayRole, rating)
|
||||
if is_read_only:
|
||||
self.setFlags(Qt.ItemIsSelectable|Qt.ItemIsEnabled)
|
||||
|
||||
|
||||
class DateTableWidgetItem(QTableWidgetItem):
|
||||
|
||||
def __init__(self, date_read, is_read_only=False, default_to_today=False):
|
||||
if date_read == UNDEFINED_DATE and default_to_today:
|
||||
date_read = now()
|
||||
if is_read_only:
|
||||
QTableWidgetItem.__init__(self, format_date(date_read, None), QtGui.QTableWidgetItem.UserType)
|
||||
self.setFlags(Qt.ItemIsSelectable|Qt.ItemIsEnabled)
|
||||
else:
|
||||
QTableWidgetItem.__init__(self, '', QtGui.QTableWidgetItem.UserType)
|
||||
self.setData(Qt.DisplayRole, QDateTime(date_read))
|
||||
|
||||
|
||||
class NoWheelComboBox(QComboBox):
|
||||
|
||||
def wheelEvent (self, event):
|
||||
# Disable the mouse wheel on top of the combo box changing selection as plays havoc in a grid
|
||||
event.ignore()
|
||||
|
||||
|
||||
class CheckableTableWidgetItem(QTableWidgetItem):
|
||||
|
||||
def __init__(self, checked=False, is_tristate=False):
|
||||
QTableWidgetItem.__init__(self, '')
|
||||
self.setFlags(Qt.ItemFlags(Qt.ItemIsSelectable | Qt.ItemIsUserCheckable | Qt.ItemIsEnabled ))
|
||||
if is_tristate:
|
||||
self.setFlags(self.flags() | Qt.ItemIsTristate)
|
||||
if checked:
|
||||
self.setCheckState(Qt.Checked)
|
||||
else:
|
||||
if is_tristate and checked is None:
|
||||
self.setCheckState(Qt.PartiallyChecked)
|
||||
else:
|
||||
self.setCheckState(Qt.Unchecked)
|
||||
|
||||
def get_boolean_value(self):
|
||||
'''
|
||||
Return a boolean value indicating whether checkbox is checked
|
||||
If this is a tristate checkbox, a partially checked value is returned as None
|
||||
'''
|
||||
if self.checkState() == Qt.PartiallyChecked:
|
||||
return None
|
||||
else:
|
||||
return self.checkState() == Qt.Checked
|
||||
|
||||
|
||||
class TextIconWidgetItem(QTableWidgetItem):
|
||||
|
||||
def __init__(self, text, icon):
|
||||
QTableWidgetItem.__init__(self, text)
|
||||
if icon:
|
||||
self.setIcon(icon)
|
||||
|
||||
|
||||
class ReadOnlyTextIconWidgetItem(ReadOnlyTableWidgetItem):
|
||||
|
||||
def __init__(self, text, icon):
|
||||
ReadOnlyTableWidgetItem.__init__(self, text)
|
||||
if icon:
|
||||
self.setIcon(icon)
|
||||
|
||||
|
||||
class ReadOnlyLineEdit(QLineEdit):
|
||||
|
||||
def __init__(self, text, parent):
|
||||
if text is None:
|
||||
text = ''
|
||||
QLineEdit.__init__(self, text, parent)
|
||||
self.setEnabled(False)
|
||||
|
||||
|
||||
class KeyValueComboBox(QComboBox):
|
||||
|
||||
def __init__(self, parent, values, selected_key):
|
||||
QComboBox.__init__(self, parent)
|
||||
self.values = values
|
||||
self.populate_combo(selected_key)
|
||||
|
||||
def populate_combo(self, selected_key):
|
||||
self.clear()
|
||||
selected_idx = idx = -1
|
||||
for key, value in self.values.iteritems():
|
||||
idx = idx + 1
|
||||
self.addItem(value)
|
||||
if key == selected_key:
|
||||
selected_idx = idx
|
||||
self.setCurrentIndex(selected_idx)
|
||||
|
||||
def selected_key(self):
|
||||
for key, value in self.values.iteritems():
|
||||
if value == unicode(self.currentText()).strip():
|
||||
return key
|
||||
|
||||
|
||||
class CustomColumnComboBox(QComboBox):
|
||||
|
||||
def __init__(self, parent, custom_columns, selected_column, initial_items=['']):
|
||||
QComboBox.__init__(self, parent)
|
||||
self.populate_combo(custom_columns, selected_column, initial_items)
|
||||
|
||||
def populate_combo(self, custom_columns, selected_column, initial_items=['']):
|
||||
self.clear()
|
||||
self.column_names = initial_items
|
||||
if len(initial_items) > 0:
|
||||
self.addItems(initial_items)
|
||||
selected_idx = 0
|
||||
for idx, value in enumerate(initial_items):
|
||||
if value == selected_column:
|
||||
selected_idx = idx
|
||||
for key in sorted(custom_columns.keys()):
|
||||
self.column_names.append(key)
|
||||
self.addItem('%s (%s)'%(key, custom_columns[key]['name']))
|
||||
if key == selected_column:
|
||||
selected_idx = len(self.column_names) - 1
|
||||
self.setCurrentIndex(selected_idx)
|
||||
|
||||
def get_selected_column(self):
|
||||
return self.column_names[self.currentIndex()]
|
||||
|
||||
|
||||
class KeyboardConfigDialog(SizePersistedDialog):
|
||||
'''
|
||||
This dialog is used to allow editing of keyboard shortcuts.
|
||||
'''
|
||||
def __init__(self, gui, group_name):
|
||||
SizePersistedDialog.__init__(self, gui, 'Keyboard shortcut dialog')
|
||||
self.gui = gui
|
||||
self.setWindowTitle('Keyboard shortcuts')
|
||||
layout = QVBoxLayout(self)
|
||||
self.setLayout(layout)
|
||||
|
||||
self.keyboard_widget = ShortcutConfig(self)
|
||||
layout.addWidget(self.keyboard_widget)
|
||||
self.group_name = group_name
|
||||
|
||||
button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
|
||||
button_box.accepted.connect(self.commit)
|
||||
button_box.rejected.connect(self.reject)
|
||||
layout.addWidget(button_box)
|
||||
|
||||
# Cause our dialog size to be restored from prefs or created on first usage
|
||||
self.resize_dialog()
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
self.keyboard_widget.initialize(self.gui.keyboard)
|
||||
self.keyboard_widget.highlight_group(self.group_name)
|
||||
|
||||
def commit(self):
|
||||
self.keyboard_widget.commit()
|
||||
self.accept()
|
||||
|
||||
|
||||
class DateDelegate(QStyledItemDelegate):
|
||||
'''
|
||||
Delegate for dates. Because this delegate stores the
|
||||
format as an instance variable, a new instance must be created for each
|
||||
column. This differs from all the other delegates.
|
||||
'''
|
||||
def __init__(self, parent):
|
||||
QStyledItemDelegate.__init__(self, parent)
|
||||
self.format = 'dd MMM yyyy'
|
||||
|
||||
def displayText(self, val, locale):
|
||||
d = val.toDateTime()
|
||||
if d <= UNDEFINED_QDATETIME:
|
||||
return ''
|
||||
return format_date(qt_to_dt(d, as_utc=False), self.format)
|
||||
|
||||
def createEditor(self, parent, option, index):
|
||||
qde = QStyledItemDelegate.createEditor(self, parent, option, index)
|
||||
qde.setDisplayFormat(self.format)
|
||||
qde.setMinimumDateTime(UNDEFINED_QDATETIME)
|
||||
qde.setSpecialValueText(_('Undefined'))
|
||||
qde.setCalendarPopup(True)
|
||||
return qde
|
||||
|
||||
def setEditorData(self, editor, index):
|
||||
val = index.model().data(index, Qt.DisplayRole).toDateTime()
|
||||
if val is None or val == UNDEFINED_QDATETIME:
|
||||
val = now()
|
||||
editor.setDateTime(val)
|
||||
|
||||
def setModelData(self, editor, model, index):
|
||||
val = editor.dateTime()
|
||||
if val <= UNDEFINED_QDATETIME:
|
||||
model.setData(index, UNDEFINED_QDATETIME, Qt.EditRole)
|
||||
else:
|
||||
model.setData(index, QDateTime(val), Qt.EditRole)
|
||||
|
||||
class PrefsViewerDialog(SizePersistedDialog):
|
||||
|
||||
def __init__(self, gui, namespace):
|
||||
SizePersistedDialog.__init__(self, gui, 'Prefs Viewer dialog')
|
||||
self.setWindowTitle('Preferences for: '+namespace)
|
||||
|
||||
self.gui = gui
|
||||
self.db = gui.current_db
|
||||
self.namespace = namespace
|
||||
self._init_controls()
|
||||
self.resize_dialog()
|
||||
|
||||
self._populate_settings()
|
||||
|
||||
if self.keys_list.count():
|
||||
self.keys_list.setCurrentRow(0)
|
||||
|
||||
def _init_controls(self):
|
||||
layout = QVBoxLayout(self)
|
||||
self.setLayout(layout)
|
||||
|
||||
ml = QHBoxLayout()
|
||||
layout.addLayout(ml, 1)
|
||||
|
||||
self.keys_list = QListWidget(self)
|
||||
self.keys_list.setSelectionMode(QAbstractItemView.SingleSelection)
|
||||
self.keys_list.setFixedWidth(150)
|
||||
self.keys_list.setAlternatingRowColors(True)
|
||||
ml.addWidget(self.keys_list)
|
||||
self.value_text = QTextEdit(self)
|
||||
self.value_text.setTabStopWidth(24)
|
||||
self.value_text.setReadOnly(True)
|
||||
ml.addWidget(self.value_text, 1)
|
||||
|
||||
button_box = QDialogButtonBox(QDialogButtonBox.Ok)
|
||||
button_box.accepted.connect(self.accept)
|
||||
self.clear_button = button_box.addButton('Clear', QDialogButtonBox.ResetRole)
|
||||
self.clear_button.setIcon(get_icon('trash.png'))
|
||||
self.clear_button.setToolTip('Clear all settings for this plugin')
|
||||
self.clear_button.clicked.connect(self._clear_settings)
|
||||
layout.addWidget(button_box)
|
||||
|
||||
def _populate_settings(self):
|
||||
self.keys_list.clear()
|
||||
ns_prefix = self._get_ns_prefix()
|
||||
keys = sorted([k[len(ns_prefix):] for k in self.db.prefs.iterkeys()
|
||||
if k.startswith(ns_prefix)])
|
||||
for key in keys:
|
||||
self.keys_list.addItem(key)
|
||||
self.keys_list.setMinimumWidth(self.keys_list.sizeHintForColumn(0))
|
||||
self.keys_list.currentRowChanged[int].connect(self._current_row_changed)
|
||||
|
||||
def _current_row_changed(self, new_row):
|
||||
if new_row < 0:
|
||||
self.value_text.clear()
|
||||
return
|
||||
key = unicode(self.keys_list.currentItem().text())
|
||||
val = self.db.prefs.get_namespaced(self.namespace, key, '')
|
||||
self.value_text.setPlainText(self.db.prefs.to_raw(val))
|
||||
|
||||
def _get_ns_prefix(self):
|
||||
return 'namespaced:%s:'% self.namespace
|
||||
|
||||
def _clear_settings(self):
|
||||
from calibre.gui2.dialogs.confirm_delete import confirm
|
||||
message = '<p>Are you sure you want to clear your settings in this library for this plugin?</p>' \
|
||||
'<p>Any settings in other libraries or stored in a JSON file in your calibre plugins ' \
|
||||
'folder will not be touched.</p>' \
|
||||
'<p>You must restart calibre afterwards.</p>'
|
||||
if not confirm(message, self.namespace+'_clear_settings', self):
|
||||
return
|
||||
ns_prefix = self._get_ns_prefix()
|
||||
keys = [k for k in self.db.prefs.iterkeys() if k.startswith(ns_prefix)]
|
||||
for k in keys:
|
||||
del self.db.prefs[k]
|
||||
self._populate_settings()
|
||||
d = info_dialog(self, 'Settings deleted',
|
||||
'<p>All settings for this plugin in this library have been cleared.</p>'
|
||||
'<p>Please restart calibre now.</p>',
|
||||
show_copy_button=False)
|
||||
b = d.bb.addButton(_('Restart calibre now'), d.bb.AcceptRole)
|
||||
b.setIcon(QIcon(I('lt.png')))
|
||||
d.do_restart = False
|
||||
def rf():
|
||||
d.do_restart = True
|
||||
b.clicked.connect(rf)
|
||||
d.set_details('')
|
||||
d.exec_()
|
||||
b.clicked.disconnect()
|
||||
self.close()
|
||||
if d.do_restart:
|
||||
self.gui.quit(restart=True)
|
||||
|
||||
1094
calibre-plugin/config.py
Normal file
1094
calibre-plugin/config.py
Normal file
File diff suppressed because it is too large
Load diff
1125
calibre-plugin/dialogs.py
Normal file
1125
calibre-plugin/dialogs.py
Normal file
File diff suppressed because it is too large
Load diff
2125
calibre-plugin/ffdl_plugin.py
Normal file
2125
calibre-plugin/ffdl_plugin.py
Normal file
File diff suppressed because it is too large
Load diff
43
calibre-plugin/ffdl_util.py
Normal file
43
calibre-plugin/ffdl_util.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Jim Miller'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from StringIO import StringIO
|
||||
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, exceptions
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.configurable import Configuration
|
||||
from calibre_plugins.fanfictiondownloader_plugin.prefs import (prefs)
|
||||
|
||||
def get_ffdl_personalini():
|
||||
if prefs['includeimages']:
|
||||
# this is a cheat to make it easier for users.
|
||||
return '''[epub]
|
||||
include_images:true
|
||||
keep_summary_html:true
|
||||
make_firstimage_cover:true
|
||||
''' + prefs['personal.ini']
|
||||
else:
|
||||
return prefs['personal.ini']
|
||||
|
||||
def get_ffdl_config(url,fileform="epub",personalini=None):
|
||||
if not personalini:
|
||||
personalini = get_ffdl_personalini()
|
||||
site='unknown'
|
||||
try:
|
||||
site = adapters.getConfigSectionFor(url)
|
||||
except Exception as e:
|
||||
print("Failed trying to get ini config for url(%s): %s, using section [%s] instead"%(url,e,site))
|
||||
configuration = Configuration(site,fileform)
|
||||
configuration.readfp(StringIO(get_resources("plugin-defaults.ini")))
|
||||
configuration.readfp(StringIO(personalini))
|
||||
|
||||
return configuration
|
||||
|
||||
def get_ffdl_adapter(url,fileform="epub",personalini=None):
|
||||
return adapters.getAdapter(get_ffdl_config(url,fileform,personalini),url)
|
||||
|
||||
BIN
calibre-plugin/images/icon.png
Normal file
BIN
calibre-plugin/images/icon.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
BIN
calibre-plugin/images/icon.xcf
Normal file
BIN
calibre-plugin/images/icon.xcf
Normal file
Binary file not shown.
244
calibre-plugin/jobs.py
Normal file
244
calibre-plugin/jobs.py
Normal file
|
|
@ -0,0 +1,244 @@
|
|||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Jim Miller'
|
||||
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import time, traceback
|
||||
|
||||
from calibre.utils.ipc.server import Server
|
||||
from calibre.utils.ipc.job import ParallelJob
|
||||
from calibre.constants import numeric_version as calibre_version
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
#
|
||||
# Functions to perform downloads using worker jobs
|
||||
#
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
def do_download_worker(book_list, options,
|
||||
cpus, notification=lambda x,y:x):
|
||||
'''
|
||||
Master job, to launch child jobs to extract ISBN for a set of books
|
||||
This is run as a worker job in the background to keep the UI more
|
||||
responsive and get around the memory leak issues as it will launch
|
||||
a child job for each book as a worker process
|
||||
'''
|
||||
server = Server(pool_size=cpus)
|
||||
|
||||
logger.info(options['version'])
|
||||
total = 0
|
||||
alreadybad = []
|
||||
# Queue all the jobs
|
||||
logger.info("Adding jobs for URLs:")
|
||||
for book in book_list:
|
||||
logger.info("%s"%book['url'])
|
||||
if book['good']:
|
||||
total += 1
|
||||
args = ['calibre_plugins.fanfictiondownloader_plugin.jobs',
|
||||
'do_download_for_worker',
|
||||
(book,options)]
|
||||
job = ParallelJob('arbitrary_n',
|
||||
"url:(%s) id:(%s)"%(book['url'],book['calibre_id']),
|
||||
done=None,
|
||||
args=args)
|
||||
job._book = book
|
||||
server.add_job(job)
|
||||
else:
|
||||
# was already bad before the subprocess ever started.
|
||||
alreadybad.append(book)
|
||||
|
||||
# This server is an arbitrary_n job, so there is a notifier available.
|
||||
# Set the % complete to a small number to avoid the 'unavailable' indicator
|
||||
notification(0.01, _('Downloading FanFiction Stories'))
|
||||
|
||||
# dequeue the job results as they arrive, saving the results
|
||||
count = 0
|
||||
while True:
|
||||
job = server.changed_jobs_queue.get()
|
||||
# A job can 'change' when it is not finished, for example if it
|
||||
# produces a notification. Ignore these.
|
||||
job.update()
|
||||
if not job.is_finished:
|
||||
continue
|
||||
# A job really finished. Get the information.
|
||||
book_list.remove(job._book)
|
||||
book_list.append(job.result)
|
||||
book_id = job._book['calibre_id']
|
||||
count = count + 1
|
||||
notification(float(count)/total, '%d of %d stories finished downloading'%(count,total))
|
||||
# Add this job's output to the current log
|
||||
logger.info('Logfile for book ID %s (%s)'%(book_id, job._book['title']))
|
||||
logger.info(job.details)
|
||||
|
||||
if count >= total:
|
||||
logger.info("\n"+_("Successful:")+"\n%s\n"%("\n".join([book['url'] for book in
|
||||
filter(lambda x: x['good'], book_list) ] ) ) )
|
||||
logger.info("\n"+_("Unsuccessful:")+"\n%s\n"%("\n".join([book['url'] for book in
|
||||
filter(lambda x: not x['good'], book_list) ] ) ) )
|
||||
break
|
||||
|
||||
server.close()
|
||||
|
||||
# return the book list as the job result
|
||||
return book_list
|
||||
|
||||
def do_download_for_worker(book,options,notification=lambda x,y:x):
|
||||
'''
|
||||
Child job, to download story when run as a worker job
|
||||
'''
|
||||
|
||||
from calibre_plugins.fanfictiondownloader_plugin import FanFictionDownLoaderBase
|
||||
ffdlbase = FanFictionDownLoaderBase(options['plugin_path'])
|
||||
with ffdlbase:
|
||||
|
||||
from calibre_plugins.fanfictiondownloader_plugin.dialogs import (NotGoingToDownload,
|
||||
OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY)
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_update_data
|
||||
|
||||
from calibre_plugins.fanfictiondownloader_plugin.ffdl_util import (get_ffdl_adapter, get_ffdl_config)
|
||||
|
||||
try:
|
||||
book['comment'] = _('Download started...')
|
||||
|
||||
configuration = get_ffdl_config(book['url'],
|
||||
options['fileform'],
|
||||
options['personal.ini'])
|
||||
|
||||
if not options['updateepubcover'] and 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
|
||||
configuration.set("overrides","never_make_cover","true")
|
||||
|
||||
# images only for epub, html, even if the user mistakenly
|
||||
# turned it on else where.
|
||||
if options['fileform'] not in ("epub","html"):
|
||||
configuration.set("overrides","include_images","false")
|
||||
|
||||
adapter = adapters.getAdapter(configuration,book['url'])
|
||||
adapter.is_adult = book['is_adult']
|
||||
adapter.username = book['username']
|
||||
adapter.password = book['password']
|
||||
adapter.setChaptersRange(book['begin'],book['end'])
|
||||
|
||||
adapter.load_cookiejar(options['cookiejarfile'])
|
||||
logger.debug("cookiejar:%s"%adapter.cookiejar)
|
||||
adapter.set_pagecache(options['pagecache'])
|
||||
|
||||
story = adapter.getStoryMetadataOnly()
|
||||
if 'calibre_series' in book:
|
||||
adapter.setSeries(book['calibre_series'][0],book['calibre_series'][1])
|
||||
|
||||
# set PI version instead of default.
|
||||
if 'version' in options:
|
||||
story.setMetadata('version',options['version'])
|
||||
|
||||
writer = writers.getWriter(options['fileform'],configuration,adapter)
|
||||
|
||||
outfile = book['outfile']
|
||||
|
||||
## No need to download at all. Shouldn't ever get down here.
|
||||
if options['collision'] in (CALIBREONLY):
|
||||
logger.info("Skipping CALIBREONLY 'update' down inside worker--this shouldn't be happening...")
|
||||
book['comment'] = 'Metadata collected.'
|
||||
|
||||
## checks were done earlier, it's new or not dup or newer--just write it.
|
||||
elif options['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \
|
||||
('epub_for_update' not in book and options['collision'] in (UPDATE, UPDATEALWAYS)):
|
||||
|
||||
# preserve logfile even on overwrite.
|
||||
if 'epub_for_update' in book:
|
||||
(urlignore,
|
||||
chaptercountignore,
|
||||
oldchaptersignore,
|
||||
oldimgsignore,
|
||||
oldcoverignore,
|
||||
calibrebookmarkignore,
|
||||
# only logfile set in adapter, so others aren't used.
|
||||
adapter.logfile) = get_update_data(book['epub_for_update'])
|
||||
|
||||
# change the existing entries id to notid so
|
||||
# write_epub writes a whole new set to indicate overwrite.
|
||||
if adapter.logfile:
|
||||
adapter.logfile = adapter.logfile.replace("span id","span notid")
|
||||
|
||||
logger.info("write to %s"%outfile)
|
||||
writer.writeStory(outfilename=outfile, forceOverwrite=True)
|
||||
book['comment'] = 'Download %s completed, %s chapters.'%(options['fileform'],story.getMetadata("numChapters"))
|
||||
|
||||
## checks were done earlier, just update it.
|
||||
elif 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
|
||||
|
||||
# update now handled by pre-populating the old images and
|
||||
# chapters in the adapter rather than merging epubs.
|
||||
urlchaptercount = int(story.getMetadata('numChapters').replace(',',''))
|
||||
(url,
|
||||
chaptercount,
|
||||
adapter.oldchapters,
|
||||
adapter.oldimgs,
|
||||
adapter.oldcover,
|
||||
adapter.calibrebookmark,
|
||||
adapter.logfile) = get_update_data(book['epub_for_update'])
|
||||
|
||||
# dup handling from ffdl_plugin needed for anthology updates.
|
||||
if options['collision'] == UPDATE:
|
||||
if chaptercount == urlchaptercount:
|
||||
book['comment']=_("Already contains %d chapters. Reuse as is.")%chaptercount
|
||||
book['outfile'] = book['epub_for_update'] # for anthology merge ops.
|
||||
return book
|
||||
|
||||
# dup handling from ffdl_plugin needed for anthology updates.
|
||||
if chaptercount > urlchaptercount:
|
||||
raise NotGoingToDownload(_("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update.") % (chaptercount,urlchaptercount),'dialog_error.png')
|
||||
|
||||
if not (options['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \
|
||||
and adapter.getConfig("do_update_hook"):
|
||||
chaptercount = adapter.hookForUpdates(chaptercount)
|
||||
|
||||
logger.info("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount))
|
||||
logger.info("write to %s"%outfile)
|
||||
|
||||
writer.writeStory(outfilename=outfile, forceOverwrite=True)
|
||||
|
||||
book['comment'] = _('Update %s completed, added %s chapters for %s total.')%\
|
||||
(options['fileform'],(urlchaptercount-chaptercount),urlchaptercount)
|
||||
|
||||
if options['smarten_punctuation'] and options['fileform'] == "epub" \
|
||||
and calibre_version >= (0, 9, 39):
|
||||
# for smarten punc
|
||||
from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS
|
||||
from calibre.utils.logging import Log
|
||||
from collections import namedtuple
|
||||
|
||||
# do smarten_punctuation from calibre's polish feature
|
||||
data = {'smarten_punctuation':True}
|
||||
opts = ALL_OPTS.copy()
|
||||
opts.update(data)
|
||||
O = namedtuple('Options', ' '.join(ALL_OPTS.iterkeys()))
|
||||
opts = O(**opts)
|
||||
|
||||
log = Log(level=Log.DEBUG)
|
||||
# report = []
|
||||
polish({outfile:outfile}, opts, log, logger.info) # report.append
|
||||
|
||||
except NotGoingToDownload as d:
|
||||
book['good']=False
|
||||
book['comment']=unicode(d)
|
||||
book['icon'] = d.icon
|
||||
|
||||
except Exception as e:
|
||||
book['good']=False
|
||||
book['comment']=unicode(e)
|
||||
book['icon']='dialog_error.png'
|
||||
book['status'] = 'Error'
|
||||
logger.info("Exception: %s:%s"%(book,unicode(e)))
|
||||
traceback.print_exc()
|
||||
|
||||
#time.sleep(10)
|
||||
return book
|
||||
150
calibre-plugin/prefs.py
Normal file
150
calibre-plugin/prefs.py
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Jim Miller'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import copy
|
||||
|
||||
from calibre.utils.config import JSONConfig
|
||||
from calibre.gui2.ui import get_gui
|
||||
|
||||
from calibre_plugins.fanfictiondownloader_plugin.dialogs import OVERWRITE
|
||||
from calibre_plugins.fanfictiondownloader_plugin.common_utils import get_library_uuid
|
||||
PREFS_NAMESPACE = 'FanFictionDownLoaderPlugin'
|
||||
PREFS_KEY_SETTINGS = 'settings'
|
||||
|
||||
# Set defaults used by all. Library specific settings continue to
|
||||
# take from here.
|
||||
default_prefs = {}
|
||||
default_prefs['personal.ini'] = get_resources('plugin-example.ini')
|
||||
default_prefs['rejecturls'] = ''
|
||||
default_prefs['rejectreasons'] = '''Sucked
|
||||
Boring
|
||||
Dup from another site'''
|
||||
default_prefs['reject_always'] = False
|
||||
|
||||
default_prefs['updatemeta'] = True
|
||||
default_prefs['updatecover'] = False
|
||||
default_prefs['updateepubcover'] = False
|
||||
default_prefs['keeptags'] = False
|
||||
default_prefs['suppressauthorsort'] = False
|
||||
default_prefs['suppresstitlesort'] = False
|
||||
default_prefs['mark'] = False
|
||||
default_prefs['showmarked'] = False
|
||||
default_prefs['autoconvert'] = False
|
||||
default_prefs['urlsfromclip'] = True
|
||||
default_prefs['updatedefault'] = True
|
||||
default_prefs['fileform'] = 'epub'
|
||||
default_prefs['collision'] = OVERWRITE
|
||||
default_prefs['deleteotherforms'] = False
|
||||
default_prefs['adddialogstaysontop'] = False
|
||||
default_prefs['includeimages'] = False
|
||||
default_prefs['lookforurlinhtml'] = False
|
||||
default_prefs['checkforseriesurlid'] = True
|
||||
default_prefs['checkforurlchange'] = True
|
||||
default_prefs['injectseries'] = False
|
||||
default_prefs['smarten_punctuation'] = False
|
||||
|
||||
default_prefs['send_lists'] = ''
|
||||
default_prefs['read_lists'] = ''
|
||||
default_prefs['addtolists'] = False
|
||||
default_prefs['addtoreadlists'] = False
|
||||
default_prefs['addtolistsonread'] = False
|
||||
|
||||
default_prefs['gcnewonly'] = False
|
||||
default_prefs['gc_site_settings'] = {}
|
||||
default_prefs['allow_gc_from_ini'] = True
|
||||
default_prefs['gc_polish_cover'] = False
|
||||
|
||||
default_prefs['countpagesstats'] = []
|
||||
|
||||
default_prefs['errorcol'] = ''
|
||||
default_prefs['custom_cols'] = {}
|
||||
default_prefs['custom_cols_newonly'] = {}
|
||||
default_prefs['allow_custcol_from_ini'] = True
|
||||
|
||||
default_prefs['std_cols_newonly'] = {}
|
||||
|
||||
# This is where all preferences for this plugin *were* stored
|
||||
# Remember that this name (i.e. plugins/fanfictiondownloader_plugin) is also
|
||||
# in a global namespace, so make it as unique as possible.
|
||||
# You should always prefix your config file name with plugins/,
|
||||
# so as to ensure you dont accidentally clobber a calibre config file
|
||||
old_prefs = JSONConfig('plugins/fanfictiondownloader_plugin')
|
||||
|
||||
def set_library_config(library_config,db):
|
||||
db.prefs.set_namespaced(PREFS_NAMESPACE,
|
||||
PREFS_KEY_SETTINGS,
|
||||
library_config)
|
||||
|
||||
def get_library_config(db):
|
||||
library_id = get_library_uuid(db)
|
||||
library_config = None
|
||||
# Check whether this is a configuration needing to be migrated
|
||||
# from json into database. If so: get it, set it, rename it in json.
|
||||
if library_id in old_prefs:
|
||||
#print("get prefs from old_prefs")
|
||||
library_config = old_prefs[library_id]
|
||||
set_library_config(library_config,db)
|
||||
old_prefs["migrated to library db %s"%library_id] = old_prefs[library_id]
|
||||
del old_prefs[library_id]
|
||||
|
||||
if library_config is None:
|
||||
#print("get prefs from db")
|
||||
library_config = db.prefs.get_namespaced(PREFS_NAMESPACE, PREFS_KEY_SETTINGS,
|
||||
copy.deepcopy(default_prefs))
|
||||
return library_config
|
||||
|
||||
# fake out so I don't have to change the prefs calls anywhere. The
|
||||
# Java programmer in me is offended by op-overloading, but it's very
|
||||
# tidy.
|
||||
class PrefsFacade():
|
||||
def _get_db(self):
|
||||
if self.passed_db:
|
||||
return self.passed_db
|
||||
else:
|
||||
# In the GUI plugin we want current db so we detect when
|
||||
# it's changed. CLI plugin calls need to pass db in.
|
||||
return get_gui().current_db
|
||||
|
||||
def __init__(self,passed_db=None):
|
||||
self.default_prefs = default_prefs
|
||||
self.libraryid = None
|
||||
self.current_prefs = None
|
||||
self.passed_db=passed_db
|
||||
|
||||
def _get_prefs(self):
|
||||
libraryid = get_library_uuid(self._get_db())
|
||||
if self.current_prefs == None or self.libraryid != libraryid:
|
||||
#print("self.current_prefs == None(%s) or self.libraryid != libraryid(%s)"%(self.current_prefs == None,self.libraryid != libraryid))
|
||||
self.libraryid = libraryid
|
||||
self.current_prefs = get_library_config(self._get_db())
|
||||
return self.current_prefs
|
||||
|
||||
def __getitem__(self,k):
|
||||
prefs = self._get_prefs()
|
||||
if k not in prefs:
|
||||
# pulls from default_prefs.defaults automatically if not set
|
||||
# in default_prefs
|
||||
return self.default_prefs[k]
|
||||
return prefs[k]
|
||||
|
||||
def __setitem__(self,k,v):
|
||||
prefs = self._get_prefs()
|
||||
prefs[k]=v
|
||||
# self._save_prefs(prefs)
|
||||
|
||||
def __delitem__(self,k):
|
||||
prefs = self._get_prefs()
|
||||
if k in prefs:
|
||||
del prefs[k]
|
||||
|
||||
def save_to_db(self):
|
||||
set_library_config(self._get_prefs(),self._get_db())
|
||||
|
||||
prefs = PrefsFacade()
|
||||
|
||||
1626
calibre-plugin/translations/de.po
Normal file
1626
calibre-plugin/translations/de.po
Normal file
File diff suppressed because it is too large
Load diff
1625
calibre-plugin/translations/es.po
Normal file
1625
calibre-plugin/translations/es.po
Normal file
File diff suppressed because it is too large
Load diff
1624
calibre-plugin/translations/fr.po
Normal file
1624
calibre-plugin/translations/fr.po
Normal file
File diff suppressed because it is too large
Load diff
1519
calibre-plugin/translations/messages.pot
Normal file
1519
calibre-plugin/translations/messages.pot
Normal file
File diff suppressed because it is too large
Load diff
1624
calibre-plugin/translations/pt_BR.po
Normal file
1624
calibre-plugin/translations/pt_BR.po
Normal file
File diff suppressed because it is too large
Load diff
1794
calibre-plugin/translations/zz.po
Normal file
1794
calibre-plugin/translations/zz.po
Normal file
File diff suppressed because it is too large
Load diff
10
cron.yaml
Normal file
10
cron.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
cron:
|
||||
- description: cleanup job
|
||||
url: /r3m0v3r
|
||||
schedule: every 2 hours
|
||||
|
||||
# There's a bug in the Python 2.7 runtime that prevents this from
|
||||
# working properly. In theory, there should never be orphans anyway.
|
||||
#- description: orphan cleanup job
|
||||
# url: /r3m0v3rOrphans
|
||||
# schedule: every 4 hours
|
||||
73
css/index.css
Normal file
73
css/index.css
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
body
|
||||
{
|
||||
font: 0.9em "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
|
||||
}
|
||||
|
||||
#main
|
||||
{
|
||||
width: 60%;
|
||||
margin-left: 20%;
|
||||
background-color: #dae6ff;
|
||||
padding: 2em;
|
||||
}
|
||||
|
||||
#greeting
|
||||
{
|
||||
# margin-bottom: 1em;
|
||||
border-color: #efefef;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#logpassword:hover, #logpasswordtable:hover, #urlbox:hover, #typebox:hover, #helpbox:hover, #yourfile:hover
|
||||
{
|
||||
border: thin solid #fffeff;
|
||||
}
|
||||
|
||||
h1
|
||||
{
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
#logpasswordtable
|
||||
{
|
||||
padding: 1em;
|
||||
}
|
||||
|
||||
#logpassword, #logpasswordtable {
|
||||
// display: none;
|
||||
}
|
||||
|
||||
#urlbox, #typebox, #logpasswordtable, #logpassword, #helpbox, #yourfile
|
||||
{
|
||||
margin: 1em;
|
||||
padding: 1em;
|
||||
border: thin dotted #fffeff;
|
||||
}
|
||||
|
||||
div.field
|
||||
{
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
#submitbtn
|
||||
{
|
||||
padding: 1em;
|
||||
}
|
||||
|
||||
#typelabel
|
||||
{
|
||||
}
|
||||
|
||||
#typeoptions
|
||||
{
|
||||
margin-top: 0.5em;
|
||||
}
|
||||
|
||||
#error
|
||||
{
|
||||
color: #f00;
|
||||
}
|
||||
.recent {
|
||||
font-size: large;
|
||||
}
|
||||
1997
defaults.ini
Normal file
1997
defaults.ini
Normal file
File diff suppressed because it is too large
Load diff
59
delete_fic.py
Normal file
59
delete_fic.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
import os
|
||||
import cgi
|
||||
import sys
|
||||
import logging
|
||||
import traceback
|
||||
import StringIO
|
||||
|
||||
from google.appengine.api import users
|
||||
from google.appengine.ext import webapp
|
||||
from google.appengine.ext.webapp import util
|
||||
|
||||
from fanficdownloader.downaloder import *
|
||||
from fanficdownloader.ffnet import *
|
||||
from fanficdownloader.output import *
|
||||
|
||||
from google.appengine.ext import db
|
||||
|
||||
from fanficdownloader.zipdir import *
|
||||
|
||||
from ffstorage import *
|
||||
|
||||
def create_mac(user, fic_id, fic_url):
|
||||
return str(abs(hash(user)+hash(fic_id)))+str(abs(hash(fic_url)))
|
||||
|
||||
def check_mac(user, fic_id, fic_url, mac):
|
||||
return (create_mac(user, fic_id, fic_url) == mac)
|
||||
|
||||
def create_mac_for_fic(user, fic_id):
|
||||
key = db.Key(fic_id)
|
||||
fanfic = db.get(key)
|
||||
if fanfic.user != user:
|
||||
return None
|
||||
else:
|
||||
return create_mac(user, key, fanfic.url)
|
||||
|
||||
class DeleteFicHandler(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if not user:
|
||||
self.redirect('/login')
|
||||
|
||||
fic_id = self.request.get('fic_id')
|
||||
fic_mac = self.request.get('key_id')
|
||||
|
||||
actual_mac = create_mac_for_fic(user, fic_id)
|
||||
if actual_mac != fic_mac:
|
||||
self.response.out.write("Ooops")
|
||||
else:
|
||||
key = db.Key(fic_id)
|
||||
fanfic = db.get(key)
|
||||
fanfic.delete()
|
||||
self.redirect('/recent')
|
||||
|
||||
|
||||
fics = db.GqlQuery("Select * From DownloadedFanfic WHERE user = :1", user)
|
||||
template_values = dict(fics = fics, nickname = user.nickname())
|
||||
path = os.path.join(os.path.dirname(__file__), 'recent.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
333
downloader.py
Normal file
333
downloader.py
Normal file
|
|
@ -0,0 +1,333 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import sys, os
|
||||
from os.path import normpath, expanduser, isfile, join
|
||||
from StringIO import StringIO
|
||||
from optparse import OptionParser
|
||||
import getpass
|
||||
import string
|
||||
import ConfigParser
|
||||
from subprocess import call
|
||||
import pprint
|
||||
|
||||
def insert_into_python_path():
|
||||
"""
|
||||
Inserts this directory into the Python path, making the packages contained
|
||||
within directly importable
|
||||
"""
|
||||
path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Insert at the beginning of the Python path to give the packages contained
|
||||
# within this directory the highest priority when importing. This is done
|
||||
# to avoid importing potentially existing different versions of the
|
||||
# packages on the Python path
|
||||
sys.path.insert(0, path)
|
||||
|
||||
import logging
|
||||
if sys.version_info >= (2, 7):
|
||||
# suppresses default logger. Logging is setup in fanficdownload/__init__.py so it works in calibre, too.
|
||||
rootlogger = logging.getLogger()
|
||||
loghandler=logging.NullHandler()
|
||||
loghandler.setFormatter(logging.Formatter("(=====)(levelname)s:%(message)s"))
|
||||
rootlogger.addHandler(loghandler)
|
||||
|
||||
try:
|
||||
from calibre.constants import numeric_version as calibre_version
|
||||
is_calibre = True
|
||||
except:
|
||||
is_calibre = False
|
||||
|
||||
# using try/except directly was masking errors during development.
|
||||
if is_calibre:
|
||||
# running under calibre
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters,writers,exceptions
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.configurable import Configuration
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.geturls import get_urls_from_page
|
||||
else:
|
||||
from fanficdownloader import adapters,writers,exceptions
|
||||
from fanficdownloader.configurable import Configuration
|
||||
from fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
|
||||
from fanficdownloader.geturls import get_urls_from_page
|
||||
|
||||
|
||||
if sys.version_info < (2, 5):
|
||||
print "This program requires Python 2.5 or newer."
|
||||
sys.exit(1)
|
||||
|
||||
def writeStory(config,adapter,writeformat,metaonly=False,outstream=None):
|
||||
writer = writers.getWriter(writeformat,config,adapter)
|
||||
writer.writeStory(outstream=outstream,metaonly=metaonly)
|
||||
output_filename=writer.getOutputFileName()
|
||||
del writer
|
||||
return output_filename
|
||||
|
||||
def main(argv,
|
||||
parser=None,
|
||||
passed_defaultsini=None,
|
||||
passed_personalini=None):
|
||||
# read in args, anything starting with -- will be treated as --<varible>=<value>
|
||||
if not parser:
|
||||
parser = OptionParser("usage: %prog [options] storyurl")
|
||||
parser.add_option("-f", "--format", dest="format", default="epub",
|
||||
help="write story as FORMAT, epub(default), mobi, text or html", metavar="FORMAT")
|
||||
|
||||
if passed_defaultsini:
|
||||
config_help="read config from specified file(s) in addition to calibre plugin personal.ini, ~/.fanficdownloader/personal.ini, and ./personal.ini"
|
||||
else:
|
||||
config_help="read config from specified file(s) in addition to ~/.fanficdownloader/defaults.ini, ~/.fanficdownloader/personal.ini, ./defaults.ini, and ./personal.ini"
|
||||
parser.add_option("-c", "--config",
|
||||
action="append", dest="configfile", default=None,
|
||||
help=config_help, metavar="CONFIG")
|
||||
parser.add_option("-b", "--begin", dest="begin", default=None,
|
||||
help="Begin with Chapter START", metavar="START")
|
||||
parser.add_option("-e", "--end", dest="end", default=None,
|
||||
help="End with Chapter END", metavar="END")
|
||||
parser.add_option("-o", "--option",
|
||||
action="append", dest="options",
|
||||
help="set an option NAME=VALUE", metavar="NAME=VALUE")
|
||||
parser.add_option("-m", "--meta-only",
|
||||
action="store_true", dest="metaonly",
|
||||
help="Retrieve metadata and stop. Or, if --update-epub, update metadata title page only.",)
|
||||
parser.add_option("-u", "--update-epub",
|
||||
action="store_true", dest="update",
|
||||
help="Update an existing epub with new chapters, give epub filename instead of storyurl.",)
|
||||
parser.add_option("--update-cover",
|
||||
action="store_true", dest="updatecover",
|
||||
help="Update cover in an existing epub, otherwise existing cover (if any) is used on update. Only valid with --update-epub.",)
|
||||
parser.add_option("--force",
|
||||
action="store_true", dest="force",
|
||||
help="Force overwrite of an existing epub, download and overwrite all chapters.",)
|
||||
parser.add_option("-l", "--list",
|
||||
action="store_true", dest="list",
|
||||
help="Get list of valid story URLs from page given.",)
|
||||
parser.add_option("-n", "--normalize-list",
|
||||
action="store_true", dest="normalize",default=False,
|
||||
help="Get list of valid story URLs from page given, but normalized to standard forms.",)
|
||||
parser.add_option("-s", "--sites-list",
|
||||
action="store_true", dest="siteslist",default=False,
|
||||
help="Get list of valid story URLs examples.",)
|
||||
parser.add_option("-d", "--debug",
|
||||
action="store_true", dest="debug",
|
||||
help="Show debug output while downloading.",)
|
||||
|
||||
(options, args) = parser.parse_args(argv)
|
||||
|
||||
if not options.debug:
|
||||
logger = logging.getLogger("fanficdownloader")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
if not options.siteslist and len(args) != 1:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
if options.siteslist:
|
||||
for (site,examples) in adapters.getSiteExamples():
|
||||
print("\n====%s====\n\nExample URLs:"%site)
|
||||
for u in examples:
|
||||
print(" * %s"%u)
|
||||
return
|
||||
|
||||
if options.update and options.format != 'epub':
|
||||
parser.error("-u/--update-epub only works with epub")
|
||||
|
||||
## Attempt to update an existing epub.
|
||||
chaptercount = None
|
||||
output_filename = None
|
||||
if options.update:
|
||||
try:
|
||||
(url,chaptercount) = get_dcsource_chaptercount(args[0])
|
||||
if not url:
|
||||
print "No story URL found in epub to update."
|
||||
return
|
||||
print "Updating %s, URL: %s" % (args[0],url)
|
||||
output_filename = args[0]
|
||||
except:
|
||||
# if there's an error reading the update file, maybe it's a URL?
|
||||
# we'll look for an existing outputfile down below.
|
||||
url = args[0]
|
||||
else:
|
||||
url = args[0]
|
||||
|
||||
try:
|
||||
configuration = Configuration(adapters.getConfigSectionFor(url),options.format)
|
||||
except exceptions.UnknownSite, e:
|
||||
if options.list or options.normalize:
|
||||
# list for page doesn't have to be a supported site.
|
||||
configuration = Configuration("test1.com",options.format)
|
||||
else:
|
||||
raise e
|
||||
|
||||
conflist = []
|
||||
homepath = join(expanduser("~"),".fanficdownloader")
|
||||
|
||||
if passed_defaultsini:
|
||||
configuration.readfp(passed_defaultsini)
|
||||
|
||||
if isfile(join(homepath,"defaults.ini")):
|
||||
conflist.append(join(homepath,"defaults.ini"))
|
||||
if isfile("defaults.ini"):
|
||||
conflist.append("defaults.ini")
|
||||
|
||||
if passed_personalini:
|
||||
configuration.readfp(passed_personalini)
|
||||
|
||||
if isfile(join(homepath,"personal.ini")):
|
||||
conflist.append(join(homepath,"personal.ini"))
|
||||
if isfile("personal.ini"):
|
||||
conflist.append("personal.ini")
|
||||
|
||||
if options.configfile:
|
||||
conflist.extend(options.configfile)
|
||||
|
||||
logging.debug('reading %s config file(s), if present'%conflist)
|
||||
configuration.read(conflist)
|
||||
|
||||
try:
|
||||
configuration.add_section("overrides")
|
||||
except ConfigParser.DuplicateSectionError:
|
||||
pass
|
||||
|
||||
if options.force:
|
||||
configuration.set("overrides","always_overwrite","true")
|
||||
|
||||
if options.update and chaptercount:
|
||||
configuration.set("overrides","output_filename",output_filename)
|
||||
|
||||
if options.update and not options.updatecover:
|
||||
configuration.set("overrides","never_make_cover","true")
|
||||
|
||||
# images only for epub, even if the user mistakenly turned it
|
||||
# on else where.
|
||||
if options.format not in ("epub","html"):
|
||||
configuration.set("overrides","include_images","false")
|
||||
|
||||
if options.options:
|
||||
for opt in options.options:
|
||||
(var,val) = opt.split('=')
|
||||
configuration.set("overrides",var,val)
|
||||
|
||||
if options.list or options.normalize:
|
||||
retlist = get_urls_from_page(args[0], configuration, normalize=options.normalize)
|
||||
print "\n".join(retlist)
|
||||
return
|
||||
|
||||
try:
|
||||
adapter = adapters.getAdapter(configuration,url)
|
||||
adapter.setChaptersRange(options.begin,options.end)
|
||||
|
||||
# check for updating from URL (vs from file)
|
||||
if options.update and not chaptercount:
|
||||
try:
|
||||
writer = writers.getWriter("epub",configuration,adapter)
|
||||
output_filename=writer.getOutputFileName()
|
||||
(noturl,chaptercount) = get_dcsource_chaptercount(output_filename)
|
||||
print "Updating %s, URL: %s" % (output_filename,url)
|
||||
except:
|
||||
options.update = False
|
||||
pass
|
||||
|
||||
## Check for include_images without no_image_processing. In absence of PIL, give warning.
|
||||
if adapter.getConfig('include_images') and not adapter.getConfig('no_image_processing'):
|
||||
try:
|
||||
from calibre.utils.magick import Image
|
||||
logging.debug("Using calibre.utils.magick")
|
||||
except:
|
||||
try:
|
||||
import Image
|
||||
logging.debug("Using PIL")
|
||||
except:
|
||||
print "You have include_images enabled, but Python Image Library(PIL) isn't found.\nImages will be included full size in original format.\nContinue? (y/n)?"
|
||||
if not sys.stdin.readline().strip().lower().startswith('y'):
|
||||
return
|
||||
|
||||
## three tries, that's enough if both user/pass & is_adult needed,
|
||||
## or a couple tries of one or the other
|
||||
for x in range(0,2):
|
||||
try:
|
||||
adapter.getStoryMetadataOnly()
|
||||
except exceptions.FailedToLogin, f:
|
||||
if f.passwdonly:
|
||||
print "Story requires a password."
|
||||
else:
|
||||
print "Login Failed, Need Username/Password."
|
||||
sys.stdout.write("Username: ")
|
||||
adapter.username = sys.stdin.readline().strip()
|
||||
adapter.password = getpass.getpass(prompt='Password: ')
|
||||
#print("Login: `%s`, Password: `%s`" % (adapter.username, adapter.password))
|
||||
except exceptions.AdultCheckRequired:
|
||||
print "Please confirm you are an adult in your locale: (y/n)?"
|
||||
if sys.stdin.readline().strip().lower().startswith('y'):
|
||||
adapter.is_adult=True
|
||||
|
||||
if options.update and not options.force:
|
||||
urlchaptercount = int(adapter.getStoryMetadataOnly().getMetadata('numChapters'))
|
||||
|
||||
if chaptercount == urlchaptercount and not options.metaonly:
|
||||
print "%s already contains %d chapters." % (output_filename,chaptercount)
|
||||
elif chaptercount > urlchaptercount:
|
||||
print "%s contains %d chapters, more than source: %d." % (output_filename,chaptercount,urlchaptercount)
|
||||
elif chaptercount == 0:
|
||||
print "%s doesn't contain any recognizable chapters, probably from a different source. Not updating." % (output_filename)
|
||||
else:
|
||||
# update now handled by pre-populating the old
|
||||
# images and chapters in the adapter rather than
|
||||
# merging epubs.
|
||||
(url,
|
||||
chaptercount,
|
||||
adapter.oldchapters,
|
||||
adapter.oldimgs,
|
||||
adapter.oldcover,
|
||||
adapter.calibrebookmark,
|
||||
adapter.logfile) = get_update_data(output_filename)
|
||||
|
||||
print "Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)
|
||||
|
||||
if not (options.update and chaptercount == urlchaptercount) \
|
||||
and adapter.getConfig("do_update_hook"):
|
||||
chaptercount = adapter.hookForUpdates(chaptercount)
|
||||
|
||||
writeStory(configuration,adapter,"epub")
|
||||
|
||||
else:
|
||||
# regular download
|
||||
if options.metaonly:
|
||||
pprint.pprint(adapter.getStoryMetadataOnly().getAllMetadata())
|
||||
|
||||
output_filename=writeStory(configuration,adapter,options.format,options.metaonly)
|
||||
|
||||
if not options.metaonly and adapter.getConfig("post_process_cmd"):
|
||||
metadata = adapter.story.metadata
|
||||
metadata['output_filename']=output_filename
|
||||
call(string.Template(adapter.getConfig("post_process_cmd"))
|
||||
.substitute(metadata), shell=True)
|
||||
|
||||
del adapter
|
||||
|
||||
except exceptions.InvalidStoryURL, isu:
|
||||
print isu
|
||||
except exceptions.StoryDoesNotExist, dne:
|
||||
print dne
|
||||
except exceptions.UnknownSite, us:
|
||||
print us
|
||||
|
||||
if __name__ == "__main__":
|
||||
#import time
|
||||
#start = time.time()
|
||||
insert_into_python_path()
|
||||
main(sys.argv[1:])
|
||||
#print("Total time seconds:%f"%(time.time()-start))
|
||||
89
editconfig.html
Normal file
89
editconfig.html
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="/css/index.css" rel="stylesheet" type="text/css">
|
||||
<title>FanFictionDownLoader - read fanfiction from twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org on Kindle, Nook, Sony Reader, iPad, iPhone, Android, Aldiko, Stanza</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
|
||||
<script type="text/javascript">
|
||||
|
||||
var _gaq = _gaq || [];
|
||||
_gaq.push(['_setAccount', 'UA-12136939-1']);
|
||||
_gaq.push(['_trackPageview']);
|
||||
|
||||
(function() {
|
||||
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
||||
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
||||
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
||||
})();
|
||||
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<div id='main' style="width: 80%; margin-left: 10%;">
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFictionDownLoader</a>
|
||||
</h1>
|
||||
|
||||
<div style="text-align: center">
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</div>
|
||||
|
||||
<form action="/editconfig" method="post">
|
||||
<input type="hidden" name="update" value="true" />
|
||||
<div id='logpasswordtable'>
|
||||
<h3>Edit Config</h3>
|
||||
<div id='logpassword'>
|
||||
Editing configuration for {{ nickname }}.
|
||||
</div>
|
||||
<div class='fieldandlabel'>
|
||||
<textarea name="config" style="width: 100%; height: 200px;" wrap='off'>{{ config }}</textarea>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id='submitbtn'>
|
||||
<input type="submit" value="Save">
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<div>
|
||||
<h3>Default System configuration</h3>
|
||||
<pre>
|
||||
{{ defaultsini }}
|
||||
</pre>
|
||||
</div>
|
||||
|
||||
<div style='text-align: center'>
|
||||
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
|
||||
alt="Powered by Google App Engine" />
|
||||
<br/><br/>
|
||||
This is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">FanFictionDownLoader</a><br/>
|
||||
Copyright © Fanficdownloader team
|
||||
</div>
|
||||
|
||||
<div style="margin-top: 1em; text-align: center'">
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "pub-2027714004231956";
|
||||
/* FFD */
|
||||
google_ad_slot = "7330682770";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
25
epubmerge.py
Normal file
25
epubmerge.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# epubmerge.py 1.0
|
||||
|
||||
# Copyright 2011, Jim Miller
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if __name__ == "__main__":
|
||||
print('''
|
||||
The this utility has been split out into it's own project.
|
||||
See: http://code.google.com/p/epubmerge/
|
||||
...for a CLI epubmerge.py program and calibre plugin.
|
||||
''')
|
||||
103
example.ini
Normal file
103
example.ini
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
## This is an example of what your personal configuration might look
|
||||
## like. Uncomment options by removing the '#' in front of them.
|
||||
|
||||
[defaults]
|
||||
## Some sites also require the user to confirm they are adult for
|
||||
## adult content. Uncomment by removing '#' in front of is_adult. In
|
||||
## commandline version, this should go in your personal.ini, not
|
||||
## defaults.ini.
|
||||
#is_adult:true
|
||||
|
||||
## Don't like the numbers at the start of chapter titles on some
|
||||
## sites? You can use strip_chapter_numbers to strip them off. Just
|
||||
## want to make them all look the same? Strip them off, then add them
|
||||
## back on with add_chapter_numbers. Don't like the way it strips
|
||||
## numbers or adds them back? See chapter_title_strip_pattern and
|
||||
## chapter_title_add_pattern.
|
||||
#strip_chapter_numbers:true
|
||||
#add_chapter_numbers:true
|
||||
|
||||
[epub]
|
||||
## include images from img tags in the body and summary of stories.
|
||||
## Images will be converted to jpg for size if possible. Images work
|
||||
## in epub format only. To get mobi or other format with images,
|
||||
## download as epub and use Calibre to convert.
|
||||
#include_images:true
|
||||
|
||||
## If not set, the summary will have all html stripped for safety.
|
||||
## Both this and include_images must be true to get images in the
|
||||
## summary.
|
||||
#keep_summary_html:true
|
||||
|
||||
## If set, the first image found will be made the cover image. If
|
||||
## keep_summary_html is true, any images in summary will be before any
|
||||
## in chapters.
|
||||
#make_firstimage_cover:true
|
||||
|
||||
## Resize images down to width, height, preserving aspect ratio.
|
||||
## Nook size, with margin.
|
||||
#image_max_size: 580, 725
|
||||
|
||||
## Change image to grayscale, if graphics library allows, to save
|
||||
## space.
|
||||
#grayscale_images: false
|
||||
|
||||
|
||||
## Most common, I expect will be using this to save username/passwords
|
||||
## for different sites. Here are a few examples. See defaults.ini
|
||||
## for the full list.
|
||||
|
||||
[www.twilighted.net]
|
||||
#username:YourPenname
|
||||
#password:YourPassword
|
||||
## default is false
|
||||
#collect_series: true
|
||||
|
||||
[www.ficwad.com]
|
||||
#username:YourUsername
|
||||
#password:YourPassword
|
||||
|
||||
[www.twiwrite.net]
|
||||
#username:YourName
|
||||
#password:yourpassword
|
||||
## default is false
|
||||
#collect_series: true
|
||||
|
||||
[www.adastrafanfic.com]
|
||||
## Some sites do not require a login, but do require the user to
|
||||
## confirm they are adult for adult content.
|
||||
#is_adult:true
|
||||
|
||||
[www.thewriterscoffeeshop.com]
|
||||
#username:YourName
|
||||
#password:yourpassword
|
||||
#is_adult:true
|
||||
## default is false
|
||||
#collect_series: true
|
||||
|
||||
[www.fictionalley.org]
|
||||
#is_adult:true
|
||||
|
||||
[www.harrypotterfanfiction.com]
|
||||
#is_adult:true
|
||||
|
||||
[www.fimfiction.net]
|
||||
#is_adult:true
|
||||
#fail_on_password: false
|
||||
|
||||
[www.tthfanfic.org]
|
||||
#is_adult:true
|
||||
## tth is a little unusual--it doesn't require user/pass, but the site
|
||||
## keeps track of which chapters you've read and won't send another
|
||||
## update until it thinks you're up to date. This way, on download,
|
||||
## it thinks you're up to date.
|
||||
#username:YourName
|
||||
#password:yourpassword
|
||||
|
||||
|
||||
## This section will override anything in the system defaults or other
|
||||
## sections here.
|
||||
[overrides]
|
||||
## default varies by site. Set true here to force all sites to
|
||||
## collect series.
|
||||
#collect_series: true
|
||||
BIN
fanficdownloader.zip
Normal file
BIN
fanficdownloader.zip
Normal file
Binary file not shown.
2014
fanficdownloader/BeautifulSoup.py
Normal file
2014
fanficdownloader/BeautifulSoup.py
Normal file
File diff suppressed because it is too large
Load diff
57
fanficdownloader/HtmlTagStack.py
Normal file
57
fanficdownloader/HtmlTagStack.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
# coding: utf-8
|
||||
|
||||
import re
|
||||
import codecs
|
||||
|
||||
stack = []
|
||||
|
||||
def get_end_tag(tag):
|
||||
if len(tag) > 0 and tag.find(u'<') > -1 and tag.rfind(u'>') > -1:
|
||||
return re.sub(r'.*<([^\ >]+).*', r'</\1>', tag)
|
||||
return u''
|
||||
|
||||
def get_tag_name(tag):
|
||||
if len(tag) > 0 and tag.find(u'<') > -1 and tag.rfind(u'>') > -1:
|
||||
return re.sub(r'</*([^\ >]+).*', r'\1', tag)
|
||||
return u''
|
||||
|
||||
def push(tag):
|
||||
if len(tag) > 0 and tag.find(u'<') > -1 and tag.rfind(u'>') > -1:
|
||||
stack.append(tag)
|
||||
|
||||
def pop():
|
||||
if len(stack) > 0:
|
||||
return stack.pop()
|
||||
return u''
|
||||
|
||||
def pop_end_tag():
|
||||
return unicode(get_end_tag(pop()))
|
||||
|
||||
def spool_end():
|
||||
html = u''
|
||||
for tag in reversed(stack):
|
||||
html += get_end_tag(tag)
|
||||
return html
|
||||
|
||||
def spool_start():
|
||||
html = u''
|
||||
for item in stack:
|
||||
html += item
|
||||
return html
|
||||
|
||||
def has_elements():
|
||||
return len(stack) > 0
|
||||
|
||||
def get_last():
|
||||
# t = pop()
|
||||
# push(t)
|
||||
# return t
|
||||
if len(stack) > 0:
|
||||
return stack[len(stack)-1]
|
||||
return u''
|
||||
|
||||
def flush():
|
||||
del stack[:]
|
||||
|
||||
def get_stack():
|
||||
return stack
|
||||
19
fanficdownloader/__init__.py
Normal file
19
fanficdownloader/__init__.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
try:
|
||||
# just a way to switch between web service and CLI/PI
|
||||
import google.appengine.api
|
||||
except:
|
||||
try: # just a way to switch between CLI and PI
|
||||
import calibre.constants
|
||||
except:
|
||||
import sys
|
||||
if sys.version_info >= (2, 7):
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
loghandler=logging.StreamHandler()
|
||||
loghandler.setFormatter(logging.Formatter("FFDL:%(levelname)s:%(filename)s(%(lineno)d):%(message)s"))
|
||||
logger.addHandler(loghandler)
|
||||
loghandler.setLevel(logging.DEBUG)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
253
fanficdownloader/adapters/__init__.py
Normal file
253
fanficdownloader/adapters/__init__.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os, re, sys, glob, types
|
||||
from os.path import dirname, basename, normpath
|
||||
import logging
|
||||
import urlparse as up
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .. import exceptions as exceptions
|
||||
from ..configurable import Configuration
|
||||
|
||||
## must import each adapter here.
|
||||
|
||||
import adapter_test1
|
||||
import adapter_fanfictionnet
|
||||
import adapter_castlefansorg
|
||||
import adapter_fictionalleyorg
|
||||
import adapter_fictionpresscom
|
||||
import adapter_ficwadcom
|
||||
import adapter_fimfictionnet
|
||||
import adapter_harrypotterfanfictioncom
|
||||
import adapter_mediaminerorg
|
||||
import adapter_potionsandsnitchesnet
|
||||
import adapter_tenhawkpresentscom
|
||||
import adapter_adastrafanficcom
|
||||
import adapter_twcslibrarynet
|
||||
import adapter_tthfanficorg
|
||||
import adapter_twilightednet
|
||||
import adapter_twiwritenet
|
||||
import adapter_whoficcom
|
||||
import adapter_siyecouk
|
||||
import adapter_archiveofourownorg
|
||||
import adapter_ficbooknet
|
||||
import adapter_portkeyorg
|
||||
import adapter_mugglenetcom
|
||||
import adapter_hpfandomnet
|
||||
import adapter_thequidditchpitchorg
|
||||
import adapter_nfacommunitycom
|
||||
import adapter_midnightwhispersca
|
||||
import adapter_ksarchivecom
|
||||
import adapter_archiveskyehawkecom
|
||||
import adapter_squidgeorgpeja
|
||||
import adapter_libraryofmoriacom
|
||||
import adapter_wraithbaitcom
|
||||
import adapter_checkmatedcom
|
||||
import adapter_chaossycophanthexcom
|
||||
import adapter_dramioneorg
|
||||
import adapter_erosnsapphosycophanthexcom
|
||||
import adapter_lumossycophanthexcom
|
||||
import adapter_occlumencysycophanthexcom
|
||||
import adapter_phoenixsongnet
|
||||
import adapter_walkingtheplankorg
|
||||
import adapter_ashwindersycophanthexcom
|
||||
import adapter_thehexfilesnet
|
||||
import adapter_dokugacom
|
||||
import adapter_iketernalnet
|
||||
import adapter_onedirectionfanfictioncom
|
||||
import adapter_storiesofardacom
|
||||
import adapter_samdeanarchivenu
|
||||
import adapter_destinysgatewaycom
|
||||
import adapter_ncisfictionnet
|
||||
import adapter_stargateatlantisorg
|
||||
import adapter_thealphagatecom
|
||||
import adapter_fanfiktionde
|
||||
import adapter_ponyfictionarchivenet
|
||||
import adapter_sg1heliopoliscom
|
||||
import adapter_ncisficcom
|
||||
import adapter_nationallibrarynet
|
||||
import adapter_themasquenet
|
||||
import adapter_pretendercentrecom
|
||||
import adapter_darksolaceorg
|
||||
import adapter_finestoriescom
|
||||
import adapter_hpfanficarchivecom
|
||||
import adapter_twilightarchivescom
|
||||
import adapter_wizardtalesnet
|
||||
import adapter_nhamagicalworldsus
|
||||
import adapter_hlfictionnet
|
||||
import adapter_grangerenchantedcom
|
||||
import adapter_dracoandginnycom
|
||||
import adapter_scarvesandcoffeenet
|
||||
import adapter_thepetulantpoetesscom
|
||||
import adapter_wolverineandroguecom
|
||||
import adapter_sinfuldesireorg
|
||||
import adapter_merlinficdtwinscouk
|
||||
import adapter_thehookupzonenet
|
||||
import adapter_bloodtiesfancom
|
||||
import adapter_indeathnet
|
||||
import adapter_qafficcom
|
||||
import adapter_efpfanficnet
|
||||
import adapter_potterficscom
|
||||
import adapter_efictionestelielde
|
||||
import adapter_dotmoonnet
|
||||
import adapter_pommedesangcom
|
||||
import adapter_restrictedsectionorg
|
||||
import adapter_imagineeficcom
|
||||
import adapter_buffynfaithnet
|
||||
import adapter_psychficcom
|
||||
import adapter_hennethannunnet
|
||||
import adapter_tokrafandomnetcom
|
||||
import adapter_netraptororg
|
||||
import adapter_asr3slashzoneorg
|
||||
import adapter_nickandgregnet
|
||||
import adapter_potterheadsanonymouscom
|
||||
import adapter_simplyundeniablecom
|
||||
import adapter_scarheadnet
|
||||
import adapter_fictionpadcom
|
||||
import adapter_storiesonlinenet
|
||||
import adapter_trekiverseorg
|
||||
import adapter_literotica
|
||||
import adapter_voracity2eficcom
|
||||
import adapter_spikeluvercom
|
||||
import adapter_bloodshedversecom
|
||||
import adapter_nocturnallightnet
|
||||
import adapter_fanfichu
|
||||
import adapter_fanfictioncsodaidokhu
|
||||
import adapter_fictionmaniatv
|
||||
import adapter_bdsmgeschichten
|
||||
import adapter_tolkienfanfiction
|
||||
import adapter_themaplebookshelf
|
||||
import adapter_fannation
|
||||
import adapter_sheppardweircom
|
||||
import adapter_samandjacknet
|
||||
import adapter_csiforensicscom
|
||||
import adapter_lotrfanfictioncom
|
||||
import adapter_fhsarchivecom
|
||||
|
||||
## This bit of complexity allows adapters to be added by just adding
|
||||
## importing. It eliminates the long if/else clauses we used to need
|
||||
## to pick out the adapter.
|
||||
|
||||
## List of registered site adapters.
|
||||
__class_list = []
|
||||
__domain_map = {}
|
||||
|
||||
def imports():
|
||||
for name, val in globals().items():
|
||||
if isinstance(val, types.ModuleType):
|
||||
yield val.__name__
|
||||
|
||||
for x in imports():
|
||||
if "fanficdownloader.adapters.adapter_" in x:
|
||||
#print x
|
||||
cls = sys.modules[x].getClass()
|
||||
__class_list.append(cls)
|
||||
for site in cls.getAcceptDomains():
|
||||
__domain_map[site]=cls
|
||||
|
||||
def getNormalStoryURL(url):
|
||||
r = getNormalStoryURLSite(url)
|
||||
if r:
|
||||
return r[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
def getNormalStoryURLSite(url):
|
||||
if not getNormalStoryURL.__dummyconfig:
|
||||
getNormalStoryURL.__dummyconfig = Configuration("test1.com","EPUB")
|
||||
# pulling up an adapter is pretty low over-head. If
|
||||
# it fails, it's a bad url.
|
||||
try:
|
||||
adapter = getAdapter(getNormalStoryURL.__dummyconfig,url)
|
||||
url = adapter.url
|
||||
site = adapter.getSiteDomain()
|
||||
del adapter
|
||||
return (url,site)
|
||||
except:
|
||||
return None
|
||||
|
||||
# kludgey function static/singleton
|
||||
getNormalStoryURL.__dummyconfig = None
|
||||
|
||||
def getAdapter(config,url,anyurl=False):
|
||||
|
||||
#logger.debug("trying url:"+url)
|
||||
(cls,fixedurl) = getClassFor(url)
|
||||
#logger.debug("fixedurl:"+fixedurl)
|
||||
if cls:
|
||||
if anyurl:
|
||||
fixedurl = cls.getSiteExampleURLs().split()[0]
|
||||
adapter = cls(config,fixedurl) # raises InvalidStoryURL
|
||||
return adapter
|
||||
# No adapter found.
|
||||
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )
|
||||
|
||||
def getConfigSections():
|
||||
return [cls.getConfigSection() for cls in __class_list]
|
||||
|
||||
def getSiteExamples():
|
||||
l=[]
|
||||
for cls in sorted(__class_list, key=lambda x : x.getConfigSection()):
|
||||
l.append((cls.getConfigSection(),cls.getSiteExampleURLs().split()))
|
||||
return l
|
||||
|
||||
def getConfigSectionFor(url):
|
||||
(cls,fixedurl) = getClassFor(url)
|
||||
if cls:
|
||||
return cls.getConfigSection()
|
||||
|
||||
# No adapter found.
|
||||
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )
|
||||
|
||||
def getClassFor(url):
|
||||
## fix up leading protocol.
|
||||
fixedurl = re.sub(r"(?i)^[htp]+(s?)[:/]+",r"http\1://",url.strip())
|
||||
if fixedurl.startswith("//"):
|
||||
fixedurl = "http:%s"%url
|
||||
if not fixedurl.startswith("http"):
|
||||
fixedurl = "http://%s"%url
|
||||
## remove any trailing '#' locations.
|
||||
fixedurl = re.sub(r"#.*$","",fixedurl)
|
||||
|
||||
parsedUrl = up.urlparse(fixedurl)
|
||||
domain = parsedUrl.netloc.lower()
|
||||
if( domain != parsedUrl.netloc ):
|
||||
fixedurl = fixedurl.replace(parsedUrl.netloc,domain)
|
||||
|
||||
cls = getClassFromList(domain)
|
||||
if not cls and domain.startswith("www."):
|
||||
domain = domain.replace("www.","")
|
||||
#logger.debug("trying site:without www: "+domain)
|
||||
cls = getClassFromList(domain)
|
||||
fixedurl = re.sub(r"^http(s?)://www\.",r"http\1://",fixedurl)
|
||||
if not cls:
|
||||
#logger.debug("trying site:www."+domain)
|
||||
cls = getClassFromList("www."+domain)
|
||||
fixedurl = re.sub(r"^http(s?)://",r"http\1://www.",fixedurl)
|
||||
|
||||
if cls:
|
||||
fixedurl = cls.stripURLParameters(fixedurl)
|
||||
|
||||
return (cls,fixedurl)
|
||||
|
||||
def getClassFromList(domain):
|
||||
try:
|
||||
return __domain_map[domain]
|
||||
except KeyError:
|
||||
pass # return none.
|
||||
238
fanficdownloader/adapters/adapter_adastrafanficcom.py
Normal file
238
fanficdownloader/adapters/adapter_adastrafanficcom.py
Normal file
|
|
@ -0,0 +1,238 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','aaff')
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.adastrafanfic.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
addurl = "&warning=5"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Content is only suitable for mature adults. May contain explicit language and adult themes. Equivalent of NC-17." in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
# problems with some stories, but only in calibre. I suspect
|
||||
# issues with different SGML parsers in python. This is a
|
||||
# nasty hack, but it works.
|
||||
data = data[data.index("<body"):]
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
## <meta name='description' content='<p>Description</p> ...' >
|
||||
## Summary, strangely, is in the content attr of a <meta name='description'> tag
|
||||
## which is escaped HTML. Unfortunately, we can't use it because they don't
|
||||
## escape (') chars in the desc, breakin the tag.
|
||||
#meta_desc = soup.find('meta',{'name':'description'})
|
||||
#metasoup = bs.BeautifulStoneSoup(meta_desc['content'])
|
||||
#self.story.setMetadata('description',stripHTML(metasoup))
|
||||
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ''
|
||||
while value and not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
# sometimes poorly formated desc (<p> w/o </p>) leads
|
||||
# to all labels being included.
|
||||
svalue=svalue[:svalue.find('<span class="label">')]
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
charstext = [char.string for char in chars]
|
||||
for char in charstext:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
genrestext = [genre.string for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
warningstext = [warning.string for warning in warnings]
|
||||
self.warning = ', '.join(warningstext)
|
||||
for warning in warningstext:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(value.strip(), "%d %b %Y"))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(value.strip(), "%d %b %Y"))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self._fetchUrl(url)
|
||||
# problems with some stories, but only in calibre. I suspect
|
||||
# issues with different SGML parsers in python. This is a
|
||||
# nasty hack, but it works.
|
||||
data = data[data.index("<body"):]
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data,
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
span = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return AdAstraFanficComSiteAdapter
|
||||
|
||||
389
fanficdownloader/adapters/adapter_archiveofourownorg.py
Normal file
389
fanficdownloader/adapters/adapter_archiveofourownorg.py
Normal file
|
|
@ -0,0 +1,389 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
#from .. import BeautifulSoup as bs
|
||||
import bs4 as bs
|
||||
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return ArchiveOfOurOwnOrgAdapter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
|
||||
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
self.story.setMetadata('storyId',m.group('id'))
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/works/'+self.story.getMetadata('storyId'))
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ao3')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%Y-%b-%d"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'archiveofourown.org'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/works/123456 http://"+cls.getSiteDomain()+"/collections/Some_Archive/works/123456 http://"+cls.getSiteDomain()+"/works/123456/chapters/78901"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
# http://archiveofourown.org/collections/Smallville_Slash_Archive/works/159770
|
||||
# Discard leading zeros from story ID numbers--AO3 doesn't use them in it's own chapter URLs.
|
||||
return r"https?://"+re.escape(self.getSiteDomain())+r"(/collections/[^/]+)?/works/0*(?P<id>\d+)"
|
||||
|
||||
## Login
|
||||
def needToLoginCheck(self, data):
|
||||
if 'This work is only available to registered users of the Archive.' in data \
|
||||
or "The password or user name you entered doesn't match our records" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url, data):
|
||||
|
||||
params = {}
|
||||
if self.password:
|
||||
params['user_session[login]'] = self.username
|
||||
params['user_session[password]'] = self.password
|
||||
else:
|
||||
params['user_session[login]'] = self.getConfig("username")
|
||||
params['user_session[password]'] = self.getConfig("password")
|
||||
params['user_session[remember_me]'] = '1'
|
||||
params['commit'] = 'Log in'
|
||||
#params['utf8'] = u'✓'#u'\x2713' # gets along with out it, and it confuses the encoder.
|
||||
params['authenticity_token'] = data.split('input name="authenticity_token" type="hidden" value="')[1].split('" /></div>')[0]
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user_sessions'
|
||||
logger.info("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['user_session[login]']))
|
||||
|
||||
d = self._postUrl(loginUrl, params)
|
||||
#logger.info(d)
|
||||
|
||||
if "Successfully logged in" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['user_session[login]']))
|
||||
raise exceptions.FailedToLogin(url,params['user_session[login]'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
addurl = "?view_adult=true"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
metaurl = self.url+addurl
|
||||
url = self.url+'/navigate'+addurl
|
||||
logger.info("url: "+url)
|
||||
logger.info("metaurl: "+metaurl)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
meta = self._fetchUrl(metaurl)
|
||||
|
||||
if "This work could have adult content. If you proceed you have agreed that you are willing to see such content." in meta:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Sorry, we couldn't find the work you were looking for." in data:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url,data)
|
||||
data = self._fetchUrl(url,usecache=False)
|
||||
meta = self._fetchUrl(metaurl,usecache=False)
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
for tag in soup.findAll('div',id='admin-banner'):
|
||||
tag.extract()
|
||||
metasoup = bs.BeautifulSoup(meta)
|
||||
for tag in metasoup.findAll('div',id='admin-banner'):
|
||||
tag.extract()
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r"/works/\d+$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
alist = soup.findAll('a', href=re.compile(r"/users/\w+/pseuds/\w+"))
|
||||
if len(alist) < 1: # ao3 allows for author 'Anonymous' with no author link.
|
||||
self.story.setMetadata('author','Anonymous')
|
||||
self.story.setMetadata('authorUrl','http://archiveofourown.org/')
|
||||
self.story.setMetadata('authorId','0')
|
||||
else:
|
||||
for a in alist:
|
||||
self.story.addToList('authorId',a['href'].split('/')[-1])
|
||||
self.story.addToList('authorUrl',a['href'])
|
||||
self.story.addToList('author',a.text)
|
||||
|
||||
newestChapter = None
|
||||
self.newestChapterNum = None # save for comparing during update.
|
||||
# Scan all chapters to find the oldest and newest, on AO3 it's
|
||||
# possible for authors to insert new chapters out-of-order or
|
||||
# change the dates of earlier ones by editing them--That WILL
|
||||
# break epub update.
|
||||
# Find the chapters:
|
||||
chapters=soup.findAll('a', href=re.compile(r'/works/'+self.story.getMetadata('storyId')+"/chapters/\d+$"))
|
||||
self.story.setMetadata('numChapters',len(chapters))
|
||||
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
|
||||
if len(chapters)==1:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+chapters[0]['href']+addurl))
|
||||
else:
|
||||
for index, chapter in enumerate(chapters):
|
||||
# strip just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['href']+addurl))
|
||||
# (2013-09-21)
|
||||
date = stripHTML(chapter.findNext('span'))[1:-1]
|
||||
chapterDate = makeDate(date,self.dateformat)
|
||||
if newestChapter == None or chapterDate > newestChapter:
|
||||
newestChapter = chapterDate
|
||||
self.newestChapterNum = index
|
||||
|
||||
a = metasoup.find('blockquote',{'class':'userstuff'})
|
||||
if a != None:
|
||||
self.setDescription(url,a)
|
||||
#self.story.setMetadata('description',a.text)
|
||||
|
||||
a = metasoup.find('dd',{'class':"rating tags"})
|
||||
if a != None:
|
||||
self.story.setMetadata('rating',stripHTML(a.text))
|
||||
|
||||
a = metasoup.find('dd',{'class':"fandom tags"})
|
||||
fandoms = a.findAll('a',{'class':"tag"})
|
||||
for fandom in fandoms:
|
||||
self.story.addToList('fandoms',fandom.string)
|
||||
self.story.addToList('category',fandom.string)
|
||||
|
||||
a = metasoup.find('dd',{'class':"warning tags"})
|
||||
if a != None:
|
||||
warnings = a.findAll('a',{'class':"tag"})
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
a = metasoup.find('dd',{'class':"freeform tags"})
|
||||
if a != None:
|
||||
genres = a.findAll('a',{'class':"tag"})
|
||||
for genre in genres:
|
||||
self.story.addToList('freeformtags',genre.string)
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
a = metasoup.find('dd',{'class':"category tags"})
|
||||
if a != None:
|
||||
genres = a.findAll('a',{'class':"tag"})
|
||||
for genre in genres:
|
||||
if genre != "Gen":
|
||||
self.story.addToList('ao3categories',genre.string)
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
a = metasoup.find('dd',{'class':"character tags"})
|
||||
if a != None:
|
||||
chars = a.findAll('a',{'class':"tag"})
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
a = metasoup.find('dd',{'class':"relationship tags"})
|
||||
if a != None:
|
||||
ships = a.findAll('a',{'class':"tag"})
|
||||
for ship in ships:
|
||||
self.story.addToList('ships',ship.string)
|
||||
|
||||
a = metasoup.find('dd',{'class':"collections"})
|
||||
if a != None:
|
||||
collections = a.findAll('a')
|
||||
for collection in collections:
|
||||
self.story.addToList('collections',collection.string)
|
||||
|
||||
stats = metasoup.find('dl',{'class':'stats'})
|
||||
dt = stats.findAll('dt')
|
||||
dd = stats.findAll('dd')
|
||||
for x in range(0,len(dt)):
|
||||
label = dt[x].text
|
||||
value = dd[x].text
|
||||
|
||||
if 'Words:' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Comments:' in label:
|
||||
self.story.setMetadata('comments', value)
|
||||
|
||||
if 'Kudos:' in label:
|
||||
self.story.setMetadata('kudos', value)
|
||||
|
||||
if 'Hits:' in label:
|
||||
self.story.setMetadata('hits', value)
|
||||
|
||||
if 'Bookmarks:' in label:
|
||||
self.story.setMetadata('bookmarks', value)
|
||||
|
||||
if 'Chapters:' in label:
|
||||
if value.split('/')[0] == value.split('/')[1]:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Completed' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
|
||||
# Find Series name from series URL.
|
||||
ddseries = metasoup.find('dd',{'class':"series"})
|
||||
|
||||
if ddseries:
|
||||
for i, a in enumerate(ddseries.findAll('a', href=re.compile(r"/series/\d+"))):
|
||||
series_name = stripHTML(a)
|
||||
series_url = 'http://'+self.host+a['href']
|
||||
series_index = int(stripHTML(a.previousSibling).replace(', ','').split(' ')[1]) # "Part # of" or ", Part #"
|
||||
self.story.setMetadata('series%02d'%i,"%s [%s]"%(series_name,series_index))
|
||||
self.story.setMetadata('series%02dUrl'%i,series_url)
|
||||
if i == 0:
|
||||
self.setSeries(series_name, series_index)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
|
||||
def hookForUpdates(self,chaptercount):
|
||||
if self.oldchapters and len(self.oldchapters) > self.newestChapterNum:
|
||||
print("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1))
|
||||
self.oldchapters = self.oldchapters[:self.newestChapterNum]
|
||||
return len(self.oldchapters)
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
chapter=bs.BeautifulSoup('<div class="story"></div>').find('div')
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
exclude_notes=self.getConfigList('exclude_notes')
|
||||
|
||||
if 'authorheadnotes' not in exclude_notes:
|
||||
headnotes = soup.find('div', {'class' : "preface group"}).find('div', {'class' : "notes module"})
|
||||
if headnotes != None:
|
||||
headnotes = headnotes.find('blockquote', {'class' : "userstuff"})
|
||||
if headnotes != None:
|
||||
chapter.append("<b>Author's Note:</b>")
|
||||
chapter.append(headnotes)
|
||||
|
||||
if 'chaptersummary' not in exclude_notes:
|
||||
chapsumm = soup.find('div', {'id' : "summary"})
|
||||
if chapsumm != None:
|
||||
chapsumm = chapsumm.find('blockquote')
|
||||
chapter.append("<b>Summary for the Chapter:</b>")
|
||||
chapter.append(chapsumm)
|
||||
|
||||
if 'chapterheadnotes' not in exclude_notes:
|
||||
chapnotes = soup.find('div', {'id' : "notes"})
|
||||
if chapnotes != None:
|
||||
chapnotes = chapnotes.find('blockquote')
|
||||
if chapnotes != None:
|
||||
chapter.append("<b>Notes for the Chapter:</b>")
|
||||
chapter.append(chapnotes)
|
||||
|
||||
text = soup.find('div', {'class' : "userstuff module"})
|
||||
chtext = text.find('h3', {'class' : "landmark heading"})
|
||||
if chtext:
|
||||
chtext.extract()
|
||||
chapter.append(text)
|
||||
|
||||
if 'chapterfootnotes' not in exclude_notes:
|
||||
chapfoot = soup.find('div', {'class' : "end notes module", 'role' : "complementary"})
|
||||
if chapfoot != None:
|
||||
chapfoot = chapfoot.find('blockquote')
|
||||
chapter.append("<b>Notes for the Chapter:</b>")
|
||||
chapter.append(chapfoot)
|
||||
|
||||
if 'authorfootnotes' not in exclude_notes:
|
||||
footnotes = soup.find('div', {'id' : "work_endnotes"})
|
||||
if footnotes != None:
|
||||
footnotes = footnotes.find('blockquote')
|
||||
chapter.append("<b>Author's Note:</b>")
|
||||
chapter.append(footnotes)
|
||||
|
||||
if None == soup:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,chapter)
|
||||
191
fanficdownloader/adapters/adapter_archiveskyehawkecom.py
Normal file
191
fanficdownloader/adapters/adapter_archiveskyehawkecom.py
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
|
||||
def getClass():
|
||||
return ArchiveSkyeHawkeComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class ArchiveSkyeHawkeComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/story.php?no='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ash')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%Y-%m-%d"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'archive.skyehawke.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['archive.skyehawke.com','www.skyehawke.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://archive.skyehawke.com/story.php?no=1234 http://www.skyehawke.com/archive/story.php?no=1234 http://skyehawke.com/archive/story.php?no=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://")+r"(archive|www)\.skyehawke\.com/(archive/)?story\.php\?no=\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('div', {'class':"story border"}).find('span',{'class':'left'})
|
||||
title=stripHTML(a).split('"')[1]
|
||||
self.story.setMetadata('title',title)
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
author = a.find('a')
|
||||
self.story.setMetadata('authorId',author['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+author['href'])
|
||||
self.story.setMetadata('author',author.string)
|
||||
|
||||
authorSoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
|
||||
|
||||
chapter=soup.find('select',{'name':'chapter'}).findAll('option')
|
||||
|
||||
for i in range(1,len(chapter)):
|
||||
ch=chapter[i]
|
||||
self.chapterUrls.append((stripHTML(ch),ch['value']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
box=soup.find('div', {'class': "container borderridge"})
|
||||
sum=box.find('span').text
|
||||
self.setDescription(url,sum)
|
||||
|
||||
boxes=soup.findAll('div', {'class': "container bordersolid"})
|
||||
for box in boxes:
|
||||
if box.find('b') != None and box.find('b').text == "History and Story Information":
|
||||
|
||||
for b in box.findAll('b'):
|
||||
if "words" in b.nextSibling:
|
||||
self.story.setMetadata('numWords', b.text)
|
||||
if "archived" in b.previousSibling:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(b.text), self.dateformat))
|
||||
if "updated" in b.previousSibling:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(b.text), self.dateformat))
|
||||
if "fandom" in b.nextSibling:
|
||||
self.story.addToList('category', b.text)
|
||||
|
||||
for br in box.findAll('br'):
|
||||
br.replaceWith('split')
|
||||
genre=box.text.split("Genre:")[1].split("split")[0]
|
||||
if not "Unspecified" in genre:
|
||||
self.story.addToList('genre',genre)
|
||||
|
||||
|
||||
if box.find('span') != None and box.find('span').text == "WARNING":
|
||||
|
||||
rating=box.findAll('span')[1]
|
||||
rating.find('br').replaceWith('split')
|
||||
rating=rating.text.replace("This story is rated",'').split('split')[0]
|
||||
self.story.setMetadata('rating',rating)
|
||||
logger.debug(self.story.getMetadata('rating'))
|
||||
|
||||
warnings=box.find('ol')
|
||||
if warnings != None:
|
||||
warnings=warnings.text.replace(']', '').replace('[', '').split(' ')
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning)
|
||||
|
||||
|
||||
for asoup in authorSoup.findAll('div', {'class':"story bordersolid"}):
|
||||
if asoup.find('a')['href'] == 'story.php?no='+self.story.getMetadata('storyId'):
|
||||
if '[ Completed ]' in asoup.text:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
chars=asoup.findNext('div').text.split('Characters')[1].split(']')[0]
|
||||
for char in chars.split(','):
|
||||
if not "None" in char:
|
||||
self.story.addToList('characters',char)
|
||||
break
|
||||
|
||||
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div',{'class':"chapter bordersolid"}).findNext('div').findNext('div')
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
254
fanficdownloader/adapters/adapter_ashwindersycophanthexcom.py
Normal file
254
fanficdownloader/adapters/adapter_ashwindersycophanthexcom.py
Normal file
|
|
@ -0,0 +1,254 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return AshwinderSycophantHexComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','asph')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m/%d/%Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'ashwinder.sycophanthex.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'This story contains adult content and/or themes.' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['rememberme'] = '1'
|
||||
params['sid'] = ''
|
||||
params['intent'] = ''
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Logout" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
|
||||
|
||||
try:
|
||||
# in case link points somewhere other than the first chapter
|
||||
a = soup.findAll('option')[1]['value']
|
||||
self.story.setMetadata('storyId',a.split('=',)[1])
|
||||
url = 'http://'+self.host+'/'+a
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url))
|
||||
except:
|
||||
pass
|
||||
|
||||
for info in asoup.findAll('table', {'width' : '100%', 'bordercolor' : re.compile(r'#')}):
|
||||
a = info.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
if a != None:
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
break
|
||||
|
||||
|
||||
# Find the chapters:
|
||||
chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1$'))
|
||||
if len(chapters) == 0:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
else:
|
||||
for chapter in chapters:
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d):
|
||||
try:
|
||||
return d.name
|
||||
except:
|
||||
return ""
|
||||
|
||||
cats = info.findAll('a',href=re.compile('categories.php'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
a = info.find('a', href=re.compile(r'reviews.php\?sid='+self.story.getMetadata('storyId')))
|
||||
val = a.nextSibling
|
||||
svalue = ""
|
||||
while not defaultGetattr(val) == 'br':
|
||||
val = val.nextSibling
|
||||
val = val.nextSibling
|
||||
while not defaultGetattr(val) == 'table':
|
||||
svalue += str(val)
|
||||
val = val.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = info.findAll('b')
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = stripHTML(labelspan)
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word Count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Genres' in label:
|
||||
genres = value.string.split(', ')
|
||||
for genre in genres:
|
||||
if genre != 'none':
|
||||
self.story.addToList('genre',genre)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = value.string.split(', ')
|
||||
for warning in warnings:
|
||||
if warning != ' none':
|
||||
self.story.addToList('warnings',warning)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
soup = bs.BeautifulSoup(data, selfClosingTags=('br','hr','span','center')) # some chapters seem to be hanging up on those tags, so it is safer to close them
|
||||
|
||||
story = soup.find('div', {"align" : "left"})
|
||||
|
||||
if None == story:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,story)
|
||||
227
fanficdownloader/adapters/adapter_asr3slashzoneorg.py
Normal file
227
fanficdownloader/adapters/adapter_asr3slashzoneorg.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return Asr3SlashzoneOrgAdapter
|
||||
|
||||
class Asr3SlashzoneOrgAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/archive/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','asr3')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d/%m/%y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'asr3.slashzone.org'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/archive/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/archive/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=3"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
#print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/archive/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Rating
|
||||
rate = stripHTML(soup.find('div',{'id':'pagetitle'}))
|
||||
rate = rate[rate.rindex('[')+1:rate.rindex(']')]
|
||||
self.story.setMetadata('rating', rate)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/archive/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
metadiv = soup.find('div',{'class':'content'})
|
||||
smalldiv = metadiv.find('div',{'class':'small'})
|
||||
|
||||
categorys = smalldiv.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for category in categorys:
|
||||
self.story.addToList('category',category.string)
|
||||
|
||||
chars = smalldiv.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
ships = smalldiv.parent.findAll('a',href=re.compile(r'browse\.php\?type=class&type_id=2&classid=1'))
|
||||
for ship in ships:
|
||||
self.story.addToList('ships',ship.string)
|
||||
|
||||
metatext = stripHTML(smalldiv)
|
||||
|
||||
if 'Completed: Yes' in metatext:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
wordstart=metatext.rindex('Word count:')+12
|
||||
words = metatext[wordstart:metatext.index(' ',wordstart)]
|
||||
self.story.setMetadata('numWords', words)
|
||||
|
||||
datesdiv = soup.find('div',{'class':'bottom'})
|
||||
dates = stripHTML(datesdiv).split()
|
||||
# Published: 04/26/2011 Updated: 03/06/2013
|
||||
self.story.setMetadata('datePublished', makeDate(dates[1], self.dateformat))
|
||||
self.story.setMetadata('dateUpdated', makeDate(dates[3], self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/archive/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# remove 'small' leaving only summary.
|
||||
smalldiv.extract()
|
||||
self.setDescription(url,metadiv)
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url))
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
346
fanficdownloader/adapters/adapter_bdsmgeschichten.py
Normal file
346
fanficdownloader/adapters/adapter_bdsmgeschichten.py
Normal file
|
|
@ -0,0 +1,346 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
import urlparse
|
||||
import time
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def _translate_date_german_english(date):
|
||||
fullmon = {"Januar":"01",
|
||||
"Februar":"02",
|
||||
u"März":"03",
|
||||
"April":"04",
|
||||
"Mai":"05",
|
||||
"Juni":"06",
|
||||
"Juli":"07",
|
||||
"August":"08",
|
||||
"September":"09",
|
||||
"Oktober":"10",
|
||||
"November":"11",
|
||||
"Dezember":"12"}
|
||||
for (name,num) in fullmon.items():
|
||||
date = date.replace(name,num)
|
||||
return date
|
||||
|
||||
_REGEX_TRAILING_DIGIT = re.compile("(\d+)$")
|
||||
_REGEX_DASH_TO_END = re.compile("-[^-]+$")
|
||||
_REGEX_CHAPTER_TITLE = re.compile(ur"""
|
||||
\s*
|
||||
[\u2013-]?
|
||||
\s*
|
||||
([\dIVX-]+)?
|
||||
\.?
|
||||
\s*
|
||||
[\[\(]?
|
||||
\s*
|
||||
(Teil|Kapitel|Tag)?
|
||||
\s*
|
||||
([\dIVX-]+)?
|
||||
\s*
|
||||
[\]\)]?
|
||||
\s*
|
||||
$
|
||||
""", re.VERBOSE)
|
||||
_INITIAL_STEP = 5
|
||||
|
||||
class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["utf8", "Windows-1252"]
|
||||
|
||||
self.story.setMetadata('siteabbrev','bdsmgesch')
|
||||
|
||||
# Replace possible chapter numbering
|
||||
chapterMatch = _REGEX_TRAILING_DIGIT.search(url)
|
||||
if chapterMatch is None:
|
||||
self.maxChapter = 1
|
||||
else:
|
||||
self.maxChapter = int(chapterMatch.group(1))
|
||||
# url = re.sub(_REGEX_TRAILING_DIGIT, "1", url)
|
||||
|
||||
# set storyId
|
||||
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
|
||||
|
||||
# normalize URL
|
||||
self._setURL('http://%s/%s' % (self.getSiteDomain(), self.story.getMetadata('storyId')))
|
||||
|
||||
self.dateformat = '%d. %m %Y - %H:%M'
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'bdsm-geschichten.net'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.bdsm-geschichten.net', 'www.bdsm-geschichten.net']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://www.bdsm-geschichten.net/title-of-story-1 http://bdsm-geschichten.net/title-of-story-1"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"http://(www\.)?bdsm-geschichten.net/(?P<storyId>[a-zA-Z0-9_-]+)"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if not (self.is_adult or self.getConfig("is_adult")):
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
try:
|
||||
data1 = self._fetchUrl(self.url)
|
||||
soup = bs.BeautifulSoup(data1)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
#strip comments from soup
|
||||
[comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
|
||||
# Cache the soups so we won't have to redownload in getChapterText later
|
||||
self.soupsCache = {}
|
||||
self.soupsCache[self.url] = soup
|
||||
|
||||
# author
|
||||
authorDiv = soup.find("div", "author-pane-line author-name")
|
||||
authorId = authorDiv.string.strip()
|
||||
self.story.setMetadata('authorId', authorId)
|
||||
self.story.setMetadata('author', authorId)
|
||||
# TODO not really true need to be loggedin for this to work or fetch userid
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+authorId)
|
||||
|
||||
# TODO better metadata
|
||||
date = soup.find("div", {"class": "submitted"}).string.strip()
|
||||
date = re.sub(" —.*", "", date)
|
||||
date = _translate_date_german_english(date)
|
||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
||||
title1 = soup.find("h1", {'class': 'title'}).string
|
||||
|
||||
|
||||
for tagLink in soup.find("ul", "taxonomy").findAll("a"):
|
||||
self.story.addToList('category', tagLink.string)
|
||||
|
||||
## Retrieve chapter soups
|
||||
if self.getConfig('find_chapters') == 'guess':
|
||||
self.chapterUrls = []
|
||||
self._find_chapters_by_guessing(title1)
|
||||
else:
|
||||
self._find_chapters_by_parsing(soup)
|
||||
|
||||
firstChapterUrl = self.chapterUrls[0][1]
|
||||
if firstChapterUrl in self.soupsCache:
|
||||
firstChapterSoup = self.soupsCache[firstChapterUrl]
|
||||
h1 = firstChapterSoup.find("h1").text
|
||||
else:
|
||||
h1 = soup.find("h1").text
|
||||
|
||||
h1 = re.sub(_REGEX_CHAPTER_TITLE, "", h1)
|
||||
self.story.setMetadata('title', h1)
|
||||
self.story.setMetadata('numChapters', len(self.chapterUrls))
|
||||
return
|
||||
|
||||
def _find_chapters_by_parsing(self, soup):
|
||||
|
||||
# store original soup
|
||||
origSoup = soup
|
||||
|
||||
#
|
||||
# find first chapter
|
||||
#
|
||||
firstLink = None
|
||||
firstLinkDiv = soup.find("div", "field-field-erster-teil")
|
||||
if firstLinkDiv is not None:
|
||||
firstLink = "http://%s%s" % (self.getSiteDomain(), firstLinkDiv.findNext("a")['href'])
|
||||
logger.debug("Found first chapter right away <%s>" % firstLink)
|
||||
try:
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(firstLink))
|
||||
self.soupsCache[firstLink] = soup
|
||||
self.chapterUrls.insert(0, (soup.find("h1").text, firstLink))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise exceptions.StoryDoesNotExist(firstLink)
|
||||
else:
|
||||
logger.debug("DIDN'T find first chapter right away")
|
||||
# parse previous Link until first
|
||||
while True:
|
||||
prevLink = None
|
||||
prevLinkDiv = soup.find("div", "field-field-vorheriger-teil")
|
||||
if prevLinkDiv is not None:
|
||||
prevLink = prevLinkDiv.find("a")
|
||||
if prevLink is None:
|
||||
prevLink = soup.find("a", text=re.compile("<<<")) # <<<
|
||||
if prevLink is None:
|
||||
logger.debug("Couldn't find prev part")
|
||||
break
|
||||
else:
|
||||
logger.debug("Previous Chapter <%s>" % prevLink)
|
||||
if type(prevLink) != bs.Tag or prevLink.name != "a":
|
||||
prevLink = prevLink.findParent("a")
|
||||
if prevLink is None or '#' in prevLink['href']:
|
||||
logger.debug("Couldn't find prev part (false positive) <%s>" % prevLink)
|
||||
break
|
||||
prevLink = prevLink['href']
|
||||
try:
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(prevLink))
|
||||
self.soupsCache[prevLink] = soup
|
||||
prevTtitle = soup.find("h1", {'class': 'title'}).string
|
||||
self.chapterUrls.insert(0, (prevTtitle, prevLink))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(nextLink)
|
||||
else:
|
||||
raise e
|
||||
firstLink = prevLink
|
||||
|
||||
# if first chapter couldn't be determined, assume the URL originally
|
||||
# passed is the first chapter
|
||||
if firstLink is None:
|
||||
logger.debug("Couldn't set first chapter")
|
||||
firstLink = self.url
|
||||
self.chapterUrls.insert(0, (soup.find("h1").text, firstLink))
|
||||
|
||||
# set first URL
|
||||
logger.debug("Set first link: %s" % firstLink)
|
||||
self._setURL(firstLink)
|
||||
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(firstLink).group('storyId'))
|
||||
|
||||
#
|
||||
# Parse next chapters
|
||||
#
|
||||
while True:
|
||||
nextLink = None
|
||||
nextLinkDiv = soup.find("div", "field-field-naechster-teil")
|
||||
if nextLinkDiv is not None:
|
||||
nextLink = nextLinkDiv.find("a")
|
||||
if nextLink is None:
|
||||
nextLink = soup.find("a", text=re.compile(">>>"))
|
||||
if nextLink is None:
|
||||
nextLink = soup.find("a", text=re.compile("Fortsetzung"))
|
||||
|
||||
if nextLink is None:
|
||||
logger.debug("Couldn't find next part")
|
||||
break
|
||||
else:
|
||||
if type(nextLink) != bs.Tag or nextLink.name != "a":
|
||||
nextLink = nextLink.findParent("a")
|
||||
if nextLink is None or '#' in nextLink['href']:
|
||||
logger.debug("Couldn't find next part (false positive) <%s>" % nextLink)
|
||||
break
|
||||
nextLink = nextLink['href']
|
||||
|
||||
if not nextLink.startswith('http:'):
|
||||
nextLink = 'http://' + self.getSiteDomain() + nextLink
|
||||
|
||||
for loadedChapter in self.chapterUrls:
|
||||
if loadedChapter[0] == nextLink:
|
||||
logger.debug("ERROR: Repeating chapter <%s> Try to fix it" % nextLink)
|
||||
nextLinkMatch = _REGEX_TRAILING_DIGIT.match(nextLink)
|
||||
if nextLinkMatch is not None:
|
||||
curChap = nextLinkMatch.group(1)
|
||||
nextLink = re.sub(_REGEX_TRAILING_DIGIT, str(int(curChap) + 1), nextLink)
|
||||
else:
|
||||
break
|
||||
try:
|
||||
data = self._fetchUrl(nextLink)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(nextLink)
|
||||
else:
|
||||
raise e
|
||||
title2 = soup.find("h1", {'class': 'title'}).string
|
||||
self.chapterUrls.append((title2, nextLink))
|
||||
logger.debug("Grabbing next chapter URL " + nextLink)
|
||||
self.soupsCache[nextLink] = soup
|
||||
# [comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
logger.debug("Chapters: %s" % self.chapterUrls)
|
||||
|
||||
|
||||
def _find_chapters_by_guessing(self, title1):
|
||||
step = _INITIAL_STEP
|
||||
curMax = self.maxChapter + step
|
||||
lastHit = True
|
||||
while True:
|
||||
nextChapterUrl = re.sub(_REGEX_TRAILING_DIGIT, str(curMax), self.url)
|
||||
if nextChapterUrl == self.url:
|
||||
logger.debug("Unable to guess next chapter because URL doesn't end in numbers")
|
||||
break;
|
||||
try:
|
||||
logger.debug("Trying chapter URL " + nextChapterUrl)
|
||||
data = self._fetchUrl(nextChapterUrl)
|
||||
hit = True
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
hit = False
|
||||
else:
|
||||
raise e
|
||||
if hit:
|
||||
logger.debug("Found chapter URL " + nextChapterUrl)
|
||||
self.maxChapter = curMax
|
||||
self.soupsCache[nextChapterUrl] = bs.BeautifulSoup(data)
|
||||
if not lastHit:
|
||||
break
|
||||
lastHit = curMax
|
||||
curMax += step
|
||||
else:
|
||||
lastHit = False
|
||||
curMax -= 1
|
||||
logger.debug(curMax)
|
||||
|
||||
for i in xrange(1, self.maxChapter):
|
||||
nextChapterUrl = re.sub(_REGEX_TRAILING_DIGIT, str(i), self.url)
|
||||
nextChapterTitle = re.sub("1", str(i), title1)
|
||||
self.chapterUrls.append((nextChapterTitle, nextChapterUrl))
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
if url in self.soupsCache:
|
||||
logger.debug('Getting chapter <%s> from cache' % url)
|
||||
soup = self.soupsCache[url]
|
||||
else:
|
||||
logger.debug('Downloading chapter <%s>' % url)
|
||||
data1 = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data1)
|
||||
#strip comments from soup
|
||||
[comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
|
||||
# get story text
|
||||
storyDiv1 = bs.Tag(soup, "div")
|
||||
for para in soup.find("div", "full-node").find('div', 'content').findAll("p"):
|
||||
storyDiv1.append(para)
|
||||
storyDiv1.append('<br />')
|
||||
storytext = self.utf8FromSoup(url,storyDiv1)
|
||||
|
||||
return storytext
|
||||
|
||||
|
||||
def getClass():
|
||||
return BdsmGeschichtenAdapter
|
||||
193
fanficdownloader/adapters/adapter_bloodshedversecom.py
Normal file
193
fanficdownloader/adapters/adapter_bloodshedversecom.py
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
from datetime import timedelta
|
||||
import re
|
||||
import urllib2
|
||||
import urlparse
|
||||
|
||||
from .. import BeautifulSoup
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
from .. import exceptions
|
||||
|
||||
|
||||
def getClass():
|
||||
return BloodshedverseComAdapter
|
||||
|
||||
|
||||
def _get_query_data(url):
|
||||
components = urlparse.urlparse(url)
|
||||
query_data = urlparse.parse_qs(components.query)
|
||||
return dict((key, data[0]) for key, data in query_data.items())
|
||||
|
||||
|
||||
class BloodshedverseComAdapter(BaseSiteAdapter):
|
||||
SITE_ABBREVIATION = 'bvc'
|
||||
SITE_DOMAIN = 'bloodshedverse.com'
|
||||
|
||||
BASE_URL = 'http://' + SITE_DOMAIN + '/'
|
||||
READ_URL_TEMPLATE = BASE_URL + 'stories.php?go=read&no=%s'
|
||||
|
||||
STARTED_DATETIME_FORMAT = '%m/%d/%Y'
|
||||
UPDATED_DATETIME_FORMAT = '%m/%d/%Y %I:%M'
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
query_data = urlparse.parse_qs(self.parsedUrl.query)
|
||||
story_no = query_data['no'][0]
|
||||
|
||||
self.story.setMetadata('storyId', story_no)
|
||||
self._setURL(self.READ_URL_TEMPLATE % story_no)
|
||||
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
|
||||
|
||||
def _customized_fetch_url(self, url, exception=None, parameters=None):
|
||||
if exception:
|
||||
try:
|
||||
data = self._fetchUrl(url, parameters)
|
||||
except urllib2.HTTPError:
|
||||
raise exception(self.url)
|
||||
# Just let self._fetchUrl throw the exception, don't catch and
|
||||
# customize it.
|
||||
else:
|
||||
data = self._fetchUrl(url, parameters)
|
||||
|
||||
return BeautifulSoup.BeautifulSoup(data)
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return BloodshedverseComAdapter.SITE_DOMAIN
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return cls.READ_URL_TEMPLATE % 1234
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape(self.BASE_URL + 'stories.php?go=') + r'(read|chapters)\&no=\d+$'
|
||||
|
||||
# Override stripURLParameters so the "no" parameter won't get stripped
|
||||
@classmethod
|
||||
def stripURLParameters(cls, url):
|
||||
return url
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
soup = self._customized_fetch_url(self.url)
|
||||
|
||||
# Since no 404 error code we have to raise the exception ourselves.
|
||||
# A title that is just 'by' indicates that there is no author name
|
||||
# and no story title available.
|
||||
if stripHTML(soup.title) == 'by':
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
|
||||
for option in soup.find('select', {'name': 'chapter'}):
|
||||
title = stripHTML(option)
|
||||
url = self.READ_URL_TEMPLATE % option['value']
|
||||
self.chapterUrls.append((title, url))
|
||||
|
||||
# Get the URL to the author's page and find the correct story entry to
|
||||
# scrape the metadata
|
||||
author_url = urlparse.urljoin(self.url, soup.find('a', {'class': 'headline'})['href'])
|
||||
soup = self._customized_fetch_url(author_url)
|
||||
|
||||
story_no = self.story.getMetadata('storyId')
|
||||
# Ignore first list_box div, it only contains the author information
|
||||
for list_box in soup('div', {'class': 'list_box'})[1:]:
|
||||
url = list_box.find('a', {'class': 'fictitle'})['href']
|
||||
query_data = _get_query_data(url)
|
||||
|
||||
# Found the div containing the story's metadata; break the loop and
|
||||
# parse the element
|
||||
if query_data['no'] == story_no:
|
||||
break
|
||||
else:
|
||||
raise exceptions.FailedToDownload(self.url)
|
||||
|
||||
title_anchor = list_box.find('a', {'class': 'fictitle'})
|
||||
self.story.setMetadata('title', stripHTML(title_anchor))
|
||||
|
||||
author_anchor = title_anchor.findNextSibling('a')
|
||||
self.story.setMetadata('author', stripHTML(author_anchor))
|
||||
self.story.setMetadata('authorId', _get_query_data(author_anchor['href'])['who'])
|
||||
self.story.setMetadata('authorUrl', urlparse.urljoin(self.url, author_anchor['href']))
|
||||
|
||||
list_review = list_box.find('div', {'class': 'list_review'})
|
||||
reviews = stripHTML(list_review.a).split(' ', 1)[0]
|
||||
self.story.setMetadata('reviews', reviews)
|
||||
|
||||
summary_div = list_box.find('div', {'class': 'list_summary'})
|
||||
if not self.getConfig('keep_summary_html'):
|
||||
summary = ''.join(summary_div(text=True))
|
||||
else:
|
||||
summary = self.utf8FromSoup(author_url, summary_div)
|
||||
|
||||
self.story.setMetadata('description', summary)
|
||||
|
||||
# I'm assuming this to be the category, not sure what else it could be
|
||||
first_listinfo = list_box.find('div', {'class': 'list_info'})
|
||||
self.story.addToList('category', stripHTML(first_listinfo.a))
|
||||
|
||||
for list_info in first_listinfo.findNextSiblings('div', {'class': 'list_info'}):
|
||||
for b_tag in list_info('b'):
|
||||
key = b_tag.string.strip(': ')
|
||||
# Strip colons from the beginning, superfluous spaces and minus
|
||||
# characters from the end, and possibly trailing commas from
|
||||
# the warnings if only one is present
|
||||
value = b_tag.nextSibling.string.strip(': -,')
|
||||
|
||||
if key == 'Genre':
|
||||
for genre in value.split(', '):
|
||||
# Ignore the "none" genre
|
||||
if not genre == 'none':
|
||||
self.story.addToList('genre', genre)
|
||||
|
||||
elif key == 'Rating':
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
elif key == 'Complete':
|
||||
self.story.setMetadata('status', 'Completed' if value == 'Yes' else 'In-Progress')
|
||||
|
||||
elif key == 'Warning':
|
||||
for warning in value.split(', '):
|
||||
# The string here starts with ", " before the actual list
|
||||
# of values sometimes, so check for an empty warning
|
||||
# and ignore the "none" warning.
|
||||
if not warning or warning == 'none':
|
||||
continue
|
||||
|
||||
self.story.addToList('warnings', warning)
|
||||
|
||||
elif key == 'Chapters':
|
||||
self.story.setMetadata('numChapters', int(value))
|
||||
|
||||
elif key == 'Words':
|
||||
# Apparently only numChapters need to be an integer for
|
||||
# some strange reason. Remove possible ',' characters as to
|
||||
# not confuse the codebase down the line
|
||||
self.story.setMetadata('numWords', value.replace(',', ''))
|
||||
|
||||
elif key == 'Started':
|
||||
self.story.setMetadata('datePublished', makeDate(value, self.STARTED_DATETIME_FORMAT))
|
||||
|
||||
elif key == 'Updated':
|
||||
date_string, period = value.rsplit(' ', 1)
|
||||
date = makeDate(date_string, self.UPDATED_DATETIME_FORMAT)
|
||||
|
||||
# Rather ugly hack to work around Calibre's changing of
|
||||
# Python's locale setting, causing am/pm to not be properly
|
||||
# parsed by strptime() when using a non-english locale
|
||||
if period == 'pm':
|
||||
date += timedelta(hours=12)
|
||||
self.story.setMetadata('dateUpdated', date)
|
||||
|
||||
if self.story.getMetadata('rating') == 'NC-17' and not (self.is_adult or self.getConfig('is_adult')):
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
def getChapterText(self, url):
|
||||
soup = self._customized_fetch_url(url)
|
||||
storytext_div = soup.find('div', {'class': 'storytext'})
|
||||
|
||||
if self.getConfig('strip_text_links'):
|
||||
for anchor in storytext_div('a', {'class': 'FAtxtL'}):
|
||||
navigable_string = BeautifulSoup.NavigableString(anchor.string)
|
||||
anchor.replaceWith(navigable_string)
|
||||
|
||||
return self.utf8FromSoup(url, storytext_div)
|
||||
337
fanficdownloader/adapters/adapter_bloodtiesfancom.py
Normal file
337
fanficdownloader/adapters/adapter_bloodtiesfancom.py
Normal file
|
|
@ -0,0 +1,337 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
# By virtue of being recent and requiring both is_adult and user/pass,
|
||||
# adapter_fanficcastletvnet.py is the best choice for learning to
|
||||
# write adapters--especially for sites that use the eFiction system.
|
||||
# Most sites that have ".../viewstory.php?sid=123" in the story URL
|
||||
# are eFiction.
|
||||
|
||||
# For non-eFiction sites, it can be considerably more complex, but
|
||||
# this is still a good starting point.
|
||||
|
||||
# In general an 'adapter' needs to do these five things:
|
||||
|
||||
# - 'Register' correctly with the downloader
|
||||
# - Site Login (if needed)
|
||||
# - 'Are you adult?' check (if needed--some do one, some the other, some both)
|
||||
# - Grab the chapter list
|
||||
# - Grab the story meta-data (some (non-eFiction) adapters have to get it from the author page)
|
||||
# - Grab the chapter texts
|
||||
|
||||
# Search for XXX comments--that's where things are most likely to need changing.
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
# updated to reflect the class below it. That, plus getSiteDomain()
|
||||
# take care of 'Registering'.
|
||||
def getClass():
|
||||
return BloodTiesFansComAdapter # XXX
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class BloodTiesFansComAdapter(BaseSiteAdapter): # XXX
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/fiction/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','btf') # XXX
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b %d, %Y" # XXX
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'bloodties-fans.com' # XXX
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/fiction/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/fiction/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/fiction/user.php?action=login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
|
||||
# Furthermore, there's a couple sites now with more than
|
||||
# one warning level for different ratings. And they're
|
||||
# fussy about it. midnightwhispers has three: 4, 2 & 1.
|
||||
# we'll try 1 first.
|
||||
addurl = "&ageconsent=ok&warning=4" # XXX
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
|
||||
# Since the warning text can change by warning level, let's
|
||||
# look for the warning pass url. nfacommunity uses
|
||||
# &warning= -- actually, so do other sites. Must be an
|
||||
# eFiction book.
|
||||
|
||||
# viewstory.php?sid=561&warning=4
|
||||
# viewstory.php?sid=561&warning=1
|
||||
# viewstory.php?sid=561&warning=2
|
||||
#print data
|
||||
#m = re.search(r"'viewstory.php\?sid=1882(&warning=4)'",data)
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/fiction/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/fiction/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
charstext = [char.string for char in chars]
|
||||
for char in charstext:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
## Not all sites use Genre, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
genrestext = [genre.string for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
## Not all sites use Warnings, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
warningstext = [warning.string for warning in warnings]
|
||||
self.warning = ', '.join(warningstext)
|
||||
for warning in warningstext:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/fiction/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
296
fanficdownloader/adapters/adapter_buffynfaithnet.py
Normal file
296
fanficdownloader/adapters/adapter_buffynfaithnet.py
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
import cookielib as cl
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
# updated to reflect the class below it. That, plus getSiteDomain()
|
||||
# take care of 'Registering'.
|
||||
def getClass():
|
||||
return BuffyNFaithNetAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class BuffyNFaithNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.setHeader()
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
self.story.setMetadata('storyId',m.group('id'))
|
||||
|
||||
# normalized story URL. gets rid of chapter if there, left with ch 1 URL on this site
|
||||
nurl = "http://"+self.getSiteDomain()+"/fanfictions/index.php?act=vie&id="+self.story.getMetadata('storyId')
|
||||
self._setURL(nurl)
|
||||
#argh, this mangles the ampersands I need on metadata['storyUrl']
|
||||
#will set it this way
|
||||
self.story.setMetadata('storyUrl',nurl,condremoveentities=False)
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','bnfnet')
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'buffynfaith.net'
|
||||
|
||||
@classmethod
|
||||
def stripURLParameters(cls,url):
|
||||
"Only needs to be overriden if URL contains more than one parameter"
|
||||
## This adapter needs at least two parameters left on the URL, act and id
|
||||
return re.sub(r"(\?act=(vie|ovr)&id=\d+)&.*$",r"\1",url)
|
||||
|
||||
def setHeader(self):
|
||||
"buffynfaith.net wants a Referer for images. Used both above and below(after cookieproc added)"
|
||||
self.opener.addheaders.append(('Referer', 'http://'+self.getSiteDomain()+'/'))
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/fanfictions/index.php?act=vie&id=1234 http://"+cls.getSiteDomain()+"/fanfictions/index.php?act=ovr&id=1234 http://"+cls.getSiteDomain()+"/fanfictions/index.php?act=vie&id=1234&ch=2"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
#http://buffynfaith.net/fanfictions/index.php?act=vie&id=963
|
||||
#http://buffynfaith.net/fanfictions/index.php?act=vie&id=949
|
||||
#http://buffynfaith.net/fanfictions/index.php?act=vie&id=949&ch=2
|
||||
p = re.escape("http://"+self.getSiteDomain()+"/fanfictions/index.php?act=")+\
|
||||
r"(vie|ovr)&id=(?P<id>\d+)(&ch=(?P<ch>\d+))?$"
|
||||
return p
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
dateformat = "%d %B %Y"
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
#set a cookie to get past adult check
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
cookie = cl.Cookie(version=0, name='my_age', value='yes',
|
||||
port=None, port_specified=False,
|
||||
domain=self.getSiteDomain(), domain_specified=False, domain_initial_dot=False,
|
||||
path='/', path_specified=True,
|
||||
secure=False,
|
||||
expires=time.time()+10000,
|
||||
discard=False,
|
||||
comment=None,
|
||||
comment_url=None,
|
||||
rest={'HttpOnly': None},
|
||||
rfc2109=False)
|
||||
self.cookiejar.set_cookie(cookie)
|
||||
self.setHeader()
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
#print data
|
||||
|
||||
if "ADULT CONTENT WARNING" in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
#stuff in <head>: description
|
||||
svalue = soup.head.find('meta',attrs={'name':'description'})['content']
|
||||
#self.story.setMetadata('description',svalue)
|
||||
self.setDescription(url,svalue)
|
||||
|
||||
#useful stuff in rest of doc, all contained in this:
|
||||
doc = soup.body.find('div', id='my_wrapper')
|
||||
|
||||
#first the site category (more of a genre to me, meh) and title, in this element:
|
||||
mt = doc.find('div',attrs={'class':'maintitle'})
|
||||
self.story.addToList('genre',mt.findAll('a')[1].string)
|
||||
self.story.setMetadata('title',mt.findAll('a')[1].nextSibling[len(' » '):])
|
||||
del mt
|
||||
|
||||
#the actual category, for me, is 'Buffy: The Vampire Slayer'
|
||||
#self.story.addToList('category','Buffy: The Vampire Slayer')
|
||||
#No need to do it here, it is better to set it in in plugin-defaults.ini and defaults.ini
|
||||
|
||||
#then a block that sits in a table cell like so:
|
||||
#(contains a lot of metadata)
|
||||
mblock = doc.find('td', align='left', width = '70%').contents
|
||||
while len(mblock) > 0:
|
||||
i = mblock.pop(0)
|
||||
if 'Author:' in i.string:
|
||||
#drop empty space
|
||||
mblock.pop(0)
|
||||
#get author link
|
||||
a = mblock.pop(0)
|
||||
authre = re.escape('./index.php?act=bio&id=')+'(?P<authid>\d+)'
|
||||
m = re.match(authre,a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
self.story.setMetadata('authorId',m.group('authid'))
|
||||
authurl = u'http://%s/fanfictions/index.php?act=bio&id=%s' % ( self.getSiteDomain(),
|
||||
self.story.getMetadata('authorId'))
|
||||
self.story.setMetadata('authorUrl',authurl,condremoveentities=False)
|
||||
#drop empty space
|
||||
mblock.pop(0)
|
||||
if 'Rating:' in i.string:
|
||||
self.story.setMetadata('rating',mblock.pop(0).strip())
|
||||
if 'Published:' in i.string:
|
||||
date = mblock.pop(0).strip()
|
||||
#get rid of 'st', 'nd', 'rd', 'th' after day number
|
||||
date = date[0:2]+date[4:]
|
||||
self.story.setMetadata('datePublished',makeDate(date, dateformat))
|
||||
if 'Last Updated:' in i.string:
|
||||
date = mblock.pop(0).strip()
|
||||
#get rid of 'st', 'nd', 'rd', 'th' after day number
|
||||
date = date[0:2]+date[4:]
|
||||
self.story.setMetadata('dateUpdated',makeDate(date, dateformat))
|
||||
if 'Genre:' in i.string:
|
||||
genres = mblock.pop(0).strip()
|
||||
genres = genres.split('/')
|
||||
for genre in genres: self.story.addToList('genre',genre)
|
||||
#end ifs
|
||||
#end while
|
||||
|
||||
# Find the chapter selector
|
||||
select = soup.find('select', { 'name' : 'ch' } )
|
||||
|
||||
if select is None:
|
||||
# no selector found, so it's a one-chapter story.
|
||||
#self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = u'http://%s/fanfictions/index.php?act=vie&id=%s&ch=%s' % ( self.getSiteDomain(),
|
||||
self.story.getMetadata('storyId'),
|
||||
o['value'])
|
||||
title = u"%s" % o
|
||||
title = stripHTML(title)
|
||||
ts = title.split(' ',1)
|
||||
title = ts[0]+'. '+ts[1]
|
||||
self.chapterUrls.append((title,url))
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
## Go scrape the rest of the metadata from the author's page.
|
||||
data = self._fetchUrl(self.story.getMetadata('authorUrl'))
|
||||
soup = bs.BeautifulSoup(data)
|
||||
#find the story link and its parent div
|
||||
storya = soup.find('a',{'href':self.story.getMetadata('storyUrl')})
|
||||
storydiv = storya.parent
|
||||
#warnings come under a <spawn> tag. Never seen that before...
|
||||
#appears to just be a line of freeform text, not necessarily a list
|
||||
#optional
|
||||
spawn = storydiv.find('spawn',{'id':'warnings'})
|
||||
if spawn is not None:
|
||||
warns = spawn.nextSibling.strip()
|
||||
self.story.addToList('warnings',warns)
|
||||
#some meta in spans - this should get all, even the ones jammed in a table
|
||||
spans = storydiv.findAll('span')
|
||||
for s in spans:
|
||||
if s.string == 'Ship:':
|
||||
list = s.nextSibling.strip().split()
|
||||
self.story.extendList('ships',list)
|
||||
if s.string == 'Characters:':
|
||||
list = s.nextSibling.strip().split(',')
|
||||
self.story.extendList('characters',list)
|
||||
if s.string == 'Status:':
|
||||
st = s.nextSibling.strip()
|
||||
self.story.setMetadata('status',st)
|
||||
if s.string == 'Words:':
|
||||
st = s.nextSibling.strip()
|
||||
self.story.setMetadata('numWords',st)
|
||||
|
||||
#reviews - is this worth having?
|
||||
#ffnet adapter gathers it, don't know if anything else does
|
||||
#or if it's ever going to be used!
|
||||
a = storydiv.find('a',{'id':'bold-blue'})
|
||||
if a:
|
||||
revs = a.nextSibling.strip()[1:-1]
|
||||
self.story.setMetadata('reviews',st)
|
||||
else:
|
||||
revs = '0'
|
||||
self.story.setMetadata('reviews',st)
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'fanfiction'})
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
#remove all the unnecessary bookmark tags
|
||||
[s.extract() for s in div('div',{'class':"tiny_box2"})]
|
||||
|
||||
#is there a review link?
|
||||
r = div.find('a',href=re.compile(re.escape("./index.php?act=irv")+".*$"))
|
||||
if r is not None:
|
||||
#remove the review link and its parent div
|
||||
r.parent.extract()
|
||||
|
||||
#There might also be a link to the sequel on the last chapter
|
||||
#I'm inclined to keep it in, but the URL needs to be changed from relative to absolute
|
||||
#Shame there isn't proper series metadata available
|
||||
#(I couldn't find it anyway)
|
||||
s = div.find('a',href=re.compile(re.escape("./index.php?act=ovr")+".*$"))
|
||||
if s is not None:
|
||||
s['href'] = 'http://'+self.getSiteDomain()+'/fanfictions'+s['href'][1:]
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
310
fanficdownloader/adapters/adapter_castlefansorg.py
Normal file
310
fanficdownloader/adapters/adapter_castlefansorg.py
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
# By virtue of being recent and requiring both is_adult and user/pass,
|
||||
# adapter_fanficcastletvnet.py is the best choice for learning to
|
||||
# write adapters--especially for sites that use the eFiction system.
|
||||
# Most sites that have ".../viewstory.php?sid=123" in the story URL
|
||||
# are eFiction.
|
||||
|
||||
# For non-eFiction sites, it can be considerably more complex, but
|
||||
# this is still a good starting point.
|
||||
|
||||
# In general an 'adapter' needs to do these five things:
|
||||
|
||||
# - 'Register' correctly with the downloader
|
||||
# - Site Login (if needed)
|
||||
# - 'Are you adult?' check (if needed--some do one, some the other, some both)
|
||||
# - Grab the chapter list
|
||||
# - Grab the story meta-data (some (non-eFiction) adapters have to get it from the author page)
|
||||
# - Grab the chapter texts
|
||||
|
||||
# Search for XXX comments--that's where things are most likely to need changing.
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
# updated to reflect the class below it. That, plus getSiteDomain()
|
||||
# take care of 'Registering'.
|
||||
def getClass():
|
||||
return CastleFansOrgAdapter # XXX
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class CastleFansOrgAdapter(BaseSiteAdapter): # XXX
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/fanfic/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','cslf') # XXX
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b %d, %Y" # XXX
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'castlefans.org' # XXX
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/fanfic/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/fanfic/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/fanfic/user.php?action=login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=4" # XXX
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
if "Age Consent Required" in data: # XXX
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
pagetitle = soup.find('div',{'id':'pagetitle'})
|
||||
## Title
|
||||
a = pagetitle.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/fanfic/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/fanfic/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
charstext = [char.string for char in chars]
|
||||
for char in charstext:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
## Not all sites use Genre, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
genrestext = [genre.string for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
## Not all sites use Warnings, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
warningstext = [warning.string for warning in warnings]
|
||||
self.warning = ', '.join(warningstext)
|
||||
for warning in warningstext:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/fanfic/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
238
fanficdownloader/adapters/adapter_chaossycophanthexcom.py
Normal file
238
fanficdownloader/adapters/adapter_chaossycophanthexcom.py
Normal file
|
|
@ -0,0 +1,238 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return ChaosSycophantHexComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class ChaosSycophantHexComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','csph')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m/%d/%Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'chaos.sycophanthex.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=19"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
if "Age Consent Required" in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
pt = soup.find('div', {'id' : 'pagetitle'})
|
||||
a = pt.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = pt.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
rating=pt.text.split('(')[1].split(')')[0]
|
||||
self.story.setMetadata('rating', rating)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
|
||||
value = labels[0].previousSibling
|
||||
svalue = ""
|
||||
while value != None:
|
||||
val = value
|
||||
value = value.previousSibling
|
||||
while not defaultGetattr(val,'class') == 'label':
|
||||
svalue += str(val)
|
||||
val = val.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value.split(' -')[0])
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Complete' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value.split(' -')[0]), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
238
fanficdownloader/adapters/adapter_checkmatedcom.py
Normal file
238
fanficdownloader/adapters/adapter_checkmatedcom.py
Normal file
|
|
@ -0,0 +1,238 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
|
||||
def getClass():
|
||||
return CheckmatedComAdapter
|
||||
|
||||
|
||||
class CheckmatedComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
self._setURL('http://' + self.getSiteDomain() + '/story.php?story='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','chm')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b %d, %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.checkmated.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/story.php?story=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/story.php?story=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites. This story is in The Bedchamber
|
||||
def needToLoginCheck(self, data):
|
||||
if 'This story is in The Bedchamber' in data \
|
||||
or 'That username is not in our database' in data \
|
||||
or "That password is not correct, please try again" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['name'] = self.username
|
||||
params['pass'] = self.password
|
||||
else:
|
||||
params['name'] = self.getConfig("username")
|
||||
params['pass'] = self.getConfig("password")
|
||||
params['login'] = 'yes'
|
||||
params['submit'] = 'login'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain()+'/login.php'
|
||||
d = self._fetchUrl(loginUrl,params)
|
||||
e = self._fetchUrl(url)
|
||||
|
||||
if "Welcome back," not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['name']))
|
||||
raise exceptions.FailedToLogin(url,params['name'])
|
||||
return False
|
||||
elif "This story is in The Bedchamber" in e:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Your account does not have sufficient priviliges to read this story.")
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('span', {'class' : 'storytitle'})
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = a.parent.find('a', href=re.compile(r"authors.php\?name\=\w+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
a = soup.find('select', {'name' : 'chapter'})
|
||||
if a == None:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
else:
|
||||
for chapter in a.findAll('option'):
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/story.php?story='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
# website does not keep track of word count, and there is no convenient way to calculate it
|
||||
|
||||
summary = soup.find('fieldset')
|
||||
summary.find('legend').extract()
|
||||
summary.name='div'
|
||||
self.setDescription(url,summary)
|
||||
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
table = soup.findAll('div', {'class' : 'text'})[1]
|
||||
for labels in table.findAll('tr'):
|
||||
value = labels.findAll('td')[1]
|
||||
label = labels.findAll('td')[0]
|
||||
|
||||
|
||||
if 'Rating' in stripHTML(label):
|
||||
self.story.setMetadata('rating', stripHTML(value))
|
||||
|
||||
if 'Ship' in stripHTML(label):
|
||||
if value.string != "none/none":
|
||||
self.story.addToList('ships',value.string)
|
||||
for char in value.string.split('/'):
|
||||
if char != 'none':
|
||||
self.story.addToList('characters',char)
|
||||
|
||||
if 'Status' in stripHTML(label):
|
||||
if value.find('img', {'src' : 'img/incomplete.gif'}) == None:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in stripHTML(label):
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in stripHTML(label):
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
a = self._fetchUrl(self.story.getMetadata('authorUrl')+'&cat=stories')
|
||||
for story in bs.BeautifulSoup(a).findAll('table', {'class' : 'storyinfo'}):
|
||||
a = story.find('a', href=re.compile(r"review.php\?s\="+self.story.getMetadata('storyId')+'&act=view'))
|
||||
if a != None:
|
||||
for labels in story.findAll('tr'):
|
||||
value = labels.findAll('td')[1]
|
||||
label = labels.findAll('td')[0]
|
||||
if 'genre' in stripHTML(label):
|
||||
for genre in value.findAll('img'):
|
||||
self.story.addToList('genre',genre['title'])
|
||||
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'resizeableText'})
|
||||
div.find('div', {'class' : 'storyTools'}).extract()
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
56
fanficdownloader/adapters/adapter_csiforensicscom.py
Normal file
56
fanficdownloader/adapters/adapter_csiforensicscom.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import re
|
||||
from base_efiction_adapter import BaseEfictionAdapter
|
||||
|
||||
class CSIForensicsComAdapter(BaseEfictionAdapter):
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'csi-forensics.com'
|
||||
|
||||
@classmethod
|
||||
def getPathToArchive(self):
|
||||
return ''
|
||||
|
||||
@classmethod
|
||||
def getSiteAbbrev(self):
|
||||
return 'csiforensics'
|
||||
|
||||
@classmethod
|
||||
def getDateFormat(self):
|
||||
return "%d %b %Y"
|
||||
|
||||
def handleMetadataPair(self, key, value):
|
||||
if key == 'Warnings':
|
||||
for val in re.split("\s*,\s*", value):
|
||||
if value == 'None':
|
||||
return
|
||||
else:
|
||||
self.story.addToList('warnings', val)
|
||||
|
||||
elif 'Categories' in key:
|
||||
for val in re.split("\s*>\s*", value):
|
||||
self.story.addToList('category', val)
|
||||
else:
|
||||
super(CSIForensicsComAdapter, self).handleMetadataPair(key, value)
|
||||
|
||||
def getClass():
|
||||
return CSIForensicsComAdapter
|
||||
|
||||
336
fanficdownloader/adapters/adapter_darksolaceorg.py
Normal file
336
fanficdownloader/adapters/adapter_darksolaceorg.py
Normal file
|
|
@ -0,0 +1,336 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return DarkSolaceOrgAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class DarkSolaceOrgAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/elysian/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','dksl')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%B %d, %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'dark-solace.org'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.dark-solace.org','dark-solace.org']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/elysian/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://")+"(www\.)?"+re.escape(self.getSiteDomain()+"/elysian/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'This story contains adult content not suitable for children' in data \
|
||||
or "That password doesn't match the one in our database" in data \
|
||||
or "Registered Users Only" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['action'] = 'login'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://www.' + self.getSiteDomain() + '/elysian/user.php'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._postUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #User Account Page
|
||||
logger.info("Failed to login to URL %s as %s, or have no authorization to access the story" % (loginUrl, params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=5"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title and author
|
||||
div = soup.find('div', {'id' : 'pagetitle'})
|
||||
|
||||
aut = div.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',aut['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/elysian/'+aut['href'])
|
||||
self.story.setMetadata('author',aut.string)
|
||||
aut.extract()
|
||||
|
||||
# first a tag in pagetitle is title
|
||||
self.story.setMetadata('title',stripHTML(div.find('a')))
|
||||
div.find('a').extract()
|
||||
# only thing left in div(pagetitle) now should be 'by' and rating.
|
||||
rating = stripHTML(div)
|
||||
if '[' in rating:
|
||||
self.story.setMetadata('rating', rating[rating.index('[')+1:-1])
|
||||
|
||||
for chapa in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+
|
||||
self.story.getMetadata('storyId')+'&chapter=\d+')):
|
||||
self.chapterUrls.append((stripHTML(chapa),'http://'+self.host+'/elysian/'+chapa['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
|
||||
storylink = asoup.find('a', href=re.compile(r'viewstory.php\?sid='+
|
||||
self.story.getMetadata('storyId')+'($|[^\d])'))
|
||||
# author's story list is paginated if there's a pagelinks div.
|
||||
# Only need to look in it if the story wasn't on the first page.
|
||||
pagelinks = asoup.find('div',{'id':'pagelinks'})
|
||||
if pagelinks and storylink==None:
|
||||
authpageslist = pagelinks.findAll('a',href=re.compile(r'action=storiesby'))
|
||||
for page in authpageslist[1:]: # skip first, already checked above.
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl('http://'+self.host+'/elysian/'+page['href']))
|
||||
storylink = asoup.find('a', href=re.compile(r'viewstory.php\?sid='+
|
||||
self.story.getMetadata('storyId')+'($|[^\d])'))
|
||||
if storylink:
|
||||
break
|
||||
|
||||
if not storylink:
|
||||
raise exceptions.FailedToDownload("Unable to find story metadata on author's page(s)")
|
||||
|
||||
metalist = storylink.parent.parent
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = metalist.findAll('span', {'class' : 'label'})
|
||||
for labelspan in labels:
|
||||
label = labelspan.text
|
||||
value = labelspan.nextSibling
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while value and not (defaultGetattr(value,'class') == 'label' or "Chapters: " in stripHTML(value)):
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = metalist.find('a', href=re.compile(r"series.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/elysian/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storylink = seriessoup.find('a', href=re.compile(r'viewstory.php\?sid='+
|
||||
self.story.getMetadata('storyId')+'($|[^\d])'))
|
||||
if storylink and storylink.parent and storylink.parent['class'] != 'title': # in case of links inside story summaries.
|
||||
storylink = None
|
||||
|
||||
offset = 0
|
||||
# series story list is paginated if there's a pagelinks div.
|
||||
# Only need to look in it if the story wasn't on the first page.
|
||||
pagelinks = seriessoup.find('div',{'id':'pagelinks'})
|
||||
if pagelinks and storylink==None:
|
||||
authpageslist = pagelinks.findAll('a',href=re.compile(r'offset='))
|
||||
for page in authpageslist[1:]: # skip first, already checked above.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl('http://'+self.host+'/elysian/'+page['href']))
|
||||
storylink = seriessoup.find('a', href=re.compile(r'viewstory.php\?sid='+
|
||||
self.story.getMetadata('storyId')+'($|[^\d])'))
|
||||
if storylink and storylink.parent and storylink.parent['class'] != 'title': # in case of links inside story summaries.
|
||||
storylink = None
|
||||
if storylink:
|
||||
offset = int(page['href'].split('=')[-1]) # offset is last.
|
||||
break
|
||||
|
||||
# for reasons I don't understand, searching for story
|
||||
# links by regex wasn't working reliably. It was missing
|
||||
# the javascript links sometimes. This is cleaner anyway.
|
||||
for i, div in enumerate(seriessoup.findAll('div', {'class':'title'})):
|
||||
a = div.find('a') # first a is story link.
|
||||
# skip 'report this' and 'TOC' links
|
||||
if a == storylink:
|
||||
self.setSeries(series_name, 1+i+offset)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
|
||||
except Exception, e:
|
||||
print("Series parsing failed: %s"%e)
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url))
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
244
fanficdownloader/adapters/adapter_destinysgatewaycom.py
Normal file
244
fanficdownloader/adapters/adapter_destinysgatewaycom.py
Normal file
|
|
@ -0,0 +1,244 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return DestinysGatewayComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class DestinysGatewayComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','dgrfa')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b %d %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.destinysgateway.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&warning=4"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
278
fanficdownloader/adapters/adapter_dokugacom.py
Normal file
278
fanficdownloader/adapters/adapter_dokugacom.py
Normal file
|
|
@ -0,0 +1,278 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return DokugaComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class DokugaComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[3])
|
||||
|
||||
|
||||
# www.dokuga.com has two 'sections', shown in URL as
|
||||
# 'fanfiction' and 'spark' that change how things should be
|
||||
# handled.
|
||||
# http://www.dokuga.com/fanfiction/story/7528/1
|
||||
# http://www.dokuga.com/spark/story/7299/1
|
||||
self.section=self.parsedUrl.path.split('/',)[1]
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/'+self.parsedUrl.path.split('/',)[1]+'/story/'+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','dkg')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
if 'fanfiction' in self.section:
|
||||
self.dateformat = "%d %b %Y"
|
||||
else:
|
||||
self.dateformat = "%m-%d-%y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.dokuga.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/fanfiction/story/1234/1 http://"+cls.getSiteDomain()+"/spark/story/1234/1"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"http://"+self.getSiteDomain()+"/(fanfiction|spark)?/story/\d+/?\d+?$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'The author has disabled anonymous viewing for this story.' in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url,soup):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['username'] = self.username
|
||||
params['passwd'] = self.password
|
||||
else:
|
||||
params['username'] = self.getConfig("username")
|
||||
params['passwd'] = self.getConfig("password")
|
||||
params['Submit'] = 'Submit'
|
||||
|
||||
# copy all hidden input tags to pick up appropriate tokens.
|
||||
for tag in soup.findAll('input',{'type':'hidden'}):
|
||||
params[tag['name']] = tag['value']
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/fanfiction'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['username']))
|
||||
|
||||
d = self._postUrl(loginUrl, params)
|
||||
|
||||
if "Your session has expired. Please log in again." in d:
|
||||
d = self._postUrl(loginUrl, params)
|
||||
|
||||
if "Logout" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['username']))
|
||||
raise exceptions.FailedToLogin(url,params['username'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url,soup)
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title and author
|
||||
a = soup.find('div', {'align' : 'center'}).find('h3')
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
aut = a.find('a')
|
||||
self.story.setMetadata('authorId',aut['href'].split('=')[1])
|
||||
alink='http://'+self.host+aut['href']
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+aut['href'])
|
||||
self.story.setMetadata('author',aut.string)
|
||||
aut.extract()
|
||||
|
||||
a = a.string[:(len(a.string)-4)]
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find the chapters:
|
||||
chapters = soup.find('select').findAll('option')
|
||||
if len(chapters)==1:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/'+self.section+'/story/'+self.story.getMetadata('storyId')+'/1'))
|
||||
else:
|
||||
for chapter in chapters:
|
||||
# just in case there's tags, like <i> in chapter titles. /fanfiction/story/7406/1
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+self.section+'/story/'+self.story.getMetadata('storyId')+'/'+chapter['value']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl(alink))
|
||||
|
||||
if 'fanfiction' in self.section:
|
||||
asoup=asoup.find('div', {'id' : 'cb_tabid_52'}).find('div')
|
||||
|
||||
#grab the rest of the metadata from the author's page
|
||||
for div in asoup.findAll('div'):
|
||||
nav=div.find('a', href=re.compile(r'/fanfiction/story/'+self.story.getMetadata('storyId')+"/1$"))
|
||||
if nav != None:
|
||||
break
|
||||
div=div.nextSibling
|
||||
self.setDescription(url,div)
|
||||
|
||||
div=div.nextSibling
|
||||
|
||||
a=div.text.split('Rating: ')
|
||||
if len(a) == 2: self.story.setMetadata('rating', a[1].split('&')[0])
|
||||
|
||||
a=div.text.split('Status: ')
|
||||
if len(a)==2:
|
||||
iscomp=a[1].split('&')[0]
|
||||
if 'Complete' in iscomp:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
a=div.text.split('Category: ')
|
||||
if len(a) == 2: self.story.addToList('category', a[1].split('&')[0])
|
||||
|
||||
a=div.text.split('Created: ')
|
||||
if len(a) == 2: self.story.setMetadata('datePublished', makeDate(stripHTML(a[1].split('&')[0]), self.dateformat))
|
||||
|
||||
a=div.text.split('Updated: ')
|
||||
if len(a) == 2: self.story.setMetadata('dateUpdated', makeDate(stripHTML(a[1]), self.dateformat))
|
||||
|
||||
div=div.nextSibling.nextSibling
|
||||
a=div.text.split('Words: ')
|
||||
if len(a) == 2: self.story.setMetadata('numWords', a[1].split('&')[0])
|
||||
|
||||
a=div.text.split('Genre: ')
|
||||
if len(a) == 2:
|
||||
for genre in a[1].split('&')[0].split(', '):
|
||||
self.story.addToList('genre',genre)
|
||||
|
||||
else:
|
||||
asoup=asoup.find('div', {'id' : 'maincol'}).find('div', {'class' : 'padding'})
|
||||
for div in asoup.findAll('div'):
|
||||
nav=div.find('a', href=re.compile(r'/spark/story/'+self.story.getMetadata('storyId')+"/1$"))
|
||||
if nav != None:
|
||||
break
|
||||
|
||||
div=div.nextSibling.nextSibling
|
||||
self.setDescription(url,div)
|
||||
self.story.addToList('category', 'Spark')
|
||||
|
||||
div=div.nextSibling.nextSibling
|
||||
a=div.text.split('Rating: ')
|
||||
if len(a) == 2: self.story.setMetadata('rating', a[1].split(' - ')[0])
|
||||
|
||||
a=div.text.split('Status: ')
|
||||
if len(a)==2:
|
||||
iscomp=a[1].split(' - ')[0]
|
||||
if 'Complete' in iscomp:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
a=div.text.split('Genre: ')
|
||||
if len(a)==2:
|
||||
for genre in a[1].split(' - ')[0].split('/'):
|
||||
self.story.addToList('genre',genre)
|
||||
|
||||
div=div.nextSibling.nextSibling
|
||||
|
||||
a=div.text.split('Updated: ')
|
||||
if len(a)==2:
|
||||
date=a[1].split(' -')[0]
|
||||
self.story.setMetadata('dateUpdated', makeDate(date, self.dateformat))
|
||||
|
||||
# does not have published date anywhere
|
||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
||||
|
||||
a=div.text.split('Words ')
|
||||
if len(a)==2: self.story.setMetadata('numWords', a[1])
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'chtext'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
217
fanficdownloader/adapters/adapter_dotmoonnet.py
Normal file
217
fanficdownloader/adapters/adapter_dotmoonnet.py
Normal file
|
|
@ -0,0 +1,217 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return DotMoonNetAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class DotMoonNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL. www.dotmoon.net/library_view.php?storyid=3
|
||||
self._setURL('http://' + self.getSiteDomain() + '/library_view.php?storyid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','dotm')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%Y-%m-%d"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.dotmoon.net'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/library_view.php?storyid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/library_view.php?storyid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'You must be logged in to read adult-rated stories' in data \
|
||||
or 'Password incorrect' in data \
|
||||
or "That username does not exist" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['user'] = self.username
|
||||
params['passwrd'] = self.password
|
||||
else:
|
||||
params['user'] = self.getConfig("username")
|
||||
params['passwrd'] = self.getConfig("password")
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/board/index.php'
|
||||
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['user']))
|
||||
|
||||
d = self._fetchUrl(loginUrl+'?action=login2&user='+params['user']+'&passwrd='+params['passwrd'])
|
||||
d = self._fetchUrl(loginUrl)
|
||||
|
||||
if "Show unread posts since last visit" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['user']))
|
||||
raise exceptions.FailedToLogin(url,params['user'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
if "Invalid story ID" in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Invalid story ID.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
body=soup.findAll('body')[1]
|
||||
body.find('table').extract()
|
||||
|
||||
## Title
|
||||
a = body.find('b')
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url. http://www.dotmoon.net/board/index.php?action=profile;u=1'
|
||||
a = body.find('a', href=re.compile(r"index.php\?action=profile;u=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[2])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters: 'library_storyview.php?chapterid=3
|
||||
chapters=body.findAll('a', href=re.compile(r"library_storyview.php\?chapterid=\d+$"))
|
||||
if len(chapters)==0:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: No php/html chapters found.")
|
||||
if len(chapters)==1:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/'+chapters[0]['href']))
|
||||
else:
|
||||
for chapter in chapters:
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# other tags
|
||||
|
||||
labels = body.find('table', {'width':'390'}).findAll('td')
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if label != None:
|
||||
if 'Fandom' in label:
|
||||
self.story.addToList('category',value.string)
|
||||
|
||||
if 'Setting' in label:
|
||||
self.story.addToList('genre',value.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
self.story.addToList('genre',value.string)
|
||||
|
||||
if 'Style' in label:
|
||||
self.story.addToList('genre',value.string)
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.addToList('rating',value.string)
|
||||
|
||||
if 'Created' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Status' in label:
|
||||
if 'Completed' in value.string:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
table=body.findAll('table', {'width':'400'})[1].find('td')
|
||||
self.setDescription(url,stripHTML(table).split('Summary: ')[1])
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('blockquote')
|
||||
div.name='div'
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
302
fanficdownloader/adapters/adapter_dracoandginnycom.py
Normal file
302
fanficdownloader/adapters/adapter_dracoandginnycom.py
Normal file
|
|
@ -0,0 +1,302 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return DracoAndGinnyComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class DracoAndGinnyComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','dcagn')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b %d, %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.dracoandginny.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=2"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
content=soup.find('div',{'class' : 'listbox'})
|
||||
|
||||
self.setDescription(url,content.find('blockquote'))
|
||||
|
||||
for genre in content.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')):
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
for warning in content.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')):
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
labels = content.findAll('b')
|
||||
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value.split(' |')[0])
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value.nextSibling).split(' |')[0], self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'class' : 'listbox'})
|
||||
|
||||
if None == div:
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
311
fanficdownloader/adapters/adapter_dramioneorg.py
Normal file
311
fanficdownloader/adapters/adapter_dramioneorg.py
Normal file
|
|
@ -0,0 +1,311 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return DramioneOrgAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class DramioneOrgAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','drmn')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d %B %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'dramione.org'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&warning=5"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
if "Stories that are suitable for ages 16 and older" in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Use banner as cover if found
|
||||
coverurl = ''
|
||||
img = soup.find('img',{'class':'banner'})
|
||||
if img:
|
||||
coverurl = img['src']
|
||||
#print "Cover: "+coverurl
|
||||
a = soup.find(text="This story has a banner; click to view.")
|
||||
if a:
|
||||
#print "A: "+ ', '.join("(%s, %s)" %tup for tup in a.parent.attrs)
|
||||
coverurl = a.parent['href']
|
||||
#print "Cover: "+coverurl
|
||||
if coverurl:
|
||||
self.setCoverImage(url,coverurl)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
genres=soup.findAll('a', {'class' : "tag-1"})
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
warnings=soup.findAll('a', {'class' : "tag-2"})
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
themes=soup.findAll('a', {'class' : "tag-3"})
|
||||
for theme in themes:
|
||||
self.story.addToList('themes',theme.string)
|
||||
|
||||
hermiones=soup.findAll('a', {'class' : "tag-4"})
|
||||
for hermione in hermiones:
|
||||
self.story.addToList('hermiones',hermione.string)
|
||||
|
||||
dracos=soup.findAll('a', {'class' : "tag-5"})
|
||||
for draco in dracos:
|
||||
self.story.addToList('dracos',draco.string)
|
||||
|
||||
timelines=soup.findAll('a', {'class' : "tag-6"})
|
||||
for timeline in timelines:
|
||||
self.story.addToList('timeline',timeline.string)
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Read' in label:
|
||||
self.story.setMetadata('read', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
value=re.sub(r"(\d+)(st|nd|rd|th)",r"\1",value)
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
value=re.sub(r"(\d+)(st|nd|rd|th)",r"\1",value)
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
try:
|
||||
self.story.setMetadata('reviews',
|
||||
stripHTML(soup.find('h2',{'id':'pagetitle'}).
|
||||
findAll('a', href=re.compile(r'^reviews.php'))[1]))
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
224
fanficdownloader/adapters/adapter_efictionestelielde.py
Normal file
224
fanficdownloader/adapters/adapter_efictionestelielde.py
Normal file
|
|
@ -0,0 +1,224 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return EfictionEstelielDeAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class EfictionEstelielDeAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','eesd')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%B %d, %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'efiction.esteliel.de'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title and author
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
pagetitle = soup.find('div',{'id':'pagetitle'})
|
||||
## Title
|
||||
a = pagetitle.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
list = soup.find('div', {'class':'listbox'})
|
||||
labelspan=list.find('span',{'class':'label'})
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
labels = list.findAll('b')
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while 'Rating' not in str(value):
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Words' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Category' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
if list.find('a', href=re.compile(r"series.php")) != None:
|
||||
for series in asoup.findAll('a', href=re.compile(r"series.php\?seriesid=\d+")):
|
||||
# Find Series name from series URL.
|
||||
series_url = 'http://'+self.host+'/'+series['href']
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
name=seriessoup.find('div', {'id' : 'pagetitle'})
|
||||
name.find('a').extract()
|
||||
self.setSeries(name.text.split(' by[')[0], i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
i=0
|
||||
break
|
||||
i+=1
|
||||
if i == 0:
|
||||
break
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
316
fanficdownloader/adapters/adapter_efpfanficnet.py
Normal file
316
fanficdownloader/adapters/adapter_efpfanficnet.py
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return EFPFanFicNet
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class EFPFanFicNet(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','efp')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d/%m/%y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.efpfanfic.net'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if( 'Fai il login e leggi la storia!' in data or
|
||||
'Questa storia presenta contenuti non adatti ai minori' in data ):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Invia'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?sid='+self.story.getMetadata('storyId')
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if '<a class="menu" href="newaccount.php">' in d : # register for new account link
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
# if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
# raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapter selector
|
||||
select = soup.find('select', { 'name' : 'sid' } )
|
||||
|
||||
if select is None:
|
||||
# no selector found, so it's a one-chapter story.
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
else:
|
||||
allOptions = select.findAll('option', {'value' : re.compile(r'viewstory')})
|
||||
for o in allOptions:
|
||||
url = u'http://%s/%s' % ( self.getSiteDomain(),
|
||||
o['value'])
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
title = stripHTML(o)
|
||||
self.chapterUrls.append((title,url))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
self.story.setMetadata('language','Italian')
|
||||
|
||||
# normalize story URL to first chapter if later chapter URL was given:
|
||||
url = self.chapterUrls[0][1].replace('&i=1','')
|
||||
logger.debug("Normalizing to URL: "+url)
|
||||
self._setURL(url)
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
storya = None
|
||||
authsoup = None
|
||||
storyblock = None
|
||||
authurl = self.story.getMetadata('authorUrl')
|
||||
|
||||
## author can have more than one page of stories.
|
||||
while storyblock == None:
|
||||
|
||||
# no storya, but do have authsoup--we're looping on author pages.
|
||||
if authsoup != None:
|
||||
# last author link with offset should be the 'next' link.
|
||||
authurl = u'http://%s/%s' % ( self.getSiteDomain(),
|
||||
authsoup.findAll('a',href=re.compile(r'viewuser\.php\?uid=\d+&catid=&offset='))[-1]['href'] )
|
||||
|
||||
# Need author page for most of the metadata.
|
||||
logger.debug("fetching author page: (%s)"%authurl)
|
||||
authsoup = bs.BeautifulSoup(self._fetchUrl(authurl))
|
||||
#print("authsoup:%s"%authsoup)
|
||||
|
||||
storyas = authsoup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r'&i=1$'))
|
||||
for storya in storyas:
|
||||
#print("======storya:%s"%storya)
|
||||
storyblock = storya.findParent('div',{'class':'storybloc'})
|
||||
#print("======storyblock:%s"%storyblock)
|
||||
if storyblock != None:
|
||||
continue
|
||||
|
||||
self.setDescription(url,storyblock.find('div', {'class':'introbloc'}))
|
||||
|
||||
noteblock = storyblock.find('div', {'class':'notebloc'})
|
||||
#print("%s"%noteblock)
|
||||
notetext = ("%s" % noteblock).replace("<br />"," |")
|
||||
# <div class="notebloc">Autore: <a href="viewuser.php?uid=243036">Cendrillon89</a> | Pubblicata: 23/10/12 | Aggiornata: 30/10/12 | Rating: Arancione | Genere: Drammatico, Sentimentale | Capitoli: 10 | Completa<br />
|
||||
# Tipo di coppia: Het | Personaggi: Akasuna no Sasori , Akatsuki, Nuovo Personaggio | Note: OOC | Avvertimenti: Tematiche delicate<br />
|
||||
# Categoria: <a href="categories.php?catid=1&parentcatid=1">Anime & Manga</a> > <a href="categories.php?catid=108&parentcatid=108">Naruto</a> | Contesto: Naruto Shippuuden | Leggi le <a href="reviews.php?sid=1331275&a=">3</a> recensioni</div>
|
||||
|
||||
cats = noteblock.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
for item in notetext.split("|"):
|
||||
if ":" in item:
|
||||
(label,value) = item.split(":")
|
||||
label=label.strip()
|
||||
value=value.strip()
|
||||
else:
|
||||
label=value=item.strip()
|
||||
|
||||
if 'Pubblicata' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Aggiornata' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if label == "Completa":
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
|
||||
if label == "In corso":
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Personaggi' in label:
|
||||
for val in value.split(","):
|
||||
self.story.addToList('characters',val)
|
||||
|
||||
if 'Genere' in label:
|
||||
for val in value.split(","):
|
||||
self.story.addToList('genre',val)
|
||||
|
||||
if 'Coppie' in label:
|
||||
for val in value.split(","):
|
||||
self.story.addToList('ships',val)
|
||||
|
||||
if 'Avvertimenti' in label:
|
||||
for val in value.split(","):
|
||||
if val != "None":
|
||||
self.story.addToList('warnings',val)
|
||||
|
||||
# 'extra' metadata for this adapter:
|
||||
|
||||
if 'Tipo di coppia' in label:
|
||||
for val in value.split(","):
|
||||
self.story.addToList('type',val)
|
||||
|
||||
if 'Note' in label:
|
||||
for val in value.split(","):
|
||||
if val != "None":
|
||||
self.story.addToList('notes',val)
|
||||
|
||||
if 'Contesto' in label:
|
||||
self.story.setMetadata('context', value)
|
||||
|
||||
## Note--efp doesn't provide word count.
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?ssid=\d+&i=1"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId'))+'&i=1':
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url))
|
||||
|
||||
div = soup.find('div', {'class' : 'storia'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
# remove any header and 'o:p' tags.
|
||||
for tag in div.findAll("head") + div.findAll("o:p"):
|
||||
tag.extract()
|
||||
|
||||
# change any html and body tags to div.
|
||||
for tag in div.findAll("html") + div.findAll("body"):
|
||||
tag.name='div'
|
||||
|
||||
# remove extra bogus doctype.
|
||||
#<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
return re.sub(r"<!DOCTYPE[^>]+>","",self.utf8FromSoup(url,div))
|
||||
256
fanficdownloader/adapters/adapter_erosnsapphosycophanthexcom.py
Normal file
256
fanficdownloader/adapters/adapter_erosnsapphosycophanthexcom.py
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return ErosnSapphoSycophantHexComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','essph')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d/%m/%y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'erosnsappho.sycophanthex.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=18"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
pt = soup.find('div', {'id' : 'pagetitle'})
|
||||
a = pt.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = pt.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
rating=pt.text.split('(')[1].split(')')[0]
|
||||
self.story.setMetadata('rating', rating)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
|
||||
value = labels[0].previousSibling
|
||||
svalue = ""
|
||||
while value != None:
|
||||
val = value
|
||||
value = value.previousSibling
|
||||
while not defaultGetattr(val,'class') == 'label':
|
||||
svalue += str(val)
|
||||
val = val.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value.split(' -')[0])
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Complete' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value.split(' -')[0]), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
185
fanficdownloader/adapters/adapter_fanfichu.py
Normal file
185
fanficdownloader/adapters/adapter_fanfichu.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
# coding=utf-8
|
||||
|
||||
import re
|
||||
import urllib2
|
||||
import urlparse
|
||||
|
||||
from .. import BeautifulSoup
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
from .. import exceptions
|
||||
|
||||
|
||||
_SOURCE_CODE_ENCODING = 'utf-8'
|
||||
|
||||
|
||||
def getClass():
|
||||
return FanficHuAdapter
|
||||
|
||||
|
||||
def _get_query_data(url):
|
||||
components = urlparse.urlparse(url)
|
||||
query_data = urlparse.parse_qs(components.query)
|
||||
return dict((key, data[0]) for key, data in query_data.items())
|
||||
|
||||
|
||||
class FanficHuAdapter(BaseSiteAdapter):
|
||||
SITE_ABBREVIATION = 'ffh'
|
||||
SITE_DOMAIN = 'fanfic.hu'
|
||||
SITE_LANGUAGE = 'Hungarian'
|
||||
|
||||
BASE_URL = 'http://' + SITE_DOMAIN + '/merengo/'
|
||||
VIEW_STORY_URL_TEMPLATE = BASE_URL + 'viewstory.php?sid=%s'
|
||||
|
||||
DATE_FORMAT = '%m/%d/%Y'
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
query_data = urlparse.parse_qs(self.parsedUrl.query)
|
||||
story_id = query_data['sid'][0]
|
||||
|
||||
self.story.setMetadata('storyId', story_id)
|
||||
self._setURL(self.VIEW_STORY_URL_TEMPLATE % story_id)
|
||||
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
|
||||
self.story.setMetadata('language', self.SITE_LANGUAGE)
|
||||
|
||||
def _customized_fetch_url(self, url, exception=None, parameters=None):
|
||||
if exception:
|
||||
try:
|
||||
data = self._fetchUrl(url, parameters)
|
||||
except urllib2.HTTPError:
|
||||
raise exception(self.url)
|
||||
# Just let self._fetchUrl throw the exception, don't catch and
|
||||
# customize it.
|
||||
else:
|
||||
data = self._fetchUrl(url, parameters)
|
||||
|
||||
return BeautifulSoup.BeautifulSoup(data)
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return FanficHuAdapter.SITE_DOMAIN
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return cls.VIEW_STORY_URL_TEMPLATE % 1234
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape(self.VIEW_STORY_URL_TEMPLATE[:-2]) + r'\d+$'
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
soup = self._customized_fetch_url(self.url + '&i=1')
|
||||
|
||||
if soup.title.string.encode(_SOURCE_CODE_ENCODING).strip(' :') == 'írta':
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
|
||||
chapter_options = soup.find('form', action='viewstory.php').select('option')
|
||||
# Remove redundant "Fejezetek" option
|
||||
chapter_options.pop(0)
|
||||
|
||||
# If there is still more than one entry remove chapter overview entry
|
||||
if len(chapter_options) > 1:
|
||||
chapter_options.pop(0)
|
||||
|
||||
for option in chapter_options:
|
||||
url = urlparse.urljoin(self.url, option['value'])
|
||||
self.chapterUrls.append((option.string, url))
|
||||
|
||||
author_url = urlparse.urljoin(self.BASE_URL, soup.find('a', href=lambda href: href and href.startswith('viewuser.php?uid='))['href'])
|
||||
soup = self._customized_fetch_url(author_url)
|
||||
|
||||
story_id = self.story.getMetadata('storyId')
|
||||
for table in soup('table', {'class': 'mainnav'}):
|
||||
title_anchor = table.find('span', {'class': 'storytitle'}).a
|
||||
href = title_anchor['href']
|
||||
if href.startswith('javascript:'):
|
||||
href = href.rsplit(' ', 1)[1].strip("'")
|
||||
query_data = _get_query_data(href)
|
||||
|
||||
if query_data['sid'] == story_id:
|
||||
break
|
||||
else:
|
||||
# This should never happen, the story must be found on the author's
|
||||
# page.
|
||||
raise exceptions.FailedToDownload(self.url)
|
||||
|
||||
self.story.setMetadata('title', title_anchor.string)
|
||||
|
||||
rows = table('tr')
|
||||
|
||||
anchors = rows[0].div('a')
|
||||
author_anchor = anchors[1]
|
||||
query_data = _get_query_data(author_anchor['href'])
|
||||
self.story.setMetadata('author', author_anchor.string)
|
||||
self.story.setMetadata('authorId', query_data['uid'])
|
||||
self.story.setMetadata('authorUrl', urlparse.urljoin(self.BASE_URL, author_anchor['href']))
|
||||
self.story.setMetadata('reviews', anchors[3].string)
|
||||
|
||||
if self.getConfig('keep_summary_html'):
|
||||
self.story.setMetadata('description', self.utf8FromSoup(author_url, rows[1].td))
|
||||
else:
|
||||
self.story.setMetadata('description', ''.join(rows[1].td(text=True)))
|
||||
|
||||
for row in rows[3:]:
|
||||
index = 0
|
||||
cells = row('td')
|
||||
|
||||
while index < len(cells):
|
||||
cell = cells[index]
|
||||
key = cell.b.string.encode(_SOURCE_CODE_ENCODING).strip(':')
|
||||
try:
|
||||
value = cells[index+1].string.encode(_SOURCE_CODE_ENCODING)
|
||||
except AttributeError:
|
||||
value = None
|
||||
|
||||
if key == 'Kategória':
|
||||
for anchor in cells[index+1]('a'):
|
||||
self.story.addToList('category', anchor.string)
|
||||
|
||||
elif key == 'Szereplõk':
|
||||
if cells[index+1].string:
|
||||
for name in cells[index+1].string.split(', '):
|
||||
self.story.addToList('character', name)
|
||||
|
||||
elif key == 'Korhatár':
|
||||
if value != 'nem korhatáros':
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
elif key == 'Figyelmeztetések':
|
||||
for b_tag in cells[index+1]('b'):
|
||||
self.story.addToList('warnings', b_tag.string)
|
||||
|
||||
elif key == 'Jellemzõk':
|
||||
for genre in cells[index+1].string.split(', '):
|
||||
self.story.addToList('genre', genre)
|
||||
|
||||
elif key == 'Fejezetek':
|
||||
self.story.setMetadata('numChapters', int(value))
|
||||
|
||||
elif key == 'Megjelenés':
|
||||
self.story.setMetadata('datePublished', makeDate(value, self.DATE_FORMAT))
|
||||
|
||||
elif key == 'Frissítés':
|
||||
self.story.setMetadata('dateUpdated', makeDate(value, self.DATE_FORMAT))
|
||||
|
||||
elif key == 'Szavak':
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
elif key == 'Befejezett':
|
||||
self.story.setMetadata('status', 'Completed' if value == 'Nem' else 'In-Progress')
|
||||
|
||||
index += 2
|
||||
|
||||
if self.story.getMetadata('rating') == '18':
|
||||
if not (self.is_adult or self.getConfig('is_adult')):
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
def getChapterText(self, url):
|
||||
soup = self._customized_fetch_url(url)
|
||||
story_cell = soup.find('form', action='viewstory.php').parent.parent
|
||||
|
||||
for div in story_cell('div'):
|
||||
div.extract()
|
||||
|
||||
return self.utf8FromSoup(url, story_cell)
|
||||
218
fanficdownloader/adapters/adapter_fanfictioncsodaidokhu.py
Normal file
218
fanficdownloader/adapters/adapter_fanfictioncsodaidokhu.py
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
# coding=utf-8
|
||||
|
||||
import re
|
||||
import urllib2
|
||||
import urlparse
|
||||
|
||||
from .. import BeautifulSoup
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
from .. import exceptions
|
||||
|
||||
|
||||
_SOURCE_CODE_ENCODING = 'utf-8'
|
||||
|
||||
|
||||
def getClass():
|
||||
return FanfictionCsodaidokHuAdapter
|
||||
|
||||
|
||||
def _get_query_data(url):
|
||||
components = urlparse.urlparse(url)
|
||||
query_data = urlparse.parse_qs(components.query)
|
||||
return dict((key, data[0]) for key, data in query_data.items())
|
||||
|
||||
|
||||
# yields Tag _and_ NavigableString siblings from the given tag. The
|
||||
# BeautifulSoup findNextSiblings() method for some reasons only returns either
|
||||
# NavigableStrings _or_ Tag objects, not both.
|
||||
def _yield_next_siblings(tag):
|
||||
sibling = tag.nextSibling
|
||||
while sibling:
|
||||
yield sibling
|
||||
sibling = sibling.nextSibling
|
||||
|
||||
|
||||
class FanfictionCsodaidokHuAdapter(BaseSiteAdapter):
|
||||
_SITE_DOMAIN = 'fanfiction.csodaidok.hu'
|
||||
_BASE_URL = 'http://' + _SITE_DOMAIN + '/'
|
||||
_VIEW_STORY_URL_TEMPLATE = _BASE_URL + 'viewstory.php?sid=%s'
|
||||
_VIEW_CHAPTER_URL_TEMPLATE = _VIEW_STORY_URL_TEMPLATE + '&chapter=%s'
|
||||
|
||||
_STORY_DOES_NOT_EXIST_PAGE_TITLE = 'Cím: Szerző:'
|
||||
_DATE_FORMAT = '%Y.%m.%d'
|
||||
_SITE_LANGUAGE = 'Hungarian'
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
query_data = urlparse.parse_qs(self.parsedUrl.query)
|
||||
story_id = query_data['sid'][0]
|
||||
|
||||
self.story.setMetadata('storyId', story_id)
|
||||
self._setURL(self._VIEW_STORY_URL_TEMPLATE % story_id)
|
||||
self.story.setMetadata('siteabbrev', self._SITE_DOMAIN)
|
||||
self.story.setMetadata('language', self._SITE_LANGUAGE)
|
||||
|
||||
def _customized_fetch_url(self, url, exception=None, parameters=None):
|
||||
if exception:
|
||||
try:
|
||||
data = self._fetchUrl(url, parameters)
|
||||
except urllib2.HTTPError:
|
||||
raise exception(self.url)
|
||||
# Just let self._fetchUrl throw the exception, don't catch and
|
||||
# customize it.
|
||||
else:
|
||||
data = self._fetchUrl(url, parameters)
|
||||
|
||||
return BeautifulSoup.BeautifulSoup(data)
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return FanfictionCsodaidokHuAdapter._SITE_DOMAIN
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return cls._VIEW_STORY_URL_TEMPLATE % 1234
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape(self._VIEW_STORY_URL_TEMPLATE[:-2]) + r'\d+$'
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
soup = self._customized_fetch_url(self.url + '&chapter=1')
|
||||
|
||||
element = soup.find('div', id='pagetitle')
|
||||
page_title = ''.join(element(text=True)).encode(_SOURCE_CODE_ENCODING)
|
||||
if page_title == self._STORY_DOES_NOT_EXIST_PAGE_TITLE:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
|
||||
author_url = urlparse.urljoin(self.url, element.a['href'])
|
||||
|
||||
story_id = self.story.getMetadata('storyId')
|
||||
element = soup.find('select', {'name': 'chapter'})
|
||||
if element:
|
||||
for option in element('option'):
|
||||
title = option.string
|
||||
url = self._VIEW_CHAPTER_URL_TEMPLATE % (story_id, option['value'])
|
||||
self.chapterUrls.append((title, url))
|
||||
|
||||
soup = self._customized_fetch_url(author_url)
|
||||
story_id = self.story.getMetadata('storyId')
|
||||
|
||||
for listbox_div in soup('div', {'class': lambda klass: klass and 'listbox' in klass}):
|
||||
a = listbox_div.div.a
|
||||
if not a['href'].startswith('viewstory.php?sid='):
|
||||
continue
|
||||
|
||||
query_data = _get_query_data(a['href'])
|
||||
if query_data['sid'] == story_id:
|
||||
break
|
||||
else:
|
||||
raise exceptions.FailedToDownload(self.url)
|
||||
|
||||
title = ''.join(a(text=True))
|
||||
self.story.setMetadata('title', title)
|
||||
if not self.chapterUrls:
|
||||
self.chapterUrls.append((title, self.url))
|
||||
|
||||
element = a.findNextSibling('a')
|
||||
self.story.setMetadata('author', element.string)
|
||||
query_data = _get_query_data(element['href'])
|
||||
self.story.setMetadata('authorId', query_data['uid'])
|
||||
self.story.setMetadata('authorUrl', author_url)
|
||||
|
||||
element = element.findNextSibling('span')
|
||||
rating = element.nextSibling.strip(' [')
|
||||
|
||||
if rating.encode(_SOURCE_CODE_ENCODING) != 'Korhatár nélkül':
|
||||
self.story.setMetadata('rating', rating)
|
||||
|
||||
if rating == '18':
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
element = element.findNextSiblings('a')[1]
|
||||
self.story.setMetadata('reviews', element.string)
|
||||
|
||||
sections = listbox_div('div', {'class': lambda klass: klass and klass in ['content', 'tail']})
|
||||
for section in sections:
|
||||
for element in section('span', {'class': 'classification'}):
|
||||
key = element.string.encode(_SOURCE_CODE_ENCODING).strip(' :')
|
||||
try:
|
||||
value = element.nextSibling.string.encode(_SOURCE_CODE_ENCODING).strip()
|
||||
except AttributeError:
|
||||
value = None
|
||||
|
||||
if key == 'Tartalom':
|
||||
contents = []
|
||||
keep_summary_html = self.getConfig('keep_summary_html')
|
||||
|
||||
for sibling in _yield_next_siblings(element):
|
||||
if isinstance(sibling, BeautifulSoup.Tag):
|
||||
if sibling.name == 'span' and sibling.get('class', None) == 'classification':
|
||||
break
|
||||
|
||||
if keep_summary_html:
|
||||
contents.append(self.utf8FromSoup(author_url, sibling))
|
||||
else:
|
||||
contents.append(''.join(sibling(text=True)))
|
||||
else:
|
||||
contents.append(sibling)
|
||||
self.story.setMetadata('description', ''.join(contents))
|
||||
|
||||
elif key == 'Kategória':
|
||||
for sibling in element.findNextSiblings(['a', 'span']):
|
||||
if sibling.name == 'span':
|
||||
break
|
||||
|
||||
self.story.addToList('category', sibling.string)
|
||||
|
||||
elif key == 'Szereplők':
|
||||
for name in value.split(', '):
|
||||
self.story.addToList('characters', name)
|
||||
|
||||
elif key == 'Műfaj':
|
||||
if value != 'Nincs':
|
||||
self.story.setMetadata('genre', value)
|
||||
|
||||
elif key == 'Figyelmeztetés':
|
||||
if value != 'Nincs':
|
||||
for warning in value.split(', '):
|
||||
self.story.addToList('warnings', warning)
|
||||
|
||||
elif key == 'Kihívás':
|
||||
if value != 'Nincs':
|
||||
self.story.setMetadata('challenge', value)
|
||||
|
||||
elif key == 'Sorozat':
|
||||
if value != 'Nincs':
|
||||
self.story.setMetadata('series', value)
|
||||
|
||||
elif key == 'Fejezetek':
|
||||
self.story.setMetadata('numChapters', int(value))
|
||||
|
||||
elif key == 'Befejezett':
|
||||
self.story.setMetadata('status', 'Completed' if value == 'Nem' else 'In-Progress')
|
||||
|
||||
elif key == 'Szavak száma':
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
elif key == 'Feltöltve':
|
||||
self.story.setMetadata('datePublished', makeDate(value, self._DATE_FORMAT))
|
||||
|
||||
elif key == 'Frissítve':
|
||||
self.story.setMetadata('dateUpdated', makeDate(value, self._DATE_FORMAT))
|
||||
|
||||
def getChapterText(self, url):
|
||||
soup = self._customized_fetch_url(url)
|
||||
contents = []
|
||||
|
||||
notes_div = soup.find('div', id='notes')
|
||||
if notes_div:
|
||||
contents.append(self.utf8FromSoup(url, notes_div))
|
||||
story_div = notes_div.findNextSibling('div')
|
||||
else:
|
||||
element = soup.find('div', {'class': 'jumpmenu'})
|
||||
story_div = element.findNextSibling('div')
|
||||
|
||||
contents.append(self.utf8FromSoup(url, story_div.span))
|
||||
return ''.join(contents)
|
||||
343
fanficdownloader/adapters/adapter_fanfictionnet.py
Normal file
343
fanficdownloader/adapters/adapter_fanfictionnet.py
Normal file
|
|
@ -0,0 +1,343 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
from urllib import unquote_plus
|
||||
import time
|
||||
|
||||
#from .. import BeautifulSoup as bs
|
||||
import bs4 as bs
|
||||
from .. import exceptions as exceptions
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
ffnetgenres=["Adventure", "Angst", "Crime", "Drama", "Family", "Fantasy", "Friendship", "General",
|
||||
"Horror", "Humor", "Hurt-Comfort", "Mystery", "Parody", "Poetry", "Romance", "Sci-Fi",
|
||||
"Spiritual", "Supernatural", "Suspense", "Tragedy", "Western"]
|
||||
|
||||
class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','ffnet')
|
||||
|
||||
# get storyId from url--url validation guarantees second part is storyId
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL("https://"+self.getSiteDomain()\
|
||||
+"/s/"+self.story.getMetadata('storyId')+"/1/")
|
||||
|
||||
# ffnet update emails have the latest chapter URL.
|
||||
# Frequently, when they arrive, not all the servers have the
|
||||
# latest chapter yet and going back to chapter 1 to pull the
|
||||
# chapter list doesn't get the latest. So save and use the
|
||||
# original URL given to pull chapter list & metadata.
|
||||
# Not used by plugin because URL gets normalized first for
|
||||
# eliminating duplicate story urls.
|
||||
self.origurl = url
|
||||
if "https://m." in self.origurl:
|
||||
## accept m(mobile)url, but use www.
|
||||
self.origurl = self.origurl.replace("https://m.","https://www.")
|
||||
|
||||
self.opener.addheaders.append(('Referer',self.origurl))
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.fanfiction.net'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.fanfiction.net','m.fanfiction.net']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "https://www.fanfiction.net/s/1234/1/ https://www.fanfiction.net/s/1234/12/ http://www.fanfiction.net/s/1234/1/Story_Title http://m.fanfiction.net/s/1234/1/"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$"
|
||||
|
||||
def _fetchUrl(self,url,parameters=None,extrasleep=1.0,usecache=True):
|
||||
## ffnet(and, I assume, fpcom) tends to fail more if hit too
|
||||
## fast. This is in additional to what ever the
|
||||
## slow_down_sleep_time setting is.
|
||||
return BaseSiteAdapter._fetchUrl(self,url,
|
||||
parameters=parameters,
|
||||
extrasleep=extrasleep,
|
||||
usecache=usecache)
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
|
||||
|
||||
# fetch the chapter. From that we will get almost all the
|
||||
# metadata and chapter list
|
||||
|
||||
url = self.origurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
#logger.debug("\n===================\n%s\n===================\n"%data)
|
||||
soup = bs.BeautifulSoup(data, "html5lib")
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Unable to locate story" in data:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
|
||||
# some times "Chapter not found...", sometimes "Chapter text not found..."
|
||||
if "not found. Please check to see you are not using an outdated url." in data:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! 'Chapter not found. Please check to see you are not using an outdated url.'" % url)
|
||||
|
||||
if self.getConfig('check_next_chapter'):
|
||||
try:
|
||||
## ffnet used to have a tendency to send out update
|
||||
## notices in email before all their servers were
|
||||
## showing the update on the first chapter. It
|
||||
## generates another server request and doesn't seem
|
||||
## to be needed lately, so now default it to off.
|
||||
try:
|
||||
chapcount = len(soup.find('select', { 'name' : 'chapter' } ).findAll('option'))
|
||||
# get chapter part of url.
|
||||
except:
|
||||
chapcount = 1
|
||||
chapter = url.split('/',)[5]
|
||||
tryurl = "https://%s/s/%s/%d/"%(self.getSiteDomain(),
|
||||
self.story.getMetadata('storyId'),
|
||||
chapcount+1)
|
||||
logger.debug('=Trying newer chapter: %s' % tryurl)
|
||||
newdata = self._fetchUrl(tryurl)
|
||||
if "not found. Please check to see you are not using an outdated url." \
|
||||
not in newdata:
|
||||
logger.debug('=======Found newer chapter: %s' % tryurl)
|
||||
soup = bs.BeautifulSoup(newdata, "html5lib")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"^/u/\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('/')[2])
|
||||
self.story.setMetadata('authorUrl','https://'+self.host+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
## Pull some additional data from html.
|
||||
|
||||
## ffnet shows category two ways
|
||||
## 1) class(Book, TV, Game,etc) >> category(Harry Potter, Sailor Moon, etc)
|
||||
## 2) cat1_cat2_Crossover
|
||||
## For 1, use the second link.
|
||||
## For 2, fetch the crossover page and pull the two categories from there.
|
||||
|
||||
categories = soup.find('div',{'id':'pre_story_links'}).findAll('a',{'class':'xcontrast_txt'})
|
||||
#print("xcontrast_txt a:%s"%categories)
|
||||
if len(categories) > 1:
|
||||
# Strangely, the ones with *two* links are the
|
||||
# non-crossover categories. Each is in a category itself
|
||||
# of Book, Movie, etc.
|
||||
self.story.addToList('category',stripHTML(categories[1]))
|
||||
elif 'Crossover' in categories[0]['href']:
|
||||
caturl = "https://%s%s"%(self.getSiteDomain(),categories[0]['href'])
|
||||
catsoup = bs.BeautifulSoup(self._fetchUrl(caturl), "html5lib")
|
||||
for a in catsoup.findAll('a',href=re.compile(r"^/crossovers/.+?/\d+/")):
|
||||
self.story.addToList('category',stripHTML(a))
|
||||
else:
|
||||
# Fall back. I ran across a story with a Crossver
|
||||
# category link to a broken page once.
|
||||
# http://www.fanfiction.net/s/2622060/1/
|
||||
# Naruto + Harry Potter Crossover
|
||||
logger.info("Fall back category collection")
|
||||
for c in stripHTML(categories[0]).replace(" Crossover","").split(' + '):
|
||||
self.story.addToList('category',c)
|
||||
|
||||
|
||||
|
||||
a = soup.find('a', href=re.compile(r'https?://www\.fictionratings\.com/'))
|
||||
rating = a.string
|
||||
if 'Fiction' in rating: # if rating has 'Fiction ', strip that out for consistency with past.
|
||||
rating = rating[8:]
|
||||
|
||||
self.story.setMetadata('rating',rating)
|
||||
|
||||
# after Rating, the same bit of text containing id:123456 contains
|
||||
# Complete--if completed.
|
||||
gui_table1i = soup.find('div',{'id':'content_wrapper_inner'})
|
||||
|
||||
self.story.setMetadata('title', stripHTML(gui_table1i.find('b'))) # title appears to be only(or at least first) bold tag in gui_table1i
|
||||
|
||||
summarydiv = gui_table1i.find('div',{'style':'margin-top:2px'})
|
||||
if summarydiv:
|
||||
self.setDescription(url,stripHTML(summarydiv))
|
||||
|
||||
|
||||
grayspan = gui_table1i.find('span', {'class':'xgray xcontrast_txt'})
|
||||
# for b in grayspan.findAll('button'):
|
||||
# b.extract()
|
||||
metatext = stripHTML(grayspan).replace('Hurt/Comfort','Hurt-Comfort')
|
||||
#logger.debug("metatext:(%s)"%metatext)
|
||||
metalist = metatext.split(" - ")
|
||||
#logger.debug("metalist:(%s)"%metalist)
|
||||
|
||||
# Rated: Fiction K - English - Words: 158,078 - Published: 02-04-11
|
||||
# Rated: Fiction T - English - Adventure/Sci-Fi - Naruto U. - Chapters: 22 - Words: 114,414 - Reviews: 395 - Favs: 779 - Follows: 835 - Updated: 03-21-13 - Published: 04-28-12 - id: 8067258
|
||||
|
||||
# rating is obtained above more robustly.
|
||||
if metalist[0].startswith('Rated:'):
|
||||
metalist=metalist[1:]
|
||||
|
||||
# next is assumed to be language.
|
||||
self.story.setMetadata('language',metalist[0])
|
||||
metalist=metalist[1:]
|
||||
|
||||
# next might be genre.
|
||||
genrelist = metalist[0].split('/') # Hurt/Comfort already changed above.
|
||||
goodgenres=True
|
||||
for g in genrelist:
|
||||
#logger.debug("g:(%s)"%g)
|
||||
if g.strip() not in ffnetgenres:
|
||||
#logger.info("g not in ffnetgenres")
|
||||
goodgenres=False
|
||||
if goodgenres:
|
||||
self.story.extendList('genre',genrelist)
|
||||
metalist=metalist[1:]
|
||||
|
||||
# Updated: <span data-xutime='1368059198'>5/8</span> - Published: <span data-xutime='1278984264'>7/12/2010</span>
|
||||
# Published: <span data-xutime='1384358726'>8m ago</span>
|
||||
dates = soup.findAll('span',{'data-xutime':re.compile(r'^\d+$')})
|
||||
if len(dates) > 1 :
|
||||
# updated get set to the same as published upstream if not found.
|
||||
self.story.setMetadata('dateUpdated',datetime.fromtimestamp(float(dates[0]['data-xutime'])))
|
||||
self.story.setMetadata('datePublished',datetime.fromtimestamp(float(dates[-1]['data-xutime'])))
|
||||
|
||||
donechars = False
|
||||
while len(metalist) > 0:
|
||||
if metalist[0].startswith('Chapters') or metalist[0].startswith('Status') or metalist[0].startswith('id:') or metalist[0].startswith('Updated:') or metalist[0].startswith('Published:'):
|
||||
pass
|
||||
elif metalist[0].startswith('Reviews'):
|
||||
self.story.setMetadata('reviews',metalist[0].split(':')[1].strip())
|
||||
elif metalist[0].startswith('Favs:'):
|
||||
self.story.setMetadata('favs',metalist[0].split(':')[1].strip())
|
||||
elif metalist[0].startswith('Follows:'):
|
||||
self.story.setMetadata('follows',metalist[0].split(':')[1].strip())
|
||||
elif metalist[0].startswith('Words'):
|
||||
self.story.setMetadata('numWords',metalist[0].split(':')[1].strip())
|
||||
elif not donechars:
|
||||
# with 'pairing' support, pairings are bracketed w/o comma after
|
||||
# [Caspian X, Lucy Pevensie] Edmund Pevensie, Peter Pevensie
|
||||
self.story.extendList('characters',metalist[0].replace('[','').replace(']',',').split(','))
|
||||
|
||||
l = metalist[0]
|
||||
while '[' in l:
|
||||
self.story.addToList('ships',l[l.index('[')+1:l.index(']')].replace(', ','/'))
|
||||
l = l[l.index(']')+1:]
|
||||
|
||||
donechars = True
|
||||
metalist=metalist[1:]
|
||||
|
||||
if 'Status: Complete' in metatext:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if get_cover:
|
||||
# Try the larger image first.
|
||||
try:
|
||||
img = soup.find('img',{'class':'lazy cimage'})
|
||||
self.setCoverImage(url,img['data-original'])
|
||||
except:
|
||||
img = soup.find('img',{'class':'cimage'})
|
||||
if img:
|
||||
self.setCoverImage(url,img['src'])
|
||||
|
||||
# Find the chapter selector
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
if select is None:
|
||||
# no selector found, so it's a one-chapter story.
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = u'https://%s/s/%s/%s/' % ( self.getSiteDomain(),
|
||||
self.story.getMetadata('storyId'),
|
||||
o['value'])
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
title = u"%s" % o
|
||||
title = re.sub(r'<[^>]+>','',title)
|
||||
self.chapterUrls.append((title,url))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
# time.sleep(4.0) ## ffnet(and, I assume, fpcom) tends to fail
|
||||
# ## more if hit too fast. This is in
|
||||
# ## additional to what ever the
|
||||
# ## slow_down_sleep_time setting is.
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
data = self._fetchUrl(url,extrasleep=4.0)
|
||||
|
||||
if "Please email this error message in full to <a href='mailto:support@fanfiction.com'>support@fanfiction.com</a>" in data:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! FanFiction.net Site Error!" % url)
|
||||
|
||||
# some ancient stories have body tags inside them that cause
|
||||
# soup parsing to discard the content. For story text we
|
||||
# don't care about anything before "<div role='main'" and
|
||||
# this kills any body tags.
|
||||
# XXX needed with new BS? -- No, doesn't look like it
|
||||
# divstr = "<div role='main'"
|
||||
# if divstr not in data:
|
||||
# raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
# else:
|
||||
# data = data[data.index(divstr):]
|
||||
# data = data.replace("<body","<notbody").replace("<BODY","<NOTBODY")
|
||||
|
||||
soup = bs.BeautifulSoup(data, "html5lib")
|
||||
|
||||
## Remove the 'share' button.
|
||||
## No longer appears in the story text.
|
||||
# sharediv = soup.find('div', {'class' : 'a2a_kit a2a_default_style'})
|
||||
# if sharediv:
|
||||
# sharediv.extract()
|
||||
|
||||
div = soup.find('div', {'id' : 'storytextp'})
|
||||
|
||||
if None == div:
|
||||
logger.debug('div id=storytextp not found. data:%s'%data)
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
||||
def getClass():
|
||||
return FanFictionNetSiteAdapter
|
||||
|
||||
212
fanficdownloader/adapters/adapter_fanfiktionde.py
Normal file
212
fanficdownloader/adapters/adapter_fanfiktionde.py
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return FanFiktionDeAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class FanFiktionDeAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/s/'+self.story.getMetadata('storyId') + '/1')
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ffde')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d.%m.%Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.fanfiktion.de'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/s/46ccbef30000616306614050 http://"+cls.getSiteDomain()+"/s/46ccbef30000616306614050/1 http://"+cls.getSiteDomain()+"/s/46ccbef30000616306614050/1/story-name"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/s/")+r"\w+(/\d+)?"
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Diese Geschichte wurde als entwicklungsbeeintr' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "Noch kein registrierter Benutzer?" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self,url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['nickname'] = self.username
|
||||
params['passwd'] = self.password
|
||||
else:
|
||||
params['nickname'] = self.getConfig("username")
|
||||
params['passwd'] = self.getConfig("password")
|
||||
params['savelogindata'] = '1'
|
||||
params['a'] = 'l'
|
||||
params['submit'] = 'Login...'
|
||||
|
||||
loginUrl = 'https://ssl.fanfiktion.de/'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['nickname']))
|
||||
soup = bs.BeautifulSoup(self._postUrl(loginUrl,params))
|
||||
if not soup.find('a', title='Logout'):
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['nickname']))
|
||||
raise exceptions.FailedToLogin(url,params['nickname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url,usecache=False)
|
||||
|
||||
if "Uhr ist diese Geschichte nur nach einer" in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Auserhalb der Zeit von 23:00 Uhr bis 04:00 Uhr ist diese Geschichte nur nach einer erfolgreichen Altersverifikation zuganglich.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')+"/"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
head = soup.find('div', {'class' : 'story-left'})
|
||||
a = head.find('a')
|
||||
self.story.setMetadata('authorId',a['href'].split('/')[2])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',stripHTML(a))
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.find('select').findAll('option'):
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/s/'+self.story.getMetadata('storyId')+'/'+chapter['value']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
self.story.setMetadata('language','German')
|
||||
|
||||
#find metadata on the story page
|
||||
headtext = stripHTML(head)
|
||||
self.story.setMetadata('datePublished', makeDate(headtext.split('erstellt: ')[1].split('\n')[0], self.dateformat))
|
||||
|
||||
self.story.setMetadata('dateUpdated', makeDate(headtext.split('aktualisiert: ')[1].split('\n')[0], self.dateformat))
|
||||
|
||||
# second colspan=3 td in head.
|
||||
genres=stripHTML(head.findAll('td',{'colspan':'3'})[1])
|
||||
self.story.extendList('genre',genres[:genres.index('(')].split(', '))
|
||||
# for genre in head.text.split(' ')[3].split('/')[0].split(', '):
|
||||
# self.story.addToList('genre',genre)
|
||||
|
||||
if 'fertiggestellt' in headtext:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In Progress')
|
||||
|
||||
#find metadata on the author's page
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl("http://"+self.getSiteDomain()+"?a=q&a1=v&t=nickdetailsstories&lbi=stories&ar=0&nick="+self.story.getMetadata('authorId')))
|
||||
tr=asoup.findAll('tr')
|
||||
for i in range(1,len(tr)):
|
||||
a = tr[i].find('a')
|
||||
if '/s/'+self.story.getMetadata('storyId')+'/1/' in a['href']:
|
||||
break
|
||||
self.setDescription(url,a['onmouseover'].split("', '")[1])
|
||||
|
||||
td = tr[i].findAll('td')
|
||||
self.story.addToList('category',stripHTML(td[2]))
|
||||
self.story.setMetadata('rating', stripHTML(td[5]))
|
||||
self.story.setMetadata('numWords', stripHTML(td[6]))
|
||||
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
time.sleep(0.5) ## ffde has "floodlock" protection
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
for a in div.findAll('script'):
|
||||
a.extract()
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
44
fanficdownloader/adapters/adapter_fannation.py
Normal file
44
fanficdownloader/adapters/adapter_fannation.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import re
|
||||
from base_efiction_adapter import BaseEfictionAdapter
|
||||
|
||||
class FanNationAdapter(BaseEfictionAdapter):
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'fannation.shades-of-moonlight.com'
|
||||
|
||||
@classmethod
|
||||
def getPathToArchive(self):
|
||||
return '/archive'
|
||||
|
||||
@classmethod
|
||||
def getSiteAbbrev(self):
|
||||
return 'fannation'
|
||||
|
||||
def handleMetadataPair(self, key, value):
|
||||
if key == 'Romance':
|
||||
for val in re.split("\s*,\s*", value):
|
||||
self.story.addToList('romance', val)
|
||||
else:
|
||||
super(FanNationAdapter, self).handleMetadataPair(key, value)
|
||||
|
||||
def getClass():
|
||||
return FanNationAdapter
|
||||
57
fanficdownloader/adapters/adapter_fhsarchivecom.py
Normal file
57
fanficdownloader/adapters/adapter_fhsarchivecom.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import re
|
||||
from base_efiction_adapter import BaseEfictionAdapter
|
||||
|
||||
class FHSArchiveComAdapter(BaseEfictionAdapter):
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'fhsarchive.com'
|
||||
|
||||
@classmethod
|
||||
def getPathToArchive(self):
|
||||
return '/autoarchive'
|
||||
|
||||
@classmethod
|
||||
def getSiteAbbrev(self):
|
||||
return 'fhsa'
|
||||
|
||||
@classmethod
|
||||
def getDateFormat(self):
|
||||
return "%m/%d/%y"
|
||||
|
||||
def handleMetadataPair(self, key, value):
|
||||
if key == 'Warnings':
|
||||
for val in re.split("\s*,\s*", value):
|
||||
if value == 'None':
|
||||
return
|
||||
else:
|
||||
# toss numbers only.
|
||||
self.story.addToList('warnings', filter(lambda x : not x.isdigit() , val))
|
||||
|
||||
# elif 'Categories' in key:
|
||||
# for val in re.split("\s*>\s*", value):
|
||||
# self.story.addToList('category', val)
|
||||
else:
|
||||
super(FHSArchiveComAdapter, self).handleMetadataPair(key, value)
|
||||
|
||||
def getClass():
|
||||
return FHSArchiveComAdapter
|
||||
|
||||
226
fanficdownloader/adapters/adapter_ficbooknet.py
Normal file
226
fanficdownloader/adapters/adapter_ficbooknet.py
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import datetime
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
from .. import translit
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
|
||||
def getClass():
|
||||
return FicBookNetAdapter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class FicBookNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/readfic/'+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','fbn')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d %m %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.ficbook.net'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/readfic/12345 http://"+cls.getSiteDomain()+"/readfic/93626/246417#part_content"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/readfic/")+r"\d+"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
url=self.url
|
||||
logger.debug("URL: "+url)
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
table = soup.find('td',{'width':'50%'})
|
||||
|
||||
## Title
|
||||
a = soup.find('h1')
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
logger.debug("Title: (%s)"%self.story.getMetadata('title'))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = table.find('a')
|
||||
self.story.setMetadata('authorId',a.text) # Author's name is unique
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.text)
|
||||
logger.debug("Author: (%s)"%self.story.getMetadata('author'))
|
||||
|
||||
# Find the chapters:
|
||||
chapters = soup.find('div', {'class' : 'part_list'})
|
||||
if chapters != None:
|
||||
chapters=chapters.findAll('a', href=re.compile(r'/readfic/'+self.story.getMetadata('storyId')+"/\d+#part_content$"))
|
||||
self.story.setMetadata('numChapters',len(chapters))
|
||||
for x in range(0,len(chapters)):
|
||||
chapter=chapters[x]
|
||||
churl='http://'+self.host+chapter['href']
|
||||
self.chapterUrls.append((stripHTML(chapter),churl))
|
||||
if x == 0:
|
||||
pubdate = translit.translit(stripHTML(bs.BeautifulSoup(self._fetchUrl(churl)).find('div', {'class' : 'part_added'}).find('span')))
|
||||
if x == len(chapters)-1:
|
||||
update = translit.translit(stripHTML(bs.BeautifulSoup(self._fetchUrl(churl)).find('div', {'class' : 'part_added'}).find('span')))
|
||||
else:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
self.story.setMetadata('numChapters',1)
|
||||
pubdate=translit.translit(stripHTML(soup.find('div', {'class' : 'part_added'}).find('span')))
|
||||
update=pubdate
|
||||
|
||||
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
|
||||
|
||||
if not ',' in pubdate:
|
||||
pubdate=datetime.date.today().strftime(self.dateformat)
|
||||
if not ',' in update:
|
||||
update=datetime.date.today().strftime(self.dateformat)
|
||||
pubdate=pubdate.split(',')[0]
|
||||
update=update.split(',')[0]
|
||||
|
||||
fullmon = {"yanvarya":"01", "января":"01",
|
||||
"fievralya":"02", "февраля":"02",
|
||||
"marta":"03", "марта":"03",
|
||||
"aprielya":"04", "апреля":"04",
|
||||
"maya":"05", "мая":"05",
|
||||
"iyunya":"06", "июня":"06",
|
||||
"iyulya":"07", "июля":"07",
|
||||
"avghusta":"08", "августа":"08",
|
||||
"sentyabrya":"09", "сентября":"09",
|
||||
"oktyabrya":"10", "октября":"10",
|
||||
"noyabrya":"11", "ноября":"11",
|
||||
"diekabrya":"12", "декабря":"12" }
|
||||
for (name,num) in fullmon.items():
|
||||
if name in pubdate:
|
||||
pubdate = pubdate.replace(name,num)
|
||||
if name in update:
|
||||
update = update.replace(name,num)
|
||||
|
||||
self.story.setMetadata('dateUpdated', makeDate(update, self.dateformat))
|
||||
self.story.setMetadata('datePublished', makeDate(pubdate, self.dateformat))
|
||||
self.story.setMetadata('language','Russian')
|
||||
|
||||
pr=soup.find('a', href=re.compile(r'/printfic/\w+'))
|
||||
pr='http://'+self.host+pr['href']
|
||||
pr = bs.BeautifulSoup(self._fetchUrl(pr))
|
||||
pr=pr.findAll('div', {'class' : 'part_text'})
|
||||
i=0
|
||||
for part in pr:
|
||||
i=i+len(stripHTML(part).split(' '))
|
||||
self.story.setMetadata('numWords', str(i))
|
||||
|
||||
i=0
|
||||
fandoms = table.findAll('a', href=re.compile(r'/fanfiction/\w+'))
|
||||
for fandom in fandoms:
|
||||
self.story.addToList('category',fandom.string)
|
||||
i=i+1
|
||||
if i > 1:
|
||||
self.story.addToList('genre', 'Кроссовер')
|
||||
|
||||
meta=table.findAll('a', href=re.compile(r'/ratings/'))
|
||||
i=0
|
||||
for m in meta:
|
||||
if i == 0:
|
||||
self.story.setMetadata('rating', m.find('b').text)
|
||||
i=1
|
||||
elif i == 1:
|
||||
if not "," in m.nextSibling:
|
||||
i=2
|
||||
self.story.addToList('genre', m.find('b').text)
|
||||
elif i == 2:
|
||||
self.story.addToList('warnings', m.find('b').text)
|
||||
|
||||
|
||||
if table.find('span', {'style' : 'color: green'}):
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In Progress')
|
||||
|
||||
|
||||
tags = table.findAll('b')
|
||||
for tag in tags:
|
||||
label = translit.translit(tag.text)
|
||||
if 'Piersonazhi:' in label or 'Персонажи:' in label:
|
||||
chars=tag.nextSibling.string.split(', ')
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char)
|
||||
break
|
||||
|
||||
summary=soup.find('span', {'class' : 'urlize'})
|
||||
self.setDescription(url,summary)
|
||||
#self.story.setMetadata('description', summary.text)
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
chapter = soup.find('div', {'class' : 'public_beta'})
|
||||
if chapter == None:
|
||||
chapter = soup.find('div', {'class' : 'public_beta_disabled'})
|
||||
|
||||
if None == chapter:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,chapter)
|
||||
241
fanficdownloader/adapters/adapter_fictionalleyorg.py
Normal file
241
fanficdownloader/adapters/adapter_fictionalleyorg.py
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','fa')
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
self.story.setMetadata('authorId',m.group('auth'))
|
||||
self.story.setMetadata('storyId',m.group('id'))
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL(url)
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.fictionalley.org'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/authors/drt/DA.html http://"+cls.getSiteDomain()+"/authors/drt/JOTP01a.html"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
# http://www.fictionalley.org/authors/drt/DA.html
|
||||
# http://www.fictionalley.org/authors/drt/JOTP01a.html
|
||||
return re.escape("http://"+self.getSiteDomain())+"/authors/(?P<auth>[a-zA-Z0-9_]+)/(?P<id>[a-zA-Z0-9_]+)\.html"
|
||||
|
||||
def _postFetchWithIAmOld(self,url):
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
params={'iamold':'Yes',
|
||||
'action':'ageanswer'}
|
||||
logger.info("Attempting to get cookie for %s" % url)
|
||||
## posting on list doesn't work, but doesn't hurt, either.
|
||||
data = self._postUrl(url,params)
|
||||
else:
|
||||
data = self._fetchUrl(url)
|
||||
return data
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
## could be either chapter list page or one-shot text page.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._postFetchWithIAmOld(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
chapterdata = data
|
||||
# If chapter list page, get the first chapter to look for adult check
|
||||
chapterlinklist = soup.findAll('a',{'class':'chapterlink'})
|
||||
if chapterlinklist:
|
||||
chapterdata = self._postFetchWithIAmOld(chapterlinklist[0]['href'])
|
||||
|
||||
if "Are you over seventeen years old" in chapterdata:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if not chapterlinklist:
|
||||
# no chapter list, chapter URL: change to list link.
|
||||
# second a tag inside div breadcrumbs
|
||||
storya = soup.find('div',{'class':'breadcrumbs'}).findAll('a')[1]
|
||||
self._setURL(storya['href'])
|
||||
url=self.url
|
||||
logger.debug("Normalizing to URL: "+url)
|
||||
## title's right there...
|
||||
self.story.setMetadata('title',stripHTML(storya))
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
chapterlinklist = soup.findAll('a',{'class':'chapterlink'})
|
||||
else:
|
||||
## still need title from somewhere. If chapterlinklist,
|
||||
## then chapterdata contains a chapter, find title the
|
||||
## same way.
|
||||
chapsoup = bs.BeautifulSoup(chapterdata)
|
||||
storya = chapsoup.find('div',{'class':'breadcrumbs'}).findAll('a')[1]
|
||||
self.story.setMetadata('title',stripHTML(storya))
|
||||
del chapsoup
|
||||
|
||||
del chapterdata
|
||||
|
||||
## authorid already set.
|
||||
## <h1 class="title" align="center">Just Off The Platform II by <a href="http://www.fictionalley.org/authors/drt/">DrT</a></h1>
|
||||
authora=soup.find('h1',{'class':'title'}).find('a')
|
||||
self.story.setMetadata('author',authora.string)
|
||||
self.story.setMetadata('authorUrl',authora['href'])
|
||||
|
||||
if len(chapterlinklist) == 1:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),chapterlinklist[0]['href']))
|
||||
else:
|
||||
# Find the chapters:
|
||||
for chapter in chapterlinklist:
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),chapter['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
## Go scrape the rest of the metadata from the author's page.
|
||||
data = self._fetchUrl(self.story.getMetadata('authorUrl'))
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
# <dl><dt><a class = "Rid story" href = "http://www.fictionalley.org/authors/aafro_man_ziegod/TMH.html">
|
||||
# [Rid] The Magical Hottiez</a> by <a class = "pen_name" href = "http://www.fictionalley.org/authors/aafro_man_ziegod/">Aafro Man Ziegod</a> </small></dt>
|
||||
# <dd><small class = "storyinfo"><a href = "http://www.fictionalley.org/ratings.html" target = "_new">Rating:</a> PG-13 - Spoilers: PS/SS, CoS, PoA, GoF, QTTA, FB - 4264 hits - 5060 words<br />
|
||||
# Genre: Humor, Romance - Main character(s): None - Ships: None - Era: Multiple Eras<br /></small>
|
||||
# Chaos ensues after Witch Weekly, seeking to increase readers, decides to create a boyband out of five seemingly talentless wizards: Harry Potter, Draco Malfoy, Ron Weasley, Neville Longbottom, and Oliver "Toss Your Knickers Here" Wood.<br />
|
||||
# <small class = "storyinfo">Published: June 3, 2002 (between Goblet of Fire and Order of Phoenix) - Updated: June 3, 2002</small>
|
||||
# </dd></dl>
|
||||
|
||||
storya = soup.find('a',{'href':self.story.getMetadata('storyUrl')})
|
||||
storydd = storya.findNext('dd')
|
||||
|
||||
# Rating: PG - Spoilers: None - 2525 hits - 736 words
|
||||
# Genre: Humor - Main character(s): H, R - Ships: None - Era: Multiple Eras
|
||||
# Harry and Ron are back at it again! They reeeeeeally don't want to be back, because they know what's awaiting them. "VH1 Goes Inside..." is back! Why? 'Cos there are soooo many more couples left to pick on.
|
||||
# Published: September 25, 2004 (between Order of Phoenix and Half-Blood Prince) - Updated: September 25, 2004
|
||||
|
||||
## change to text and regexp find.
|
||||
metastr = stripHTML(storydd).replace('\n',' ').replace('\t',' ')
|
||||
|
||||
m = re.match(r".*?Rating: (.+?) -.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('rating', m.group(1))
|
||||
|
||||
m = re.match(r".*?Genre: (.+?) -.*?",metastr)
|
||||
if m:
|
||||
for g in m.group(1).split(','):
|
||||
self.story.addToList('genre',g)
|
||||
|
||||
m = re.match(r".*?Published: ([a-zA-Z]+ \d\d?, \d\d\d\d).*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('datePublished',makeDate(m.group(1), "%B %d, %Y"))
|
||||
|
||||
m = re.match(r".*?Updated: ([a-zA-Z]+ \d\d?, \d\d\d\d).*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%B %d, %Y"))
|
||||
|
||||
m = re.match(r".*? (\d+) words Genre.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('numWords', m.group(1))
|
||||
|
||||
for small in storydd.findAll('small'):
|
||||
small.extract() ## removes the <small> tags, leaving only the summary.
|
||||
self.setDescription(url,storydd)
|
||||
#self.story.setMetadata('description',stripHTML(storydd))
|
||||
|
||||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self._fetchUrl(url)
|
||||
# find <!-- headerend --> & <!-- footerstart --> and
|
||||
# replaced with matching div pair for easier parsing.
|
||||
# Yes, it's an evil kludge, but what can ya do? Using
|
||||
# something other than div prevents soup from pairing
|
||||
# our div with poor html inside the story text.
|
||||
data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
|
||||
# problems with some stories confusing Soup. This is a nasty
|
||||
# hack, but it works.
|
||||
data = data[data.index("<crazytagstringnobodywouldstumbleonaccidently"):]
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data,
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
body = soup.findAll('body') ## some stories use a nested body and body
|
||||
## tag, in which case we don't
|
||||
## need crazytagstringnobodywouldstumbleonaccidently
|
||||
## and use the second one instead.
|
||||
if len(body)>1:
|
||||
text = body[1]
|
||||
text.name='div' # force to be a div to avoid multiple body tags.
|
||||
else:
|
||||
text = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
|
||||
text.name='div' # change to div tag.
|
||||
|
||||
if not data or not text:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
# not sure how, but we can get html, etc tags still in some
|
||||
# stories. That breaks later updates because it confuses
|
||||
# epubutils.py
|
||||
for tag in text.findAll('head'):
|
||||
tag.extract()
|
||||
|
||||
for tag in text.findAll('body') + text.findAll('html'):
|
||||
tag.name = 'div'
|
||||
|
||||
return self.utf8FromSoup(url,text)
|
||||
|
||||
def getClass():
|
||||
return FictionAlleyOrgSiteAdapter
|
||||
|
||||
178
fanficdownloader/adapters/adapter_fictionmaniatv.py
Normal file
178
fanficdownloader/adapters/adapter_fictionmaniatv.py
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
import re
|
||||
import urllib2
|
||||
import urlparse
|
||||
|
||||
from .. import BeautifulSoup
|
||||
from ..BeautifulSoup import NavigableString
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
from .. import exceptions
|
||||
|
||||
|
||||
def getClass():
|
||||
return FictionManiaTVAdapter
|
||||
|
||||
|
||||
def _get_query_data(url):
|
||||
components = urlparse.urlparse(url)
|
||||
query_data = urlparse.parse_qs(components.query)
|
||||
return dict((key, data[0]) for key, data in query_data.items())
|
||||
|
||||
# yields Tag _and_ NavigableString siblings from the given tag. The
|
||||
# BeautifulSoup findNextSiblings() method for some reasons only returns either
|
||||
# NavigableStrings _or_ Tag objects, not both.
|
||||
def _yield_next_siblings(tag):
|
||||
sibling = tag.nextSibling
|
||||
while sibling:
|
||||
yield sibling
|
||||
sibling = sibling.nextSibling
|
||||
|
||||
|
||||
class FictionManiaTVAdapter(BaseSiteAdapter):
|
||||
SITE_ABBREVIATION = 'fmt'
|
||||
SITE_DOMAIN = 'fictionmania.tv'
|
||||
|
||||
BASE_URL = 'http://' + SITE_DOMAIN + '/stories/'
|
||||
READ_TEXT_STORY_URL_TEMPLATE = BASE_URL + 'readtextstory.html?storyID=%s'
|
||||
DETAILS_URL_TEMPLATE = BASE_URL + 'details.html?storyID=%s'
|
||||
|
||||
DATETIME_FORMAT = '%m/%d/%Y'
|
||||
ALTERNATIVE_DATETIME_FORMAT = '%m/%d/%y'
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
query_data = urlparse.parse_qs(self.parsedUrl.query)
|
||||
story_id = query_data['storyID'][0]
|
||||
|
||||
self.story.setMetadata('storyId', story_id)
|
||||
self._setURL(self.READ_TEXT_STORY_URL_TEMPLATE % story_id)
|
||||
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
|
||||
|
||||
# Always single chapters, probably should use the Anthology feature to
|
||||
# merge chapters of a story
|
||||
self.story.setMetadata('numChapters', 1)
|
||||
|
||||
def _customized_fetch_url(self, url, exception=None, parameters=None):
|
||||
if exception:
|
||||
try:
|
||||
data = self._fetchUrl(url, parameters)
|
||||
except urllib2.HTTPError:
|
||||
raise exception(self.url)
|
||||
# Just let self._fetchUrl throw the exception, don't catch and
|
||||
# customize it.
|
||||
else:
|
||||
data = self._fetchUrl(url, parameters)
|
||||
|
||||
return BeautifulSoup.BeautifulSoup(data)
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return FictionManiaTVAdapter.SITE_DOMAIN
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return cls.READ_TEXT_STORY_URL_TEMPLATE % 1234
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape(self.BASE_URL) + '(readtextstory|details)\.html\?storyID=\d+$'
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
url = self.DETAILS_URL_TEMPLATE % self.story.getMetadata('storyId')
|
||||
soup = self._customized_fetch_url(url)
|
||||
|
||||
keep_summary_html = self.getConfig('keep_summary_html')
|
||||
for row in soup.find('table')('tr'):
|
||||
cells = row('td')
|
||||
key = cells[0].b.string.strip(':')
|
||||
try:
|
||||
value = cells[1].string
|
||||
except AttributeError:
|
||||
value = None
|
||||
|
||||
if key == 'Story Name-Title':
|
||||
self.story.setMetadata('title', value)
|
||||
self.chapterUrls.append((value, self.url))
|
||||
|
||||
elif key == 'File Name':
|
||||
self.story.setMetadata('fileName', value)
|
||||
|
||||
elif key == 'File Size':
|
||||
self.story.setMetadata('fileSize', value)
|
||||
|
||||
elif key == 'Author':
|
||||
element = cells[1].a
|
||||
self.story.setMetadata('author', element.string)
|
||||
query_data = _get_query_data(element['href'])
|
||||
self.story.setMetadata('authorId', query_data['word'])
|
||||
self.story.setMetadata('authorUrl', urlparse.urljoin(url, element['href']))
|
||||
|
||||
elif key == 'Date Added':
|
||||
try:
|
||||
date = makeDate(value, self.DATETIME_FORMAT)
|
||||
except ValueError:
|
||||
date = makeDate(value, self.ALTERNATIVE_DATETIME_FORMAT)
|
||||
self.story.setMetadata('datePublished', date)
|
||||
|
||||
elif key == 'Old Name':
|
||||
self.story.setMetadata('oldName', value)
|
||||
|
||||
elif key == 'New Name':
|
||||
self.story.setMetadata('newName', value)
|
||||
|
||||
elif key == 'Other Key Names':
|
||||
for name in value.split(', '):
|
||||
self.story.addToList('characters', name)
|
||||
|
||||
# I have no clue how the rating system works, if you are reading
|
||||
# transgender fanfiction, you are probably an adult.
|
||||
elif key == 'Rating':
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
elif key == 'Complete':
|
||||
self.story.setMetadata('status', 'Complete' if value == 'Complete' else 'In-Progress')
|
||||
|
||||
elif key == 'Categories':
|
||||
for element in cells[1]('a'):
|
||||
self.story.addToList('category', element.string)
|
||||
|
||||
elif key == 'Key Words':
|
||||
for element in cells[1]('a'):
|
||||
self.story.addToList('keyWords', element.string)
|
||||
|
||||
elif key == 'Main Characters Age':
|
||||
element = cells[1].a
|
||||
self.story.setMetadata('mainCharactersAge', element.string)
|
||||
|
||||
elif key == 'Synopsis':
|
||||
element = cells[1]
|
||||
|
||||
# Replace td with div to avoid possible strange formatting in
|
||||
# the ebook later on
|
||||
element.name = 'div'
|
||||
|
||||
if keep_summary_html:
|
||||
self.story.setMetadata('description', unicode(element))
|
||||
else:
|
||||
self.story.setMetadata('description', ''.join(element(text=True)))
|
||||
|
||||
elif key == 'Reads':
|
||||
self.story.setMetadata('readings', value)
|
||||
|
||||
def getChapterText(self, url):
|
||||
soup = self._customized_fetch_url(url)
|
||||
element = soup.find('pre')
|
||||
element.name = 'div'
|
||||
|
||||
# The story's content is contained in a <pre> tag, probably taken 1:1
|
||||
# from the source text file. A simple replacement of all newline
|
||||
# characters with a break line tag should take care of formatting.
|
||||
|
||||
# While wrapping in paragraphs would be possible, it's too much work,
|
||||
# I'd rather display the story 1:1 like it was found in the pre tag.
|
||||
content = unicode(element)
|
||||
content = content.replace('\n', '<br />')
|
||||
|
||||
if self.getConfig('non_breaking_spaces'):
|
||||
content = content.replace(' ', ' ')
|
||||
return content
|
||||
194
fanficdownloader/adapters/adapter_fictionpadcom.py
Normal file
194
fanficdownloader/adapters/adapter_fictionpadcom.py
Normal file
|
|
@ -0,0 +1,194 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
import time
|
||||
import json
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
#from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class FictionPadSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','fpad')
|
||||
self.dateformat = "%Y-%m-%dT%H:%M:%SZ"
|
||||
self.is_adult=False
|
||||
self.username = None
|
||||
self.password = None
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
self.story.setMetadata('storyId',m.group('id'))
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL("https://"+self.getSiteDomain()
|
||||
+"/author/"+m.group('author')
|
||||
+"/stories/"+self.story.getMetadata('storyId'))
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'fictionpad.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "https://fictionpad.com/author/Author/stories/1234/Some-Title"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
# http://fictionpad.com/author/Serdd/stories/4275
|
||||
return r"http(s)?://(www\.)?fictionpad\.com/author/(?P<author>[^/]+)/stories/(?P<id>\d+)"
|
||||
|
||||
# <form method="post" action="/signin">
|
||||
# <input name="authenticity_token" type="hidden" value="u+cfdXh46dRnwVnSlmE2B2BFmHgu760paqgBG6KQeos=" />
|
||||
# <input type="hidden" name="remember" value="1">
|
||||
# <strong class="help-start text-center">or with FictionPad</strong>
|
||||
# <label class="control-label hidden-placeholder">Pseudonym or Email Address</label>
|
||||
# <input name="login" class="input-block-level" type="text" placeholder="Pseudonym or Email Address" maxlength="50" required autofocus>
|
||||
# <label class="control-label hidden-placeholder">Password</label>
|
||||
# <input name="password" class="input-block-level" type="password" placeholder="Password" minlength="6" required>
|
||||
# <button type="submit" class="btn btn-primary btn-block">Sign In</button>
|
||||
# <p class="help-end">
|
||||
# <a href="/passwordreset">Forgot your password?</a>
|
||||
# </p>
|
||||
# </form>
|
||||
def performLogin(self):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['login'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['login'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['remember'] = '1'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/signin'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['login']))
|
||||
|
||||
## need to pull empty login page first to get authenticity_token
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(loginUrl))
|
||||
params['authenticity_token']=soup.find('input', {'name':'authenticity_token'})['value']
|
||||
|
||||
data = self._postUrl(loginUrl, params)
|
||||
|
||||
if "Invalid email/pseudonym and password combination." in data:
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['login']))
|
||||
raise exceptions.FailedToLogin(loginUrl,params['login'])
|
||||
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
# fetch the chapter. From that we will get almost all the
|
||||
# metadata and chapter list
|
||||
|
||||
url=self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
if "This is a mature story. Please sign in to read it." in data:
|
||||
self.performLogin()
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
find = "wordyarn.config.page = "
|
||||
data = data[data.index(find)+len(find):]
|
||||
data = data[:data.index("</script>")]
|
||||
data = data[:data.rindex(";")]
|
||||
data = data.replace('tables:','"tables":')
|
||||
tables = json.loads(data)['tables']
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# looks like only one author per story allowed.
|
||||
author = tables['users'][0]
|
||||
story = tables['stories'][0]
|
||||
story_ver = tables['story_versions'][0]
|
||||
print("story:%s"%story)
|
||||
|
||||
self.story.setMetadata('authorId',author['id'])
|
||||
self.story.setMetadata('author',author['display_name'])
|
||||
self.story.setMetadata('authorUrl','https://'+self.host+'/author/'+author['display_name']+'/stories')
|
||||
|
||||
self.story.setMetadata('title',story_ver['title'])
|
||||
self.setDescription(url,story_ver['description'])
|
||||
|
||||
if not ('assets/story_versions/covers' in story_ver['profile_image_url@2x']):
|
||||
self.setCoverImage(url,story_ver['profile_image_url@2x'])
|
||||
|
||||
self.story.setMetadata('datePublished',makeDate(story['published_at'], self.dateformat))
|
||||
self.story.setMetadata('dateUpdated',makeDate(story['published_at'], self.dateformat))
|
||||
|
||||
self.story.setMetadata('followers',story['followers_count'])
|
||||
self.story.setMetadata('comments',story['comments_count'])
|
||||
self.story.setMetadata('views',story['views_count'])
|
||||
self.story.setMetadata('likes',int(story['likes'])) # no idea why they floated these.
|
||||
if 'dislikes' in story:
|
||||
self.story.setMetadata('dislikes',int(story['dislikes']))
|
||||
|
||||
if story_ver['is_complete']:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
self.story.setMetadata('rating', story_ver['maturity_level'])
|
||||
self.story.setMetadata('numWords', unicode(story_ver['word_count']))
|
||||
|
||||
for i in tables['fandoms']:
|
||||
self.story.addToList('category',i['name'])
|
||||
|
||||
for i in tables['genres']:
|
||||
self.story.addToList('genre',i['name'])
|
||||
|
||||
for i in tables['characters']:
|
||||
self.story.addToList('characters',i['name'])
|
||||
|
||||
for c in tables['chapters']:
|
||||
chtitle = "Chapter %d"%c['number']
|
||||
if c['title']:
|
||||
chtitle += " - %s"%c['title']
|
||||
self.chapterUrls.append((chtitle,c['body_url']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
if not url:
|
||||
data = u"<em>This chapter has no text.</em>"
|
||||
else:
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(u"<div id='story'>"+data+u"</div>")
|
||||
return self.utf8FromSoup(url,soup)
|
||||
|
||||
def getClass():
|
||||
return FictionPadSiteAdapter
|
||||
|
||||
51
fanficdownloader/adapters/adapter_fictionpresscom.py
Normal file
51
fanficdownloader/adapters/adapter_fictionpresscom.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
import time
|
||||
|
||||
## They're from the same people and pretty much identical.
|
||||
from adapter_fanfictionnet import FanFictionNetSiteAdapter
|
||||
|
||||
class FictionPressComSiteAdapter(FanFictionNetSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
FanFictionNetSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','fpcom')
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.fictionpress.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.fictionpress.com','m.fictionpress.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "https://www.fictionpress.com/s/1234/1/ https://www.fictionpress.com/s/1234/12/ http://www.fictionpress.com/s/1234/1/Story_Title http://m.fictionpress.com/s/1234/1/"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"https?://(www|m)?\.fictionpress\.com/s/\d+(/\d+)?(/|/[a-zA-Z0-9_-]+)?/?$"
|
||||
|
||||
def getClass():
|
||||
return FictionPressComSiteAdapter
|
||||
|
||||
237
fanficdownloader/adapters/adapter_ficwadcom.py
Normal file
237
fanficdownloader/adapters/adapter_ficwadcom.py
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
import time
|
||||
import httplib, urllib
|
||||
|
||||
#from .. import BeautifulSoup as bs
|
||||
import bs4 as bs
|
||||
|
||||
from .. import exceptions as exceptions
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class FicwadComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','fw')
|
||||
|
||||
# get storyId from url--url validation guarantees second part is storyId
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
self.username = "NoneGiven"
|
||||
self.password = ""
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'ficwad.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://ficwad.com/story/1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape(r"http://"+self.getSiteDomain())+"/story/\d+?$"
|
||||
|
||||
def performLogin(self,url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['username'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['username'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/account/login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['username']))
|
||||
d = self._postUrl(loginUrl,params,usecache=False)
|
||||
|
||||
if "Login attempt failed..." in d:
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['username']))
|
||||
raise exceptions.FailedToLogin(url,params['username'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# fetch the chapter. From that we will get almost all the
|
||||
# metadata and chapter list
|
||||
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
# non-existent/removed story urls get thrown to the front page.
|
||||
if "<h2>Welcome to FicWad</h2>" in data:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
soup = bs.BeautifulSoup(data, "html5lib")
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# if blocked, attempt login.
|
||||
if soup.find("div",{"class":"blocked"}):
|
||||
if self.performLogin(url): # performLogin raises
|
||||
# FailedToLogin if it fails.
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url,usecache=False), "html5lib")
|
||||
|
||||
divstory = soup.find('div',id='story')
|
||||
storya = divstory.find('a',href=re.compile("^/story/\d+$"))
|
||||
if storya : # if there's a story link in the divstory header, this is a chapter page.
|
||||
# normalize story URL on chapter list.
|
||||
self.story.setMetadata('storyId',storya['href'].split('/',)[2])
|
||||
url = "http://"+self.getSiteDomain()+storya['href']
|
||||
logger.debug("Normalizing to URL: "+url)
|
||||
self._setURL(url)
|
||||
try:
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib")
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# if blocked, attempt login.
|
||||
if soup.find("div",{"class":"blocked"}):
|
||||
if self.performLogin(url): # performLogin raises
|
||||
# FailedToLogin if it fails.
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url,usecache=False), "html5lib")
|
||||
|
||||
# title - first h4 tag will be title.
|
||||
titleh4 = soup.find('div',{'class':'storylist'}).find('h4')
|
||||
self.story.setMetadata('title', stripHTML(titleh4.a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('span',{'class':'author'}).find('a', href=re.compile(r"^/author/\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('/')[2])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# description
|
||||
storydiv = soup.find("div",{"id":"story"})
|
||||
self.setDescription(url,storydiv.find("blockquote",{'class':'summary'}).p)
|
||||
#self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
|
||||
|
||||
# most of the meta data is here:
|
||||
metap = storydiv.find("p",{"class":"meta"})
|
||||
self.story.addToList('category',metap.find("a",href=re.compile(r"^/category/\d+")).string)
|
||||
|
||||
# warnings
|
||||
# <span class="req"><a href="/help/38" title="Medium Spoilers">[!!] </a> <a href="/help/38" title="Rape/Sexual Violence">[R] </a> <a href="/help/38" title="Violence">[V] </a> <a href="/help/38" title="Child/Underage Sex">[Y] </a></span>
|
||||
spanreq = metap.find("span",{"class":"story-warnings"})
|
||||
if spanreq: # can be no warnings.
|
||||
for a in spanreq.findAll("a"):
|
||||
self.story.addToList('warnings',a['title'])
|
||||
|
||||
## perhaps not the most efficient way to parse this, using
|
||||
## regexps for each rather than something more complex, but
|
||||
## IMO, it's more readable and amenable to change.
|
||||
metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ')
|
||||
#print "metap: (%s)"%metastr
|
||||
|
||||
m = re.match(r".*?Rating: (.+?) -.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('rating', m.group(1))
|
||||
|
||||
m = re.match(r".*?Genres: (.+?) -.*?",metastr)
|
||||
if m:
|
||||
for g in m.group(1).split(','):
|
||||
self.story.addToList('genre',g)
|
||||
|
||||
m = re.match(r".*?Characters: (.*?) -.*?",metastr)
|
||||
if m:
|
||||
for g in m.group(1).split(','):
|
||||
if g:
|
||||
self.story.addToList('characters',g)
|
||||
|
||||
m = re.match(r".*?Published: ([0-9-]+?) -.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('datePublished',makeDate(m.group(1), "%Y-%m-%d"))
|
||||
|
||||
# Updated can have more than one space after it. <shrug>
|
||||
m = re.match(r".*?Updated: ([0-9-]+?) +-.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%Y-%m-%d"))
|
||||
|
||||
m = re.match(r".*? - ([0-9,]+?) words.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('numWords',m.group(1))
|
||||
|
||||
if metastr.endswith("Complete"):
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
# get the chapter list first this time because that's how we
|
||||
# detect the need to login.
|
||||
storylistul = soup.find('ul',{'class':'storylist'})
|
||||
if not storylistul:
|
||||
# no list found, so it's a one-chapter story.
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
else:
|
||||
chapterlistlis = storylistul.findAll('li')
|
||||
for chapterli in chapterlistlis:
|
||||
if "blocked" in chapterli['class']:
|
||||
# paranoia check. We should already be logged in by now.
|
||||
raise exceptions.FailedToLogin(url,self.username)
|
||||
else:
|
||||
#print "chapterli.h4.a (%s)"%chapterli.h4.a
|
||||
self.chapterUrls.append((chapterli.h4.a.string,
|
||||
u'http://%s%s'%(self.getSiteDomain(),
|
||||
chapterli.h4.a['href'])))
|
||||
#print "self.chapterUrls:%s"%self.chapterUrls
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
return
|
||||
|
||||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib")
|
||||
|
||||
span = soup.find('div', {'id' : 'storytext'})
|
||||
|
||||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return FicwadComSiteAdapter
|
||||
|
||||
338
fanficdownloader/adapters/adapter_fimfictionnet.py
Normal file
338
fanficdownloader/adapters/adapter_fimfictionnet.py
Normal file
|
|
@ -0,0 +1,338 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
import cookielib as cl
|
||||
import json
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return FimFictionNetSiteAdapter
|
||||
|
||||
class FimFictionNetSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','fimficnet')
|
||||
self.story.setMetadata('storyId', self.parsedUrl.path.split('/',)[2])
|
||||
self._setURL("http://"+self.getSiteDomain()+"/story/"+self.story.getMetadata('storyId')+"/")
|
||||
self.is_adult = False
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d %b %Y"
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.fimfiction.net'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
# mobile.fimifction.com isn't actually a valid domain, but we can still get the story id from URLs anyway
|
||||
return ['www.fimfiction.net','mobile.fimfiction.net', 'www.fimfiction.com', 'mobile.fimfiction.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://www.fimfiction.net/story/1234/story-title-here http://www.fimfiction.net/story/1234/ http://www.fimfiction.com/story/1234/1/ http://mobile.fimfiction.net/story/1234/1/story-title-here/chapter-title-here"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"https?://(www|mobile)\.fimfiction\.(net|com)/story/\d+/?.*"
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
cookie = cl.Cookie(version=0, name='view_mature', value='true',
|
||||
port=None, port_specified=False,
|
||||
domain=self.getSiteDomain(), domain_specified=False, domain_initial_dot=False,
|
||||
path='/story', path_specified=True,
|
||||
secure=False,
|
||||
expires=time.time()+10000,
|
||||
discard=False,
|
||||
comment=None,
|
||||
comment_url=None,
|
||||
rest={'HttpOnly': None},
|
||||
rfc2109=False)
|
||||
self.cookiejar.set_cookie(cookie)
|
||||
|
||||
##---------------------------------------------------------------------------------------------------
|
||||
## Get the story's title page. Check if it exists.
|
||||
|
||||
try:
|
||||
# don't use cache if manual is_adult--should only happen
|
||||
# if it's an adult story and they don't have is_adult in ini.
|
||||
data = self.do_fix_blockquotes(self._fetchUrl(self.url,
|
||||
usecache=(not self.is_adult)))
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Warning: mysql_fetch_array(): supplied argument is not a valid MySQL result resource" in data:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
|
||||
if "This story has been marked as having adult content. Please click below to confirm you are of legal age to view adult material in your country." in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if self.password:
|
||||
params = {}
|
||||
params['password'] = self.password
|
||||
data = self._postUrl(self.url, params)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
if not (soup.find('form', {'id' : 'password_form'}) == None):
|
||||
if self.getConfig('fail_on_password'):
|
||||
raise exceptions.FailedToDownload("%s requires story password and fail_on_password is true."%self.url)
|
||||
else:
|
||||
raise exceptions.FailedToLogin(self.url,"Story requires individual password",passwdonly=True)
|
||||
|
||||
##----------------------------------------------------------------------------------------------------
|
||||
## Extract metadata
|
||||
|
||||
storyContentBox = soup.find('div', {'class':'story_content_box'})
|
||||
|
||||
# Title
|
||||
title = storyContentBox.find('a', {'class':re.compile(r'.*\bstory_name\b.*')})
|
||||
self.story.setMetadata('title',stripHTML(title))
|
||||
|
||||
# Author
|
||||
author = storyContentBox.find('span', {'class':'author'})
|
||||
self.story.setMetadata("author", stripHTML(author))
|
||||
#No longer seems to be a way to access Fimfiction's internal author ID
|
||||
self.story.setMetadata("authorId", self.story.getMetadata("author"))
|
||||
self.story.setMetadata("authorUrl", "http://%s/user/%s" % (self.getSiteDomain(), stripHTML(author)))
|
||||
|
||||
#Rating text is replaced with full words for historical compatibility after the site changed
|
||||
#on 2014-10-27
|
||||
rating = stripHTML(storyContentBox.find('a', {'class':re.compile(r'.*\bcontent-rating-.*')}))
|
||||
rating = rating.replace("E", "Everyone").replace("T", "Teen").replace("M", "Mature")
|
||||
self.story.setMetadata("rating", rating)
|
||||
|
||||
# Chapters
|
||||
for chapter in storyContentBox.findAll('a',{'class':'chapter_link'}):
|
||||
self.chapterUrls.append((stripHTML(chapter), 'http://'+self.host+chapter['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# Status
|
||||
# In the case of Fimfiction, possible statuses are 'Completed', 'Incomplete', 'On Hiatus' and 'Cancelled'
|
||||
# For the sake of bringing it in line with the other adapters, 'Incomplete' becomes 'In-Progress'
|
||||
# and 'Complete' becomes 'Completed'. 'Cancelled' and 'On Hiatus' are passed through, it's easy now for users
|
||||
# to change/remove if they want with replace_metadata
|
||||
status = stripHTML(storyContentBox.find('span', {'class':re.compile(r'.*\bcompleted-status-.*')}))
|
||||
status = status.replace("Incomplete", "In-Progress").replace("Complete", "Completed")
|
||||
self.story.setMetadata("status", status)
|
||||
|
||||
# Genres and Warnings
|
||||
# warnings were folded into general categories in the 2014-10-27 site update
|
||||
categories = storyContentBox.findAll('a', {'class':re.compile(r'.*\bstory_category\b.*')})
|
||||
for category in categories:
|
||||
category = stripHTML(category)
|
||||
if category == "Gore" or category == "Sex":
|
||||
self.story.addToList('warnings', category)
|
||||
else:
|
||||
self.story.addToList('genre', category)
|
||||
|
||||
# Word count
|
||||
wordCountText = stripHTML(storyContentBox.find('li', {'class':'bottom'}).find('div', {'class':'word_count'}))
|
||||
self.story.setMetadata("numWords", re.sub(r'[^0-9]', '', wordCountText))
|
||||
|
||||
# Cover image
|
||||
storyImage = storyContentBox.find('div', {'class':'story_image'})
|
||||
if storyImage:
|
||||
coverurl = storyImage.find('a')['href']
|
||||
if coverurl.startswith('//'): # fix for img urls missing 'http:'
|
||||
coverurl = "http:"+coverurl
|
||||
if get_cover:
|
||||
self.setCoverImage(self.url,coverurl)
|
||||
|
||||
coverSource = storyImage.find('a', {'class':'source'})
|
||||
if coverSource:
|
||||
self.story.setMetadata('coverSourceUrl', coverSource['href'])
|
||||
#There's no text associated with the cover source link, so just
|
||||
#reuse the URL. Makes it clear it's an external link leading
|
||||
#outside of the fanfic site, at least.
|
||||
self.story.setMetadata('coverSource', coverSource['href'])
|
||||
|
||||
# fimf has started including extra stuff inside the description div.
|
||||
descdivstr = u"%s"%storyContentBox.find("div", {"class":"description"})
|
||||
hrstr=u"<hr />"
|
||||
descdivstr = u'<div class="description">'+descdivstr[descdivstr.index(hrstr)+len(hrstr):]
|
||||
self.setDescription(self.url,descdivstr)
|
||||
|
||||
# Find the newest and oldest chapter dates
|
||||
storyData = storyContentBox.find('div', {'class':'story_data'})
|
||||
oldestChapter = None
|
||||
newestChapter = None
|
||||
self.newestChapterNum = None # save for comparing during update.
|
||||
# Scan all chapters to find the oldest and newest, on
|
||||
# FiMFiction it's possible for authors to insert new chapters
|
||||
# out-of-order or change the dates of earlier ones by editing
|
||||
# them--That WILL break epub update.
|
||||
for index, chapterDate in enumerate(storyData.findAll('span', {'class':'date'})):
|
||||
chapterDate = self.ordinal_date_string_to_date(chapterDate.contents[1])
|
||||
if oldestChapter == None or chapterDate < oldestChapter:
|
||||
oldestChapter = chapterDate
|
||||
if newestChapter == None or chapterDate > newestChapter:
|
||||
newestChapter = chapterDate
|
||||
self.newestChapterNum = index
|
||||
|
||||
if newestChapter is None:
|
||||
#this will only be true when updating metadata for stories that have 0 chapters
|
||||
#there is a "last modified" date given on the page, extract it and use that.
|
||||
moddatetag = storyContentBox.find('span', {'class':'last_modified'})
|
||||
if not moddatetag is None:
|
||||
newestChapter = self.ordinal_date_string_to_date(moddatetag('span')[1].text)
|
||||
|
||||
# Date updated
|
||||
self.story.setMetadata("dateUpdated", newestChapter)
|
||||
|
||||
# Date published
|
||||
# falls back to oldest chapter date for stories that haven't been officially published yet
|
||||
pubdatetag = storyContentBox.find('span', {'class':'date_approved'})
|
||||
if pubdatetag is None:
|
||||
if oldestChapter is None:
|
||||
#this will only be true when updating metadata for stories that have 0 chapters
|
||||
#and that have never been officially published - a rare occurrence. Fall back to last
|
||||
#modified date as the publication date, it's all that we've got.
|
||||
self.story.setMetadata("datePublished", newestChapter)
|
||||
else:
|
||||
self.story.setMetadata("datePublished", oldestChapter)
|
||||
else:
|
||||
pubDate = self.ordinal_date_string_to_date(pubdatetag('span')[1].text)
|
||||
self.story.setMetadata("datePublished", pubDate)
|
||||
|
||||
# Characters
|
||||
chars = storyContentBox.find("div", {"class":"extra_story_data"})
|
||||
for character in chars.findAll("a", {"class":"character_icon"}):
|
||||
self.story.addToList("characters", character['title'])
|
||||
|
||||
# Likes and dislikes
|
||||
storyToolbar = soup.find('div', {'class':'story-toolbar'})
|
||||
likes = storyToolbar.find('span', {'class':'likes'})
|
||||
if not likes is None:
|
||||
self.story.setMetadata("likes", stripHTML(likes))
|
||||
dislikes = storyToolbar.find('span', {'class':'dislikes'})
|
||||
if not dislikes is None:
|
||||
self.story.setMetadata("dislikes", stripHTML(dislikes))
|
||||
|
||||
# Highest view for a chapter and total views
|
||||
viewSpan = storyToolbar.find('span', {'title':re.compile(r'.*\btotal views\b.*')})
|
||||
self.story.setMetadata("views", re.sub(r'[^0-9]', '', stripHTML(viewSpan)))
|
||||
self.story.setMetadata("total_views", re.sub(r'[^0-9]', '', viewSpan['title']))
|
||||
|
||||
# Comment count
|
||||
commentSpan = storyToolbar.find('span', {'title':re.compile(r'.*\bcomments\b.*')})
|
||||
self.story.setMetadata("comment_count", re.sub(r'[^0-9]', '', stripHTML(commentSpan)))
|
||||
|
||||
# Short description
|
||||
descriptionMeta = soup.find('meta', {'property':'og:description'})
|
||||
self.story.setMetadata("short_description", stripHTML(descriptionMeta['content']))
|
||||
|
||||
#groups
|
||||
if soup.find('button', {'id':'button-view-all-groups'}):
|
||||
groupResponse = self._fetchUrl("http://www.fimfiction.net/ajax/groups/story_groups_list.php?story=%s" % (self.story.getMetadata("storyId")))
|
||||
groupData = json.loads(groupResponse)
|
||||
groupList = bs.BeautifulSoup(groupData["content"])
|
||||
else:
|
||||
groupList = soup.find('ul', {'id':'story-groups-list'})
|
||||
|
||||
if not (groupList == None):
|
||||
for groupName in groupList.findAll('a'):
|
||||
self.story.addToList("groupsUrl", 'http://'+self.host+groupName["href"])
|
||||
self.story.addToList("groups",stripHTML(groupName).replace(',', ';'))
|
||||
|
||||
#sequels
|
||||
sequelStoryHeader = soup.find('h1', {'class':'header-stories'}, text="Sequels")
|
||||
if not sequelStoryHeader == None:
|
||||
sequelContainer = sequelStoryHeader.parent.parent
|
||||
for sequel in sequelContainer.findAll('a', {'class':'story_link'}):
|
||||
self.story.addToList("sequelsUrl", 'http://'+self.host+sequel["href"])
|
||||
self.story.addToList("sequels", stripHTML(sequel).replace(',', ';'))
|
||||
|
||||
#The link to the prequel is embedded in the description text, so erring
|
||||
#on the side of caution and wrapping this whole thing in a try block.
|
||||
#If anything goes wrong this probably wasn't a valid prequel link.
|
||||
try:
|
||||
description = soup.find('div', {'class':'description'})
|
||||
firstHR = description.find("hr")
|
||||
nextSib = firstHR.nextSibling
|
||||
if "This story is a sequel to" in nextSib.string:
|
||||
link = nextSib.nextSibling
|
||||
if link.name == "a":
|
||||
self.story.setMetadata("prequelUrl", 'http://'+self.host+link["href"])
|
||||
self.story.setMetadata("prequel", stripHTML(link))
|
||||
except:
|
||||
pass
|
||||
|
||||
def ordinal_date_string_to_date(self, datestring):
|
||||
datestripped=re.sub(r"(\d+)(st|nd|rd|th)", r"\1", datestring.strip())
|
||||
return makeDate(datestripped, self.dateformat)
|
||||
|
||||
def hookForUpdates(self,chaptercount):
|
||||
if self.oldchapters and len(self.oldchapters) > self.newestChapterNum:
|
||||
print("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1))
|
||||
self.oldchapters = self.oldchapters[:self.newestChapterNum]
|
||||
return len(self.oldchapters)
|
||||
|
||||
def do_fix_blockquotes(self,data):
|
||||
if self.getConfig('fix_fimf_blockquotes'):
|
||||
# <p class="double"><blockquote>
|
||||
# </blockquote></p>
|
||||
# include > in re groups so there's always something in the group.
|
||||
data = re.sub(r'<p([^>]*>\s*)<blockquote([^>]*>)',r'<blockquote\2<p\1',data)
|
||||
data = re.sub(r'</blockquote(>\s*)</p>',r'</p\1</blockquote>',data)
|
||||
return data
|
||||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
soup = bs.BeautifulSoup(data)
|
||||
if not (soup.find('form', {'id' : 'password_form'}) == None):
|
||||
if self.password:
|
||||
params = {}
|
||||
params['password'] = self.password
|
||||
data = self._postUrl(url, params)
|
||||
else:
|
||||
print("Chapter %s needed password but no password was present" % url)
|
||||
|
||||
data = self.do_fix_blockquotes(data)
|
||||
|
||||
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr')).find('div', {'class' : 'chapter_content'})
|
||||
if soup == None:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,soup)
|
||||
288
fanficdownloader/adapters/adapter_finestoriescom.py
Normal file
288
fanficdownloader/adapters/adapter_finestoriescom.py
Normal file
|
|
@ -0,0 +1,288 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return FineStoriesComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class FineStoriesComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2].split(':')[0])
|
||||
if 'storyInfo' in self.story.getMetadata('storyId'):
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/s/storyInfo.php?id='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','fnst')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%Y-%m-%d"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'finestories.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/s/1234 http://"+cls.getSiteDomain()+"/s/1234:4010 http://"+cls.getSiteDomain()+"/library/storyInfo.php?id=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain())+r"/(s|library)?/(storyInfo.php\?id=)?\d+(:\d+)?(;\d+)?$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Free Registration' in data \
|
||||
or "Invalid Password!" in data \
|
||||
or "Invalid User Name!" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['theusername'] = self.username
|
||||
params['thepassword'] = self.password
|
||||
else:
|
||||
params['theusername'] = self.getConfig("username")
|
||||
params['thepassword'] = self.getConfig("password")
|
||||
params['rememberMe'] = '1'
|
||||
params['page'] = 'http://'+self.getSiteDomain()+'/'
|
||||
params['submit'] = 'Login'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/login.php'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['theusername']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "My Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['theusername']))
|
||||
raise exceptions.FailedToLogin(url,params['theusername'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"/a/\w+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('/')[2])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
|
||||
self.story.setMetadata('author',a.text)
|
||||
|
||||
# Find the chapters:
|
||||
chapters = soup.findAll('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')+":\d+$"))
|
||||
if len(chapters) != 0:
|
||||
for chapter in chapters:
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['href']))
|
||||
else:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/s/'+self.story.getMetadata('storyId')))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# surprisingly, the detailed page does not give enough details, so go to author's page
|
||||
|
||||
skip=0
|
||||
i=0
|
||||
while i == 0:
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')+"&skip="+str(skip)))
|
||||
|
||||
a = asoup.findAll('td', {'class' : 'lc2'})
|
||||
for lc2 in a:
|
||||
if lc2.find('a')['href'] == '/s/'+self.story.getMetadata('storyId'):
|
||||
i=1
|
||||
break
|
||||
if a[len(a)-1] == lc2:
|
||||
skip=skip+10
|
||||
|
||||
for cat in lc2.findAll('div', {'class' : 'typediv'}):
|
||||
self.story.addToList('category',cat.text)
|
||||
|
||||
self.story.setMetadata('numWords', lc2.findNext('td', {'class' : 'num'}).text)
|
||||
|
||||
lc4 = lc2.findNext('td', {'class' : 'lc4'})
|
||||
|
||||
|
||||
try:
|
||||
a = lc4.find('a', href=re.compile(r"/library/show_series.php\?id=\d+"))
|
||||
i = a.parent.text.split('(')[1].split(')')[0]
|
||||
self.setSeries(a.text, i)
|
||||
self.story.setMetadata('seriesUrl','http://'+self.host+a['href'])
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
a = lc4.find('a', href=re.compile(r"/library/universe.php\?id=\d+"))
|
||||
self.story.addToList("category",a.text)
|
||||
except:
|
||||
pass
|
||||
|
||||
for a in lc4.findAll('span', {'class' : 'help'}):
|
||||
a.extract()
|
||||
|
||||
self.setDescription('http://'+self.host+'/s/'+self.story.getMetadata('storyId'),lc4.text.split('[More Info')[0])
|
||||
|
||||
for b in lc4.findAll('b'):
|
||||
label = b.text
|
||||
value = b.nextSibling
|
||||
|
||||
if 'For Age' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Tags' in label:
|
||||
for genre in value.split(', '):
|
||||
self.story.addToList('genre',genre)
|
||||
|
||||
if 'Posted' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value.split('/ (')[0]), self.dateformat))
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value.split('/ (')[0]), self.dateformat))
|
||||
|
||||
if 'Concluded' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value.split('/ (')[0]), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value.split('/ (')[0]), self.dateformat))
|
||||
|
||||
status = lc4.find('span', {'class' : 'ab'})
|
||||
if status != None:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
if "Last Activity" in status.text:
|
||||
self.story.setMetadata('dateUpdated', makeDate(status.text.split('Activity: ')[1].split(')')[0], self.dateformat))
|
||||
else:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
# some big chapters are split over several pages
|
||||
pager = div.find('span', {'class' : 'pager'})
|
||||
if pager != None:
|
||||
urls=pager.findAll('a')
|
||||
urls=urls[:len(urls)-1]
|
||||
|
||||
|
||||
for ur in urls:
|
||||
soup = bs.BeautifulSoup(self._fetchUrl("http://"+self.getSiteDomain()+ur['href']),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div1 = soup.find('div', {'id' : 'story'})
|
||||
|
||||
# appending next section
|
||||
last=div.findAll('p')
|
||||
next=div1.find('span', {'class' : 'conTag'}).nextSibling
|
||||
|
||||
last[len(last)-1]=last[len(last)-1].append(next)
|
||||
div.append(div1)
|
||||
|
||||
# removing all the left-over stuff
|
||||
for a in div.findAll('span'):
|
||||
a.extract()
|
||||
|
||||
for a in div.findAll('h1'):
|
||||
a.extract()
|
||||
for a in div.findAll('h2'):
|
||||
a.extract()
|
||||
for a in div.findAll('h3'):
|
||||
a.extract()
|
||||
for a in div.findAll('h4'):
|
||||
a.extract()
|
||||
for a in div.findAll('br'):
|
||||
a.extract()
|
||||
for a in div.findAll('div', {'class' : 'date'}):
|
||||
a.extract()
|
||||
|
||||
a = div.find('form')
|
||||
if a != None:
|
||||
b = a.nextSibling
|
||||
while b != None:
|
||||
a.extract()
|
||||
a=b
|
||||
b=b.nextSibling
|
||||
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
311
fanficdownloader/adapters/adapter_grangerenchantedcom.py
Normal file
311
fanficdownloader/adapters/adapter_grangerenchantedcom.py
Normal file
|
|
@ -0,0 +1,311 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return GrangerEnchantedCom
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class GrangerEnchantedCom(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
self.section=self.parsedUrl.path.split('/',)[1]
|
||||
|
||||
# normalized story URL.
|
||||
if "malfoymanor" in self.parsedUrl.netloc:
|
||||
self._setURL('http://malfoymanor.' + self.getSiteDomain() + '/themanor/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
self.story.addToList("category","The Manor")
|
||||
else:
|
||||
self._setURL('http://' + self.getSiteDomain() + '/enchant/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','gech')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d/%b/%Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'grangerenchanted.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['grangerenchanted.com','malfoymanor.grangerenchanted.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://grangerenchanted.com/enchant/viewstory.php?sid=1234 http://malfoymanor.grangerenchanted.com/themanor/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"http://(malfoymanor.)?grangerenchanted.com/(enchant|themanor)?/viewstory.php\?sid=\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
if "enchant" in self.section:
|
||||
loginUrl = 'http://grangerenchanted.com/enchant/user.php?action=login'
|
||||
else:
|
||||
loginUrl = 'http://malfoymanor.grangerenchanted.com/themanor/user.php?action=login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=1"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+self.section+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Read' in label:
|
||||
self.story.setMetadata('read', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=4'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+self.section+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
try:
|
||||
self.story.setMetadata('reviews',
|
||||
stripHTML(soup.find('div',{'id':'sort'}).
|
||||
findAll('a', href=re.compile(r'^reviews.php'))[1]))
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story1'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
203
fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
Normal file
203
fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','hp')
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only psid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?psid='+self.story.getMetadata('storyId'))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.harrypotterfanfiction.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.harrypotterfanfiction.com','harrypotterfanfiction.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://www.harrypotterfanfiction.com/viewstory.php?psid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://")+r"(www\.)?"+re.escape("harrypotterfanfiction.com/viewstory.php?psid=")+r"\d+$"
|
||||
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
url = self.url+'&index=1'
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'\?psid='+self.story.getMetadata('storyId')))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
## javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid=290995'
|
||||
if "This story may contain adult themes." in a['href'] and not (self.is_adult or self.getConfig("is_adult")):
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?showuid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
## hpcom doesn't give us total words--but it does give
|
||||
## us words/chapter. I'd rather add than fetch and
|
||||
## parse another page.
|
||||
words=0
|
||||
for tr in soup.find('table',{'class':'text'}).findAll('tr'):
|
||||
tdstr = tr.findAll('td')[2].string
|
||||
if tdstr and tdstr.isdigit():
|
||||
words+=int(tdstr)
|
||||
self.story.setMetadata('numWords',str(words))
|
||||
|
||||
# Find the chapters:
|
||||
tablelist = soup.find('table',{'class':'text'})
|
||||
for chapter in tablelist.findAll('a', href=re.compile(r'\?chapterid=\d+')):
|
||||
#javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?chapterid=433441&i=1'
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
chpt=re.sub(r'^.*?(\?chapterid=\d+).*?',r'\1',chapter['href'])
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php'+chpt))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
## Finding the metadata is a bit of a pain. Desc is the only thing this color.
|
||||
desctable= soup.find('table',{'bgcolor':'#f0e8e8'})
|
||||
self.setDescription(url,desctable)
|
||||
#self.story.setMetadata('description',stripHTML(desctable))
|
||||
|
||||
## Finding the metadata is a bit of a pain. Most of the meta
|
||||
## data is in a center.table without a bgcolor.
|
||||
#for center in soup.findAll('center'):
|
||||
table = soup.find('table',{'class':'storymaininfo'})
|
||||
if table:
|
||||
metastr = stripHTML(str(table)).replace('\n',' ').replace('\t',' ')
|
||||
# Rating: 12+ Story Reviews: 3
|
||||
# Chapters: 3
|
||||
# Characters: Andromeda, Ted, Bellatrix, R. Lestrange, Lucius, Narcissa, OC
|
||||
# Genre(s): Fluff, Romance, Young Adult Era: OtherPairings: Other Pairing, Lucius/Narcissa
|
||||
# Status: Completed
|
||||
# First Published: 2010.09.02
|
||||
# Last Published Chapter: 2010.09.28
|
||||
# Last Updated: 2010.09.28
|
||||
# Favorite Story Of: 1 users
|
||||
# Warnings: Scenes of a Mild Sexual Nature
|
||||
|
||||
m = re.match(r".*?Status: Completed.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('status','Completed')
|
||||
else:
|
||||
self.story.setMetadata('status','In-Progress')
|
||||
|
||||
m = re.match(r".*?Rating: (.+?) Story Reviews.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('rating', m.group(1))
|
||||
|
||||
m = re.match(r".*?Genre\(s\): (.+?) Era.*?",metastr)
|
||||
if m:
|
||||
for g in m.group(1).split(','):
|
||||
self.story.addToList('genre',g)
|
||||
|
||||
m = re.match(r".*?Characters: (.+?) Genre.*?",metastr)
|
||||
if m:
|
||||
for g in m.group(1).split(','):
|
||||
self.story.addToList('characters',g)
|
||||
|
||||
m = re.match(r".*?Warnings: (.+).*?",metastr)
|
||||
if m:
|
||||
for w in m.group(1).split(','):
|
||||
if w != 'Now Warnings':
|
||||
self.story.addToList('warnings',w)
|
||||
|
||||
m = re.match(r".*?First Published: ([0-9\.]+).*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('datePublished',makeDate(m.group(1), "%Y.%m.%d"))
|
||||
|
||||
# Updated can have more than one space after it. <shrug>
|
||||
m = re.match(r".*?Last Updated: ([0-9\.]+).*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%Y.%m.%d"))
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
## most adapters use BeautifulStoneSoup here, but non-Stone
|
||||
## allows nested div tags.
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'fluidtext'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
||||
def getClass():
|
||||
return HarryPotterFanFictionComSiteAdapter
|
||||
|
||||
172
fanficdownloader/adapters/adapter_hennethannunnet.py
Normal file
172
fanficdownloader/adapters/adapter_hennethannunnet.py
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return HennethAnnunNetAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class HennethAnnunNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/stories/chapter.cfm?stid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','htan')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m/%d/%y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.henneth-annun.net'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/stories/chapter.cfm?stid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return "http://"+self.getSiteDomain()+"/stories/chapter(_view)?.cfm\?stid="+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
if "We're sorry. This story is not available." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: This story is not available.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('h2', {'id':'page_heading'})
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find the chapters: chapter_view.cfm?stid=6663&spordinal=1"
|
||||
for chapter in soup.findAll('a', href=re.compile(r'chapter_view.cfm\?stid='+self.story.getMetadata('storyId')+"&spordinal=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/stories/'+chapter['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
self.story.setMetadata('numWords', soup.find('tr', {'class':'foot'}).findAll('td')[1].text)
|
||||
|
||||
self.setDescription(url,soup.find('div', {'id':'summary'}))
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
info = soup.find('div', {'id':'storyinformation'})
|
||||
labels=info.findAll('b')
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Completion' in label:
|
||||
if 'Complete' in value.string:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.setMetadata('rating', value.string)
|
||||
|
||||
if 'Era:' in label:
|
||||
self.story.addToList('category',value.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
self.story.addToList('genre',value.string)
|
||||
|
||||
labels=info.findAll('strong')
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Author' in label:
|
||||
value=value.nextSibling
|
||||
self.story.setMetadata('authorId',value['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+value['href'])
|
||||
self.story.setMetadata('author',value.string)
|
||||
|
||||
if 'Post' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated:' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
for char in soup.findAll('a', href=re.compile(r"/resources/bios_view.cfm\?scid=\d+")):
|
||||
self.story.addToList('characters',stripHTML(char))
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'class' : 'block chapter'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
232
fanficdownloader/adapters/adapter_hlfictionnet.py
Normal file
232
fanficdownloader/adapters/adapter_hlfictionnet.py
Normal file
|
|
@ -0,0 +1,232 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return HLFictionNetAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class HLFictionNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','hlf')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m/%d/%y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'hlfiction.net'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title and author
|
||||
a = soup.find('div', {'id' : 'pagetitle'})
|
||||
|
||||
aut = a.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',aut['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+aut['href'])
|
||||
self.story.setMetadata('author',aut.string)
|
||||
aut.extract()
|
||||
|
||||
self.story.setMetadata('title',stripHTML(a)[:(len(a.string)-3)])
|
||||
|
||||
# Find the chapters:
|
||||
chapters=soup.find('select')
|
||||
if chapters != None:
|
||||
for chapter in chapters.findAll('option'):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php?sid='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
|
||||
else:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
|
||||
|
||||
for list in asoup.findAll('div', {'class' : re.compile('listbox\s+')}):
|
||||
a = list.find('a')
|
||||
if ('viewstory.php?sid='+self.story.getMetadata('storyId')) in a['href']:
|
||||
break
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = list.findAll('span', {'class' : 'classification'})
|
||||
for labelspan in labels:
|
||||
label = labelspan.string
|
||||
value = labelspan.nextSibling
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'classification':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value[:len(value)-2])
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'categories.php\?catid=\d+'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
for char in value.string.split(', '):
|
||||
if not 'None' in char:
|
||||
self.story.addToList('characters',char)
|
||||
|
||||
if 'Genre' in label:
|
||||
for genre in value.string.split(', '):
|
||||
if not 'None' in genre:
|
||||
self.story.addToList('genre',genre)
|
||||
|
||||
if 'Warnings' in label:
|
||||
for warning in value.string.split(', '):
|
||||
if not 'None' in warning:
|
||||
self.story.addToList('warnings',warning)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = list.find('a', href=re.compile(r"series.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if ('viewstory.php?sid='+self.story.getMetadata('storyId')) in a['href']:
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
233
fanficdownloader/adapters/adapter_hpfandomnet.py
Normal file
233
fanficdownloader/adapters/adapter_hpfandomnet.py
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
# updated to reflect the class below it. That, plus getSiteDomain()
|
||||
# take care of 'Registering'.
|
||||
def getClass():
|
||||
return HPFandomNetAdapterAdapter # XXX
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class HPFandomNetAdapterAdapter(BaseSiteAdapter): # XXX
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
# XXX Most sites don't have the /eff part. Replace all to remove it usually.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/eff/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','hpfdm') # XXX
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%Y.%m.%d" # XXX
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.hpfandom.net' # XXX
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/eff/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/eff/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/eff/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
## Going to get the rest from the author page.
|
||||
authdata = self._fetchUrl(self.story.getMetadata('authorUrl'))
|
||||
# fix a typo in the site HTML so I can find the Characters list.
|
||||
authdata = authdata.replace('<td width=10%">','<td width="10%">')
|
||||
|
||||
# hpfandom.net only seems to indicate adult-only by javascript on the story/chapter links.
|
||||
if "javascript:if (confirm('Slash/het fiction which incorporates sexual situations to a somewhat graphic degree and some violence. ')) location = 'viewstory.php?sid=%s'"%self.story.getMetadata('storyId') in authdata \
|
||||
and not (self.is_adult or self.getConfig("is_adult")):
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
authsoup = bs.BeautifulSoup(authdata)
|
||||
|
||||
reviewsa = authsoup.find('a', href="reviews.php?sid="+self.story.getMetadata('storyId')+"&a=")
|
||||
# <table><tr><td><p><b><a ...>
|
||||
metablock = reviewsa.findParent("table")
|
||||
#print("metablock:%s"%metablock)
|
||||
|
||||
## Title
|
||||
titlea = metablock.find('a', href=re.compile("viewstory.php"))
|
||||
#print("titlea:%s"%titlea)
|
||||
if titlea == None:
|
||||
raise exceptions.FailedToDownload("Story URL (%s) not found on author's page, can't use chapter URLs"%url)
|
||||
self.story.setMetadata('title',stripHTML(titlea))
|
||||
|
||||
# Find the chapters: !!! hpfandom.net differs from every other
|
||||
# eFiction site--the sid on viewstory for chapters is
|
||||
# *different* for each chapter
|
||||
for chapter in soup.findAll('a', {'href':re.compile(r"viewstory.php\?sid=\d+&i=\d+")}):
|
||||
m = re.match(r'.*?(viewstory.php\?sid=\d+&i=\d+).*?',chapter['href'])
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
#print("====chapter===%s"%m.group(1))
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/eff/'+m.group(1)))
|
||||
|
||||
if len(self.chapterUrls) == 0:
|
||||
self.chapterUrls.append((stripHTML(self.story.getMetadata('title')),url))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
summary = metablock.find("td",{"class":"summary"})
|
||||
summary.name='span'
|
||||
self.setDescription(url,summary)
|
||||
|
||||
# words & completed in first row of metablock.
|
||||
firstrow = stripHTML(metablock.find('tr'))
|
||||
# A Mother's Love xx Going Grey 1 (G+) by Kiristeen | Reviews - 18 | Words: 27468 | Completed: Yes
|
||||
m = re.match(r".*?\((?P<rating>[^)]+)\).*?Words: (?P<words>\d+).*?Completed: (?P<status>Yes|No)",firstrow)
|
||||
if m != None:
|
||||
if m.group('rating') != None:
|
||||
self.story.setMetadata('rating', m.group('rating'))
|
||||
|
||||
if m.group('words') != None:
|
||||
self.story.setMetadata('numWords', m.group('words'))
|
||||
|
||||
if m.group('status') != None:
|
||||
if 'Yes' in m.group('status'):
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
|
||||
# <tr><td width="10%" valign="top">Chapters:</td><td width="40%" valign="top">4</td>
|
||||
# <td width="10%" valign="top">Published:</td><td width="40%" valign="top">2010.09.29</td></tr>
|
||||
# <tr><td width="10%" valign="top">Completed:</td><td width="40%" valign="top">Yes</td><td width="10%" valign="top">Updated:</td><td width="40%" valign="top">2010.10.03</td></tr>
|
||||
labels = metablock.findAll('td',{'width':'10%'})
|
||||
for td in labels:
|
||||
label = td.string
|
||||
value = td.nextSibling.string
|
||||
#print("\nlabel:%s\nvalue:%s\n"%(label,value))
|
||||
|
||||
if 'Category' in label and value:
|
||||
cats = td.parent.findAll('a',href=re.compile(r'categories.php'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label and value: # this site can have Character label with no
|
||||
# values, apparently. Others as a precaution.
|
||||
for char in value.split(','):
|
||||
self.story.addToList('characters',char.strip())
|
||||
|
||||
if 'Genre' in label and value:
|
||||
for genre in value.split(','):
|
||||
self.story.addToList('genre',genre.strip())
|
||||
|
||||
if 'Warnings' in label and value:
|
||||
for warning in value.split(','):
|
||||
if warning.strip() != 'none':
|
||||
self.story.addToList('warnings',warning.strip())
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self._fetchUrl(url)
|
||||
# There's no good wrapper around the chapter text. :-/
|
||||
# There are, however, tables with width=100% just above and below the real text.
|
||||
data = re.sub(r'<table width="100%">.*?</table>','<div name="storybody">',
|
||||
data,count=1,flags=re.DOTALL)
|
||||
|
||||
data = re.sub(r'<table width="100%">.*?</table>','</div>',
|
||||
data,count=1,flags=re.DOTALL)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data,selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find("div",{'name':'storybody'})
|
||||
#print("\n\ndiv:%s\n\n"%div)
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
return self.utf8FromSoup(url,div)
|
||||
224
fanficdownloader/adapters/adapter_hpfanficarchivecom.py
Normal file
224
fanficdownloader/adapters/adapter_hpfanficarchivecom.py
Normal file
|
|
@ -0,0 +1,224 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return HPFanficArchiveComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class HPFanficArchiveComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/stories/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','hpffa')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%B %d, %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.hpfanficarchive.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/stories/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/stories/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/stories/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/stories/'+chapter['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
val = labelspan.nextSibling
|
||||
value = unicode('')
|
||||
while val and not defaultGetattr(val,'class') == 'label':
|
||||
value += unicode(val)
|
||||
val = val.nextSibling
|
||||
label = labelspan.string
|
||||
#print("label:%s\nvalue:%s"%(label,value))
|
||||
|
||||
if 'Summary' in label:
|
||||
self.setDescription(url,value)
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', stripHTML(value))
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', stripHTML(value))
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Pairing' in label:
|
||||
ships = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=4'))
|
||||
for ship in ships:
|
||||
self.story.addToList('ships',ship.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in stripHTML(value):
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/stories/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
283
fanficdownloader/adapters/adapter_iketernalnet.py
Normal file
283
fanficdownloader/adapters/adapter_iketernalnet.py
Normal file
|
|
@ -0,0 +1,283 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return IkEternalNetAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class IkEternalNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ike')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%B %d, %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.ik-eternal.net'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&warning=1"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
|
||||
# Since the warning text can change by warning level, let's
|
||||
# look for the warning pass url. ksarchive uses
|
||||
# &warning= -- actually, so do other sites. Must be an
|
||||
# eFiction book.
|
||||
|
||||
# viewstory.php?sid=1882&warning=4
|
||||
# viewstory.php?sid=1654&ageconsent=ok&warning=5
|
||||
#print data
|
||||
#m = re.search(r"'viewstory.php\?sid=1882(&warning=4)'",data)
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data,selfClosingTags=('p')) #poor formatting of the paragraphs in the title page
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
asoup = soup.find('div', {'class': 'listbox'})
|
||||
for a in asoup.findAll('p'):
|
||||
a.name='br'
|
||||
labels = asoup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
290
fanficdownloader/adapters/adapter_imagineeficcom.py
Normal file
290
fanficdownloader/adapters/adapter_imagineeficcom.py
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return ImagineEFicComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class ImagineEFicComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ime')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%Y.%m.%d"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'imagine.e-fic.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=4"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
200
fanficdownloader/adapters/adapter_indeathnet.py
Normal file
200
fanficdownloader/adapters/adapter_indeathnet.py
Normal file
|
|
@ -0,0 +1,200 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return InDeathNetAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class InDeathNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
self.story.setMetadata('storyId',m.group('id'))
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://www.' + self.getSiteDomain() + '/blog/archive/'+self.story.getMetadata('storyId')+'-'+m.group('name')+'/')
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','idn')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d %B %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'indeath.net'
|
||||
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/blog/archive/123-story-in-death/"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
# http://www.indeath.net/blog/archive/169-ransom-in-death/
|
||||
return re.escape("http://")+re.escape(self.getSiteDomain())+r"/blog/(archive/)?(?P<id>\d+)\-(?P<name>[a-z0-9\-]*)/?$"
|
||||
|
||||
|
||||
def getDateFromComponents(self, postmonth, postday):
|
||||
ym = re.search("Entries\ in\ (?P<mon>January|February|March|April|May|June|July|August|September|October|November|December)\ (?P<year>\d{4})",postmonth)
|
||||
d = re.search("(?P<day>\d{2})\ (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)",postday)
|
||||
postdate = makeDate(d.group('day')+' '+ym.group('mon')+' '+ym.group('year'),self.dateformat)
|
||||
return postdate
|
||||
|
||||
def getAuthorData(self):
|
||||
|
||||
mainUrl = self.url.replace("/archive","")
|
||||
|
||||
try:
|
||||
maindata = self._fetchUrl(mainUrl)
|
||||
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.meta)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
mainsoup = bs.BeautifulSoup(maindata)
|
||||
|
||||
# find first entry
|
||||
e = mainsoup.find('div',{'class':"entry"})
|
||||
|
||||
# get post author as author
|
||||
d = e.find('div',{'class':"desc"})
|
||||
a = d.find('strong')
|
||||
self.story.setMetadata('author',a.contents[0].string.strip())
|
||||
|
||||
# Don't seem to be able to get author pages anymore
|
||||
self.story.setMetadata('authorUrl','http://www.indeath.net/')
|
||||
self.story.setMetadata('authorId','0')
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
url = self.url
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.meta)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
h = soup.find('a', id="blog_title")
|
||||
t = h.find('span')
|
||||
self.story.setMetadata('title',stripHTML(t.contents[0]).strip())
|
||||
|
||||
s = t.find('div')
|
||||
if s != None:
|
||||
self.setDescription(url,s)
|
||||
|
||||
# Get Author from main blog page since it's not reliably on the archive page
|
||||
self.getAuthorData()
|
||||
|
||||
# Find the chapters:
|
||||
chapters=soup.findAll('a', title="View entry", href=re.compile(r'http://www.indeath.net/blog/'+self.story.getMetadata('storyId')+"/entry\-(\d+)\-([^/]*)/$"))
|
||||
|
||||
#reverse the list since newest at the top
|
||||
chapters.reverse()
|
||||
|
||||
# Get date published & updated from first & last entries
|
||||
posttable=soup.find('div', id="main_column")
|
||||
|
||||
postmonths=posttable.findAll('th', text=re.compile(r'Entries\ in\ '))
|
||||
postmonths.reverse()
|
||||
|
||||
postdates=posttable.findAll('span', _class="desc", text=re.compile('\d{2}\ (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'))
|
||||
postdates.reverse()
|
||||
|
||||
self.story.setMetadata('datePublished',self.getDateFromComponents(postmonths[0],postdates[0]))
|
||||
self.story.setMetadata('dateUpdated',self.getDateFromComponents(postmonths[len(postmonths)-1],postdates[len(postdates)-1]))
|
||||
|
||||
# Process List of Chapters
|
||||
self.story.setMetadata('numChapters',len(chapters))
|
||||
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
|
||||
for x in range(0,len(chapters)):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
chapter=chapters[x]
|
||||
if len(chapters)==1:
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),chapter['href']))
|
||||
else:
|
||||
ct = stripHTML(chapter)
|
||||
tnew = re.match("(?i)"+self.story.getMetadata('title')+r" - (?P<newtitle>.*)$",ct)
|
||||
if tnew:
|
||||
chaptertitle = tnew.group('newtitle')
|
||||
else:
|
||||
chaptertitle = ct
|
||||
self.chapterUrls.append((chaptertitle,chapter['href']))
|
||||
|
||||
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
#chapter=bs.BeautifulSoup('<div class="story"></div>')
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr','span','center'))
|
||||
|
||||
chapter = soup.find("div", "entry_content")
|
||||
|
||||
if None == chapter:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,chapter)
|
||||
|
||||
315
fanficdownloader/adapters/adapter_ksarchivecom.py
Normal file
315
fanficdownloader/adapters/adapter_ksarchivecom.py
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
# Search for XXX comments--that's where things are most likely to need changing.
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
# updated to reflect the class below it. That, plus getSiteDomain()
|
||||
# take care of 'Registering'.
|
||||
def getClass():
|
||||
return KSArchiveComAdapter # XXX
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class KSArchiveComAdapter(BaseSiteAdapter): # XXX
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ksa') # XXX
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b/%d/%Y" # XXX
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.ksarchive.com','ksarchive.com']
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'ksarchive.com' # XXX
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return "http://(www.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
|
||||
# Furthermore, there's a couple sites now with more than
|
||||
# one warning level for different ratings. And they're
|
||||
# fussy about it. midnightwhispers has three: 10, 3 & 5.
|
||||
# we'll try 5 first.
|
||||
addurl = "&ageconsent=ok&warning=2" # XXX
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
|
||||
# Since the warning text can change by warning level, let's
|
||||
# look for the warning pass url. ksarchive uses
|
||||
# &warning= -- actually, so do other sites. Must be an
|
||||
# eFiction book.
|
||||
|
||||
# viewstory.php?sid=1882&warning=4
|
||||
# viewstory.php?sid=1654&ageconsent=ok&warning=5
|
||||
#print data
|
||||
#m = re.search(r"'viewstory.php\?sid=1882(&warning=4)'",data)
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a)) # title's inside a <b> tag.
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',stripHTML(a))
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = stripHTML(labelspan)
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
# poor HTML(unclosed <p> for one) can cause run on
|
||||
# over the next label.
|
||||
if '<span class="label">' in svalue:
|
||||
svalue = svalue[0:svalue.find('<span class="label">')]
|
||||
break
|
||||
else:
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [stripHTML(cat) for cat in cats]
|
||||
for cat in catstext:
|
||||
# ran across one story with an empty <a href="browse.php?type=categories&catid=1"></a>
|
||||
# tag in the desc once.
|
||||
if cat and cat.strip() in ('Poetry','Essays'):
|
||||
self.story.addToList('category',stripHTML(cat))
|
||||
|
||||
if 'Characters' in label:
|
||||
self.story.addToList('characters','Kirk')
|
||||
self.story.addToList('characters','Spock')
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
charstext = [stripHTML(char) for char in chars]
|
||||
for char in charstext:
|
||||
self.story.addToList('characters',stripHTML(char))
|
||||
|
||||
## Not all sites use Genre, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
|
||||
genrestext = [stripHTML(genre) for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.story.addToList('genre',stripHTML(genre))
|
||||
|
||||
## In addition to Genre (which is very site specific) KSA
|
||||
## has 'Story Type', which is much more what most sites
|
||||
## call genre.
|
||||
if 'Story Type' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=5')) # XXX
|
||||
genrestext = [stripHTML(genre) for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.story.addToList('genre',stripHTML(genre))
|
||||
|
||||
## Not all sites use Warnings, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
warningstext = [stripHTML(warning) for warning in warnings]
|
||||
self.warning = ', '.join(warningstext)
|
||||
for warning in warningstext:
|
||||
self.story.addToList('warnings',stripHTML(warning))
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = stripHTML(a)
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulStoneSoup(data,
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
if "A fatal MySQL error was encountered" in data:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Database error on the site reported!" % url)
|
||||
else:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
251
fanficdownloader/adapters/adapter_libraryofmoriacom.py
Normal file
251
fanficdownloader/adapters/adapter_libraryofmoriacom.py
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
|
||||
def getClass():
|
||||
return LibraryOfMoriaComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class LibraryOfMoriaComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/a/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','lom')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%B %d, %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.libraryofmoria.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/a/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/a/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
addurl = "&ageconsent=ok&warning=3"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/a/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/a/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
charstext = [char.string for char in chars]
|
||||
for char in charstext:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Type' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warning' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=5'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/a/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
267
fanficdownloader/adapters/adapter_literotica.py
Normal file
267
fanficdownloader/adapters/adapter_literotica.py
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2013 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
import urlparse
|
||||
import time
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class LiteroticaSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
|
||||
self.story.setMetadata('siteabbrev','litero')
|
||||
|
||||
# normalize to first chapter. Not sure if they ever have more than 2 digits.
|
||||
storyId = self.parsedUrl.path.split('/',)[2]
|
||||
# replace later chapters with first chapter but don't remove numbers
|
||||
# from the URL that disambiguate stories with the same title.
|
||||
storyId = re.sub("-ch-?\d\d", "", storyId)
|
||||
self.story.setMetadata('storyId', storyId)
|
||||
|
||||
## accept m(mobile)url, but use www.
|
||||
url = re.sub("^(www|german|spanish|french|dutch|italian|romanian|portuguese|other)\.i",
|
||||
"\1",
|
||||
url)
|
||||
|
||||
## strip ?page=...
|
||||
url = re.sub("\?page=.*$", "", url)
|
||||
|
||||
## set url
|
||||
self._setURL(url)
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = '%m/%d/%y'
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'literotica.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.literotica.com',
|
||||
'www.i.literotica.com',
|
||||
'german.literotica.com',
|
||||
'german.i.literotica.com',
|
||||
'spanish.literotica.com',
|
||||
'spanish.i.literotica.com',
|
||||
'french.literotica.com',
|
||||
'french.i.literotica.com',
|
||||
'dutch.literotica.com',
|
||||
'dutch.i.literotica.com',
|
||||
'italian.literotica.com',
|
||||
'italian.i.literotica.com',
|
||||
'romanian.literotica.com',
|
||||
'romanian.i.literotica.com',
|
||||
'portuguese.literotica.com',
|
||||
'portuguese.i.literotica.com',
|
||||
'other.literotica.com',
|
||||
'other.i.literotica.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://www.literotica.com/s/story-title https://www.literotica.com/s/story-title http://portuguese.literotica.com/s/story-title http://german.literotica.com/s/story-title"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"https?://(www|german|spanish|french|dutch|italian|romanian|portuguese|other)(\.i)?\.literotica\.com/s/([a-zA-Z0-9_-]+)"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
"""
|
||||
NOTE: Some stories can have versions,
|
||||
e.g. /my-story-ch-05-version-10
|
||||
NOTE: If two stories share the same title, a running index is added,
|
||||
e.g.: /my-story-ch-02-1
|
||||
Strategy:
|
||||
* Go to author's page, search for the current story link,
|
||||
* If it's in a tr.root-story => One-part story
|
||||
* , get metadata and be done
|
||||
* If it's in a tr.sl => Chapter in series
|
||||
* Search up from there until we find a tr.ser-ttl (this is the
|
||||
story)
|
||||
* Gather metadata
|
||||
* Search down from there for all tr.sl until the next
|
||||
tr.ser-ttl, foreach
|
||||
* Chapter link is there
|
||||
"""
|
||||
|
||||
if not (self.is_adult or self.getConfig("is_adult")):
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
logger.debug("Chapter/Story URL: <%s> " % self.url)
|
||||
try:
|
||||
data1 = self._fetchUrl(self.url)
|
||||
soup1 = bs.BeautifulSoup(data1)
|
||||
#strip comments from soup
|
||||
[comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# author
|
||||
a = soup1.find("span", "b-story-user-y")
|
||||
self.story.setMetadata('authorId', urlparse.parse_qs(a.a['href'].split('?')[1])['uid'][0])
|
||||
authorurl = a.a['href']
|
||||
if authorurl.startswith('//'):
|
||||
authorurl = self.parsedUrl.scheme+':'+authorurl
|
||||
self.story.setMetadata('authorUrl', authorurl)
|
||||
self.story.setMetadata('author', a.text)
|
||||
|
||||
# get the author page
|
||||
try:
|
||||
dataAuth = self._fetchUrl(authorurl)
|
||||
soupAuth = bs.BeautifulSoup(dataAuth)
|
||||
#strip comments from soup
|
||||
[comment.extract() for comment in soupAuth.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(authorurl)
|
||||
else:
|
||||
raise e
|
||||
|
||||
## Find link to url in author's page
|
||||
## site has started using //domain.name/asdf urls remove https?: from front
|
||||
storyLink = soupAuth.find('a', href=self.url[self.url.index(':')+1:])
|
||||
|
||||
if storyLink is not None:
|
||||
urlTr = storyLink.parent.parent
|
||||
if urlTr['class'] == "sl":
|
||||
isSingleStory = False
|
||||
else:
|
||||
isSingleStory = True
|
||||
else:
|
||||
raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (url, authorurl))
|
||||
|
||||
if isSingleStory:
|
||||
self.story.setMetadata('title', storyLink.text)
|
||||
self.story.setMetadata('description', urlTr.findAll("td")[1].text)
|
||||
self.story.addToList('eroticatags', urlTr.findAll("td")[2].text)
|
||||
date = urlTr.findAll('td')[-1].text
|
||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
||||
self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
|
||||
self.chapterUrls = [(storyLink.text, self.url)]
|
||||
else:
|
||||
seriesTr = urlTr.previousSibling
|
||||
while seriesTr['class'] != 'ser-ttl':
|
||||
seriesTr = seriesTr.previousSibling
|
||||
m = re.match("^(?P<title>.*?):\s(?P<numChapters>\d+)\sPart\sSeries$", seriesTr.find("strong").text)
|
||||
self.story.setMetadata('title', m.group('title'))
|
||||
|
||||
## Walk the chapters
|
||||
chapterTr = seriesTr.nextSibling
|
||||
self.chapterUrls = []
|
||||
dates = []
|
||||
descriptions = []
|
||||
while chapterTr is not None and chapterTr['class'] == 'sl':
|
||||
descriptions.append(chapterTr.findAll("td")[1].text)
|
||||
chapterLink = chapterTr.find("td", "fc").find("a")
|
||||
self.chapterUrls.append((chapterLink.text, "http:" + chapterLink["href"]))
|
||||
self.story.addToList('eroticatags', chapterTr.findAll("td")[2].text)
|
||||
dates.append(makeDate(chapterTr.findAll('td')[-1].text, self.dateformat))
|
||||
chapterTr = chapterTr.nextSibling
|
||||
|
||||
## Set description to joint chapter descriptions
|
||||
self.story.setMetadata('description', " / ".join(descriptions))
|
||||
|
||||
## Set the oldest date as publication date, the newest as update date
|
||||
dates.sort()
|
||||
self.story.setMetadata('datePublished', dates[0])
|
||||
self.story.setMetadata('dateUpdated', dates[-1])
|
||||
|
||||
# normalize on first chapter URL.
|
||||
self._setURL(self.chapterUrls[0][1])
|
||||
|
||||
self.story.setMetadata('numChapters', len(self.chapterUrls))
|
||||
|
||||
# set storyId to 'title-author' to avoid duplicates
|
||||
# self.story.setMetadata('storyId',
|
||||
# re.sub("[^a-z0-9]", "", self.story.getMetadata('title').lower())
|
||||
# + "-"
|
||||
# + re.sub("[^a-z0-9]", "", self.story.getMetadata('author').lower()))
|
||||
|
||||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from <%s>' % url)
|
||||
data1 = self._fetchUrl(url)
|
||||
# brute force approach to replace the wrapping <p> tag. If
|
||||
# done by changing tag name, it causes problems with nested
|
||||
# <p> tags.
|
||||
data1 = data1.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>')
|
||||
soup1 = bs.BeautifulSoup(data1)
|
||||
|
||||
#strip comments from soup
|
||||
[comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
|
||||
# get story text
|
||||
story1 = soup1.find('div', 'b-story-body-x').div
|
||||
#print("story1:%s"%story1)
|
||||
# story1.name='div'
|
||||
story1.append('<br />')
|
||||
storytext = self.utf8FromSoup(url,story1)
|
||||
|
||||
# find num pages
|
||||
pgs = int(soup1.find("span", "b-pager-caption-t r-d45").string.split(' ')[0])
|
||||
logger.debug("pages: "+str(pgs))
|
||||
|
||||
# get all the pages
|
||||
for i in xrange(2, pgs+1):
|
||||
try:
|
||||
logger.debug("fetching page "+str(i))
|
||||
time.sleep(0.5)
|
||||
data2 = self._fetchUrl(url, {'page': i})
|
||||
# brute force approach to replace the wrapping <p> tag. If
|
||||
# done by changing tag name, it causes problems with nested
|
||||
# <p> tags.
|
||||
data2 = data2.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>')
|
||||
soup2 = bs.BeautifulSoup(data2)
|
||||
[comment.extract() for comment in soup2.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
story2 = soup2.find('div', 'b-story-body-x').div
|
||||
# story2.name='div'
|
||||
story2.append('<br />')
|
||||
storytext += self.utf8FromSoup(url,story2)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
else:
|
||||
raise e
|
||||
return storytext
|
||||
|
||||
|
||||
def getClass():
|
||||
return LiteroticaSiteAdapter
|
||||
|
||||
|
||||
|
||||
36
fanficdownloader/adapters/adapter_lotrfanfictioncom.py
Normal file
36
fanficdownloader/adapters/adapter_lotrfanfictioncom.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
from base_efiction_adapter import BaseEfictionAdapter
|
||||
|
||||
class TheLOTRFanFictionSiteAdapter(BaseEfictionAdapter):
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'lotrfanfiction.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteAbbrev(seluuf):
|
||||
return 'lotrff'
|
||||
|
||||
@classmethod
|
||||
def getDateFormat(self):
|
||||
return "%d/%m/%y"
|
||||
|
||||
def getClass():
|
||||
return TheLOTRFanFictionSiteAdapter
|
||||
238
fanficdownloader/adapters/adapter_lumossycophanthexcom.py
Normal file
238
fanficdownloader/adapters/adapter_lumossycophanthexcom.py
Normal file
|
|
@ -0,0 +1,238 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return LumosSycophantHexComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class LumosSycophantHexComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','lsph')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m/%d/%Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'lumos.sycophanthex.com'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=19"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
if "Age Consent Required" in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
pt = soup.find('div', {'id' : 'pagetitle'})
|
||||
a = pt.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = pt.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
rating=pt.text.split('(')[1].split(')')[0]
|
||||
self.story.setMetadata('rating', rating)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
|
||||
value = labels[0].previousSibling
|
||||
svalue = ""
|
||||
while value != None:
|
||||
val = value
|
||||
value = value.previousSibling
|
||||
while not defaultGetattr(val,'class') == 'label':
|
||||
svalue += str(val)
|
||||
val = val.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value.split(' -')[0])
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Complete' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value.split(' -')[0]), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
237
fanficdownloader/adapters/adapter_mediaminerorg.py
Normal file
237
fanficdownloader/adapters/adapter_mediaminerorg.py
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','mm')
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
self.story.setMetadata('storyId',m.group('id'))
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/fanfic/view_st.php/'+self.story.getMetadata('storyId'))
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.mediaminer.org'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/fanfic/view_st.php/123456 http://"+cls.getSiteDomain()+"/fanfic/view_ch.php/1234123/123444#fic_c"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
## http://www.mediaminer.org/fanfic/view_st.php/76882
|
||||
## http://www.mediaminer.org/fanfic/view_ch.php/167618/594087#fic_c
|
||||
return re.escape("http://"+self.getSiteDomain())+\
|
||||
"/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+(#fic_c)?)?$"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
# [ A - All Readers ], strip '[' ']'
|
||||
## Above title because we remove the smtxt font to get title.
|
||||
smtxt = soup.find("font",{"class":"smtxt"})
|
||||
if not smtxt:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
rating = smtxt.string[1:-1]
|
||||
self.story.setMetadata('rating',rating)
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"/fanfic/src.php/u/\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('/')[-1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
## Title - Good grief. Title varies by chaptered, 1chapter and 'type=one shot'--and even 'one-shot's can have titled chapter.
|
||||
## But, if colspan=2, there's no chapter title.
|
||||
## <td class="ffh">Atmosphere: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
|
||||
## <td colspan=2 class="ffh">Hearts of Ice <font class="smtxt">[ P - Pre-Teen ]</font></td>
|
||||
## <td colspan=2 class="ffh">Suzaku no Princess <font class="smtxt">[ P - Pre-Teen ]</font></td>
|
||||
## <td class="ffh">The Kraut, The Bartender, and The Drunkard: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
|
||||
## <td class="ffh">Betrayal and Justice: A Cold Heart</b> <font size="-1">( Chapter 1 )</font> <font class="smtxt">[ A - All Readers ]</font></td>
|
||||
## <td class="ffh">Question and Answer: Question and Answer</b> <font size="-1">( One-Shot )</font> <font class="smtxt">[ A - All Readers ]</font></td>
|
||||
title = soup.find('td',{'class':'ffh'})
|
||||
for font in title.findAll('font'):
|
||||
font.extract() # removes 'font' tags from inside the td.
|
||||
if title.has_key('colspan'):
|
||||
titlet = stripHTML(title)
|
||||
else:
|
||||
## No colspan, it's part chapter title--even if it's a one-shot.
|
||||
titlet = ':'.join(stripHTML(title).split(':')[:-1]) # strip trailing 'Chapter X' or chapter title
|
||||
self.story.setMetadata('title',titlet)
|
||||
## The story title is difficult to reliably parse from the
|
||||
## story pages. Getting it from the author page is, but costs
|
||||
## another fetch.
|
||||
# authsoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
|
||||
# titlea = authsoup.find('a',{'href':'/fanfic/view_st.php/'+self.story.getMetadata('storyId')})
|
||||
# self.story.setMetadata('title',titlea.text)
|
||||
|
||||
# save date from first for later.
|
||||
firstdate=None
|
||||
|
||||
# Find the chapters
|
||||
select = soup.find('select',{'name':'cid'})
|
||||
if not select:
|
||||
self.chapterUrls.append(( self.story.getMetadata('title'),self.url))
|
||||
else:
|
||||
for option in select.findAll("option"):
|
||||
chapter = stripHTML(option.string)
|
||||
## chapter can be: Chapter 7 [Jan 23, 2011]
|
||||
## or: Vigilant Moonlight ( Chapter 1 ) [Jan 30, 2004]
|
||||
## or even: Prologue ( Prologue ) [Jul 31, 2010]
|
||||
m = re.match(r'^(.*?) (\( .*? \) )?\[(.*?)\]$',chapter)
|
||||
chapter = m.group(1)
|
||||
# save date from first for later.
|
||||
if not firstdate:
|
||||
firstdate = m.group(3)
|
||||
self.chapterUrls.append((chapter,'http://'+self.host+'/fanfic/view_ch.php/'+self.story.getMetadata('storyId')+'/'+option['value']))
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# category
|
||||
# <a href="/fanfic/src.php/a/567">Ranma 1/2</a>
|
||||
for a in soup.findAll('a',href=re.compile(r"^/fanfic/src.php/a/")):
|
||||
self.story.addToList('category',a.string)
|
||||
|
||||
# genre
|
||||
# <a href="/fanfic/src.php/a/567">Ranma 1/2</a>
|
||||
for a in soup.findAll('a',href=re.compile(r"^/fanfic/src.php/g/")):
|
||||
self.story.addToList('genre',a.string)
|
||||
|
||||
# if firstdate, then the block below will only have last updated.
|
||||
if firstdate:
|
||||
self.story.setMetadata('datePublished', makeDate(firstdate, "%b %d, %Y"))
|
||||
# Everything else is in <tr bgcolor="#EEEED4">
|
||||
|
||||
metastr = stripHTML(soup.find("tr",{"bgcolor":"#EEEED4"})).replace('\n',' ').replace('\r',' ').replace('\t',' ')
|
||||
# Latest Revision: August 03, 2010
|
||||
m = re.match(r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d)",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('dateUpdated', makeDate(m.group(1), "%B %d, %Y"))
|
||||
if not firstdate:
|
||||
self.story.setMetadata('datePublished',
|
||||
self.story.getMetadataRaw('dateUpdated'))
|
||||
|
||||
else:
|
||||
self.story.setMetadata('dateUpdated',
|
||||
self.story.getMetadataRaw('datePublished'))
|
||||
|
||||
# Words: 123456
|
||||
m = re.match(r".*?\| Words: (\d+) \|",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('numWords', m.group(1))
|
||||
|
||||
# Summary: ....
|
||||
m = re.match(r".*?Summary: (.*)$",metastr)
|
||||
if m:
|
||||
self.setDescription(url, m.group(1))
|
||||
#self.story.setMetadata('description', m.group(1))
|
||||
|
||||
# completed
|
||||
m = re.match(r".*?Status: Completed.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('status','Completed')
|
||||
else:
|
||||
self.story.setMetadata('status','In-Progress')
|
||||
|
||||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data=self._fetchUrl(url)
|
||||
soup = bs.BeautifulStoneSoup(data,
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
anchor = soup.find('a',{'name':'fic_c'})
|
||||
|
||||
if None == anchor:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
## find divs with align=left, those are paragraphs in newer stories.
|
||||
divlist = anchor.findAllNext('div',{'align':'left'})
|
||||
if divlist:
|
||||
for div in divlist:
|
||||
div.name='p' # convert to <p> mediaminer uses div with
|
||||
# a margin for paragraphs.
|
||||
anchor.append(div) # cheat! stuff all the content
|
||||
# divs into anchor just as a
|
||||
# holder.
|
||||
del div['style']
|
||||
del div['align']
|
||||
anchor.name='div'
|
||||
return self.utf8FromSoup(url,anchor)
|
||||
|
||||
else:
|
||||
logger.debug('Using kludgey text find for older mediaminer story.')
|
||||
## Some older mediaminer stories are unparsable with BeautifulSoup.
|
||||
## Really nasty formatting. Sooo... Cheat! Parse it ourselves a bit first.
|
||||
## Story stuff falls between:
|
||||
data = "<div id='HERE'>" + data[data.find('<a name="fic_c">'):] +"</div>"
|
||||
soup = bs.BeautifulStoneSoup(data,
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
for tag in soup.findAll('td',{'class':'ffh'}) + \
|
||||
soup.findAll('div',{'class':'acl'}) + \
|
||||
soup.findAll('div',{'class':'footer smtxt'}) + \
|
||||
soup.findAll('table',{'class':'tbbrdr'}):
|
||||
tag.extract() # remove tag from soup.
|
||||
|
||||
return self.utf8FromSoup(url,soup)
|
||||
|
||||
|
||||
def getClass():
|
||||
return MediaMinerOrgSiteAdapter
|
||||
|
||||
294
fanficdownloader/adapters/adapter_merlinficdtwinscouk.py
Normal file
294
fanficdownloader/adapters/adapter_merlinficdtwinscouk.py
Normal file
|
|
@ -0,0 +1,294 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return MerlinFicDtwinsCoUk
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class MerlinFicDtwinsCoUk(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','mrfd')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b %d, %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'merlinfic.dtwins.co.uk'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=4"
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Pairing' in label:
|
||||
ships = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
|
||||
for ship in ships:
|
||||
self.story.addToList('ships',ship.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=3'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
290
fanficdownloader/adapters/adapter_midnightwhispersca.py
Normal file
290
fanficdownloader/adapters/adapter_midnightwhispersca.py
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
# Search for XXX comments--that's where things are most likely to need changing.
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
# updated to reflect the class below it. That, plus getSiteDomain()
|
||||
# take care of 'Registering'.
|
||||
def getClass():
|
||||
return MidnightwhispersCaAdapter # XXX
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class MidnightwhispersCaAdapter(BaseSiteAdapter): # XXX
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','mw') # XXX
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%B %d, %Y" # XXX
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.midnightwhispers.ca' # XXX
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
|
||||
# Furthermore, there's a couple sites now with more than
|
||||
# one warning level for different ratings. And they're
|
||||
# fussy about it. midnightwhispers has three: 10, 3 & 5.
|
||||
# we'll try 5 first.
|
||||
addurl = "&ageconsent=ok&warning=5" # XXX
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
|
||||
# Since the warning text can change by warning level, let's
|
||||
# look for the warning pass url. nfacommunity uses
|
||||
# &warning= -- actually, so do other sites. Must be an
|
||||
# eFiction book.
|
||||
|
||||
# viewstory.php?sid=1882&warning=4
|
||||
# viewstory.php?sid=1654&ageconsent=ok&warning=5
|
||||
#print data
|
||||
#m = re.search(r"'viewstory.php\?sid=1882(&warning=4)'",data)
|
||||
m = re.search(r"'viewstory.php\?sid=\d+((?:&ageconsent=ok)?&warning=\d+)'",data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a)) # title's inside a <b> tag.
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = ""
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
charstext = [char.string for char in chars]
|
||||
for char in charstext:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
## Not all sites use Genre, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
genrestext = [genre.string for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
## Not all sites use Warnings, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
warningstext = [warning.string for warning in warnings]
|
||||
self.warning = ', '.join(warningstext)
|
||||
for warning in warningstext:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulStoneSoup(data,
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
if "A fatal MySQL error was encountered" in data:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Database error on the site reported!" % url)
|
||||
else:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
336
fanficdownloader/adapters/adapter_mugglenetcom.py
Normal file
336
fanficdownloader/adapters/adapter_mugglenetcom.py
Normal file
|
|
@ -0,0 +1,336 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
# updated to reflect the class below it. That, plus getSiteDomain()
|
||||
# take care of 'Registering'.
|
||||
def getClass():
|
||||
return MuggleNetComAdapter # XXX
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class MuggleNetComAdapter(BaseSiteAdapter): # XXX
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','mgln') # XXX
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m/%d/%y" # XXX
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain.
|
||||
return 'fanfiction.mugglenet.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['fanfiction.mugglenet.com','fanfic.mugglenet.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://")+r"fanfic(tion)?\.mugglenet\.com"+re.escape("/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if "class='errortext'>Registered Users Only" in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['penname'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['penname'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
params['cookiecheck'] = '1'
|
||||
params['submit'] = 'Submit'
|
||||
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login&sid='+self.story.getMetadata('storyId')
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
if "Member Account" not in d : #Member Account
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['penname']))
|
||||
raise exceptions.FailedToLogin(url,params['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
# http://fanfiction.mugglenet.com/viewstory.php?sid=91079&ageconsent=ok&warning=3
|
||||
addurl = "&ageconsent=ok&warning=3" # XXX &warning=5
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
#print("\nurl:%s\ndata:\n%s\n"%(url,data))
|
||||
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
|
||||
# Since the warning text can change by warning level, let's
|
||||
# look for the warning pass url. nfacommunity uses
|
||||
# &warning= -- actually, so do other sites. Must be an
|
||||
# eFiction book.
|
||||
|
||||
# viewstory.php?sid=1882&warning=4
|
||||
# viewstory.php?sid=1654&ageconsent=ok&warning=5
|
||||
#print data
|
||||
#m = re.search(r"'viewstory.php\?sid=1882(&warning=4)'",data)
|
||||
m = re.search(r"'viewstory.php\?sid=%s((?:&ageconsent=ok)?&warning=\d+)'"%self.story.getMetadata('storyId'),data)
|
||||
if m != None:
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
# We tried the default and still got a warning, so
|
||||
# let's pull the warning number from the 'continue'
|
||||
# link and reload data.
|
||||
addurl = m.group(1)
|
||||
# correct stupid & error in url.
|
||||
addurl = addurl.replace("&","&")
|
||||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("URL 2nd try: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
|
||||
# Not good enough-- content can contain a ('), which ends the content prematurely.
|
||||
# metadesc = soup.find('meta',{'name':'description'})
|
||||
# print("removeAllEntities(metadesc['content']):\n%s\n"%removeAllEntities(metadesc['content']))
|
||||
start='<span class="label">Summary: </span>'
|
||||
end='<span class="label">Rated:</span>'
|
||||
summarydata = data[data.index(start)+len(start):data.index(end)]
|
||||
#print("summarydata:\n%s\n"%summarydata)
|
||||
self.setDescription(url,bs.BeautifulSoup(summarydata))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
# not good enough--poorly formated summary html will break it.
|
||||
# if 'Summary' in label:
|
||||
# ## Everything until the next span class='label'
|
||||
# svalue = ""
|
||||
# while not defaultGetattr(value,'class') == 'label':
|
||||
# svalue += str(value)
|
||||
# value = value.nextSibling
|
||||
# self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
|
||||
charstext = [char.string for char in chars]
|
||||
for char in charstext:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
## Not all sites use Genre, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
genrestext = [genre.string for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
## Not all sites use Warnings, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
warningstext = [warning.string for warning in warnings]
|
||||
self.warning = ', '.join(warningstext)
|
||||
for warning in warningstext:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
|
||||
series_name = a.string
|
||||
series_url = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
|
||||
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
|
||||
i=1
|
||||
for a in storyas:
|
||||
# skip 'report this' and 'TOC' links
|
||||
if 'contact.php' not in a['href'] and 'index' not in a['href']:
|
||||
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl',series_url)
|
||||
break
|
||||
i+=1
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
213
fanficdownloader/adapters/adapter_nationallibrarynet.py
Normal file
213
fanficdownloader/adapters/adapter_nationallibrarynet.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return NationalLibraryNetAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class NationalLibraryNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only storyid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?storyid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ntlb')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m-%d-%y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
return 'national-library.net'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.national-library.net','national-library.net']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
# ONLY the stories archived on or after June 17, 2006 and that are hosted on the website:
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?storyid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://")+"(www\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?storyid=")+r"\d+$"
|
||||
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('h1')
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"authorresults.php\?author=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for p in soup.findAll('p'):
|
||||
chapters = p.findAll('a', href=re.compile(r'viewstory.php\?storyid='+self.story.getMetadata('storyId')+"&chapnum=\d+$"))
|
||||
if len(chapters) > 0:
|
||||
for chapter in chapters:
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
|
||||
break
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('b')
|
||||
for x in range(2,len(labels)):
|
||||
value = labels[x].nextSibling
|
||||
label = labels[x].string
|
||||
|
||||
if 'Summary' in label:
|
||||
self.setDescription(url,value)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.setMetadata('rating', stripHTML(value.nextSibling))
|
||||
|
||||
if 'Word Count' in label:
|
||||
self.story.setMetadata('numWords', value.string)
|
||||
|
||||
if 'Category' in label:
|
||||
for cat in value.string.split(', '):
|
||||
self.story.addToList('category',cat)
|
||||
if 'Crossover Shows' in label:
|
||||
for cat in value.string.split(', '):
|
||||
if "No Show" not in cat:
|
||||
self.story.addToList('category',cat)
|
||||
|
||||
if 'Character' in label:
|
||||
for char in value.string.split(', '):
|
||||
self.story.addToList('characters',char)
|
||||
|
||||
if 'Pairing' in label:
|
||||
for char in value.string.split(', '):
|
||||
self.story.addToList('ships',char)
|
||||
|
||||
if 'Warnings' in label:
|
||||
for warning in value.string.split(', '):
|
||||
self.story.addToList('warnings',warning)
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Series' in label:
|
||||
self.setSeries(stripHTML(value.nextSibling), value.nextSibling.nextSibling.string[2:])
|
||||
self.story.setMetadata('seriesUrl','http://'+self.host+'/'+value.nextSibling['href'])
|
||||
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
|
||||
story=asoup.find('a', href=re.compile(r'viewstory.php\?storyid='+self.story.getMetadata('storyId')))
|
||||
|
||||
a=story.findNext(text=re.compile('Genre')).parent.nextSibling.string.split(', ')
|
||||
for genre in a:
|
||||
self.story.setMetadata('genre', genre)
|
||||
|
||||
a=story.findNext(text=re.compile('Archived'))
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(a.parent.nextSibling), self.dateformat))
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(a.parent.nextSibling), self.dateformat))
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div')
|
||||
|
||||
# bit messy since higly inconsistent
|
||||
for p in soup.findAll('p', {'align' : 'center'}):
|
||||
p.extract()
|
||||
p = soup.findAll('p')
|
||||
for x in range(0,3):
|
||||
p[x].extract()
|
||||
if "Chapters: " in stripHTML(p[3]):
|
||||
p[3].extract()
|
||||
for x in range(len(p)-2,len(p)-1):
|
||||
p[x].extract()
|
||||
|
||||
for p in soup.findAll('h1'):
|
||||
p.extract()
|
||||
for p in soup.findAll('h3'):
|
||||
p.extract()
|
||||
for p in soup.findAll('a'):
|
||||
p.extract()
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
219
fanficdownloader/adapters/adapter_ncisficcom.py
Normal file
219
fanficdownloader/adapters/adapter_ncisficcom.py
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Software: eFiction
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return NCISFicComAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class NCISFicComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only storyid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?storyid='+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ncisf')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m-%d-%y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
return 'ncisfic.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.ncisfic.com','ncisfic.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/viewstory.php?storyid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://")+"(www\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?storyid=")+r"\d+$"
|
||||
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('h1')
|
||||
self.story.setMetadata('title',stripHTML(a))
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"authorresults.php\?author=\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for p in soup.findAll('p'):
|
||||
chapters = p.findAll('a', href=re.compile(r'viewstory.php\?storyid='+self.story.getMetadata('storyId')+"&chapnum=\d+$"))
|
||||
if len(chapters) > 0:
|
||||
for chapter in chapters:
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
|
||||
break
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('b')
|
||||
for x in range(2,len(labels)):
|
||||
value = labels[x].nextSibling
|
||||
label = labels[x].string
|
||||
|
||||
if 'Summary' in label:
|
||||
self.setDescription(url,value)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.setMetadata('rating', stripHTML(value.nextSibling))
|
||||
|
||||
if 'Word Count' in label:
|
||||
self.story.setMetadata('numWords', value.string)
|
||||
|
||||
if 'Category' in label:
|
||||
for cat in value.string.split(', '):
|
||||
self.story.addToList('category',cat)
|
||||
if 'Crossover Shows' in label:
|
||||
for cat in value.string.split(', '):
|
||||
if "No Show" not in cat:
|
||||
self.story.addToList('category',cat)
|
||||
|
||||
if 'Character' in label:
|
||||
for char in value.string.split(', '):
|
||||
self.story.addToList('characters',char)
|
||||
|
||||
if 'Pairing' in label:
|
||||
for char in value.string.split(', '):
|
||||
self.story.addToList('ships',char)
|
||||
|
||||
if 'Warnings' in label:
|
||||
for warning in value.string.split(', '):
|
||||
self.story.addToList('warnings',warning)
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
if 'Series' in label:
|
||||
if "No Series" not in value.nextSibling.string:
|
||||
self.setSeries(stripHTML(value.nextSibling), value.nextSibling.nextSibling.string[2:])
|
||||
self.story.setMetadata('seriesUrl','http://'+self.host+'/'+value.nextSibling['href'])
|
||||
|
||||
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
|
||||
story=asoup.find('a', href=re.compile(r'viewstory.php\?storyid='+self.story.getMetadata('storyId')))
|
||||
|
||||
a=story.findNext('font')
|
||||
if 'Complete' in a.string:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
a=story.findNext(text=re.compile('Genre')).parent.nextSibling.string.split(', ')
|
||||
for genre in a:
|
||||
self.story.setMetadata('genre', genre)
|
||||
|
||||
a=story.findNext(text=re.compile('Archived'))
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(a.parent.nextSibling), self.dateformat))
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(a.parent.nextSibling), self.dateformat))
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div')
|
||||
|
||||
# bit messy since higly inconsistent
|
||||
for p in soup.findAll('p', {'align' : 'center'}):
|
||||
p.extract()
|
||||
p = soup.findAll('p')
|
||||
for x in range(0,3):
|
||||
p[x].extract()
|
||||
if "Chapters: " in stripHTML(p[3]):
|
||||
p[3].extract()
|
||||
for x in range(len(p)-2,len(p)-1):
|
||||
p[x].extract()
|
||||
|
||||
for p in soup.findAll('h1'):
|
||||
p.extract()
|
||||
for p in soup.findAll('h3'):
|
||||
p.extract()
|
||||
for p in soup.findAll('a'):
|
||||
p.extract()
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
210
fanficdownloader/adapters/adapter_ncisfictionnet.py
Normal file
210
fanficdownloader/adapters/adapter_ncisfictionnet.py
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2012 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return NCISFictionNetAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class NCISFictionNetAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["iso-8859-1",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL("http://"+self.getSiteDomain()\
|
||||
+"/chapters.php?stid="+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ncisfn')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d/%m/%Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.ncisfiction.net'
|
||||
|
||||
## Changed from www.ncisfiction.com to www.ncisfiction.net Oct
|
||||
## 2012 due to the ncisfiction.com domain expiring. Still accept
|
||||
## .com domains for existing updates, etc.
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.ncisfiction.net','www.ncisfiction.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/story.php?stid=01234 http://"+cls.getSiteDomain()+"/chapters.php?stid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r'http://www\.ncisfiction\.(net|com)/(chapters|story)?.php\?stid=\d+'
|
||||
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title and author
|
||||
a = soup.find('div', {'class' : 'main_title'})
|
||||
|
||||
aut = a.find('a')
|
||||
self.story.setMetadata('authorId',aut['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+aut['href'])
|
||||
self.story.setMetadata('author',aut.string)
|
||||
|
||||
aut.extract()
|
||||
self.story.setMetadata('title',stripHTML(a)[:len(stripHTML(a))-2])
|
||||
|
||||
# Find the chapters:
|
||||
i=0
|
||||
chapters=soup.findAll('table', {'class' : 'story_table'})
|
||||
for chapter in chapters:
|
||||
ch=chapter.find('a')
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(ch),'http://'+self.host+'/'+ch['href']))
|
||||
if i == 0:
|
||||
self.story.setMetadata('datePublished', makeDate(stripHTML(chapter.find('td')).split('Added: ')[1], self.dateformat))
|
||||
if i == len(chapters)-1:
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(chapter.find('td')).split('Added: ')[1], self.dateformat))
|
||||
i=i+1
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
info = soup.find('table', {'class' : 'story_info'})
|
||||
|
||||
# no convenient way to calculate word count as it is logged differently for stories with and without series
|
||||
|
||||
labels = info.findAll('tr')
|
||||
for tr in labels:
|
||||
value = tr.find('td')
|
||||
label = tr.find('th').string
|
||||
|
||||
if 'Summary' in label:
|
||||
self.setDescription(url,value)
|
||||
|
||||
if 'Rating' in label:
|
||||
self.story.setMetadata('rating', value.string)
|
||||
|
||||
if 'Category' in label:
|
||||
cats = value.findAll('a')
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
if 'Characters' in label:
|
||||
chars = value.findAll('a')
|
||||
for char in chars:
|
||||
self.story.addToList('characters',char.string)
|
||||
|
||||
if 'Pairing' in label:
|
||||
ships = value.findAll('a')
|
||||
for ship in ships:
|
||||
self.story.addToList('ships',ship.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = value.findAll('a')
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
if 'Warnings' in label:
|
||||
warnings = value.findAll('a')
|
||||
for warning in warnings:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Status' in label:
|
||||
if 'not completed' in value.text:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
else:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
a = soup.find('div',{'class' : 'sub_header'})
|
||||
series_name = a.find('a').string
|
||||
i = a.text.split('#')[1]
|
||||
self.setSeries(series_name, i)
|
||||
self.story.setMetadata('seriesUrl','http://'+self.host+'/'+a.find('a')['href'])
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
div = soup.find('div', {'class' : 'story_text'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue