Foundations of an eFiction base adapter

* works for fannation and themaplebookshop
* metadata parsing must be more extensible
* missing documentation
* proper handling of warnings / is_adult checks
* ...
This commit is contained in:
doe 2014-08-06 15:33:21 +02:00
commit 0c51160924
239 changed files with 70079 additions and 0 deletions

78
allrecent.html Normal file
View file

@ -0,0 +1,78 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="/css/index.css" rel="stylesheet" type="text/css">
<title>FanFictionDownLoader (fanfiction.net, fanficauthors, fictionalley, ficwad to epub and HTML)</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-12136939-1']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</head>
<body>
<div id='main'>
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFictionDownLoader</a>
</h1>
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
<!-- <div id='yourfile'> -->
{{yourfile}}
<!-- </div> -->
<div id='helpbox'>
{% for fic in fics %}
<p>
<a href="{{ fic.url }}" title="Link to original story"><span class="recent"><i>{{ fic.title }}</i></span></a>
by <a href="{{ fic.authorUrl }}">{{ fic.author }}</a> <b>Download Count:</b> {{ fic.count }} <br />
<b>Word Count:</b> {{ fic.numWords }} <b>Chapter Count:</b> {{ fic.numChapters }}<br />
{% if fic.category %} <b>Categories:</b> {{ fic.category }} <br /> {% endif %}
{% if fic.genre %} <b>Genres:</b> {{ fic.genre }} <br /> {% endif %}
{% if fic.language %} <b>Language:</b> {{ fic.language }} <br /> {% endif %}
{% if fic.series %} <b>Series:</b> {{ fic.series }} <br /> {% endif %}
{% if fic.characters %} <b>Characters:</b> {{ fic.characters }} <br /> {% endif %}
{% if fic.status %} <b>Status:</b> {{ fic.status }} <br /> {% endif %}
{% if fic.datePublished %} <b>Published:</b> {{ fic.datePublished }} <br /> {% endif %}
{% if fic.dateUpdated %} <b>Last Updated:</b> {{ fic.dateUpdated }} <br /> {% endif %}
{% if fic.dateCreated %} <b>Last Downloaded:</b> {{ fic.dateCreated }} <br /> {% endif %}
{% if fic.rating %} <b>Rating:</b> {{ fic.rating }} <br /> {% endif %}
{% if fic.warnings %} <b>Warnings:</b> {{ fic.warnings }} <br /> {% endif %}
{% if fic.description %} <b>Summary:</b> {{ fic.description }} <br /> {% endif %}
</p>
{% endfor %}
</div>
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
</body>
</html>

46
app.yaml Normal file
View file

@ -0,0 +1,46 @@
# ffd-retief-hrd fanfictiondownloader
application: fanfictiondownloader
version: 2-0-01
runtime: python27
api_version: 1
threadsafe: true
handlers:
- url: /r3m0v3r.*
script: utils.remover.app
login: admin
- url: /tally.*
script: utils.tally.app
login: admin
- url: /fdownloadtask
script: main.app
login: admin
- url: /css
static_dir: css
- url: /js
static_dir: js
- url: /static
static_dir: static
- url: /favicon\.ico
static_files: static/favicon.ico
upload: static/favicon\.ico
- url: /.*
script: main.app
#builtins:
#- datastore_admin: on
libraries:
- name: django
version: "1.2"
- name: PIL
version: "1.1.7"

123
calibre-plugin/__init__.py Normal file
View file

@ -0,0 +1,123 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Jim Miller'
__docformat__ = 'restructuredtext en'
import sys
if sys.version_info >= (2, 7):
import logging
logger = logging.getLogger(__name__)
loghandler=logging.StreamHandler()
loghandler.setFormatter(logging.Formatter("FFDL:%(levelname)s:%(filename)s(%(lineno)d):%(message)s"))
logger.addHandler(loghandler)
loghandler.setLevel(logging.DEBUG)
logger.setLevel(logging.DEBUG)
# pulls in translation files for _() strings
try:
load_translations()
except NameError:
pass # load_translations() added in calibre 1.9
# The class that all Interface Action plugin wrappers must inherit from
from calibre.customize import InterfaceActionBase
## Apparently the name for this class doesn't matter--it was still
## 'demo' for the first few versions.
class FanFictionDownLoaderBase(InterfaceActionBase):
'''
This class is a simple wrapper that provides information about the
actual plugin class. The actual interface plugin class is called
InterfacePlugin and is defined in the ffdl_plugin.py file, as
specified in the actual_plugin field below.
The reason for having two classes is that it allows the command line
calibre utilities to run without needing to load the GUI libraries.
'''
name = 'FanFictionDownLoader'
description = _('UI plugin to download FanFiction stories from various sites.')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Jim Miller'
version = (2, 0, 1)
minimum_calibre_version = (1, 13, 0)
#: This field defines the GUI plugin class that contains all the code
#: that actually does something. Its format is module_path:class_name
#: The specified class must be defined in the specified module.
actual_plugin = 'calibre_plugins.fanfictiondownloader_plugin.ffdl_plugin:FanFictionDownLoaderPlugin'
def is_customizable(self):
'''
This method must return True to enable customization via
Preferences->Plugins
'''
return True
def config_widget(self):
'''
Implement this method and :meth:`save_settings` in your plugin to
use a custom configuration dialog.
This method, if implemented, must return a QWidget. The widget can have
an optional method validate() that takes no arguments and is called
immediately after the user clicks OK. Changes are applied if and only
if the method returns True.
If for some reason you cannot perform the configuration at this time,
return a tuple of two strings (message, details), these will be
displayed as a warning dialog to the user and the process will be
aborted.
The base class implementation of this method raises NotImplementedError
so by default no user configuration is possible.
'''
# It is important to put this import statement here rather than at the
# top of the module as importing the config class will also cause the
# GUI libraries to be loaded, which we do not want when using calibre
# from the command line
from calibre_plugins.fanfictiondownloader_plugin.config import ConfigWidget
return ConfigWidget(self.actual_plugin_)
def save_settings(self, config_widget):
'''
Save the settings specified by the user with config_widget.
:param config_widget: The widget returned by :meth:`config_widget`.
'''
config_widget.save_settings()
# Apply the changes
ac = self.actual_plugin_
if ac is not None:
ac.apply_settings()
def cli_main(self,argv):
# I believe there's no performance hit loading these here when
# CLI--it would load everytime anyway.
from StringIO import StringIO
from calibre.library import db
from calibre_plugins.fanfictiondownloader_plugin.downloader import main as ffdl_main
from calibre_plugins.fanfictiondownloader_plugin.prefs import PrefsFacade
from calibre.utils.config import prefs as calibre_prefs
from optparse import OptionParser
parser = OptionParser('%prog --run-plugin '+self.name+' -- [options] <storyurl>')
parser.add_option('--library-path', '--with-library', default=None, help=_('Path to the calibre library. Default is to use the path stored in the settings.'))
# parser.add_option('--dont-notify-gui', default=False, action='store_true',
# help=_('Do not notify the running calibre GUI (if any) that the database has'
# ' changed. Use with care, as it can lead to database corruption!'))
pargs = [x for x in argv if x.startswith('--with-library') or x.startswith('--library-path')
or not x.startswith('-')]
opts, args = parser.parse_args(pargs)
ffdl_prefs = PrefsFacade(db(path=opts.library_path,
read_only=True))
ffdl_main(argv[1:],
parser=parser,
passed_defaultsini=StringIO(get_resources("defaults.ini")),
passed_personalini=StringIO(ffdl_prefs["personal.ini"]))

28
calibre-plugin/about.txt Normal file
View file

@ -0,0 +1,28 @@
<hr />
<p>Plugin created by Jim Miller, borrowing heavily from Grant Drake's
'<a href="http://www.mobileread.com/forums/showthread.php?t=134856">Reading List</a>',
'<a href="http://www.mobileread.com/forums/showthread.php?t=126727">Extract ISBN</a>' and
'<a href="http://www.mobileread.com/forums/showthread.php?t=134000">Count Pages</a>'
plugins.</p>
<p>
Calibre officially distributes plugins from the mobileread.com forum site.
The official distro channel for this plugin is there: <a href="http://www.mobileread.com/forums/showthread.php?t=163261">FanFictionDownLoader</a>
</p>
<p> I also monitor the
<a href="http://groups.google.com/group/fanfic-downloader">general users
group</a> for the downloader. That covers the web application and CLI, too.
</p>
The source for this plugin is available at it's
<a href="http://code.google.com/p/fanficdownloader">project home</a>.
<hr />
<p>
See the <a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownloaderSupportedsites">list of supported sites</a>.
</p>
<p>
Read the <a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownloaderFAQs">FAQs</a>.
</p>

View file

@ -0,0 +1,553 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>'
__docformat__ = 'restructuredtext en'
import os
try:
from PyQt5 import QtWidgets as QtGui
from PyQt5.Qt import (Qt, QIcon, QPixmap, QLabel, QDialog, QHBoxLayout,
QTableWidgetItem, QFont, QLineEdit, QComboBox,
QVBoxLayout, QDialogButtonBox, QStyledItemDelegate, QDateTime,
QTextEdit, QListWidget, QAbstractItemView)
except ImportError as e:
from PyQt4 import QtGui
from PyQt4.Qt import (Qt, QIcon, QPixmap, QLabel, QDialog, QHBoxLayout,
QTableWidgetItem, QFont, QLineEdit, QComboBox,
QVBoxLayout, QDialogButtonBox, QStyledItemDelegate, QDateTime,
QTextEdit, QListWidget, QAbstractItemView)
from calibre.constants import iswindows
from calibre.gui2 import gprefs, error_dialog, UNDEFINED_QDATETIME, info_dialog
from calibre.gui2.actions import menu_action_unique_name
from calibre.gui2.keyboard import ShortcutConfig
from calibre.utils.config import config_dir
from calibre.utils.date import now, format_date, qt_to_dt, UNDEFINED_DATE
# Global definition of our plugin name. Used for common functions that require this.
plugin_name = None
# Global definition of our plugin resources. Used to share between the xxxAction and xxxBase
# classes if you need any zip images to be displayed on the configuration dialog.
plugin_icon_resources = {}
def set_plugin_icon_resources(name, resources):
'''
Set our global store of plugin name and icon resources for sharing between
the InterfaceAction class which reads them and the ConfigWidget
if needed for use on the customization dialog for this plugin.
'''
global plugin_icon_resources, plugin_name
plugin_name = name
plugin_icon_resources = resources
def get_icon(icon_name):
'''
Retrieve a QIcon for the named image from the zip file if it exists,
or if not then from Calibre's image cache.
'''
if icon_name:
pixmap = get_pixmap(icon_name)
if pixmap is None:
# Look in Calibre's cache for the icon
return QIcon(I(icon_name))
else:
return QIcon(pixmap)
return QIcon()
def get_pixmap(icon_name):
'''
Retrieve a QPixmap for the named image
Any icons belonging to the plugin must be prefixed with 'images/'
'''
global plugin_icon_resources, plugin_name
if not icon_name.startswith('images/'):
# We know this is definitely not an icon belonging to this plugin
pixmap = QPixmap()
pixmap.load(I(icon_name))
return pixmap
# Check to see whether the icon exists as a Calibre resource
# This will enable skinning if the user stores icons within a folder like:
# ...\AppData\Roaming\calibre\resources\images\Plugin Name\
if plugin_name:
local_images_dir = get_local_images_dir(plugin_name)
local_image_path = os.path.join(local_images_dir, icon_name.replace('images/', ''))
if os.path.exists(local_image_path):
pixmap = QPixmap()
pixmap.load(local_image_path)
return pixmap
# As we did not find an icon elsewhere, look within our zip resources
if icon_name in plugin_icon_resources:
pixmap = QPixmap()
pixmap.loadFromData(plugin_icon_resources[icon_name])
return pixmap
return None
def get_local_images_dir(subfolder=None):
'''
Returns a path to the user's local resources/images folder
If a subfolder name parameter is specified, appends this to the path
'''
images_dir = os.path.join(config_dir, 'resources/images')
if subfolder:
images_dir = os.path.join(images_dir, subfolder)
if iswindows:
images_dir = os.path.normpath(images_dir)
return images_dir
def create_menu_item(ia, parent_menu, menu_text, image=None, tooltip=None,
shortcut=(), triggered=None, is_checked=None):
'''
Create a menu action with the specified criteria and action
Note that if no shortcut is specified, will not appear in Preferences->Keyboard
This method should only be used for actions which either have no shortcuts,
or register their menus only once. Use create_menu_action_unique for all else.
'''
if shortcut is not None:
if len(shortcut) == 0:
shortcut = ()
else:
shortcut = _(shortcut)
ac = ia.create_action(spec=(menu_text, None, tooltip, shortcut),
attr=menu_text)
if image:
ac.setIcon(get_icon(image))
if triggered is not None:
ac.triggered.connect(triggered)
if is_checked is not None:
ac.setCheckable(True)
if is_checked:
ac.setChecked(True)
parent_menu.addAction(ac)
return ac
def create_menu_action_unique(ia, parent_menu, menu_text, image=None, tooltip=None,
shortcut=None, triggered=None, is_checked=None, shortcut_name=None,
unique_name=None):
'''
Create a menu action with the specified criteria and action, using the new
InterfaceAction.create_menu_action() function which ensures that regardless of
whether a shortcut is specified it will appear in Preferences->Keyboard
'''
orig_shortcut = shortcut
kb = ia.gui.keyboard
if unique_name is None:
unique_name = menu_text
if not shortcut == False:
full_unique_name = menu_action_unique_name(ia, unique_name)
if full_unique_name in kb.shortcuts:
shortcut = False
else:
if shortcut is not None and not shortcut == False:
if len(shortcut) == 0:
shortcut = None
else:
shortcut = _(shortcut)
if shortcut_name is None:
shortcut_name = menu_text.replace('&','')
ac = ia.create_menu_action(parent_menu, unique_name, menu_text, icon=None, shortcut=shortcut,
description=tooltip, triggered=triggered, shortcut_name=shortcut_name)
if shortcut == False and not orig_shortcut == False:
if ac.calibre_shortcut_unique_name in ia.gui.keyboard.shortcuts:
kb.replace_action(ac.calibre_shortcut_unique_name, ac)
if image:
ac.setIcon(get_icon(image))
if is_checked is not None:
ac.setCheckable(True)
if is_checked:
ac.setChecked(True)
return ac
def swap_author_names(author):
if author.find(',') == -1:
return author
name_parts = author.strip().partition(',')
return name_parts[2].strip() + ' ' + name_parts[0]
def get_library_uuid(db):
try:
library_uuid = db.library_id
except:
library_uuid = ''
return library_uuid
class ImageLabel(QLabel):
def __init__(self, parent, icon_name, size=16):
QLabel.__init__(self, parent)
pixmap = get_pixmap(icon_name)
self.setPixmap(pixmap)
self.setMaximumSize(size, size)
self.setScaledContents(True)
class ImageTitleLayout(QHBoxLayout):
'''
A reusable layout widget displaying an image followed by a title
'''
def __init__(self, parent, icon_name, title, tooltip=None):
QHBoxLayout.__init__(self)
title_image_label = QLabel(parent)
pixmap = get_pixmap(icon_name)
if pixmap is None:
pixmap = get_pixmap('library.png')
# error_dialog(parent, _('Restart required'),
# _('You must restart Calibre before using this plugin!'), show=True)
else:
title_image_label.setPixmap(pixmap)
title_image_label.setMaximumSize(32, 32)
title_image_label.setScaledContents(True)
self.addWidget(title_image_label)
title_font = QFont()
title_font.setPointSize(16)
shelf_label = QLabel(title, parent)
shelf_label.setFont(title_font)
self.addWidget(shelf_label)
self.insertStretch(-1)
if tooltip:
title_image_label.setToolTip(tooltip)
shelf_label.setToolTip(tooltip)
class SizePersistedDialog(QDialog):
'''
This dialog is a base class for any dialogs that want their size/position
restored when they are next opened.
'''
def __init__(self, parent, unique_pref_name):
QDialog.__init__(self, parent)
self.unique_pref_name = unique_pref_name
self.geom = gprefs.get(unique_pref_name, None)
self.finished.connect(self.dialog_closing)
def resize_dialog(self):
if self.geom is None:
self.resize(self.sizeHint())
else:
self.restoreGeometry(self.geom)
def dialog_closing(self, result):
self.geom = bytearray(self.saveGeometry())
gprefs[self.unique_pref_name] = self.geom
class ReadOnlyTableWidgetItem(QTableWidgetItem):
def __init__(self, text):
if text is None:
text = ''
QTableWidgetItem.__init__(self, text, QtGui.QTableWidgetItem.UserType)
self.setFlags(Qt.ItemIsSelectable|Qt.ItemIsEnabled)
class RatingTableWidgetItem(QTableWidgetItem):
def __init__(self, rating, is_read_only=False):
QTableWidgetItem.__init__(self, '', QtGui.QTableWidgetItem.UserType)
self.setData(Qt.DisplayRole, rating)
if is_read_only:
self.setFlags(Qt.ItemIsSelectable|Qt.ItemIsEnabled)
class DateTableWidgetItem(QTableWidgetItem):
def __init__(self, date_read, is_read_only=False, default_to_today=False):
if date_read == UNDEFINED_DATE and default_to_today:
date_read = now()
if is_read_only:
QTableWidgetItem.__init__(self, format_date(date_read, None), QtGui.QTableWidgetItem.UserType)
self.setFlags(Qt.ItemIsSelectable|Qt.ItemIsEnabled)
else:
QTableWidgetItem.__init__(self, '', QtGui.QTableWidgetItem.UserType)
self.setData(Qt.DisplayRole, QDateTime(date_read))
class NoWheelComboBox(QComboBox):
def wheelEvent (self, event):
# Disable the mouse wheel on top of the combo box changing selection as plays havoc in a grid
event.ignore()
class CheckableTableWidgetItem(QTableWidgetItem):
def __init__(self, checked=False, is_tristate=False):
QTableWidgetItem.__init__(self, '')
self.setFlags(Qt.ItemFlags(Qt.ItemIsSelectable | Qt.ItemIsUserCheckable | Qt.ItemIsEnabled ))
if is_tristate:
self.setFlags(self.flags() | Qt.ItemIsTristate)
if checked:
self.setCheckState(Qt.Checked)
else:
if is_tristate and checked is None:
self.setCheckState(Qt.PartiallyChecked)
else:
self.setCheckState(Qt.Unchecked)
def get_boolean_value(self):
'''
Return a boolean value indicating whether checkbox is checked
If this is a tristate checkbox, a partially checked value is returned as None
'''
if self.checkState() == Qt.PartiallyChecked:
return None
else:
return self.checkState() == Qt.Checked
class TextIconWidgetItem(QTableWidgetItem):
def __init__(self, text, icon):
QTableWidgetItem.__init__(self, text)
if icon:
self.setIcon(icon)
class ReadOnlyTextIconWidgetItem(ReadOnlyTableWidgetItem):
def __init__(self, text, icon):
ReadOnlyTableWidgetItem.__init__(self, text)
if icon:
self.setIcon(icon)
class ReadOnlyLineEdit(QLineEdit):
def __init__(self, text, parent):
if text is None:
text = ''
QLineEdit.__init__(self, text, parent)
self.setEnabled(False)
class KeyValueComboBox(QComboBox):
def __init__(self, parent, values, selected_key):
QComboBox.__init__(self, parent)
self.values = values
self.populate_combo(selected_key)
def populate_combo(self, selected_key):
self.clear()
selected_idx = idx = -1
for key, value in self.values.iteritems():
idx = idx + 1
self.addItem(value)
if key == selected_key:
selected_idx = idx
self.setCurrentIndex(selected_idx)
def selected_key(self):
for key, value in self.values.iteritems():
if value == unicode(self.currentText()).strip():
return key
class CustomColumnComboBox(QComboBox):
def __init__(self, parent, custom_columns, selected_column, initial_items=['']):
QComboBox.__init__(self, parent)
self.populate_combo(custom_columns, selected_column, initial_items)
def populate_combo(self, custom_columns, selected_column, initial_items=['']):
self.clear()
self.column_names = initial_items
if len(initial_items) > 0:
self.addItems(initial_items)
selected_idx = 0
for idx, value in enumerate(initial_items):
if value == selected_column:
selected_idx = idx
for key in sorted(custom_columns.keys()):
self.column_names.append(key)
self.addItem('%s (%s)'%(key, custom_columns[key]['name']))
if key == selected_column:
selected_idx = len(self.column_names) - 1
self.setCurrentIndex(selected_idx)
def get_selected_column(self):
return self.column_names[self.currentIndex()]
class KeyboardConfigDialog(SizePersistedDialog):
'''
This dialog is used to allow editing of keyboard shortcuts.
'''
def __init__(self, gui, group_name):
SizePersistedDialog.__init__(self, gui, 'Keyboard shortcut dialog')
self.gui = gui
self.setWindowTitle('Keyboard shortcuts')
layout = QVBoxLayout(self)
self.setLayout(layout)
self.keyboard_widget = ShortcutConfig(self)
layout.addWidget(self.keyboard_widget)
self.group_name = group_name
button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
button_box.accepted.connect(self.commit)
button_box.rejected.connect(self.reject)
layout.addWidget(button_box)
# Cause our dialog size to be restored from prefs or created on first usage
self.resize_dialog()
self.initialize()
def initialize(self):
self.keyboard_widget.initialize(self.gui.keyboard)
self.keyboard_widget.highlight_group(self.group_name)
def commit(self):
self.keyboard_widget.commit()
self.accept()
class DateDelegate(QStyledItemDelegate):
'''
Delegate for dates. Because this delegate stores the
format as an instance variable, a new instance must be created for each
column. This differs from all the other delegates.
'''
def __init__(self, parent):
QStyledItemDelegate.__init__(self, parent)
self.format = 'dd MMM yyyy'
def displayText(self, val, locale):
d = val.toDateTime()
if d <= UNDEFINED_QDATETIME:
return ''
return format_date(qt_to_dt(d, as_utc=False), self.format)
def createEditor(self, parent, option, index):
qde = QStyledItemDelegate.createEditor(self, parent, option, index)
qde.setDisplayFormat(self.format)
qde.setMinimumDateTime(UNDEFINED_QDATETIME)
qde.setSpecialValueText(_('Undefined'))
qde.setCalendarPopup(True)
return qde
def setEditorData(self, editor, index):
val = index.model().data(index, Qt.DisplayRole).toDateTime()
if val is None or val == UNDEFINED_QDATETIME:
val = now()
editor.setDateTime(val)
def setModelData(self, editor, model, index):
val = editor.dateTime()
if val <= UNDEFINED_QDATETIME:
model.setData(index, UNDEFINED_QDATETIME, Qt.EditRole)
else:
model.setData(index, QDateTime(val), Qt.EditRole)
class PrefsViewerDialog(SizePersistedDialog):
def __init__(self, gui, namespace):
SizePersistedDialog.__init__(self, gui, 'Prefs Viewer dialog')
self.setWindowTitle('Preferences for: '+namespace)
self.gui = gui
self.db = gui.current_db
self.namespace = namespace
self._init_controls()
self.resize_dialog()
self._populate_settings()
if self.keys_list.count():
self.keys_list.setCurrentRow(0)
def _init_controls(self):
layout = QVBoxLayout(self)
self.setLayout(layout)
ml = QHBoxLayout()
layout.addLayout(ml, 1)
self.keys_list = QListWidget(self)
self.keys_list.setSelectionMode(QAbstractItemView.SingleSelection)
self.keys_list.setFixedWidth(150)
self.keys_list.setAlternatingRowColors(True)
ml.addWidget(self.keys_list)
self.value_text = QTextEdit(self)
self.value_text.setTabStopWidth(24)
self.value_text.setReadOnly(True)
ml.addWidget(self.value_text, 1)
button_box = QDialogButtonBox(QDialogButtonBox.Ok)
button_box.accepted.connect(self.accept)
self.clear_button = button_box.addButton('Clear', QDialogButtonBox.ResetRole)
self.clear_button.setIcon(get_icon('trash.png'))
self.clear_button.setToolTip('Clear all settings for this plugin')
self.clear_button.clicked.connect(self._clear_settings)
layout.addWidget(button_box)
def _populate_settings(self):
self.keys_list.clear()
ns_prefix = self._get_ns_prefix()
keys = sorted([k[len(ns_prefix):] for k in self.db.prefs.iterkeys()
if k.startswith(ns_prefix)])
for key in keys:
self.keys_list.addItem(key)
self.keys_list.setMinimumWidth(self.keys_list.sizeHintForColumn(0))
self.keys_list.currentRowChanged[int].connect(self._current_row_changed)
def _current_row_changed(self, new_row):
if new_row < 0:
self.value_text.clear()
return
key = unicode(self.keys_list.currentItem().text())
val = self.db.prefs.get_namespaced(self.namespace, key, '')
self.value_text.setPlainText(self.db.prefs.to_raw(val))
def _get_ns_prefix(self):
return 'namespaced:%s:'% self.namespace
def _clear_settings(self):
from calibre.gui2.dialogs.confirm_delete import confirm
message = '<p>Are you sure you want to clear your settings in this library for this plugin?</p>' \
'<p>Any settings in other libraries or stored in a JSON file in your calibre plugins ' \
'folder will not be touched.</p>' \
'<p>You must restart calibre afterwards.</p>'
if not confirm(message, self.namespace+'_clear_settings', self):
return
ns_prefix = self._get_ns_prefix()
keys = [k for k in self.db.prefs.iterkeys() if k.startswith(ns_prefix)]
for k in keys:
del self.db.prefs[k]
self._populate_settings()
d = info_dialog(self, 'Settings deleted',
'<p>All settings for this plugin in this library have been cleared.</p>'
'<p>Please restart calibre now.</p>',
show_copy_button=False)
b = d.bb.addButton(_('Restart calibre now'), d.bb.AcceptRole)
b.setIcon(QIcon(I('lt.png')))
d.do_restart = False
def rf():
d.do_restart = True
b.clicked.connect(rf)
d.set_details('')
d.exec_()
b.clicked.disconnect()
self.close()
if d.do_restart:
self.gui.quit(restart=True)

1094
calibre-plugin/config.py Normal file

File diff suppressed because it is too large Load diff

1125
calibre-plugin/dialogs.py Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,43 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Jim Miller'
__docformat__ = 'restructuredtext en'
from StringIO import StringIO
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, exceptions
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.configurable import Configuration
from calibre_plugins.fanfictiondownloader_plugin.prefs import (prefs)
def get_ffdl_personalini():
if prefs['includeimages']:
# this is a cheat to make it easier for users.
return '''[epub]
include_images:true
keep_summary_html:true
make_firstimage_cover:true
''' + prefs['personal.ini']
else:
return prefs['personal.ini']
def get_ffdl_config(url,fileform="epub",personalini=None):
if not personalini:
personalini = get_ffdl_personalini()
site='unknown'
try:
site = adapters.getConfigSectionFor(url)
except Exception as e:
print("Failed trying to get ini config for url(%s): %s, using section [%s] instead"%(url,e,site))
configuration = Configuration(site,fileform)
configuration.readfp(StringIO(get_resources("plugin-defaults.ini")))
configuration.readfp(StringIO(personalini))
return configuration
def get_ffdl_adapter(url,fileform="epub",personalini=None):
return adapters.getAdapter(get_ffdl_config(url,fileform,personalini),url)

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

245
calibre-plugin/jobs.py Normal file
View file

@ -0,0 +1,245 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2014, Jim Miller'
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>'
__docformat__ = 'restructuredtext en'
import logging
logger = logging.getLogger(__name__)
import time, os, traceback
from StringIO import StringIO
from calibre.utils.ipc.server import Server
from calibre.utils.ipc.job import ParallelJob
from calibre.constants import numeric_version as calibre_version
from calibre_plugins.fanfictiondownloader_plugin.dialogs import (NotGoingToDownload,
OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY)
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_update_data
from calibre_plugins.fanfictiondownloader_plugin.ffdl_util import (get_ffdl_adapter, get_ffdl_config)
# ------------------------------------------------------------------------------
#
# Functions to perform downloads using worker jobs
#
# ------------------------------------------------------------------------------
def do_download_worker(book_list, options,
cpus, notification=lambda x,y:x):
'''
Master job, to launch child jobs to extract ISBN for a set of books
This is run as a worker job in the background to keep the UI more
responsive and get around the memory leak issues as it will launch
a child job for each book as a worker process
'''
server = Server(pool_size=cpus)
logger.info(options['version'])
total = 0
alreadybad = []
# Queue all the jobs
logger.info("Adding jobs for URLs:")
for book in book_list:
logger.info("%s"%book['url'])
if book['good']:
total += 1
args = ['calibre_plugins.fanfictiondownloader_plugin.jobs',
'do_download_for_worker',
(book,options)]
job = ParallelJob('arbitrary_n',
"url:(%s) id:(%s)"%(book['url'],book['calibre_id']),
done=None,
args=args)
job._book = book
# job._book_id = book_id
# job._title = title
# job._modified_date = modified_date
# job._existing_isbn = existing_isbn
server.add_job(job)
else:
# was already bad before the subprocess ever started.
alreadybad.append(book)
# This server is an arbitrary_n job, so there is a notifier available.
# Set the % complete to a small number to avoid the 'unavailable' indicator
notification(0.01, 'Downloading FanFiction Stories')
# dequeue the job results as they arrive, saving the results
count = 0
while True:
job = server.changed_jobs_queue.get()
# A job can 'change' when it is not finished, for example if it
# produces a notification. Ignore these.
job.update()
if not job.is_finished:
continue
# A job really finished. Get the information.
output_book = job.result
#print("output_book:%s"%output_book)
book_list.remove(job._book)
book_list.append(job.result)
book_id = job._book['calibre_id']
#title = job._title
count = count + 1
notification(float(count)/total, '%d of %d stories finished downloading'%(count,total))
# Add this job's output to the current log
logger.info('Logfile for book ID %s (%s)'%(book_id, job._book['title']))
logger.info(job.details)
if count >= total:
logger.info("\nSuccessful:\n%s\n"%("\n".join([book['url'] for book in
filter(lambda x: x['good'], book_list) ] ) ) )
logger.info("\nUnsuccessful:\n%s\n"%("\n".join([book['url'] for book in
filter(lambda x: not x['good'], book_list) ] ) ) )
break
server.close()
# return the book list as the job result
return book_list
def do_download_for_worker(book,options,notification=lambda x,y:x):
'''
Child job, to extract isbn from formats for this specific book,
when run as a worker job
'''
try:
book['comment'] = 'Download started...'
configuration = get_ffdl_config(book['url'],
options['fileform'],
options['personal.ini'])
if not options['updateepubcover'] and 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
configuration.set("overrides","never_make_cover","true")
# images only for epub, even if the user mistakenly turned it
# on else where.
if options['fileform'] not in ("epub","html"):
configuration.set("overrides","include_images","false")
adapter = adapters.getAdapter(configuration,book['url'])
adapter.is_adult = book['is_adult']
adapter.username = book['username']
adapter.password = book['password']
adapter.setChaptersRange(book['begin'],book['end'])
story = adapter.getStoryMetadataOnly()
if 'calibre_series' in book:
adapter.setSeries(book['calibre_series'][0],book['calibre_series'][1])
# set PI version instead of default.
if 'version' in options:
story.setMetadata('version',options['version'])
writer = writers.getWriter(options['fileform'],configuration,adapter)
outfile = book['outfile']
## No need to download at all. Shouldn't ever get down here.
if options['collision'] in (CALIBREONLY):
logger.info("Skipping CALIBREONLY 'update' down inside worker--this shouldn't be happening...")
book['comment'] = 'Metadata collected.'
## checks were done earlier, it's new or not dup or newer--just write it.
elif options['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \
('epub_for_update' not in book and options['collision'] in (UPDATE, UPDATEALWAYS)):
# preserve logfile even on overwrite.
if 'epub_for_update' in book:
(urlignore,
chaptercountignore,
oldchaptersignore,
oldimgsignore,
oldcoverignore,
calibrebookmarkignore,
# only logfile set in adapter, so others aren't used.
adapter.logfile) = get_update_data(book['epub_for_update'])
# change the existing entries id to notid so
# write_epub writes a whole new set to indicate overwrite.
if adapter.logfile:
adapter.logfile = adapter.logfile.replace("span id","span notid")
logger.info("write to %s"%outfile)
writer.writeStory(outfilename=outfile, forceOverwrite=True)
book['comment'] = 'Download %s completed, %s chapters.'%(options['fileform'],story.getMetadata("numChapters"))
## checks were done earlier, just update it.
elif 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
# update now handled by pre-populating the old images and
# chapters in the adapter rather than merging epubs.
urlchaptercount = int(story.getMetadata('numChapters').replace(',',''))
(url,
chaptercount,
adapter.oldchapters,
adapter.oldimgs,
adapter.oldcover,
adapter.calibrebookmark,
adapter.logfile) = get_update_data(book['epub_for_update'])
# dup handling from ffdl_plugin needed for anthology updates.
if options['collision'] == UPDATE:
if chaptercount == urlchaptercount:
book['comment']="Already contains %d chapters. Reuse as is."%chaptercount
book['outfile'] = book['epub_for_update'] # for anthology merge ops.
return book
# dup handling from ffdl_plugin needed for anthology updates.
if chaptercount > urlchaptercount:
raise NotGoingToDownload("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update." % (chaptercount,urlchaptercount),'dialog_error.png')
if not (options['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \
and adapter.getConfig("do_update_hook"):
chaptercount = adapter.hookForUpdates(chaptercount)
logger.info("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount))
logger.info("write to %s"%outfile)
writer.writeStory(outfilename=outfile, forceOverwrite=True)
book['comment'] = 'Update %s completed, added %s chapters for %s total.'%\
(options['fileform'],(urlchaptercount-chaptercount),urlchaptercount)
if options['smarten_punctuation'] and options['fileform'] == "epub" \
and calibre_version >= (0, 9, 39):
# do smarten_punctuation from calibre's polish feature
from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS
from calibre.utils.logging import Log
from collections import namedtuple
data = {'smarten_punctuation':True}
opts = ALL_OPTS.copy()
opts.update(data)
O = namedtuple('Options', ' '.join(ALL_OPTS.iterkeys()))
opts = O(**opts)
log = Log(level=Log.DEBUG)
# report = []
polish({outfile:outfile}, opts, log, logger.info) # report.append
except NotGoingToDownload as d:
book['good']=False
book['comment']=unicode(d)
book['icon'] = d.icon
except Exception as e:
book['good']=False
book['comment']=unicode(e)
book['icon']='dialog_error.png'
book['status'] = 'Error'
logger.info("Exception: %s:%s"%(book,unicode(e)))
traceback.print_exc()
#time.sleep(10)
return book

150
calibre-plugin/prefs.py Normal file
View file

@ -0,0 +1,150 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Jim Miller'
__docformat__ = 'restructuredtext en'
import copy
from calibre.utils.config import JSONConfig
from calibre.gui2.ui import get_gui
from calibre_plugins.fanfictiondownloader_plugin.dialogs import OVERWRITE
from calibre_plugins.fanfictiondownloader_plugin.common_utils import get_library_uuid
PREFS_NAMESPACE = 'FanFictionDownLoaderPlugin'
PREFS_KEY_SETTINGS = 'settings'
# Set defaults used by all. Library specific settings continue to
# take from here.
default_prefs = {}
default_prefs['personal.ini'] = get_resources('plugin-example.ini')
default_prefs['rejecturls'] = ''
default_prefs['rejectreasons'] = '''Sucked
Boring
Dup from another site'''
default_prefs['reject_always'] = False
default_prefs['updatemeta'] = True
default_prefs['updatecover'] = False
default_prefs['updateepubcover'] = False
default_prefs['keeptags'] = False
default_prefs['suppressauthorsort'] = False
default_prefs['suppresstitlesort'] = False
default_prefs['mark'] = False
default_prefs['showmarked'] = False
default_prefs['autoconvert'] = False
default_prefs['urlsfromclip'] = True
default_prefs['updatedefault'] = True
default_prefs['fileform'] = 'epub'
default_prefs['collision'] = OVERWRITE
default_prefs['deleteotherforms'] = False
default_prefs['adddialogstaysontop'] = False
default_prefs['includeimages'] = False
default_prefs['lookforurlinhtml'] = False
default_prefs['checkforseriesurlid'] = True
default_prefs['checkforurlchange'] = True
default_prefs['injectseries'] = False
default_prefs['smarten_punctuation'] = False
default_prefs['send_lists'] = ''
default_prefs['read_lists'] = ''
default_prefs['addtolists'] = False
default_prefs['addtoreadlists'] = False
default_prefs['addtolistsonread'] = False
default_prefs['gcnewonly'] = False
default_prefs['gc_site_settings'] = {}
default_prefs['allow_gc_from_ini'] = True
default_prefs['gc_polish_cover'] = False
default_prefs['countpagesstats'] = []
default_prefs['errorcol'] = ''
default_prefs['custom_cols'] = {}
default_prefs['custom_cols_newonly'] = {}
default_prefs['allow_custcol_from_ini'] = True
default_prefs['std_cols_newonly'] = {}
# This is where all preferences for this plugin *were* stored
# Remember that this name (i.e. plugins/fanfictiondownloader_plugin) is also
# in a global namespace, so make it as unique as possible.
# You should always prefix your config file name with plugins/,
# so as to ensure you dont accidentally clobber a calibre config file
old_prefs = JSONConfig('plugins/fanfictiondownloader_plugin')
def set_library_config(library_config,db):
db.prefs.set_namespaced(PREFS_NAMESPACE,
PREFS_KEY_SETTINGS,
library_config)
def get_library_config(db):
library_id = get_library_uuid(db)
library_config = None
# Check whether this is a configuration needing to be migrated
# from json into database. If so: get it, set it, rename it in json.
if library_id in old_prefs:
#print("get prefs from old_prefs")
library_config = old_prefs[library_id]
set_library_config(library_config,db)
old_prefs["migrated to library db %s"%library_id] = old_prefs[library_id]
del old_prefs[library_id]
if library_config is None:
#print("get prefs from db")
library_config = db.prefs.get_namespaced(PREFS_NAMESPACE, PREFS_KEY_SETTINGS,
copy.deepcopy(default_prefs))
return library_config
# fake out so I don't have to change the prefs calls anywhere. The
# Java programmer in me is offended by op-overloading, but it's very
# tidy.
class PrefsFacade():
def _get_db(self):
if self.passed_db:
return self.passed_db
else:
# In the GUI plugin we want current db so we detect when
# it's changed. CLI plugin calls need to pass db in.
return get_gui().current_db
def __init__(self,passed_db=None):
self.default_prefs = default_prefs
self.libraryid = None
self.current_prefs = None
self.passed_db=passed_db
def _get_prefs(self):
libraryid = get_library_uuid(self._get_db())
if self.current_prefs == None or self.libraryid != libraryid:
#print("self.current_prefs == None(%s) or self.libraryid != libraryid(%s)"%(self.current_prefs == None,self.libraryid != libraryid))
self.libraryid = libraryid
self.current_prefs = get_library_config(self._get_db())
return self.current_prefs
def __getitem__(self,k):
prefs = self._get_prefs()
if k not in prefs:
# pulls from default_prefs.defaults automatically if not set
# in default_prefs
return self.default_prefs[k]
return prefs[k]
def __setitem__(self,k,v):
prefs = self._get_prefs()
prefs[k]=v
# self._save_prefs(prefs)
def __delitem__(self,k):
prefs = self._get_prefs()
if k in prefs:
del prefs[k]
def save_to_db(self):
set_library_config(self._get_prefs(),self._get_db())
prefs = PrefsFacade()

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

10
cron.yaml Normal file
View file

@ -0,0 +1,10 @@
cron:
- description: cleanup job
url: /r3m0v3r
schedule: every 2 hours
# There's a bug in the Python 2.7 runtime that prevents this from
# working properly. In theory, there should never be orphans anyway.
#- description: orphan cleanup job
# url: /r3m0v3rOrphans
# schedule: every 4 hours

73
css/index.css Normal file
View file

@ -0,0 +1,73 @@
body
{
font: 0.9em "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
}
#main
{
width: 60%;
margin-left: 20%;
background-color: #dae6ff;
padding: 2em;
}
#greeting
{
# margin-bottom: 1em;
border-color: #efefef;
}
#logpassword:hover, #logpasswordtable:hover, #urlbox:hover, #typebox:hover, #helpbox:hover, #yourfile:hover
{
border: thin solid #fffeff;
}
h1
{
text-decoration: none;
}
#logpasswordtable
{
padding: 1em;
}
#logpassword, #logpasswordtable {
// display: none;
}
#urlbox, #typebox, #logpasswordtable, #logpassword, #helpbox, #yourfile
{
margin: 1em;
padding: 1em;
border: thin dotted #fffeff;
}
div.field
{
margin-bottom: 0.5em;
}
#submitbtn
{
padding: 1em;
}
#typelabel
{
}
#typeoptions
{
margin-top: 0.5em;
}
#error
{
color: #f00;
}
.recent {
font-size: large;
}

1884
defaults.ini Normal file

File diff suppressed because it is too large Load diff

59
delete_fic.py Normal file
View file

@ -0,0 +1,59 @@
import os
import cgi
import sys
import logging
import traceback
import StringIO
from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
from fanficdownloader.downaloder import *
from fanficdownloader.ffnet import *
from fanficdownloader.output import *
from google.appengine.ext import db
from fanficdownloader.zipdir import *
from ffstorage import *
def create_mac(user, fic_id, fic_url):
return str(abs(hash(user)+hash(fic_id)))+str(abs(hash(fic_url)))
def check_mac(user, fic_id, fic_url, mac):
return (create_mac(user, fic_id, fic_url) == mac)
def create_mac_for_fic(user, fic_id):
key = db.Key(fic_id)
fanfic = db.get(key)
if fanfic.user != user:
return None
else:
return create_mac(user, key, fanfic.url)
class DeleteFicHandler(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if not user:
self.redirect('/login')
fic_id = self.request.get('fic_id')
fic_mac = self.request.get('key_id')
actual_mac = create_mac_for_fic(user, fic_id)
if actual_mac != fic_mac:
self.response.out.write("Ooops")
else:
key = db.Key(fic_id)
fanfic = db.get(key)
fanfic.delete()
self.redirect('/recent')
fics = db.GqlQuery("Select * From DownloadedFanfic WHERE user = :1", user)
template_values = dict(fics = fics, nickname = user.nickname())
path = os.path.join(os.path.dirname(__file__), 'recent.html')
self.response.out.write(template.render(path, template_values))

319
downloader.py Normal file
View file

@ -0,0 +1,319 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys, os
from os.path import normpath, expanduser, isfile, join
from StringIO import StringIO
from optparse import OptionParser
import getpass
import string
import ConfigParser
from subprocess import call
import pprint
import logging
if sys.version_info >= (2, 7):
# suppresses default logger. Logging is setup in fanficdownload/__init__.py so it works in calibre, too.
rootlogger = logging.getLogger()
loghandler=logging.NullHandler()
loghandler.setFormatter(logging.Formatter("(=====)(levelname)s:%(message)s"))
rootlogger.addHandler(loghandler)
try:
from calibre.constants import numeric_version as calibre_version
is_calibre = True
except:
is_calibre = False
# using try/except directly was masking errors during development.
if is_calibre:
# running under calibre
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters,writers,exceptions
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.configurable import Configuration
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.geturls import get_urls_from_page
else:
from fanficdownloader import adapters,writers,exceptions
from fanficdownloader.configurable import Configuration
from fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
from fanficdownloader.geturls import get_urls_from_page
if sys.version_info < (2, 5):
print "This program requires Python 2.5 or newer."
sys.exit(1)
def writeStory(config,adapter,writeformat,metaonly=False,outstream=None):
writer = writers.getWriter(writeformat,config,adapter)
writer.writeStory(outstream=outstream,metaonly=metaonly)
output_filename=writer.getOutputFileName()
del writer
return output_filename
def main(argv,
parser=None,
passed_defaultsini=None,
passed_personalini=None):
# read in args, anything starting with -- will be treated as --<varible>=<value>
if not parser:
parser = OptionParser("usage: %prog [options] storyurl")
parser.add_option("-f", "--format", dest="format", default="epub",
help="write story as FORMAT, epub(default), mobi, text or html", metavar="FORMAT")
if passed_defaultsini:
config_help="read config from specified file(s) in addition to calibre plugin personal.ini, ~/.fanficdownloader/personal.ini, and ./personal.ini"
else:
config_help="read config from specified file(s) in addition to ~/.fanficdownloader/defaults.ini, ~/.fanficdownloader/personal.ini, ./defaults.ini, and ./personal.ini"
parser.add_option("-c", "--config",
action="append", dest="configfile", default=None,
help=config_help, metavar="CONFIG")
parser.add_option("-b", "--begin", dest="begin", default=None,
help="Begin with Chapter START", metavar="START")
parser.add_option("-e", "--end", dest="end", default=None,
help="End with Chapter END", metavar="END")
parser.add_option("-o", "--option",
action="append", dest="options",
help="set an option NAME=VALUE", metavar="NAME=VALUE")
parser.add_option("-m", "--meta-only",
action="store_true", dest="metaonly",
help="Retrieve metadata and stop. Or, if --update-epub, update metadata title page only.",)
parser.add_option("-u", "--update-epub",
action="store_true", dest="update",
help="Update an existing epub with new chapters, give epub filename instead of storyurl.",)
parser.add_option("--update-cover",
action="store_true", dest="updatecover",
help="Update cover in an existing epub, otherwise existing cover (if any) is used on update. Only valid with --update-epub.",)
parser.add_option("--force",
action="store_true", dest="force",
help="Force overwrite of an existing epub, download and overwrite all chapters.",)
parser.add_option("-l", "--list",
action="store_true", dest="list",
help="Get list of valid story URLs from page given.",)
parser.add_option("-n", "--normalize-list",
action="store_true", dest="normalize",default=False,
help="Get list of valid story URLs from page given, but normalized to standard forms.",)
parser.add_option("-s", "--sites-list",
action="store_true", dest="siteslist",default=False,
help="Get list of valid story URLs examples.",)
parser.add_option("-d", "--debug",
action="store_true", dest="debug",
help="Show debug output while downloading.",)
(options, args) = parser.parse_args(argv)
if not options.debug:
logger = logging.getLogger("fanficdownloader")
logger.setLevel(logging.INFO)
if not options.siteslist and len(args) != 1:
parser.error("incorrect number of arguments")
if options.siteslist:
for (site,examples) in adapters.getSiteExamples():
print("\n====%s====\n\nExample URLs:"%site)
for u in examples:
print(" * %s"%u)
return
if options.update and options.format != 'epub':
parser.error("-u/--update-epub only works with epub")
## Attempt to update an existing epub.
chaptercount = None
output_filename = None
if options.update:
try:
(url,chaptercount) = get_dcsource_chaptercount(args[0])
if not url:
print "No story URL found in epub to update."
return
print "Updating %s, URL: %s" % (args[0],url)
output_filename = args[0]
except:
# if there's an error reading the update file, maybe it's a URL?
# we'll look for an existing outputfile down below.
url = args[0]
else:
url = args[0]
try:
configuration = Configuration(adapters.getConfigSectionFor(url),options.format)
except exceptions.UnknownSite, e:
if options.list or options.normalize:
# list for page doesn't have to be a supported site.
configuration = Configuration("test1.com",options.format)
else:
raise e
conflist = []
homepath = join(expanduser("~"),".fanficdownloader")
if passed_defaultsini:
configuration.readfp(passed_defaultsini)
if isfile(join(homepath,"defaults.ini")):
conflist.append(join(homepath,"defaults.ini"))
if isfile("defaults.ini"):
conflist.append("defaults.ini")
if passed_personalini:
configuration.readfp(passed_personalini)
if isfile(join(homepath,"personal.ini")):
conflist.append(join(homepath,"personal.ini"))
if isfile("personal.ini"):
conflist.append("personal.ini")
if options.configfile:
conflist.extend(options.configfile)
logging.debug('reading %s config file(s), if present'%conflist)
configuration.read(conflist)
try:
configuration.add_section("overrides")
except ConfigParser.DuplicateSectionError:
pass
if options.force:
configuration.set("overrides","always_overwrite","true")
if options.update and chaptercount:
configuration.set("overrides","output_filename",output_filename)
if options.update and not options.updatecover:
configuration.set("overrides","never_make_cover","true")
# images only for epub, even if the user mistakenly turned it
# on else where.
if options.format not in ("epub","html"):
configuration.set("overrides","include_images","false")
if options.options:
for opt in options.options:
(var,val) = opt.split('=')
configuration.set("overrides",var,val)
if options.list or options.normalize:
retlist = get_urls_from_page(args[0], configuration, normalize=options.normalize)
print "\n".join(retlist)
return
try:
adapter = adapters.getAdapter(configuration,url)
adapter.setChaptersRange(options.begin,options.end)
# check for updating from URL (vs from file)
if options.update and not chaptercount:
try:
writer = writers.getWriter("epub",configuration,adapter)
output_filename=writer.getOutputFileName()
(noturl,chaptercount) = get_dcsource_chaptercount(output_filename)
print "Updating %s, URL: %s" % (output_filename,url)
except:
options.update = False
pass
## Check for include_images without no_image_processing. In absence of PIL, give warning.
if adapter.getConfig('include_images') and not adapter.getConfig('no_image_processing'):
try:
from calibre.utils.magick import Image
logging.debug("Using calibre.utils.magick")
except:
try:
import Image
logging.debug("Using PIL")
except:
print "You have include_images enabled, but Python Image Library(PIL) isn't found.\nImages will be included full size in original format.\nContinue? (y/n)?"
if not sys.stdin.readline().strip().lower().startswith('y'):
return
## three tries, that's enough if both user/pass & is_adult needed,
## or a couple tries of one or the other
for x in range(0,2):
try:
adapter.getStoryMetadataOnly()
except exceptions.FailedToLogin, f:
if f.passwdonly:
print "Story requires a password."
else:
print "Login Failed, Need Username/Password."
sys.stdout.write("Username: ")
adapter.username = sys.stdin.readline().strip()
adapter.password = getpass.getpass(prompt='Password: ')
#print("Login: `%s`, Password: `%s`" % (adapter.username, adapter.password))
except exceptions.AdultCheckRequired:
print "Please confirm you are an adult in your locale: (y/n)?"
if sys.stdin.readline().strip().lower().startswith('y'):
adapter.is_adult=True
if options.update and not options.force:
urlchaptercount = int(adapter.getStoryMetadataOnly().getMetadata('numChapters'))
if chaptercount == urlchaptercount and not options.metaonly:
print "%s already contains %d chapters." % (output_filename,chaptercount)
elif chaptercount > urlchaptercount:
print "%s contains %d chapters, more than source: %d." % (output_filename,chaptercount,urlchaptercount)
elif chaptercount == 0:
print "%s doesn't contain any recognizable chapters, probably from a different source. Not updating." % (output_filename)
else:
# update now handled by pre-populating the old
# images and chapters in the adapter rather than
# merging epubs.
(url,
chaptercount,
adapter.oldchapters,
adapter.oldimgs,
adapter.oldcover,
adapter.calibrebookmark,
adapter.logfile) = get_update_data(output_filename)
print "Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)
if not (options.update and chaptercount == urlchaptercount) \
and adapter.getConfig("do_update_hook"):
chaptercount = adapter.hookForUpdates(chaptercount)
writeStory(configuration,adapter,"epub")
else:
# regular download
if options.metaonly:
pprint.pprint(adapter.getStoryMetadataOnly().getAllMetadata())
output_filename=writeStory(configuration,adapter,options.format,options.metaonly)
if not options.metaonly and adapter.getConfig("post_process_cmd"):
metadata = adapter.story.metadata
metadata['output_filename']=output_filename
call(string.Template(adapter.getConfig("post_process_cmd"))
.substitute(metadata), shell=True)
del adapter
except exceptions.InvalidStoryURL, isu:
print isu
except exceptions.StoryDoesNotExist, dne:
print dne
except exceptions.UnknownSite, us:
print us
if __name__ == "__main__":
#import time
#start = time.time()
main(sys.argv[1:])
#print("Total time seconds:%f"%(time.time()-start))

89
editconfig.html Normal file
View file

@ -0,0 +1,89 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="/css/index.css" rel="stylesheet" type="text/css">
<title>FanFictionDownLoader - read fanfiction from twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org on Kindle, Nook, Sony Reader, iPad, iPhone, Android, Aldiko, Stanza</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-12136939-1']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</head>
<body>
<div id='main' style="width: 80%; margin-left: 10%;">
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFictionDownLoader</a>
</h1>
<div style="text-align: center">
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
<form action="/editconfig" method="post">
<input type="hidden" name="update" value="true" />
<div id='logpasswordtable'>
<h3>Edit Config</h3>
<div id='logpassword'>
Editing configuration for {{ nickname }}.
</div>
<div class='fieldandlabel'>
<textarea name="config" style="width: 100%; height: 200px;" wrap='off'>{{ config }}</textarea>
</div>
</div>
<div id='submitbtn'>
<input type="submit" value="Save">
</div>
</form>
<div>
<h3>Default System configuration</h3>
<pre>
{{ defaultsini }}
</pre>
</div>
<div style='text-align: center'>
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
alt="Powered by Google App Engine" />
<br/><br/>
This is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">FanFictionDownLoader</a><br/>
Copyright &copy; Fanficdownloader team
</div>
<div style="margin-top: 1em; text-align: center'">
<script type="text/javascript"><!--
google_ad_client = "pub-2027714004231956";
/* FFD */
google_ad_slot = "7330682770";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
</div>
</body>
</html>

25
epubmerge.py Normal file
View file

@ -0,0 +1,25 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# epubmerge.py 1.0
# Copyright 2011, Jim Miller
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if __name__ == "__main__":
print('''
The this utility has been split out into it's own project.
See: http://code.google.com/p/epubmerge/
...for a CLI epubmerge.py program and calibre plugin.
''')

103
example.ini Normal file
View file

@ -0,0 +1,103 @@
## This is an example of what your personal configuration might look
## like. Uncomment options by removing the '#' in front of them.
[defaults]
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult. In
## commandline version, this should go in your personal.ini, not
## defaults.ini.
#is_adult:true
## Don't like the numbers at the start of chapter titles on some
## sites? You can use strip_chapter_numbers to strip them off. Just
## want to make them all look the same? Strip them off, then add them
## back on with add_chapter_numbers. Don't like the way it strips
## numbers or adds them back? See chapter_title_strip_pattern and
## chapter_title_add_pattern.
#strip_chapter_numbers:true
#add_chapter_numbers:true
[epub]
## include images from img tags in the body and summary of stories.
## Images will be converted to jpg for size if possible. Images work
## in epub format only. To get mobi or other format with images,
## download as epub and use Calibre to convert.
#include_images:true
## If not set, the summary will have all html stripped for safety.
## Both this and include_images must be true to get images in the
## summary.
#keep_summary_html:true
## If set, the first image found will be made the cover image. If
## keep_summary_html is true, any images in summary will be before any
## in chapters.
#make_firstimage_cover:true
## Resize images down to width, height, preserving aspect ratio.
## Nook size, with margin.
#image_max_size: 580, 725
## Change image to grayscale, if graphics library allows, to save
## space.
#grayscale_images: false
## Most common, I expect will be using this to save username/passwords
## for different sites. Here are a few examples. See defaults.ini
## for the full list.
[www.twilighted.net]
#username:YourPenname
#password:YourPassword
## default is false
#collect_series: true
[www.ficwad.com]
#username:YourUsername
#password:YourPassword
[www.twiwrite.net]
#username:YourName
#password:yourpassword
## default is false
#collect_series: true
[www.adastrafanfic.com]
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content.
#is_adult:true
[www.thewriterscoffeeshop.com]
#username:YourName
#password:yourpassword
#is_adult:true
## default is false
#collect_series: true
[www.fictionalley.org]
#is_adult:true
[www.harrypotterfanfiction.com]
#is_adult:true
[www.fimfiction.net]
#is_adult:true
#fail_on_password: false
[www.tthfanfic.org]
#is_adult:true
## tth is a little unusual--it doesn't require user/pass, but the site
## keeps track of which chapters you've read and won't send another
## update until it thinks you're up to date. This way, on download,
## it thinks you're up to date.
#username:YourName
#password:yourpassword
## This section will override anything in the system defaults or other
## sections here.
[overrides]
## default varies by site. Set true here to force all sites to
## collect series.
#collect_series: true

BIN
fanficdownloader.zip Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,57 @@
# coding: utf-8
import re
import codecs
stack = []
def get_end_tag(tag):
if len(tag) > 0 and tag.find(u'<') > -1 and tag.rfind(u'>') > -1:
return re.sub(r'.*<([^\ >]+).*', r'</\1>', tag)
return u''
def get_tag_name(tag):
if len(tag) > 0 and tag.find(u'<') > -1 and tag.rfind(u'>') > -1:
return re.sub(r'</*([^\ >]+).*', r'\1', tag)
return u''
def push(tag):
if len(tag) > 0 and tag.find(u'<') > -1 and tag.rfind(u'>') > -1:
stack.append(tag)
def pop():
if len(stack) > 0:
return stack.pop()
return u''
def pop_end_tag():
return unicode(get_end_tag(pop()))
def spool_end():
html = u''
for tag in reversed(stack):
html += get_end_tag(tag)
return html
def spool_start():
html = u''
for item in stack:
html += item
return html
def has_elements():
return len(stack) > 0
def get_last():
# t = pop()
# push(t)
# return t
if len(stack) > 0:
return stack[len(stack)-1]
return u''
def flush():
del stack[:]
def get_stack():
return stack

View file

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
try:
# just a way to switch between web service and CLI/PI
import google.appengine.api
except:
try: # just a way to switch between CLI and PI
import calibre.constants
except:
import sys
if sys.version_info >= (2, 7):
import logging
logger = logging.getLogger(__name__)
loghandler=logging.StreamHandler()
loghandler.setFormatter(logging.Formatter("FFDL:%(levelname)s:%(filename)s(%(lineno)d):%(message)s"))
logger.addHandler(loghandler)
loghandler.setLevel(logging.DEBUG)
logger.setLevel(logging.DEBUG)

View file

@ -0,0 +1,248 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os, re, sys, glob, types
from os.path import dirname, basename, normpath
import logging
import urlparse as up
logger = logging.getLogger(__name__)
from .. import exceptions as exceptions
from ..configurable import Configuration
## must import each adapter here.
import adapter_test1
import adapter_fanfictionnet
import adapter_castlefansorg
import adapter_fictionalleyorg
import adapter_fictionpresscom
import adapter_ficwadcom
import adapter_fimfictionnet
import adapter_harrypotterfanfictioncom
import adapter_mediaminerorg
import adapter_potionsandsnitchesnet
import adapter_tenhawkpresentscom
import adapter_adastrafanficcom
import adapter_thewriterscoffeeshopcom
import adapter_tthfanficorg
import adapter_twilightednet
import adapter_twiwritenet
import adapter_whoficcom
import adapter_siyecouk
import adapter_archiveofourownorg
import adapter_ficbooknet
import adapter_portkeyorg
import adapter_mugglenetcom
import adapter_hpfandomnet
import adapter_thequidditchpitchorg
import adapter_nfacommunitycom
import adapter_midnightwhispersca
import adapter_ksarchivecom
import adapter_archiveskyehawkecom
import adapter_squidgeorgpeja
import adapter_libraryofmoriacom
import adapter_wraithbaitcom
import adapter_checkmatedcom
import adapter_chaossycophanthexcom
import adapter_dramioneorg
import adapter_erosnsapphosycophanthexcom
import adapter_lumossycophanthexcom
import adapter_occlumencysycophanthexcom
import adapter_phoenixsongnet
import adapter_walkingtheplankorg
import adapter_ashwindersycophanthexcom
import adapter_thehexfilesnet
import adapter_dokugacom
import adapter_iketernalnet
import adapter_onedirectionfanfictioncom
import adapter_storiesofardacom
import adapter_samdeanarchivenu
import adapter_destinysgatewaycom
import adapter_ncisfictionnet
import adapter_stargateatlantisorg
import adapter_thealphagatecom
import adapter_fanfiktionde
import adapter_ponyfictionarchivenet
import adapter_sg1heliopoliscom
import adapter_ncisficcom
import adapter_nationallibrarynet
import adapter_themasquenet
import adapter_pretendercentrecom
import adapter_darksolaceorg
import adapter_finestoriescom
import adapter_hpfanficarchivecom
import adapter_twilightarchivescom
import adapter_wizardtalesnet
import adapter_nhamagicalworldsus
import adapter_hlfictionnet
import adapter_grangerenchantedcom
import adapter_dracoandginnycom
import adapter_scarvesandcoffeenet
import adapter_thepetulantpoetesscom
import adapter_wolverineandroguecom
import adapter_sinfuldesireorg
import adapter_merlinficdtwinscouk
import adapter_thehookupzonenet
import adapter_bloodtiesfancom
import adapter_indeathnet
import adapter_qafficcom
import adapter_efpfanficnet
import adapter_potterficscom
import adapter_efictionestelielde
import adapter_dotmoonnet
import adapter_pommedesangcom
import adapter_restrictedsectionorg
import adapter_imagineeficcom
import adapter_buffynfaithnet
import adapter_psychficcom
import adapter_hennethannunnet
import adapter_tokrafandomnetcom
import adapter_netraptororg
import adapter_asr3slashzoneorg
import adapter_nickandgregnet
import adapter_potterheadsanonymouscom
import adapter_simplyundeniablecom
import adapter_scarheadnet
import adapter_fictionpadcom
import adapter_storiesonlinenet
import adapter_trekiverseorg
import adapter_literotica
import adapter_voracity2eficcom
import adapter_spikeluvercom
import adapter_bloodshedversecom
import adapter_nocturnallightnet
import adapter_fanfichu
import adapter_fanfictioncsodaidokhu
import adapter_fictionmaniatv
import adapter_bdsmgeschichten
import adapter_tolkienfanfiction
import adapter_themaplebookshelf
import adapter_fannation
## This bit of complexity allows adapters to be added by just adding
## importing. It eliminates the long if/else clauses we used to need
## to pick out the adapter.
## List of registered site adapters.
__class_list = []
__domain_map = {}
def imports():
for name, val in globals().items():
if isinstance(val, types.ModuleType):
yield val.__name__
for x in imports():
if "fanficdownloader.adapters.adapter_" in x:
#print x
cls = sys.modules[x].getClass()
__class_list.append(cls)
for site in cls.getAcceptDomains():
__domain_map[site]=cls
def getNormalStoryURL(url):
r = getNormalStoryURLSite(url)
if r:
return r[0]
else:
return None
def getNormalStoryURLSite(url):
if not getNormalStoryURL.__dummyconfig:
getNormalStoryURL.__dummyconfig = Configuration("test1.com","EPUB")
# pulling up an adapter is pretty low over-head. If
# it fails, it's a bad url.
try:
adapter = getAdapter(getNormalStoryURL.__dummyconfig,url)
url = adapter.url
site = adapter.getSiteDomain()
del adapter
return (url,site)
except:
return None
# kludgey function static/singleton
getNormalStoryURL.__dummyconfig = None
def getAdapter(config,url,anyurl=False):
#logger.debug("trying url:"+url)
(cls,fixedurl) = getClassFor(url)
#logger.debug("fixedurl:"+fixedurl)
if cls:
if anyurl:
fixedurl = cls.getSiteExampleURLs().split()[0]
adapter = cls(config,fixedurl) # raises InvalidStoryURL
return adapter
# No adapter found.
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )
def getConfigSections():
return [cls.getConfigSection() for cls in __class_list]
def getSiteExamples():
l=[]
for cls in sorted(__class_list, key=lambda x : x.getConfigSection()):
l.append((cls.getConfigSection(),cls.getSiteExampleURLs().split()))
return l
def getConfigSectionFor(url):
(cls,fixedurl) = getClassFor(url)
if cls:
return cls.getConfigSection()
# No adapter found.
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )
def getClassFor(url):
## fix up leading protocol.
fixedurl = re.sub(r"(?i)^[htp]+(s?)[:/]+",r"http\1://",url.strip())
if fixedurl.startswith("//"):
fixedurl = "http:%s"%url
if not fixedurl.startswith("http"):
fixedurl = "http://%s"%url
## remove any trailing '#' locations.
fixedurl = re.sub(r"#.*$","",fixedurl)
parsedUrl = up.urlparse(fixedurl)
domain = parsedUrl.netloc.lower()
if( domain != parsedUrl.netloc ):
fixedurl = fixedurl.replace(parsedUrl.netloc,domain)
cls = getClassFromList(domain)
if not cls and domain.startswith("www."):
domain = domain.replace("www.","")
#logger.debug("trying site:without www: "+domain)
cls = getClassFromList(domain)
fixedurl = re.sub(r"^http(s?)://www\.",r"http\1://",fixedurl)
if not cls:
#logger.debug("trying site:www."+domain)
cls = getClassFromList("www."+domain)
fixedurl = re.sub(r"^http(s?)://",r"http\1://www.",fixedurl)
if cls:
fixedurl = cls.stripURLParameters(fixedurl)
return (cls,fixedurl)
def getClassFromList(domain):
try:
return __domain_map[domain]
except KeyError:
pass # return none.

View file

@ -0,0 +1,230 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','aaff')
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
@staticmethod
def getSiteDomain():
return 'www.adastrafanfic.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
addurl = "&warning=5"
else:
addurl=""
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Content is only suitable for mature adults. May contain explicit language and adult themes. Equivalent of NC-17." in data:
raise exceptions.AdultCheckRequired(self.url)
# problems with some stories, but only in calibre. I suspect
# issues with different SGML parsers in python. This is a
# nasty hack, but it works.
data = data[data.index("<body"):]
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
## <meta name='description' content='&lt;p&gt;Description&lt;/p&gt; ...' >
## Summary, strangely, is in the content attr of a <meta name='description'> tag
## which is escaped HTML. Unfortunately, we can't use it because they don't
## escape (') chars in the desc, breakin the tag.
#meta_desc = soup.find('meta',{'name':'description'})
#metasoup = bs.BeautifulStoneSoup(meta_desc['content'])
#self.story.setMetadata('description',stripHTML(metasoup))
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ''
while value and not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
# sometimes poorly formated desc (<p> w/o </p>) leads
# to all labels being included.
svalue=svalue[:svalue.find('<span class="label">')]
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(value.strip(), "%d %b %Y"))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(value.strip(), "%d %b %Y"))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
# problems with some stories, but only in calibre. I suspect
# issues with different SGML parsers in python. This is a
# nasty hack, but it works.
data = data[data.index("<body"):]
soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
span = soup.find('div', {'id' : 'story'})
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,span)
def getClass():
return AdAstraFanficComSiteAdapter

View file

@ -0,0 +1,376 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ArchiveOfOurOwnOrgAdapter
logger = logging.getLogger(__name__)
class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["utf8",
"Windows-1252"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/works/'+self.story.getMetadata('storyId'))
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ao3')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%Y-%b-%d"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'archiveofourown.org'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/works/123456 http://"+self.getSiteDomain()+"/collections/Some_Archive/works/123456 http://"+self.getSiteDomain()+"/works/123456/chapters/78901"
def getSiteURLPattern(self):
# http://archiveofourown.org/collections/Smallville_Slash_Archive/works/159770
# Discard leading zeros from story ID numbers--AO3 doesn't use them in it's own chapter URLs.
return r"https?://"+re.escape(self.getSiteDomain())+r"(/collections/[^/]+)?/works/0*(?P<id>\d+)"
## Login
def needToLoginCheck(self, data):
if 'This work is only available to registered users of the Archive.' in data \
or "The password or user name you entered doesn't match our records" in data:
return True
else:
return False
def performLogin(self, url, data):
params = {}
if self.password:
params['user_session[login]'] = self.username
params['user_session[password]'] = self.password
else:
params['user_session[login]'] = self.getConfig("username")
params['user_session[password]'] = self.getConfig("password")
params['user_session[remember_me]'] = '1'
params['commit'] = 'Log in'
#params['utf8'] = u'✓'#u'\x2713' # gets along with out it, and it confuses the encoder.
params['authenticity_token'] = data.split('input name="authenticity_token" type="hidden" value="')[1].split('" /></div>')[0]
loginUrl = 'http://' + self.getSiteDomain() + '/user_sessions'
logger.info("Will now login to URL (%s) as (%s)" % (loginUrl,
params['user_session[login]']))
d = self._postUrl(loginUrl, params)
#logger.info(d)
if "Successfully logged in" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['user_session[login]']))
raise exceptions.FailedToLogin(url,params['user_session[login]'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
addurl = "?view_adult=true"
else:
addurl=""
metaurl = self.url+addurl
url = self.url+'/navigate'+addurl
logger.info("url: "+url)
logger.info("metaurl: "+metaurl)
try:
data = self._fetchUrl(url)
meta = self._fetchUrl(metaurl)
if "This work could have adult content. If you proceed you have agreed that you are willing to see such content." in meta:
raise exceptions.AdultCheckRequired(self.url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Sorry, we couldn&#x27;t find the work you were looking for." in data:
raise exceptions.StoryDoesNotExist(self.url)
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url,data)
data = self._fetchUrl(url)
meta = self._fetchUrl(metaurl)
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
metasoup = bs.BeautifulSoup(meta)
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r"^/works/\d+$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
alist = soup.findAll('a', href=re.compile(r"^/users/\w+/pseuds/\w+"))
if len(alist) < 1: # ao3 allows for author 'Anonymous' with no author link.
self.story.setMetadata('author','Anonymous')
self.story.setMetadata('authorUrl','http://archiveofourown.org/')
self.story.setMetadata('authorId','0')
else:
for a in alist:
self.story.addToList('authorId',a['href'].split('/')[2])
self.story.addToList('authorUrl','http://'+self.host+a['href'])
self.story.addToList('author',a.text)
newestChapter = None
self.newestChapterNum = None # save for comparing during update.
# Scan all chapters to find the oldest and newest, on AO3 it's
# possible for authors to insert new chapters out-of-order or
# change the dates of earlier ones by editing them--That WILL
# break epub update.
# Find the chapters:
chapters=soup.findAll('a', href=re.compile(r'/works/'+self.story.getMetadata('storyId')+"/chapters/\d+$"))
self.story.setMetadata('numChapters',len(chapters))
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
if len(chapters)==1:
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+chapters[0]['href']+addurl))
else:
for index, chapter in enumerate(chapters):
# strip just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['href']+addurl))
# (2013-09-21)
date = stripHTML(chapter.findNext('span'))[1:-1]
chapterDate = makeDate(date,self.dateformat)
if newestChapter == None or chapterDate > newestChapter:
newestChapter = chapterDate
self.newestChapterNum = index
a = metasoup.find('blockquote',{'class':'userstuff'})
if a != None:
self.setDescription(url,a)
#self.story.setMetadata('description',a.text)
a = metasoup.find('dd',{'class':"rating tags"})
if a != None:
self.story.setMetadata('rating',stripHTML(a.text))
a = metasoup.find('dd',{'class':"fandom tags"})
fandoms = a.findAll('a',{'class':"tag"})
for fandom in fandoms:
self.story.addToList('fandoms',fandom.string)
self.story.addToList('category',fandom.string)
a = metasoup.find('dd',{'class':"warning tags"})
if a != None:
warnings = a.findAll('a',{'class':"tag"})
for warning in warnings:
self.story.addToList('warnings',warning.string)
a = metasoup.find('dd',{'class':"freeform tags"})
if a != None:
genres = a.findAll('a',{'class':"tag"})
for genre in genres:
self.story.addToList('freeformtags',genre.string)
self.story.addToList('genre',genre.string)
a = metasoup.find('dd',{'class':"category tags"})
if a != None:
genres = a.findAll('a',{'class':"tag"})
for genre in genres:
if genre != "Gen":
self.story.addToList('ao3categories',genre.string)
self.story.addToList('genre',genre.string)
a = metasoup.find('dd',{'class':"character tags"})
if a != None:
chars = a.findAll('a',{'class':"tag"})
for char in chars:
self.story.addToList('characters',char.string)
a = metasoup.find('dd',{'class':"relationship tags"})
if a != None:
ships = a.findAll('a',{'class':"tag"})
for ship in ships:
self.story.addToList('ships',ship.string)
a = metasoup.find('dd',{'class':"collections"})
if a != None:
collections = a.findAll('a')
for collection in collections:
self.story.addToList('collections',collection.string)
stats = metasoup.find('dl',{'class':'stats'})
dt = stats.findAll('dt')
dd = stats.findAll('dd')
for x in range(0,len(dt)):
label = dt[x].text
value = dd[x].text
if 'Words:' in label:
self.story.setMetadata('numWords', value)
if 'Comments:' in label:
self.story.setMetadata('comments', value)
if 'Kudos:' in label:
self.story.setMetadata('kudos', value)
if 'Hits:' in label:
self.story.setMetadata('hits', value)
if 'Bookmarks:' in label:
self.story.setMetadata('bookmarks', value)
if 'Chapters:' in label:
if value.split('/')[0] == value.split('/')[1]:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
if 'Completed' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
# Find Series name from series URL.
ddseries = metasoup.find('dd',{'class':"series"})
if ddseries:
for i, a in enumerate(ddseries.findAll('a', href=re.compile(r"/series/\d+"))):
series_name = stripHTML(a)
series_url = 'http://'+self.host+a['href']
series_index = int(stripHTML(a.previousSibling).replace(', ','').split(' ')[1]) # "Part # of" or ", Part #"
self.story.setMetadata('series%02d'%i,"%s [%s]"%(series_name,series_index))
self.story.setMetadata('series%02dUrl'%i,series_url)
if i == 0:
self.setSeries(series_name, series_index)
self.story.setMetadata('seriesUrl',series_url)
def hookForUpdates(self,chaptercount):
if self.oldchapters and len(self.oldchapters) > self.newestChapterNum:
print("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1))
self.oldchapters = self.oldchapters[:self.newestChapterNum]
return len(self.oldchapters)
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
chapter=bs.BeautifulSoup('<div class="story"></div>').find('div')
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr'))
exclude_notes=self.getConfigList('exclude_notes')
if 'authorheadnotes' not in exclude_notes:
headnotes = soup.find('div', {'class' : "preface group"}).find('div', {'class' : "notes module"})
if headnotes != None:
headnotes = headnotes.find('blockquote', {'class' : "userstuff"})
if headnotes != None:
chapter.append("<b>Author's Note:</b>")
chapter.append(headnotes)
if 'chaptersummary' not in exclude_notes:
chapsumm = soup.find('div', {'id' : "summary"})
if chapsumm != None:
chapsumm = chapsumm.find('blockquote')
chapter.append("<b>Summary for the Chapter:</b>")
chapter.append(chapsumm)
if 'chapterheadnotes' not in exclude_notes:
chapnotes = soup.find('div', {'id' : "notes"})
if chapnotes != None:
chapnotes = chapnotes.find('blockquote')
if chapnotes != None:
chapter.append("<b>Notes for the Chapter:</b>")
chapter.append(chapnotes)
text = soup.find('div', {'class' : "userstuff module"})
chtext = text.find('h3', {'class' : "landmark heading"})
if chtext:
chtext.extract()
chapter.append(text)
if 'chapterfootnotes' not in exclude_notes:
chapfoot = soup.find('div', {'class' : "end notes module", 'role' : "complementary"})
if chapfoot != None:
chapfoot = chapfoot.find('blockquote')
chapter.append("<b>Notes for the Chapter:</b>")
chapter.append(chapfoot)
if 'authorfootnotes' not in exclude_notes:
footnotes = soup.find('div', {'id' : "work_endnotes"})
if footnotes != None:
footnotes = footnotes.find('blockquote')
chapter.append("<b>Author's Note:</b>")
chapter.append(footnotes)
if None == soup:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,chapter)

View file

@ -0,0 +1,193 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ArchiveSkyeHawkeComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class ArchiveSkyeHawkeComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/story.php?no='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ash')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%Y-%m-%d"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'archive.skyehawke.com'
@classmethod
def getAcceptDomains(cls):
return ['archive.skyehawke.com','www.skyehawke.com']
@classmethod
def getSiteExampleURLs(self):
return "http://archive.skyehawke.com/story.php?no=1234 http://www.skyehawke.com/archive/story.php?no=1234 http://skyehawke.com/archive/story.php?no=1234"
def getSiteURLPattern(self):
return re.escape("http://")+r"(archive|www)\.skyehawke\.com/(archive/)?story\.php\?no=\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
data = self._fetchUrl(url)
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('div', {'class':"story border"}).find('span',{'class':'left'})
title=stripHTML(a).split('"')[1]
self.story.setMetadata('title',title)
# Find authorid and URL from... author url.
author = a.find('a')
self.story.setMetadata('authorId',author['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+author['href'])
self.story.setMetadata('author',author.string)
authorSoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
chapter=soup.find('select',{'name':'chapter'}).findAll('option')
for i in range(1,len(chapter)):
ch=chapter[i]
self.chapterUrls.append((stripHTML(ch),ch['value']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
box=soup.find('div', {'class': "container borderridge"})
sum=box.find('span').text
self.setDescription(url,sum)
boxes=soup.findAll('div', {'class': "container bordersolid"})
for box in boxes:
if box.find('b') != None and box.find('b').text == "History and Story Information":
for b in box.findAll('b'):
if "words" in b.nextSibling:
self.story.setMetadata('numWords', b.text)
if "archived" in b.previousSibling:
self.story.setMetadata('datePublished', makeDate(stripHTML(b.text), self.dateformat))
if "updated" in b.previousSibling:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(b.text), self.dateformat))
if "fandom" in b.nextSibling:
self.story.addToList('category', b.text)
for br in box.findAll('br'):
br.replaceWith('split')
genre=box.text.split("Genre:")[1].split("split")[0]
if not "Unspecified" in genre:
self.story.addToList('genre',genre)
if box.find('span') != None and box.find('span').text == "WARNING":
rating=box.findAll('span')[1]
rating.find('br').replaceWith('split')
rating=rating.text.replace("This story is rated",'').split('split')[0]
self.story.setMetadata('rating',rating)
logger.debug(self.story.getMetadata('rating'))
warnings=box.find('ol')
if warnings != None:
warnings=warnings.text.replace(']', '').replace('[', '').split(' ')
for warning in warnings:
self.story.addToList('warnings',warning)
for asoup in authorSoup.findAll('div', {'class':"story bordersolid"}):
if asoup.find('a')['href'] == 'story.php?no='+self.story.getMetadata('storyId'):
if '[ Completed ]' in asoup.text:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
chars=asoup.findNext('div').text.split('Characters')[1].split(']')[0]
for char in chars.split(','):
if not "None" in char:
self.story.addToList('characters',char)
break
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div',{'class':"chapter bordersolid"}).findNext('div').findNext('div')
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,253 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return AshwinderSycophantHexComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','asph')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'ashwinder.sycophanthex.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'This story contains adult content and/or themes.' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['rememberme'] = '1'
params['sid'] = ''
params['intent'] = ''
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Logout" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
try:
# in case link points somewhere other than the first chapter
a = soup.findAll('option')[1]['value']
self.story.setMetadata('storyId',a.split('=',)[1])
url = 'http://'+self.host+'/'+a
soup = bs.BeautifulSoup(self._fetchUrl(url))
except:
pass
for info in asoup.findAll('table', {'width' : '100%', 'bordercolor' : re.compile(r'#')}):
a = info.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
if a != None:
self.story.setMetadata('title',stripHTML(a))
break
# Find the chapters:
chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1$'))
if len(chapters) == 0:
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d):
try:
return d.name
except:
return ""
cats = info.findAll('a',href=re.compile('categories.php'))
for cat in cats:
self.story.addToList('category',cat.string)
a = info.find('a', href=re.compile(r'reviews.php\?sid='+self.story.getMetadata('storyId')))
val = a.nextSibling
svalue = ""
while not defaultGetattr(val) == 'br':
val = val.nextSibling
val = val.nextSibling
while not defaultGetattr(val) == 'table':
svalue += str(val)
val = val.nextSibling
self.setDescription(url,svalue)
# <span class="label">Rated:</span> NC-17<br /> etc
labels = info.findAll('b')
for labelspan in labels:
value = labelspan.nextSibling
label = stripHTML(labelspan)
if 'Rating' in label:
self.story.setMetadata('rating', value)
if 'Word Count' in label:
self.story.setMetadata('numWords', value)
if 'Genres' in label:
genres = value.string.split(', ')
for genre in genres:
if genre != 'none':
self.story.addToList('genre',genre)
if 'Warnings' in label:
warnings = value.string.split(', ')
for warning in warnings:
if warning != ' none':
self.story.addToList('warnings',warning)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data, selfClosingTags=('br','hr','span','center')) # some chapters seem to be hanging up on those tags, so it is safer to close them
story = soup.find('div', {"align" : "left"})
if None == story:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,story)

View file

@ -0,0 +1,226 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return Asr3SlashzoneOrgAdapter
class Asr3SlashzoneOrgAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/archive/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','asr3')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d/%m/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'asr3.slashzone.org'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/archive/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/archive/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=3"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
#print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/archive/'+a['href'])
self.story.setMetadata('author',a.string)
# Rating
rate = stripHTML(soup.find('div',{'id':'pagetitle'}))
rate = rate[rate.rindex('[')+1:rate.rindex(']')]
self.story.setMetadata('rating', rate)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/archive/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
metadiv = soup.find('div',{'class':'content'})
smalldiv = metadiv.find('div',{'class':'small'})
categorys = smalldiv.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for category in categorys:
self.story.addToList('category',category.string)
chars = smalldiv.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
ships = smalldiv.parent.findAll('a',href=re.compile(r'browse\.php\?type=class&type_id=2&classid=1'))
for ship in ships:
self.story.addToList('ships',ship.string)
metatext = stripHTML(smalldiv)
if 'Completed: Yes' in metatext:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
wordstart=metatext.rindex('Word count:')+12
words = metatext[wordstart:metatext.index(' ',wordstart)]
self.story.setMetadata('numWords', words)
datesdiv = soup.find('div',{'class':'bottom'})
dates = stripHTML(datesdiv).split()
# Published: 04/26/2011 Updated: 03/06/2013
self.story.setMetadata('datePublished', makeDate(dates[1], self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(dates[3], self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/archive/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# remove 'small' leaving only summary.
smalldiv.extract()
self.setDescription(url,metadiv)
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url))
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,170 @@
# -*- coding: utf-8 -*-
# Copyright 2014 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import urlparse
import time
from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def _translate_date_german_english(date):
fullmon = {"Januar":"01",
"Februar":"02",
u"März":"03",
"April":"04",
"Mai":"05",
"Juni":"06",
"Juli":"07",
"August":"08",
"September":"09",
"Oktober":"10",
"November":"11",
"Dezember":"12"}
for (name,num) in fullmon.items():
date = date.replace(name,num)
return date
class BdsmGeschichtenAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["utf8", "Windows-1252"]
self.story.setMetadata('siteabbrev','bdsmgesch')
# Replace possible chapter numbering
url = re.sub("-\d+$", "-1", url)
# set storyId
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
# normalize URL
self._setURL('http://%s/%s' % (self.getSiteDomain(), self.story.getMetadata('storyId')))
self.dateformat = '%d. %m %Y - %H:%M'
@staticmethod
def getSiteDomain():
return 'bdsm-geschichten.net'
@classmethod
def getAcceptDomains(cls):
return ['www.bdsm-geschichten.net', 'www.bdsm-geschichten.net']
@classmethod
def getSiteExampleURLs(self):
return "http://www.bdsm-geschichten.net/title-of-story-1 http://bdsm-geschichten.net/title-of-story-1"
def getSiteURLPattern(self):
return r"http://(www\.)?bdsm-geschichten.net/(?P<storyId>[a-zA-Z0-9_-]+)"
def extractChapterUrlsAndMetadata(self):
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
try:
data1 = self._fetchUrl(self.url)
soup = bs.BeautifulSoup(data1)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
#strip comments from soup
[comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))]
# Cache the soups so we won't have to redownload in getChapterText later
self.soupsCache = {}
self.soupsCache[self.url] = soup
# author
authorDiv = soup.find("div", "author-pane-line author-name")
authorId = authorDiv.string.strip()
self.story.setMetadata('authorId', authorId)
self.story.setMetadata('author', authorId)
# TODO not really true need to be loggedin for this to work or fetch userid
self.story.setMetadata('authorUrl','http://'+self.host+'/'+authorId)
# TODO better metadata
date = soup.find("div", {"class": "submitted"}).string.strip()
date = re.sub(" &#151;.*", "", date)
date = _translate_date_german_english(date)
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
title1 = soup.find("h1", {'class': 'title'}).string
storyTitle = re.sub(" Teil .*$", "", title1)
self.chapterUrls = [(title1, self.url)]
self.story.setMetadata('title', storyTitle)
for tagLink in soup.find("ul", "taxonomy").findAll("a"):
self.story.addToList('category', tagLink.string)
## Retrieve chapter soups
nextLinkDiv = soup.find("div", "field-field-naechster-teil")
while nextLinkDiv is not None:
nextLink = 'http://' + self.getSiteDomain() + nextLinkDiv.find("a")['href']
try:
logger.debug("Grabbing next chapter URL " + nextLink)
data2 = self._fetchUrl(nextLink)
soup2 = bs.BeautifulSoup(data2)
self.soupsCache[nextLink] = soup2
[comment.extract() for comment in soup2.findAll(text=lambda text:isinstance(text, bs.Comment))]
nextLinkDiv = soup2.find("div", "field-field-naechster-teil")
title2 = soup2.find("h1", {'class': 'title'}).string
self.chapterUrls.append((title2, nextLink))
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(nextLink)
else:
raise e
self.story.setMetadata('numChapters', len(self.chapterUrls))
return
def getChapterText(self, url):
if url in self.soupsCache:
logger.debug('Getting chapter <%s> from cache' % url)
soup = self.soupsCache[url]
else:
logger.debug('Downloading chapter <%s>' % url)
data1 = self._fetchUrl(url)
soup = bs.BeautifulSoup(data1)
#strip comments from soup
[comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))]
# get story text
storyDiv1 = bs.Tag(soup, "div")
for para in soup.find("div", "full-node").find('div', 'content').findAll("p"):
storyDiv1.append(para)
storyDiv1.append('<br />')
storytext = self.utf8FromSoup(url,storyDiv1)
return storytext
def getClass():
return BdsmGeschichtenAdapter

View file

@ -0,0 +1,193 @@
from datetime import timedelta
import re
import urllib2
import urlparse
from .. import BeautifulSoup
from ..htmlcleanup import stripHTML
from base_adapter import BaseSiteAdapter, makeDate
from .. import exceptions
def getClass():
return BloodshedverseComAdapter
def _get_query_data(url):
components = urlparse.urlparse(url)
query_data = urlparse.parse_qs(components.query)
return dict((key, data[0]) for key, data in query_data.items())
class BloodshedverseComAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'bvc'
SITE_DOMAIN = 'bloodshedverse.com'
BASE_URL = 'http://' + SITE_DOMAIN + '/'
READ_URL_TEMPLATE = BASE_URL + 'stories.php?go=read&no=%s'
STARTED_DATETIME_FORMAT = '%m/%d/%Y'
UPDATED_DATETIME_FORMAT = '%m/%d/%Y %I:%M'
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
query_data = urlparse.parse_qs(self.parsedUrl.query)
story_no = query_data['no'][0]
self.story.setMetadata('storyId', story_no)
self._setURL(self.READ_URL_TEMPLATE % story_no)
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
def _customized_fetch_url(self, url, exception=None, parameters=None):
if exception:
try:
data = self._fetchUrl(url, parameters)
except urllib2.HTTPError:
raise exception(self.url)
# Just let self._fetchUrl throw the exception, don't catch and
# customize it.
else:
data = self._fetchUrl(url, parameters)
return BeautifulSoup.BeautifulSoup(data)
@staticmethod
def getSiteDomain():
return BloodshedverseComAdapter.SITE_DOMAIN
@classmethod
def getSiteExampleURLs(cls):
return cls.READ_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self.BASE_URL + 'stories.php?go=') + r'(read|chapters)\&no=\d+$'
# Override stripURLParameters so the "no" parameter won't get stripped
@classmethod
def stripURLParameters(cls, url):
return url
def extractChapterUrlsAndMetadata(self):
soup = self._customized_fetch_url(self.url)
# Since no 404 error code we have to raise the exception ourselves.
# A title that is just 'by' indicates that there is no author name
# and no story title available.
if stripHTML(soup.title) == 'by':
raise exceptions.StoryDoesNotExist(self.url)
for option in soup.find('select', {'name': 'chapter'}):
title = stripHTML(option)
url = self.READ_URL_TEMPLATE % option['value']
self.chapterUrls.append((title, url))
# Get the URL to the author's page and find the correct story entry to
# scrape the metadata
author_url = urlparse.urljoin(self.url, soup.find('a', {'class': 'headline'})['href'])
soup = self._customized_fetch_url(author_url)
story_no = self.story.getMetadata('storyId')
# Ignore first list_box div, it only contains the author information
for list_box in soup('div', {'class': 'list_box'})[1:]:
url = list_box.find('a', {'class': 'fictitle'})['href']
query_data = _get_query_data(url)
# Found the div containing the story's metadata; break the loop and
# parse the element
if query_data['no'] == story_no:
break
else:
raise exceptions.FailedToDownload(self.url)
title_anchor = list_box.find('a', {'class': 'fictitle'})
self.story.setMetadata('title', stripHTML(title_anchor))
author_anchor = title_anchor.findNextSibling('a')
self.story.setMetadata('author', stripHTML(author_anchor))
self.story.setMetadata('authorId', _get_query_data(author_anchor['href'])['who'])
self.story.setMetadata('authorUrl', urlparse.urljoin(self.url, author_anchor['href']))
list_review = list_box.find('div', {'class': 'list_review'})
reviews = stripHTML(list_review.a).split(' ', 1)[0]
self.story.setMetadata('reviews', reviews)
summary_div = list_box.find('div', {'class': 'list_summary'})
if not self.getConfig('keep_summary_html'):
summary = ''.join(summary_div(text=True))
else:
summary = self.utf8FromSoup(author_url, summary_div)
self.story.setMetadata('description', summary)
# I'm assuming this to be the category, not sure what else it could be
first_listinfo = list_box.find('div', {'class': 'list_info'})
self.story.addToList('category', stripHTML(first_listinfo.a))
for list_info in first_listinfo.findNextSiblings('div', {'class': 'list_info'}):
for b_tag in list_info('b'):
key = b_tag.string.strip(': ')
# Strip colons from the beginning, superfluous spaces and minus
# characters from the end, and possibly trailing commas from
# the warnings if only one is present
value = b_tag.nextSibling.string.strip(': -,')
if key == 'Genre':
for genre in value.split(', '):
# Ignore the "none" genre
if not genre == 'none':
self.story.addToList('genre', genre)
elif key == 'Rating':
self.story.setMetadata('rating', value)
elif key == 'Complete':
self.story.setMetadata('status', 'Completed' if value == 'Yes' else 'In-Progress')
elif key == 'Warning':
for warning in value.split(', '):
# The string here starts with ", " before the actual list
# of values sometimes, so check for an empty warning
# and ignore the "none" warning.
if not warning or warning == 'none':
continue
self.story.addToList('warnings', warning)
elif key == 'Chapters':
self.story.setMetadata('numChapters', int(value))
elif key == 'Words':
# Apparently only numChapters need to be an integer for
# some strange reason. Remove possible ',' characters as to
# not confuse the codebase down the line
self.story.setMetadata('numWords', value.replace(',', ''))
elif key == 'Started':
self.story.setMetadata('datePublished', makeDate(value, self.STARTED_DATETIME_FORMAT))
elif key == 'Updated':
date_string, period = value.rsplit(' ', 1)
date = makeDate(date_string, self.UPDATED_DATETIME_FORMAT)
# Rather ugly hack to work around Calibre's changing of
# Python's locale setting, causing am/pm to not be properly
# parsed by strptime() when using a non-english locale
if period == 'pm':
date += timedelta(hours=12)
self.story.setMetadata('dateUpdated', date)
if self.story.getMetadata('rating') == 'NC-17' and not (self.is_adult or self.getConfig('is_adult')):
raise exceptions.AdultCheckRequired(self.url)
def getChapterText(self, url):
soup = self._customized_fetch_url(url)
storytext_div = soup.find('div', {'class': 'storytext'})
if self.getConfig('strip_text_links'):
for anchor in storytext_div('a', {'class': 'FAtxtL'}):
navigable_string = BeautifulSoup.NavigableString(anchor.string)
anchor.replaceWith(navigable_string)
return self.utf8FromSoup(url, storytext_div)

View file

@ -0,0 +1,336 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# By virtue of being recent and requiring both is_adult and user/pass,
# adapter_fanficcastletvnet.py is the best choice for learning to
# write adapters--especially for sites that use the eFiction system.
# Most sites that have ".../viewstory.php?sid=123" in the story URL
# are eFiction.
# For non-eFiction sites, it can be considerably more complex, but
# this is still a good starting point.
# In general an 'adapter' needs to do these five things:
# - 'Register' correctly with the downloader
# - Site Login (if needed)
# - 'Are you adult?' check (if needed--some do one, some the other, some both)
# - Grab the chapter list
# - Grab the story meta-data (some (non-eFiction) adapters have to get it from the author page)
# - Grab the chapter texts
# Search for XXX comments--that's where things are most likely to need changing.
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return BloodTiesFansComAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class BloodTiesFansComAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/fiction/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','btf') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y" # XXX
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'bloodties-fans.com' # XXX
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fiction/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/fiction/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/fiction/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
# Furthermore, there's a couple sites now with more than
# one warning level for different ratings. And they're
# fussy about it. midnightwhispers has three: 4, 2 & 1.
# we'll try 1 first.
addurl = "&ageconsent=ok&warning=4" # XXX
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
# Since the warning text can change by warning level, let's
# look for the warning pass url. nfacommunity uses
# &amp;warning= -- actually, so do other sites. Must be an
# eFiction book.
# viewstory.php?sid=561&amp;warning=4
# viewstory.php?sid=561&amp;warning=1
# viewstory.php?sid=561&amp;warning=2
#print data
#m = re.search(r"'viewstory.php\?sid=1882(&amp;warning=4)'",data)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/fiction/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/fiction/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
## Not all sites use Genre, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',genre.string)
## Not all sites use Warnings, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/fiction/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,291 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import cookielib as cl
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return BuffyNFaithNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class BuffyNFaithNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.setHeader()
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL. gets rid of chapter if there, left with ch 1 URL on this site
nurl = "http://"+self.getSiteDomain()+"/fanfictions/index.php?act=vie&id="+self.story.getMetadata('storyId')
self._setURL(nurl)
#argh, this mangles the ampersands I need on metadata['storyUrl']
#will set it this way
self.story.setMetadata('storyUrl',nurl,condremoveentities=False)
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','bnfnet')
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'buffynfaith.net'
@classmethod
def stripURLParameters(cls,url):
"Only needs to be overriden if URL contains more than one parameter"
## This adapter needs at least two parameters left on the URL, act and id
return re.sub(r"(\?act=(vie|ovr)&id=\d+)&.*$",r"\1",url)
def setHeader(self):
"buffynfaith.net wants a Referer for images. Used both above and below(after cookieproc added)"
self.opener.addheaders.append(('Referer', 'http://'+self.getSiteDomain()+'/'))
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfictions/index.php?act=vie&id=1234 http://buffynfaith.net/fanfictions/index.php?act=ovr&id=1234 http://buffynfaith.net/fanfictions/index.php?act=vie&id=1234&ch=2"
def getSiteURLPattern(self):
#http://buffynfaith.net/fanfictions/index.php?act=vie&id=963
#http://buffynfaith.net/fanfictions/index.php?act=vie&id=949
#http://buffynfaith.net/fanfictions/index.php?act=vie&id=949&ch=2
p = re.escape("http://"+self.getSiteDomain()+"/fanfictions/index.php?act=")+\
r"(vie|ovr)&id=(?P<id>\d+)(&ch=(?P<ch>\d+))?$"
return p
def extractChapterUrlsAndMetadata(self):
dateformat = "%d %B %Y"
url = self.url
logger.debug("URL: "+url)
#set a cookie to get past adult check
if self.is_adult or self.getConfig("is_adult"):
cookieproc = urllib2.HTTPCookieProcessor()
cookie = cl.Cookie(version=0, name='my_age', value='yes',
port=None, port_specified=False,
domain=self.getSiteDomain(), domain_specified=False, domain_initial_dot=False,
path='/', path_specified=True,
secure=False,
expires=time.time()+10000,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = urllib2.build_opener(cookieproc)
self.setHeader()
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
#print data
if "ADULT CONTENT WARNING" in data:
raise exceptions.AdultCheckRequired(self.url)
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# Now go hunting for all the meta data and the chapter list.
#stuff in <head>: description
svalue = soup.head.find('meta',attrs={'name':'description'})['content']
#self.story.setMetadata('description',svalue)
self.setDescription(url,svalue)
#useful stuff in rest of doc, all contained in this:
doc = soup.body.find('div', id='my_wrapper')
#first the site category (more of a genre to me, meh) and title, in this element:
mt = doc.find('div',attrs={'class':'maintitle'})
self.story.addToList('genre',mt.findAll('a')[1].string)
self.story.setMetadata('title',mt.findAll('a')[1].nextSibling[len('&nbsp;&raquo;&nbsp;'):])
del mt
#the actual category, for me, is 'Buffy: The Vampire Slayer'
#self.story.addToList('category','Buffy: The Vampire Slayer')
#No need to do it here, it is better to set it in in plugin-defaults.ini and defaults.ini
#then a block that sits in a table cell like so:
#(contains a lot of metadata)
mblock = doc.find('td', align='left', width = '70%').contents
while len(mblock) > 0:
i = mblock.pop(0)
if 'Author:' in i.string:
#drop empty space
mblock.pop(0)
#get author link
a = mblock.pop(0)
authre = re.escape('./index.php?act=bio&id=')+'(?P<authid>\d+)'
m = re.match(authre,a['href'])
self.story.setMetadata('author',a.string)
self.story.setMetadata('authorId',m.group('authid'))
authurl = u'http://%s/fanfictions/index.php?act=bio&id=%s' % ( self.getSiteDomain(),
self.story.getMetadata('authorId'))
self.story.setMetadata('authorUrl',authurl,condremoveentities=False)
#drop empty space
mblock.pop(0)
if 'Rating:' in i.string:
self.story.setMetadata('rating',mblock.pop(0).strip())
if 'Published:' in i.string:
date = mblock.pop(0).strip()
#get rid of 'st', 'nd', 'rd', 'th' after day number
date = date[0:2]+date[4:]
self.story.setMetadata('datePublished',makeDate(date, dateformat))
if 'Last Updated:' in i.string:
date = mblock.pop(0).strip()
#get rid of 'st', 'nd', 'rd', 'th' after day number
date = date[0:2]+date[4:]
self.story.setMetadata('dateUpdated',makeDate(date, dateformat))
if 'Genre:' in i.string:
genres = mblock.pop(0).strip()
genres = genres.split('/')
for genre in genres: self.story.addToList('genre',genre)
#end ifs
#end while
# Find the chapter selector
select = soup.find('select', { 'name' : 'ch' } )
if select is None:
# no selector found, so it's a one-chapter story.
#self.chapterUrls.append((self.story.getMetadata('title'),url))
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = u'http://%s/fanfictions/index.php?act=vie&id=%s&ch=%s' % ( self.getSiteDomain(),
self.story.getMetadata('storyId'),
o['value'])
title = u"%s" % o
title = stripHTML(title)
ts = title.split(' ',1)
title = ts[0]+'. '+ts[1]
self.chapterUrls.append((title,url))
self.story.setMetadata('numChapters',len(self.chapterUrls))
## Go scrape the rest of the metadata from the author's page.
data = self._fetchUrl(self.story.getMetadata('authorUrl'))
soup = bs.BeautifulSoup(data)
#find the story link and its parent div
storya = soup.find('a',{'href':self.story.getMetadata('storyUrl')})
storydiv = storya.parent
#warnings come under a <spawn> tag. Never seen that before...
#appears to just be a line of freeform text, not necessarily a list
#optional
spawn = storydiv.find('spawn',{'id':'warnings'})
if spawn is not None:
warns = spawn.nextSibling.strip()
self.story.addToList('warnings',warns)
#some meta in spans - this should get all, even the ones jammed in a table
spans = storydiv.findAll('span')
for s in spans:
if s.string == 'Ship:':
list = s.nextSibling.strip().split()
self.story.extendList('ships',list)
if s.string == 'Characters:':
list = s.nextSibling.strip().split(',')
self.story.extendList('characters',list)
if s.string == 'Status:':
st = s.nextSibling.strip()
self.story.setMetadata('status',st)
if s.string == 'Words:':
st = s.nextSibling.strip()
self.story.setMetadata('numWords',st)
#reviews - is this worth having?
#ffnet adapter gathers it, don't know if anything else does
#or if it's ever going to be used!
a = storydiv.find('a',{'id':'bold-blue'})
if a:
revs = a.nextSibling.strip()[1:-1]
self.story.setMetadata('reviews',st)
else:
revs = '0'
self.story.setMetadata('reviews',st)
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'fanfiction'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
#remove all the unnecessary bookmark tags
[s.extract() for s in div('div',{'class':"tiny_box2"})]
#is there a review link?
r = div.find('a',href=re.compile(re.escape("./index.php?act=irv")+".*$"))
if r is not None:
#remove the review link and its parent div
r.parent.extract()
#There might also be a link to the sequel on the last chapter
#I'm inclined to keep it in, but the URL needs to be changed from relative to absolute
#Shame there isn't proper series metadata available
#(I couldn't find it anyway)
s = div.find('a',href=re.compile(re.escape("./index.php?act=ovr")+".*$"))
if s is not None:
s['href'] = 'http://'+self.getSiteDomain()+'/fanfictions'+s['href'][1:]
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,309 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# By virtue of being recent and requiring both is_adult and user/pass,
# adapter_fanficcastletvnet.py is the best choice for learning to
# write adapters--especially for sites that use the eFiction system.
# Most sites that have ".../viewstory.php?sid=123" in the story URL
# are eFiction.
# For non-eFiction sites, it can be considerably more complex, but
# this is still a good starting point.
# In general an 'adapter' needs to do these five things:
# - 'Register' correctly with the downloader
# - Site Login (if needed)
# - 'Are you adult?' check (if needed--some do one, some the other, some both)
# - Grab the chapter list
# - Grab the story meta-data (some (non-eFiction) adapters have to get it from the author page)
# - Grab the chapter texts
# Search for XXX comments--that's where things are most likely to need changing.
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return CastleFansOrgAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class CastleFansOrgAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/fanfic/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','cslf') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y" # XXX
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'castlefans.org' # XXX
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfic/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/fanfic/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/fanfic/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=4" # XXX
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
if "Age Consent Required" in data: # XXX
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
pagetitle = soup.find('div',{'id':'pagetitle'})
## Title
a = pagetitle.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/fanfic/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/fanfic/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
## Not all sites use Genre, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',genre.string)
## Not all sites use Warnings, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/fanfic/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,237 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ChaosSycophantHexComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class ChaosSycophantHexComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','csph')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'chaos.sycophanthex.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=19"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
if "Age Consent Required" in data:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
pt = soup.find('div', {'id' : 'pagetitle'})
a = pt.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = pt.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
rating=pt.text.split('(')[1].split(')')[0]
self.story.setMetadata('rating', rating)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
value = labels[0].previousSibling
svalue = ""
while value != None:
val = value
value = value.previousSibling
while not defaultGetattr(val,'class') == 'label':
svalue += str(val)
val = val.nextSibling
self.setDescription(url,svalue)
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Word count' in label:
self.story.setMetadata('numWords', value.split(' -')[0])
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Complete' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value.split(' -')[0]), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,238 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return CheckmatedComAdapter
class CheckmatedComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
self._setURL('http://' + self.getSiteDomain() + '/story.php?story='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','chm')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.checkmated.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/story.php?story=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/story.php?story=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites. This story is in The Bedchamber
def needToLoginCheck(self, data):
if 'This story is in The Bedchamber' in data \
or 'That username is not in our database' in data \
or "That password is not correct, please try again" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['name'] = self.username
params['pass'] = self.password
else:
params['name'] = self.getConfig("username")
params['pass'] = self.getConfig("password")
params['login'] = 'yes'
params['submit'] = 'login'
loginUrl = 'http://' + self.getSiteDomain()+'/login.php'
d = self._fetchUrl(loginUrl,params)
e = self._fetchUrl(url)
if "Welcome back," not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['name']))
raise exceptions.FailedToLogin(url,params['name'])
return False
elif "This story is in The Bedchamber" in e:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Your account does not have sufficient priviliges to read this story.")
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('span', {'class' : 'storytitle'})
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = a.parent.find('a', href=re.compile(r"authors.php\?name\=\w+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
a = soup.find('select', {'name' : 'chapter'})
if a == None:
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
for chapter in a.findAll('option'):
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/story.php?story='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# website does not keep track of word count, and there is no convenient way to calculate it
summary = soup.find('fieldset')
summary.find('legend').extract()
summary.name='div'
self.setDescription(url,summary)
# <span class="label">Rated:</span> NC-17<br /> etc
table = soup.findAll('div', {'class' : 'text'})[1]
for labels in table.findAll('tr'):
value = labels.findAll('td')[1]
label = labels.findAll('td')[0]
if 'Rating' in stripHTML(label):
self.story.setMetadata('rating', stripHTML(value))
if 'Ship' in stripHTML(label):
if value.string != "none/none":
self.story.addToList('ships',value.string)
for char in value.string.split('/'):
if char != 'none':
self.story.addToList('characters',char)
if 'Status' in stripHTML(label):
if value.find('img', {'src' : 'img/incomplete.gif'}) == None:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in stripHTML(label):
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in stripHTML(label):
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
a = self._fetchUrl(self.story.getMetadata('authorUrl')+'&cat=stories')
for story in bs.BeautifulSoup(a).findAll('table', {'class' : 'storyinfo'}):
a = story.find('a', href=re.compile(r"review.php\?s\="+self.story.getMetadata('storyId')+'&act=view'))
if a != None:
for labels in story.findAll('tr'):
value = labels.findAll('td')[1]
label = labels.findAll('td')[0]
if 'genre' in stripHTML(label):
for genre in value.findAll('img'):
self.story.addToList('genre',genre['title'])
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'resizeableText'})
div.find('div', {'class' : 'storyTools'}).extract()
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,335 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return DarkSolaceOrgAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class DarkSolaceOrgAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/elysian/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','dksl')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%B %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'dark-solace.org'
@classmethod
def getAcceptDomains(cls):
return ['www.dark-solace.org','dark-solace.org']
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/elysian/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+"(www\.)?"+re.escape(self.getSiteDomain()+"/elysian/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'This story contains adult content not suitable for children' in data \
or "That password doesn't match the one in our database" in data \
or "Registered Users Only" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['action'] = 'login'
params['submit'] = 'Submit'
loginUrl = 'http://www.' + self.getSiteDomain() + '/elysian/user.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._postUrl(loginUrl, params)
if "Member Account" not in d : #User Account Page
logger.info("Failed to login to URL %s as %s, or have no authorization to access the story" % (loginUrl, params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=5"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title and author
div = soup.find('div', {'id' : 'pagetitle'})
aut = div.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',aut['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/elysian/'+aut['href'])
self.story.setMetadata('author',aut.string)
aut.extract()
# first a tag in pagetitle is title
self.story.setMetadata('title',stripHTML(div.find('a')))
div.find('a').extract()
# only thing left in div(pagetitle) now should be 'by' and rating.
rating = stripHTML(div)
if '[' in rating:
self.story.setMetadata('rating', rating[rating.index('[')+1:-1])
for chapa in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+
self.story.getMetadata('storyId')+'&chapter=\d+')):
self.chapterUrls.append((stripHTML(chapa),'http://'+self.host+'/elysian/'+chapa['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
storylink = asoup.find('a', href=re.compile(r'viewstory.php\?sid='+
self.story.getMetadata('storyId')+'($|[^\d])'))
# author's story list is paginated if there's a pagelinks div.
# Only need to look in it if the story wasn't on the first page.
pagelinks = asoup.find('div',{'id':'pagelinks'})
if pagelinks and storylink==None:
authpageslist = pagelinks.findAll('a',href=re.compile(r'action=storiesby'))
for page in authpageslist[1:]: # skip first, already checked above.
asoup = bs.BeautifulSoup(self._fetchUrl('http://'+self.host+'/elysian/'+page['href']))
storylink = asoup.find('a', href=re.compile(r'viewstory.php\?sid='+
self.story.getMetadata('storyId')+'($|[^\d])'))
if storylink:
break
if not storylink:
raise exceptions.FailedToDownload("Unable to find story metadata on author's page(s)")
metalist = storylink.parent.parent
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = metalist.findAll('span', {'class' : 'label'})
for labelspan in labels:
label = labelspan.text
value = labelspan.nextSibling
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while value and not (defaultGetattr(value,'class') == 'label' or "Chapters: " in stripHTML(value)):
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = metalist.find('a', href=re.compile(r"series.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/elysian/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storylink = seriessoup.find('a', href=re.compile(r'viewstory.php\?sid='+
self.story.getMetadata('storyId')+'($|[^\d])'))
if storylink and storylink.parent and storylink.parent['class'] != 'title': # in case of links inside story summaries.
storylink = None
offset = 0
# series story list is paginated if there's a pagelinks div.
# Only need to look in it if the story wasn't on the first page.
pagelinks = seriessoup.find('div',{'id':'pagelinks'})
if pagelinks and storylink==None:
authpageslist = pagelinks.findAll('a',href=re.compile(r'offset='))
for page in authpageslist[1:]: # skip first, already checked above.
seriessoup = bs.BeautifulSoup(self._fetchUrl('http://'+self.host+'/elysian/'+page['href']))
storylink = seriessoup.find('a', href=re.compile(r'viewstory.php\?sid='+
self.story.getMetadata('storyId')+'($|[^\d])'))
if storylink and storylink.parent and storylink.parent['class'] != 'title': # in case of links inside story summaries.
storylink = None
if storylink:
offset = int(page['href'].split('=')[-1]) # offset is last.
break
# for reasons I don't understand, searching for story
# links by regex wasn't working reliably. It was missing
# the javascript links sometimes. This is cleaner anyway.
for i, div in enumerate(seriessoup.findAll('div', {'class':'title'})):
a = div.find('a') # first a is story link.
# skip 'report this' and 'TOC' links
if a == storylink:
self.setSeries(series_name, 1+i+offset)
self.story.setMetadata('seriesUrl',series_url)
break
except Exception, e:
print("Series parsing failed: %s"%e)
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url))
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,243 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return DestinysGatewayComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class DestinysGatewayComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','dgrfa')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.destinysgateway.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&warning=4"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,278 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return DokugaComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class DokugaComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[3])
# www.dokuga.com has two 'sections', shown in URL as
# 'fanfiction' and 'spark' that change how things should be
# handled.
# http://www.dokuga.com/fanfiction/story/7528/1
# http://www.dokuga.com/spark/story/7299/1
self.section=self.parsedUrl.path.split('/',)[1]
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/'+self.parsedUrl.path.split('/',)[1]+'/story/'+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','dkg')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
if 'fanfiction' in self.section:
self.dateformat = "%d %b %Y"
else:
self.dateformat = "%m-%d-%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.dokuga.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfiction/story/1234/1 http://"+self.getSiteDomain()+"/spark/story/1234/1"
def getSiteURLPattern(self):
return r"http://"+self.getSiteDomain()+"/(fanfiction|spark)?/story/\d+/?\d+?$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'The author has disabled anonymous viewing for this story.' in data:
return True
else:
return False
def performLogin(self, url,soup):
params = {}
if self.password:
params['username'] = self.username
params['passwd'] = self.password
else:
params['username'] = self.getConfig("username")
params['passwd'] = self.getConfig("password")
params['Submit'] = 'Submit'
# copy all hidden input tags to pick up appropriate tokens.
for tag in soup.findAll('input',{'type':'hidden'}):
params[tag['name']] = tag['value']
loginUrl = 'http://' + self.getSiteDomain() + '/fanfiction'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['username']))
d = self._postUrl(loginUrl, params)
if "Your session has expired. Please log in again." in d:
d = self._postUrl(loginUrl, params)
if "Logout" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['username']))
raise exceptions.FailedToLogin(url,params['username'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url,soup)
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# print data
# Now go hunting for all the meta data and the chapter list.
## Title and author
a = soup.find('div', {'align' : 'center'}).find('h3')
# Find authorid and URL from... author url.
aut = a.find('a')
self.story.setMetadata('authorId',aut['href'].split('=')[1])
alink='http://'+self.host+aut['href']
self.story.setMetadata('authorUrl','http://'+self.host+aut['href'])
self.story.setMetadata('author',aut.string)
aut.extract()
a = a.string[:(len(a.string)-4)]
self.story.setMetadata('title',stripHTML(a))
# Find the chapters:
chapters = soup.find('select').findAll('option')
if len(chapters)==1:
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/'+self.section+'/story/'+self.story.getMetadata('storyId')+'/1'))
else:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles. /fanfiction/story/7406/1
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+self.section+'/story/'+self.story.getMetadata('storyId')+'/'+chapter['value']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
asoup = bs.BeautifulSoup(self._fetchUrl(alink))
if 'fanfiction' in self.section:
asoup=asoup.find('div', {'id' : 'cb_tabid_52'}).find('div')
#grab the rest of the metadata from the author's page
for div in asoup.findAll('div'):
nav=div.find('a', href=re.compile(r'/fanfiction/story/'+self.story.getMetadata('storyId')+"/1$"))
if nav != None:
break
div=div.nextSibling
self.setDescription(url,div)
div=div.nextSibling
a=div.text.split('Rating: ')
if len(a) == 2: self.story.setMetadata('rating', a[1].split('&')[0])
a=div.text.split('Status: ')
if len(a)==2:
iscomp=a[1].split('&')[0]
if 'Complete' in iscomp:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
a=div.text.split('Category: ')
if len(a) == 2: self.story.addToList('category', a[1].split('&')[0])
a=div.text.split('Created: ')
if len(a) == 2: self.story.setMetadata('datePublished', makeDate(stripHTML(a[1].split('&')[0]), self.dateformat))
a=div.text.split('Updated: ')
if len(a) == 2: self.story.setMetadata('dateUpdated', makeDate(stripHTML(a[1]), self.dateformat))
div=div.nextSibling.nextSibling
a=div.text.split('Words: ')
if len(a) == 2: self.story.setMetadata('numWords', a[1].split('&')[0])
a=div.text.split('Genre: ')
if len(a) == 2:
for genre in a[1].split('&')[0].split(', '):
self.story.addToList('genre',genre)
else:
asoup=asoup.find('div', {'id' : 'maincol'}).find('div', {'class' : 'padding'})
for div in asoup.findAll('div'):
nav=div.find('a', href=re.compile(r'/spark/story/'+self.story.getMetadata('storyId')+"/1$"))
if nav != None:
break
div=div.nextSibling.nextSibling
self.setDescription(url,div)
self.story.addToList('category', 'Spark')
div=div.nextSibling.nextSibling
a=div.text.split('Rating: ')
if len(a) == 2: self.story.setMetadata('rating', a[1].split(' - ')[0])
a=div.text.split('Status: ')
if len(a)==2:
iscomp=a[1].split(' - ')[0]
if 'Complete' in iscomp:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
a=div.text.split('Genre: ')
if len(a)==2:
for genre in a[1].split(' - ')[0].split('/'):
self.story.addToList('genre',genre)
div=div.nextSibling.nextSibling
a=div.text.split('Updated: ')
if len(a)==2:
date=a[1].split(' -')[0]
self.story.setMetadata('dateUpdated', makeDate(date, self.dateformat))
# does not have published date anywhere
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
a=div.text.split('Words ')
if len(a)==2: self.story.setMetadata('numWords', a[1])
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'chtext'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,217 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return DotMoonNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class DotMoonNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL. www.dotmoon.net/library_view.php?storyid=3
self._setURL('http://' + self.getSiteDomain() + '/library_view.php?storyid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','dotm')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%Y-%m-%d"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.dotmoon.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/library_view.php?storyid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/library_view.php?storyid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'You must be logged in to read adult-rated stories' in data \
or 'Password incorrect' in data \
or "That username does not exist" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['user'] = self.username
params['passwrd'] = self.password
else:
params['user'] = self.getConfig("username")
params['passwrd'] = self.getConfig("password")
loginUrl = 'http://' + self.getSiteDomain() + '/board/index.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['user']))
d = self._fetchUrl(loginUrl+'?action=login2&user='+params['user']+'&passwrd='+params['passwrd'])
d = self._fetchUrl(loginUrl)
if "Show unread posts since last visit" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['user']))
raise exceptions.FailedToLogin(url,params['user'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
if "Invalid story ID" in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Invalid story ID.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
body=soup.findAll('body')[1]
body.find('table').extract()
## Title
a = body.find('b')
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url. http://www.dotmoon.net/board/index.php?action=profile;u=1'
a = body.find('a', href=re.compile(r"index.php\?action=profile;u=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[2])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters: 'library_storyview.php?chapterid=3
chapters=body.findAll('a', href=re.compile(r"library_storyview.php\?chapterid=\d+$"))
if len(chapters)==0:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: No php/html chapters found.")
if len(chapters)==1:
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/'+chapters[0]['href']))
else:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# other tags
labels = body.find('table', {'width':'390'}).findAll('td')
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if label != None:
if 'Fandom' in label:
self.story.addToList('category',value.string)
if 'Setting' in label:
self.story.addToList('genre',value.string)
if 'Genre' in label:
self.story.addToList('genre',value.string)
if 'Style' in label:
self.story.addToList('genre',value.string)
if 'Rating' in label:
self.story.addToList('rating',value.string)
if 'Created' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
if 'Status' in label:
if 'Completed' in value.string:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
table=body.findAll('table', {'width':'400'})[1].find('td')
self.setDescription(url,stripHTML(table).split('Summary: ')[1])
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('blockquote')
div.name='div'
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,301 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return DracoAndGinnyComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class DracoAndGinnyComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','dcagn')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.dracoandginny.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=2"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
content=soup.find('div',{'class' : 'listbox'})
self.setDescription(url,content.find('blockquote'))
for genre in content.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')):
self.story.addToList('genre',genre.string)
for warning in content.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')):
self.story.addToList('warnings',warning.string)
labels = content.findAll('b')
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
if 'Word count' in label:
self.story.setMetadata('numWords', value.split(' |')[0])
if 'Rating' in label:
self.story.setMetadata('rating', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value.nextSibling).split(' |')[0], self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'class' : 'listbox'})
if None == div:
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,310 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return DramioneOrgAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class DramioneOrgAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','drmn')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d %B %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'dramione.org'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&warning=5"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
if "Stories that are suitable for ages 16 and older" in data:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Use banner as cover if found
coverurl = ''
img = soup.find('img',{'class':'banner'})
if img:
coverurl = img['src']
#print "Cover: "+coverurl
a = soup.find(text="This story has a banner; click to view.")
if a:
#print "A: "+ ', '.join("(%s, %s)" %tup for tup in a.parent.attrs)
coverurl = a.parent['href']
#print "Cover: "+coverurl
if coverurl:
self.setCoverImage(url,coverurl)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
genres=soup.findAll('a', {'class' : "tag-1"})
for genre in genres:
self.story.addToList('genre',genre.string)
warnings=soup.findAll('a', {'class' : "tag-2"})
for warning in warnings:
self.story.addToList('warnings',warning.string)
themes=soup.findAll('a', {'class' : "tag-3"})
for theme in themes:
self.story.addToList('themes',theme.string)
hermiones=soup.findAll('a', {'class' : "tag-4"})
for hermione in hermiones:
self.story.addToList('hermiones',hermione.string)
dracos=soup.findAll('a', {'class' : "tag-5"})
for draco in dracos:
self.story.addToList('dracos',draco.string)
timelines=soup.findAll('a', {'class' : "tag-6"})
for timeline in timelines:
self.story.addToList('timeline',timeline.string)
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Read' in label:
self.story.setMetadata('read', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
value=re.sub(r"(\d+)(st|nd|rd|th)",r"\1",value)
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
value=re.sub(r"(\d+)(st|nd|rd|th)",r"\1",value)
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
try:
self.story.setMetadata('reviews',
stripHTML(soup.find('h2',{'id':'pagetitle'}).
findAll('a', href=re.compile(r'^reviews.php'))[1]))
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,223 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return EfictionEstelielDeAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class EfictionEstelielDeAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','eesd')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%B %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'efiction.esteliel.de'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# Now go hunting for all the meta data and the chapter list.
## Title and author
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
pagetitle = soup.find('div',{'id':'pagetitle'})
## Title
a = pagetitle.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
list = soup.find('div', {'class':'listbox'})
labelspan=list.find('span',{'class':'label'})
value = labelspan.nextSibling
label = labelspan.string
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
labels = list.findAll('b')
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while 'Rating' not in str(value):
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rating' in label:
self.story.setMetadata('rating', value)
if 'Words' in label:
self.story.setMetadata('numWords', value)
if 'Category' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
if list.find('a', href=re.compile(r"series.php")) != None:
for series in asoup.findAll('a', href=re.compile(r"series.php\?seriesid=\d+")):
# Find Series name from series URL.
series_url = 'http://'+self.host+'/'+series['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
name=seriessoup.find('div', {'id' : 'pagetitle'})
name.find('a').extract()
self.setSeries(name.text.split(' by[')[0], i)
self.story.setMetadata('seriesUrl',series_url)
i=0
break
i+=1
if i == 0:
break
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,315 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return EFPFanFicNet
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class EFPFanFicNet(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','efp')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d/%m/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.efpfanfic.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if( 'Fai il login e leggi la storia!' in data or
'Questa storia presenta contenuti non adatti ai minori' in data ):
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Invia'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?sid='+self.story.getMetadata('storyId')
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if '<a class="menu" href="newaccount.php">' in d : # register for new account link
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# if "Access denied. This story has not been validated by the adminstrators of this site." in data:
# raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapter selector
select = soup.find('select', { 'name' : 'sid' } )
if select is None:
# no selector found, so it's a one-chapter story.
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
allOptions = select.findAll('option', {'value' : re.compile(r'viewstory')})
for o in allOptions:
url = u'http://%s/%s' % ( self.getSiteDomain(),
o['value'])
# just in case there's tags, like <i> in chapter titles.
title = stripHTML(o)
self.chapterUrls.append((title,url))
self.story.setMetadata('numChapters',len(self.chapterUrls))
self.story.setMetadata('language','Italian')
# normalize story URL to first chapter if later chapter URL was given:
url = self.chapterUrls[0][1].replace('&i=1','')
logger.debug("Normalizing to URL: "+url)
self._setURL(url)
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
storya = None
authsoup = None
storyblock = None
authurl = self.story.getMetadata('authorUrl')
## author can have more than one page of stories.
while storyblock == None:
# no storya, but do have authsoup--we're looping on author pages.
if authsoup != None:
# last author link with offset should be the 'next' link.
authurl = u'http://%s/%s' % ( self.getSiteDomain(),
authsoup.findAll('a',href=re.compile(r'viewuser\.php\?uid=\d+&catid=&offset='))[-1]['href'] )
# Need author page for most of the metadata.
logger.debug("fetching author page: (%s)"%authurl)
authsoup = bs.BeautifulSoup(self._fetchUrl(authurl))
#print("authsoup:%s"%authsoup)
storyas = authsoup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r'&i=1$'))
for storya in storyas:
#print("======storya:%s"%storya)
storyblock = storya.findParent('div',{'class':'storybloc'})
#print("======storyblock:%s"%storyblock)
if storyblock != None:
continue
self.setDescription(url,storyblock.find('div', {'class':'introbloc'}))
noteblock = storyblock.find('div', {'class':'notebloc'})
#print("%s"%noteblock)
notetext = ("%s" % noteblock).replace("<br />"," |")
# <div class="notebloc">Autore: <a href="viewuser.php?uid=243036">Cendrillon89</a> | Pubblicata: 23/10/12 | Aggiornata: 30/10/12 | Rating: Arancione | Genere: Drammatico, Sentimentale | Capitoli: 10 | Completa<br />
# Tipo di coppia: Het | Personaggi: Akasuna no Sasori , Akatsuki, Nuovo Personaggio | Note: OOC | Avvertimenti: Tematiche delicate<br />
# Categoria: <a href="categories.php?catid=1&amp;parentcatid=1">Anime & Manga</a> > <a href="categories.php?catid=108&amp;parentcatid=108">Naruto</a> | Contesto: Naruto Shippuuden | Leggi le <a href="reviews.php?sid=1331275&amp;a=">3</a> recensioni</div>
cats = noteblock.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
for item in notetext.split("|"):
if ":" in item:
(label,value) = item.split(":")
label=label.strip()
value=value.strip()
else:
label=value=item.strip()
if 'Pubblicata' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Aggiornata' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
if label == "Completa":
self.story.setMetadata('status', 'Completed')
if label == "In corso":
self.story.setMetadata('status', 'In-Progress')
if 'Rating' in label:
self.story.setMetadata('rating', value)
if 'Personaggi' in label:
for val in value.split(","):
self.story.addToList('characters',val)
if 'Genere' in label:
for val in value.split(","):
self.story.addToList('genre',val)
if 'Coppie' in label:
for val in value.split(","):
self.story.addToList('ships',val)
if 'Avvertimenti' in label:
for val in value.split(","):
if val != "None":
self.story.addToList('warnings',val)
# 'extra' metadata for this adapter:
if 'Tipo di coppia' in label:
for val in value.split(","):
self.story.addToList('type',val)
if 'Note' in label:
for val in value.split(","):
if val != "None":
self.story.addToList('notes',val)
if 'Contesto' in label:
self.story.setMetadata('context', value)
## Note--efp doesn't provide word count.
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?ssid=\d+&i=1"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId'))+'&i=1':
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url))
div = soup.find('div', {'class' : 'storia'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
# remove any header and 'o:p' tags.
for tag in div.findAll("head") + div.findAll("o:p"):
tag.extract()
# change any html and body tags to div.
for tag in div.findAll("html") + div.findAll("body"):
tag.name='div'
# remove extra bogus doctype.
#<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
return re.sub(r"<!DOCTYPE[^>]+>","",self.utf8FromSoup(url,div))

View file

@ -0,0 +1,255 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ErosnSapphoSycophantHexComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','essph')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d/%m/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'erosnsappho.sycophanthex.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=18"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
pt = soup.find('div', {'id' : 'pagetitle'})
a = pt.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = pt.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
rating=pt.text.split('(')[1].split(')')[0]
self.story.setMetadata('rating', rating)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
value = labels[0].previousSibling
svalue = ""
while value != None:
val = value
value = value.previousSibling
while not defaultGetattr(val,'class') == 'label':
svalue += str(val)
val = val.nextSibling
self.setDescription(url,svalue)
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Word count' in label:
self.story.setMetadata('numWords', value.split(' -')[0])
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Complete' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value.split(' -')[0]), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,185 @@
# coding=utf-8
import re
import urllib2
import urlparse
from .. import BeautifulSoup
from base_adapter import BaseSiteAdapter, makeDate
from .. import exceptions
_SOURCE_CODE_ENCODING = 'utf-8'
def getClass():
return FanficHuAdapter
def _get_query_data(url):
components = urlparse.urlparse(url)
query_data = urlparse.parse_qs(components.query)
return dict((key, data[0]) for key, data in query_data.items())
class FanficHuAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'ffh'
SITE_DOMAIN = 'fanfic.hu'
SITE_LANGUAGE = 'Hungarian'
BASE_URL = 'http://' + SITE_DOMAIN + '/merengo/'
VIEW_STORY_URL_TEMPLATE = BASE_URL + 'viewstory.php?sid=%s'
DATE_FORMAT = '%m/%d/%Y'
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
query_data = urlparse.parse_qs(self.parsedUrl.query)
story_id = query_data['sid'][0]
self.story.setMetadata('storyId', story_id)
self._setURL(self.VIEW_STORY_URL_TEMPLATE % story_id)
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
self.story.setMetadata('language', self.SITE_LANGUAGE)
def _customized_fetch_url(self, url, exception=None, parameters=None):
if exception:
try:
data = self._fetchUrl(url, parameters)
except urllib2.HTTPError:
raise exception(self.url)
# Just let self._fetchUrl throw the exception, don't catch and
# customize it.
else:
data = self._fetchUrl(url, parameters)
return BeautifulSoup.BeautifulSoup(data)
@staticmethod
def getSiteDomain():
return FanficHuAdapter.SITE_DOMAIN
@classmethod
def getSiteExampleURLs(cls):
return cls.VIEW_STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self.VIEW_STORY_URL_TEMPLATE[:-2]) + r'\d+$'
def extractChapterUrlsAndMetadata(self):
soup = self._customized_fetch_url(self.url + '&i=1')
if soup.title.string.encode(_SOURCE_CODE_ENCODING).strip(' :') == 'írta':
raise exceptions.StoryDoesNotExist(self.url)
chapter_options = soup.find('form', action='viewstory.php').select('option')
# Remove redundant "Fejezetek" option
chapter_options.pop(0)
# If there is still more than one entry remove chapter overview entry
if len(chapter_options) > 1:
chapter_options.pop(0)
for option in chapter_options:
url = urlparse.urljoin(self.url, option['value'])
self.chapterUrls.append((option.string, url))
author_url = urlparse.urljoin(self.BASE_URL, soup.find('a', href=lambda href: href and href.startswith('viewuser.php?uid='))['href'])
soup = self._customized_fetch_url(author_url)
story_id = self.story.getMetadata('storyId')
for table in soup('table', {'class': 'mainnav'}):
title_anchor = table.find('span', {'class': 'storytitle'}).a
href = title_anchor['href']
if href.startswith('javascript:'):
href = href.rsplit(' ', 1)[1].strip("'")
query_data = _get_query_data(href)
if query_data['sid'] == story_id:
break
else:
# This should never happen, the story must be found on the author's
# page.
raise exceptions.FailedToDownload(self.url)
self.story.setMetadata('title', title_anchor.string)
rows = table('tr')
anchors = rows[0].div('a')
author_anchor = anchors[1]
query_data = _get_query_data(author_anchor['href'])
self.story.setMetadata('author', author_anchor.string)
self.story.setMetadata('authorId', query_data['uid'])
self.story.setMetadata('authorUrl', urlparse.urljoin(self.BASE_URL, author_anchor['href']))
self.story.setMetadata('reviews', anchors[3].string)
if self.getConfig('keep_summary_html'):
self.story.setMetadata('description', self.utf8FromSoup(author_url, rows[1].td))
else:
self.story.setMetadata('description', ''.join(rows[1].td(text=True)))
for row in rows[3:]:
index = 0
cells = row('td')
while index < len(cells):
cell = cells[index]
key = cell.b.string.encode(_SOURCE_CODE_ENCODING).strip(':')
try:
value = cells[index+1].string.encode(_SOURCE_CODE_ENCODING)
except AttributeError:
value = None
if key == 'Kategória':
for anchor in cells[index+1]('a'):
self.story.addToList('category', anchor.string)
elif key == 'Szereplõk':
if cells[index+1].string:
for name in cells[index+1].string.split(', '):
self.story.addToList('character', name)
elif key == 'Korhatár':
if value != 'nem korhatáros':
self.story.setMetadata('rating', value)
elif key == 'Figyelmeztetések':
for b_tag in cells[index+1]('b'):
self.story.addToList('warnings', b_tag.string)
elif key == 'Jellemzõk':
for genre in cells[index+1].string.split(', '):
self.story.addToList('genre', genre)
elif key == 'Fejezetek':
self.story.setMetadata('numChapters', int(value))
elif key == 'Megjelenés':
self.story.setMetadata('datePublished', makeDate(value, self.DATE_FORMAT))
elif key == 'Frissítés':
self.story.setMetadata('dateUpdated', makeDate(value, self.DATE_FORMAT))
elif key == 'Szavak':
self.story.setMetadata('numWords', value)
elif key == 'Befejezett':
self.story.setMetadata('status', 'Completed' if value == 'Nem' else 'In-Progress')
index += 2
if self.story.getMetadata('rating') == '18':
if not (self.is_adult or self.getConfig('is_adult')):
raise exceptions.AdultCheckRequired(self.url)
def getChapterText(self, url):
soup = self._customized_fetch_url(url)
story_cell = soup.find('form', action='viewstory.php').parent.parent
for div in story_cell('div'):
div.extract()
return self.utf8FromSoup(url, story_cell)

View file

@ -0,0 +1,218 @@
# coding=utf-8
import re
import urllib2
import urlparse
from .. import BeautifulSoup
from base_adapter import BaseSiteAdapter, makeDate
from .. import exceptions
_SOURCE_CODE_ENCODING = 'utf-8'
def getClass():
return FanfictionCsodaidokHuAdapter
def _get_query_data(url):
components = urlparse.urlparse(url)
query_data = urlparse.parse_qs(components.query)
return dict((key, data[0]) for key, data in query_data.items())
# yields Tag _and_ NavigableString siblings from the given tag. The
# BeautifulSoup findNextSiblings() method for some reasons only returns either
# NavigableStrings _or_ Tag objects, not both.
def _yield_next_siblings(tag):
sibling = tag.nextSibling
while sibling:
yield sibling
sibling = sibling.nextSibling
class FanfictionCsodaidokHuAdapter(BaseSiteAdapter):
_SITE_DOMAIN = 'fanfiction.csodaidok.hu'
_BASE_URL = 'http://' + _SITE_DOMAIN + '/'
_VIEW_STORY_URL_TEMPLATE = _BASE_URL + 'viewstory.php?sid=%s'
_VIEW_CHAPTER_URL_TEMPLATE = _VIEW_STORY_URL_TEMPLATE + '&chapter=%s'
_STORY_DOES_NOT_EXIST_PAGE_TITLE = 'Cím: Szerző:'
_DATE_FORMAT = '%Y.%m.%d'
_SITE_LANGUAGE = 'Hungarian'
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
query_data = urlparse.parse_qs(self.parsedUrl.query)
story_id = query_data['sid'][0]
self.story.setMetadata('storyId', story_id)
self._setURL(self._VIEW_STORY_URL_TEMPLATE % story_id)
self.story.setMetadata('siteabbrev', self._SITE_DOMAIN)
self.story.setMetadata('language', self._SITE_LANGUAGE)
def _customized_fetch_url(self, url, exception=None, parameters=None):
if exception:
try:
data = self._fetchUrl(url, parameters)
except urllib2.HTTPError:
raise exception(self.url)
# Just let self._fetchUrl throw the exception, don't catch and
# customize it.
else:
data = self._fetchUrl(url, parameters)
return BeautifulSoup.BeautifulSoup(data)
@staticmethod
def getSiteDomain():
return FanfictionCsodaidokHuAdapter._SITE_DOMAIN
@classmethod
def getSiteExampleURLs(cls):
return cls._VIEW_STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self._VIEW_STORY_URL_TEMPLATE[:-2]) + r'\d+$'
def extractChapterUrlsAndMetadata(self):
soup = self._customized_fetch_url(self.url + '&chapter=1')
element = soup.find('div', id='pagetitle')
page_title = ''.join(element(text=True)).encode(_SOURCE_CODE_ENCODING)
if page_title == self._STORY_DOES_NOT_EXIST_PAGE_TITLE:
raise exceptions.StoryDoesNotExist(self.url)
author_url = urlparse.urljoin(self.url, element.a['href'])
story_id = self.story.getMetadata('storyId')
element = soup.find('select', {'name': 'chapter'})
if element:
for option in element('option'):
title = option.string
url = self._VIEW_CHAPTER_URL_TEMPLATE % (story_id, option['value'])
self.chapterUrls.append((title, url))
soup = self._customized_fetch_url(author_url)
story_id = self.story.getMetadata('storyId')
for listbox_div in soup('div', {'class': lambda klass: klass and 'listbox' in klass}):
a = listbox_div.div.a
if not a['href'].startswith('viewstory.php?sid='):
continue
query_data = _get_query_data(a['href'])
if query_data['sid'] == story_id:
break
else:
raise exceptions.FailedToDownload(self.url)
title = ''.join(a(text=True))
self.story.setMetadata('title', title)
if not self.chapterUrls:
self.chapterUrls.append((title, self.url))
element = a.findNextSibling('a')
self.story.setMetadata('author', element.string)
query_data = _get_query_data(element['href'])
self.story.setMetadata('authorId', query_data['uid'])
self.story.setMetadata('authorUrl', author_url)
element = element.findNextSibling('span')
rating = element.nextSibling.strip(' [')
if rating.encode(_SOURCE_CODE_ENCODING) != 'Korhatár nélkül':
self.story.setMetadata('rating', rating)
if rating == '18':
raise exceptions.AdultCheckRequired(self.url)
element = element.findNextSiblings('a')[1]
self.story.setMetadata('reviews', element.string)
sections = listbox_div('div', {'class': lambda klass: klass and klass in ['content', 'tail']})
for section in sections:
for element in section('span', {'class': 'classification'}):
key = element.string.encode(_SOURCE_CODE_ENCODING).strip(' :')
try:
value = element.nextSibling.string.encode(_SOURCE_CODE_ENCODING).strip()
except AttributeError:
value = None
if key == 'Tartalom':
contents = []
keep_summary_html = self.getConfig('keep_summary_html')
for sibling in _yield_next_siblings(element):
if isinstance(sibling, BeautifulSoup.Tag):
if sibling.name == 'span' and sibling.get('class', None) == 'classification':
break
if keep_summary_html:
contents.append(self.utf8FromSoup(author_url, sibling))
else:
contents.append(''.join(sibling(text=True)))
else:
contents.append(sibling)
self.story.setMetadata('description', ''.join(contents))
elif key == 'Kategória':
for sibling in element.findNextSiblings(['a', 'span']):
if sibling.name == 'span':
break
self.story.addToList('category', sibling.string)
elif key == 'Szereplők':
for name in value.split(', '):
self.story.addToList('characters', name)
elif key == 'Műfaj':
if value != 'Nincs':
self.story.setMetadata('genre', value)
elif key == 'Figyelmeztetés':
if value != 'Nincs':
for warning in value.split(', '):
self.story.addToList('warnings', warning)
elif key == 'Kihívás':
if value != 'Nincs':
self.story.setMetadata('challenge', value)
elif key == 'Sorozat':
if value != 'Nincs':
self.story.setMetadata('series', value)
elif key == 'Fejezetek':
self.story.setMetadata('numChapters', int(value))
elif key == 'Befejezett':
self.story.setMetadata('status', 'Completed' if value == 'Nem' else 'In-Progress')
elif key == 'Szavak száma':
self.story.setMetadata('numWords', value)
elif key == 'Feltöltve':
self.story.setMetadata('datePublished', makeDate(value, self._DATE_FORMAT))
elif key == 'Frissítve':
self.story.setMetadata('dateUpdated', makeDate(value, self._DATE_FORMAT))
def getChapterText(self, url):
soup = self._customized_fetch_url(url)
contents = []
notes_div = soup.find('div', id='notes')
if notes_div:
contents.append(self.utf8FromSoup(url, notes_div))
story_div = notes_div.findNextSibling('div')
else:
element = soup.find('div', {'class': 'jumpmenu'})
story_div = element.findNextSibling('div')
contents.append(self.utf8FromSoup(url, story_div.span))
return ''.join(contents)

View file

@ -0,0 +1,329 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from urllib import unquote_plus
import time
from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
from base_adapter import BaseSiteAdapter, makeDate
ffnetgenres=["Adventure", "Angst", "Crime", "Drama", "Family", "Fantasy", "Friendship", "General",
"Horror", "Humor", "Hurt-Comfort", "Mystery", "Parody", "Poetry", "Romance", "Sci-Fi",
"Spiritual", "Supernatural", "Suspense", "Tragedy", "Western"]
class FanFictionNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','ffnet')
# get storyId from url--url validation guarantees second part is storyId
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
# normalized story URL.
self._setURL("https://"+self.getSiteDomain()\
+"/s/"+self.story.getMetadata('storyId')+"/1/")
# ffnet update emails have the latest chapter URL.
# Frequently, when they arrive, not all the servers have the
# latest chapter yet and going back to chapter 1 to pull the
# chapter list doesn't get the latest. So save and use the
# original URL given to pull chapter list & metadata.
self.origurl = url
if "https://m." in self.origurl:
## accept m(mobile)url, but use www.
self.origurl = self.origurl.replace("https://m.","https://www.")
self.opener.addheaders.append(('Referer',self.origurl))
@staticmethod
def getSiteDomain():
return 'www.fanfiction.net'
@classmethod
def getAcceptDomains(cls):
return ['www.fanfiction.net','m.fanfiction.net']
@classmethod
def getSiteExampleURLs(self):
return "https://www.fanfiction.net/s/1234/1/ https://www.fanfiction.net/s/1234/12/ http://www.fanfiction.net/s/1234/1/Story_Title http://m.fanfiction.net/s/1234/1/"
def getSiteURLPattern(self):
return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$"
def _fetchUrl(self,url):
time.sleep(1.0) ## ffnet(and, I assume, fpcom) tends to fail
## more if hit too fast. This is in
## additional to what ever the
## slow_down_sleep_time setting is.
return BaseSiteAdapter._fetchUrl(self,url)
def extractChapterUrlsAndMetadata(self):
# fetch the chapter. From that we will get almost all the
# metadata and chapter list
url = self.origurl
logger.debug("URL: "+url)
# use BeautifulSoup HTML parser to make everything easier to find.
try:
data = self._fetchUrl(url)
#logger.debug("\n===================\n%s\n===================\n"%data)
soup = bs.BeautifulSoup(data)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(url)
else:
raise e
if "Unable to locate story" in data:
raise exceptions.StoryDoesNotExist(url)
# some times "Chapter not found...", sometimes "Chapter text not found..."
if "not found. Please check to see you are not using an outdated url." in data:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! 'Chapter not found. Please check to see you are not using an outdated url.'" % url)
if self.getConfig('check_next_chapter'):
try:
## ffnet used to have a tendency to send out update
## notices in email before all their servers were
## showing the update on the first chapter. It
## generates another server request and doesn't seem
## to be needed lately, so now default it to off.
try:
chapcount = len(soup.find('select', { 'name' : 'chapter' } ).findAll('option'))
# get chapter part of url.
except:
chapcount = 1
chapter = url.split('/',)[5]
tryurl = "https://%s/s/%s/%d/"%(self.getSiteDomain(),
self.story.getMetadata('storyId'),
chapcount+1)
logger.debug('=Trying newer chapter: %s' % tryurl)
newdata = self._fetchUrl(tryurl)
if "not found. Please check to see you are not using an outdated url." \
not in newdata:
logger.debug('=======Found newer chapter: %s' % tryurl)
soup = bs.BeautifulSoup(newdata)
except:
pass
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"^/u/\d+"))
self.story.setMetadata('authorId',a['href'].split('/')[2])
self.story.setMetadata('authorUrl','https://'+self.host+a['href'])
self.story.setMetadata('author',a.string)
## Pull some additional data from html.
## ffnet shows category two ways
## 1) class(Book, TV, Game,etc) >> category(Harry Potter, Sailor Moon, etc)
## 2) cat1_cat2_Crossover
## For 1, use the second link.
## For 2, fetch the crossover page and pull the two categories from there.
categories = soup.find('div',{'id':'pre_story_links'}).findAll('a',{'class':'xcontrast_txt'})
#print("xcontrast_txt a:%s"%categories)
if len(categories) > 1:
# Strangely, the ones with *two* links are the
# non-crossover categories. Each is in a category itself
# of Book, Movie, etc.
self.story.addToList('category',stripHTML(categories[1]))
elif 'Crossover' in categories[0]['href']:
caturl = "https://%s%s"%(self.getSiteDomain(),categories[0]['href'])
catsoup = bs.BeautifulSoup(self._fetchUrl(caturl))
for a in catsoup.findAll('a',href=re.compile(r"^/crossovers/.+?/\d+/")):
self.story.addToList('category',stripHTML(a))
else:
# Fall back. I ran across a story with a Crossver
# category link to a broken page once.
# http://www.fanfiction.net/s/2622060/1/
# Naruto + Harry Potter Crossover
logger.info("Fall back category collection")
for c in stripHTML(categories[0]).replace(" Crossover","").split(' + '):
self.story.addToList('category',c)
a = soup.find('a', href=re.compile(r'https?://www\.fictionratings\.com/'))
rating = a.string
if 'Fiction' in rating: # if rating has 'Fiction ', strip that out for consistency with past.
rating = rating[8:]
self.story.setMetadata('rating',rating)
# after Rating, the same bit of text containing id:123456 contains
# Complete--if completed.
gui_table1i = soup.find('div',{'id':'content_wrapper_inner'})
self.story.setMetadata('title', stripHTML(gui_table1i.find('b'))) # title appears to be only(or at least first) bold tag in gui_table1i
summarydiv = gui_table1i.find('div',{'style':'margin-top:2px'})
if summarydiv:
self.setDescription(url,stripHTML(summarydiv))
grayspan = gui_table1i.find('span', {'class':'xgray xcontrast_txt'})
# for b in grayspan.findAll('button'):
# b.extract()
metatext = stripHTML(grayspan).replace('Hurt/Comfort','Hurt-Comfort')
#logger.debug("metatext:(%s)"%metatext)
metalist = metatext.split(" - ")
#logger.debug("metalist:(%s)"%metalist)
# Rated: Fiction K - English - Words: 158,078 - Published: 02-04-11
# Rated: Fiction T - English - Adventure/Sci-Fi - Naruto U. - Chapters: 22 - Words: 114,414 - Reviews: 395 - Favs: 779 - Follows: 835 - Updated: 03-21-13 - Published: 04-28-12 - id: 8067258
# rating is obtained above more robustly.
if metalist[0].startswith('Rated:'):
metalist=metalist[1:]
# next is assumed to be language.
self.story.setMetadata('language',metalist[0])
metalist=metalist[1:]
# next might be genre.
genrelist = metalist[0].split('/') # Hurt/Comfort already changed above.
goodgenres=True
for g in genrelist:
#logger.debug("g:(%s)"%g)
if g.strip() not in ffnetgenres:
#logger.info("g not in ffnetgenres")
goodgenres=False
if goodgenres:
self.story.extendList('genre',genrelist)
metalist=metalist[1:]
# Updated: <span data-xutime='1368059198'>5/8</span> - Published: <span data-xutime='1278984264'>7/12/2010</span>
# Published: <span data-xutime='1384358726'>8m ago</span>
dates = soup.findAll('span',{'data-xutime':re.compile(r'^\d+$')})
if len(dates) > 1 :
# updated get set to the same as published upstream if not found.
self.story.setMetadata('dateUpdated',datetime.fromtimestamp(float(dates[0]['data-xutime'])))
self.story.setMetadata('datePublished',datetime.fromtimestamp(float(dates[-1]['data-xutime'])))
donechars = False
while len(metalist) > 0:
if metalist[0].startswith('Chapters') or metalist[0].startswith('Status') or metalist[0].startswith('id:') or metalist[0].startswith('Updated:') or metalist[0].startswith('Published:'):
pass
elif metalist[0].startswith('Reviews'):
self.story.setMetadata('reviews',metalist[0].split(':')[1].strip())
elif metalist[0].startswith('Favs:'):
self.story.setMetadata('favs',metalist[0].split(':')[1].strip())
elif metalist[0].startswith('Follows:'):
self.story.setMetadata('follows',metalist[0].split(':')[1].strip())
elif metalist[0].startswith('Words'):
self.story.setMetadata('numWords',metalist[0].split(':')[1].strip())
elif not donechars:
# with 'pairing' support, pairings are bracketed w/o comma after
# [Caspian X, Lucy Pevensie] Edmund Pevensie, Peter Pevensie
self.story.extendList('characters',metalist[0].replace('[','').replace(']',',').split(','))
l = metalist[0]
while '[' in l:
self.story.addToList('ships',l[l.index('[')+1:l.index(']')].replace(', ','/'))
l = l[l.index(']')+1:]
donechars = True
metalist=metalist[1:]
if 'Status: Complete' in metatext:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
# Try the larger image first.
try:
img = soup.find('img',{'class':'lazy cimage'})
self.setCoverImage(url,img['data-original'])
except:
img = soup.find('img',{'class':'cimage'})
if img:
self.setCoverImage(url,img['src'])
# Find the chapter selector
select = soup.find('select', { 'name' : 'chapter' } )
if select is None:
# no selector found, so it's a one-chapter story.
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = u'https://%s/s/%s/%s/' % ( self.getSiteDomain(),
self.story.getMetadata('storyId'),
o['value'])
# just in case there's tags, like <i> in chapter titles.
title = u"%s" % o
title = re.sub(r'<[^>]+>','',title)
self.chapterUrls.append((title,url))
self.story.setMetadata('numChapters',len(self.chapterUrls))
return
def getChapterText(self, url):
time.sleep(4.0) ## ffnet(and, I assume, fpcom) tends to fail
## more if hit too fast. This is in
## additional to what ever the
## slow_down_sleep_time setting is.
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
if "Please email this error message in full to <a href='mailto:support@fanfiction.com'>support@fanfiction.com</a>" in data:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! FanFiction.net Site Error!" % url)
# some ancient stories have body tags inside them that cause
# soup parsing to discard the content. For story text we
# don't care about anything before "<div class='storytextp"
# (there's a space after storytextp, so no close quote(')) and
# this kills any body tags.
divstr = "<div role='main'"
if divstr not in data:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
else:
data = data[data.index(divstr):]
data.replace("<body","<notbody").replace("<BODY","<NOTBODY")
soup = bs.BeautifulSoup(data)
## Remove the 'share' button.
sharediv = soup.find('div', {'class' : 'a2a_kit a2a_default_style'})
if sharediv:
sharediv.extract()
div = soup.find('div', {'id' : 'storytextp'})
if None == div:
logger.debug('div id=storytextp not found. data:%s'%data)
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)
def getClass():
return FanFictionNetSiteAdapter

View file

@ -0,0 +1,202 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return FanFiktionDeAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class FanFiktionDeAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["utf8",
"Windows-1252"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/s/'+self.story.getMetadata('storyId') + '/1')
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ffde')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d.%m.%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.fanfiktion.de'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/s/46ccbef30000616306614050 http://"+self.getSiteDomain()+"/s/46ccbef30000616306614050/1 http://"+self.getSiteDomain()+"/s/46ccbef30000616306614050/1/story-name"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/s/")+r"\w+(/\d+)?"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Diese Geschichte wurde als entwicklungsbeeintr' in data \
or 'There is no such account on our website' in data \
or "Noch kein registrierter Benutzer?" in data:
return True
else:
return False
def performLogin(self,url):
params = {}
if self.password:
params['nickname'] = self.username
params['passwd'] = self.password
else:
params['nickname'] = self.getConfig("username")
params['passwd'] = self.getConfig("password")
params['savelogindata'] = '1'
params['a'] = 'l'
params['submit'] = 'Login...'
loginUrl = 'https://ssl.fanfiktion.de/'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['nickname']))
d = self._postUrl(loginUrl,params)
if "Login erfolgreich" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['nickname']))
raise exceptions.FailedToLogin(url,params['nickname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
if "Uhr ist diese Geschichte nur nach einer" in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Auserhalb der Zeit von 23:00 Uhr bis 04:00 Uhr ist diese Geschichte nur nach einer erfolgreichen Altersverifikation zuganglich.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')+"/"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
head = soup.find('div', {'class' : 'story-metadata-left-top'})
a = head.find('a')
self.story.setMetadata('authorId',a['href'].split('/')[2])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',stripHTML(a))
# Find the chapters:
for chapter in soup.find('select').findAll('option'):
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/s/'+self.story.getMetadata('storyId')+'/'+chapter['value']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
self.story.setMetadata('language','German')
#find metadata on the story page
self.story.setMetadata('datePublished', makeDate(head.text.split('erstellt: ')[1].split('\n')[0], self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(head.text.split('letztes Update: ')[1].split('\n')[0], self.dateformat))
for genre in head.text.split('&nbsp;&nbsp;&nbsp;')[3].split('/')[0].split(', '):
self.story.addToList('genre',genre)
if 'fertiggestellt' in head.text:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In Progress')
#find metadata on the author's page
asoup = bs.BeautifulSoup(self._fetchUrl("http://"+self.getSiteDomain()+"?a=q&a1=v&t=nickdetailsstories&lbi=stories&ar=0&nick="+self.story.getMetadata('authorId')))
tr=asoup.findAll('tr')
for i in range(1,len(tr)):
a = tr[i].find('a')
if '/s/'+self.story.getMetadata('storyId')+'/1/' in a['href']:
break
self.setDescription(url,a['onmouseover'].split("', '")[1])
td = tr[i].findAll('td')
self.story.addToList('category',stripHTML(td[1]))
self.story.setMetadata('rating', stripHTML(td[4]))
self.story.setMetadata('numWords', stripHTML(td[5]))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
time.sleep(0.5) ## ffde has "floodlock" protection
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'storytext'})
for a in div.findAll('script'):
a.extract()
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
# Copyright 2014 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from base_efiction_adapter import BaseEfictionAdapter
class FanNationAdapter(BaseEfictionAdapter):
@staticmethod
def getSiteDomain():
return 'fannation.shades-of-moonlight.com'
def getPathToArchive(self):
return '/archive'
def getSiteAbbrev(self):
return 'fannation'
def getClass():
return FanNationAdapter

View file

@ -0,0 +1,226 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import datetime
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import translit
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return FicBookNetAdapter
logger = logging.getLogger(__name__)
class FicBookNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["utf8",
"Windows-1252"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/readfic/'+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','fbn')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d %m %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.ficbook.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/readfic/12345 http://"+self.getSiteDomain()+"/readfic/93626/246417#part_content"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/readfic/")+r"\d+"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url=self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# Now go hunting for all the meta data and the chapter list.
table = soup.find('td',{'width':'50%'})
## Title
a = soup.find('h1')
self.story.setMetadata('title',stripHTML(a))
logger.debug("Title: (%s)"%self.story.getMetadata('title'))
# Find authorid and URL from... author url.
a = table.find('a')
self.story.setMetadata('authorId',a.text) # Author's name is unique
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.text)
logger.debug("Author: (%s)"%self.story.getMetadata('author'))
# Find the chapters:
chapters = soup.find('div', {'class' : 'part_list'})
if chapters != None:
chapters=chapters.findAll('a', href=re.compile(r'/readfic/'+self.story.getMetadata('storyId')+"/\d+#part_content$"))
self.story.setMetadata('numChapters',len(chapters))
for x in range(0,len(chapters)):
chapter=chapters[x]
churl='http://'+self.host+chapter['href']
self.chapterUrls.append((stripHTML(chapter),churl))
if x == 0:
pubdate = translit.translit(stripHTML(bs.BeautifulSoup(self._fetchUrl(churl)).find('div', {'class' : 'part_added'}).find('span')))
if x == len(chapters)-1:
update = translit.translit(stripHTML(bs.BeautifulSoup(self._fetchUrl(churl)).find('div', {'class' : 'part_added'}).find('span')))
else:
self.chapterUrls.append((self.story.getMetadata('title'),url))
self.story.setMetadata('numChapters',1)
pubdate=translit.translit(stripHTML(soup.find('div', {'class' : 'part_added'}).find('span')))
update=pubdate
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
if not ',' in pubdate:
pubdate=datetime.date.today().strftime(self.dateformat)
if not ',' in update:
update=datetime.date.today().strftime(self.dateformat)
pubdate=pubdate.split(',')[0]
update=update.split(',')[0]
fullmon = {"yanvarya":"01", "января":"01",
"fievralya":"02", "февраля":"02",
"marta":"03", "марта":"03",
"aprielya":"04", "апреля":"04",
"maya":"05", "мая":"05",
"iyunya":"06", "июня":"06",
"iyulya":"07", "июля":"07",
"avghusta":"08", "августа":"08",
"sentyabrya":"09", "сентября":"09",
"oktyabrya":"10", "октября":"10",
"noyabrya":"11", "ноября":"11",
"diekabrya":"12", "декабря":"12" }
for (name,num) in fullmon.items():
if name in pubdate:
pubdate = pubdate.replace(name,num)
if name in update:
update = update.replace(name,num)
self.story.setMetadata('dateUpdated', makeDate(update, self.dateformat))
self.story.setMetadata('datePublished', makeDate(pubdate, self.dateformat))
self.story.setMetadata('language','Russian')
pr=soup.find('a', href=re.compile(r'/printfic/\w+'))
pr='http://'+self.host+pr['href']
pr = bs.BeautifulSoup(self._fetchUrl(pr))
pr=pr.findAll('div', {'class' : 'part_text'})
i=0
for part in pr:
i=i+len(stripHTML(part).split(' '))
self.story.setMetadata('numWords', str(i))
i=0
fandoms = table.findAll('a', href=re.compile(r'/fanfiction/\w+'))
for fandom in fandoms:
self.story.addToList('category',fandom.string)
i=i+1
if i > 1:
self.story.addToList('genre', 'Кроссовер')
meta=table.findAll('a', href=re.compile(r'/ratings/'))
i=0
for m in meta:
if i == 0:
self.story.setMetadata('rating', m.find('b').text)
i=1
elif i == 1:
if not "," in m.nextSibling:
i=2
self.story.addToList('genre', m.find('b').text)
elif i == 2:
self.story.addToList('warnings', m.find('b').text)
if table.find('span', {'style' : 'color: green'}):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In Progress')
tags = table.findAll('b')
for tag in tags:
label = translit.translit(tag.text)
if 'Piersonazhi:' in label or 'Персонажи:' in label:
chars=tag.nextSibling.string.split(', ')
for char in chars:
self.story.addToList('characters',char)
break
summary=soup.find('span', {'class' : 'urlize'})
self.setDescription(url,summary)
#self.story.setMetadata('description', summary.text)
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
chapter = soup.find('div', {'class' : 'public_beta'})
if chapter == None:
chapter = soup.find('div', {'class' : 'public_beta_disabled'})
if None == chapter:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,chapter)

View file

@ -0,0 +1,241 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','fa')
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.is_adult=False
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('authorId',m.group('auth'))
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
self._setURL(url)
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
@staticmethod
def getSiteDomain():
return 'www.fictionalley.org'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/authors/drt/DA.html http://"+self.getSiteDomain()+"/authors/drt/JOTP01a.html"
def getSiteURLPattern(self):
# http://www.fictionalley.org/authors/drt/DA.html
# http://www.fictionalley.org/authors/drt/JOTP01a.html
return re.escape("http://"+self.getSiteDomain())+"/authors/(?P<auth>[a-zA-Z0-9_]+)/(?P<id>[a-zA-Z0-9_]+)\.html"
def _postFetchWithIAmOld(self,url):
if self.is_adult or self.getConfig("is_adult"):
params={'iamold':'Yes',
'action':'ageanswer'}
logger.info("Attempting to get cookie for %s" % url)
## posting on list doesn't work, but doesn't hurt, either.
data = self._postUrl(url,params)
else:
data = self._fetchUrl(url)
return data
def extractChapterUrlsAndMetadata(self):
## could be either chapter list page or one-shot text page.
url = self.url
logger.debug("URL: "+url)
try:
data = self._postFetchWithIAmOld(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
chapterdata = data
# If chapter list page, get the first chapter to look for adult check
chapterlinklist = soup.findAll('a',{'class':'chapterlink'})
if chapterlinklist:
chapterdata = self._postFetchWithIAmOld(chapterlinklist[0]['href'])
if "Are you over seventeen years old" in chapterdata:
raise exceptions.AdultCheckRequired(self.url)
if not chapterlinklist:
# no chapter list, chapter URL: change to list link.
# second a tag inside div breadcrumbs
storya = soup.find('div',{'class':'breadcrumbs'}).findAll('a')[1]
self._setURL(storya['href'])
url=self.url
logger.debug("Normalizing to URL: "+url)
## title's right there...
self.story.setMetadata('title',stripHTML(storya))
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data)
chapterlinklist = soup.findAll('a',{'class':'chapterlink'})
else:
## still need title from somewhere. If chapterlinklist,
## then chapterdata contains a chapter, find title the
## same way.
chapsoup = bs.BeautifulSoup(chapterdata)
storya = chapsoup.find('div',{'class':'breadcrumbs'}).findAll('a')[1]
self.story.setMetadata('title',stripHTML(storya))
del chapsoup
del chapterdata
## authorid already set.
## <h1 class="title" align="center">Just Off The Platform II by <a href="http://www.fictionalley.org/authors/drt/">DrT</a></h1>
authora=soup.find('h1',{'class':'title'}).find('a')
self.story.setMetadata('author',authora.string)
self.story.setMetadata('authorUrl',authora['href'])
if len(chapterlinklist) == 1:
self.chapterUrls.append((self.story.getMetadata('title'),chapterlinklist[0]['href']))
else:
# Find the chapters:
for chapter in chapterlinklist:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
## Go scrape the rest of the metadata from the author's page.
data = self._fetchUrl(self.story.getMetadata('authorUrl'))
soup = bs.BeautifulSoup(data)
# <dl><dt><a class = "Rid story" href = "http://www.fictionalley.org/authors/aafro_man_ziegod/TMH.html">
# [Rid] The Magical Hottiez</a> by <a class = "pen_name" href = "http://www.fictionalley.org/authors/aafro_man_ziegod/">Aafro Man Ziegod</a> </small></dt>
# <dd><small class = "storyinfo"><a href = "http://www.fictionalley.org/ratings.html" target = "_new">Rating:</a> PG-13 - Spoilers: PS/SS, CoS, PoA, GoF, QTTA, FB - 4264 hits - 5060 words<br />
# Genre: Humor, Romance - Main character(s): None - Ships: None - Era: Multiple Eras<br /></small>
# Chaos ensues after Witch Weekly, seeking to increase readers, decides to create a boyband out of five seemingly talentless wizards: Harry Potter, Draco Malfoy, Ron Weasley, Neville Longbottom, and Oliver "Toss Your Knickers Here" Wood.<br />
# <small class = "storyinfo">Published: June 3, 2002 (between Goblet of Fire and Order of Phoenix) - Updated: June 3, 2002</small>
# </dd></dl>
storya = soup.find('a',{'href':self.story.getMetadata('storyUrl')})
storydd = storya.findNext('dd')
# Rating: PG - Spoilers: None - 2525 hits - 736 words
# Genre: Humor - Main character(s): H, R - Ships: None - Era: Multiple Eras
# Harry and Ron are back at it again! They reeeeeeally don't want to be back, because they know what's awaiting them. "VH1 Goes Inside..." is back! Why? 'Cos there are soooo many more couples left to pick on.
# Published: September 25, 2004 (between Order of Phoenix and Half-Blood Prince) - Updated: September 25, 2004
## change to text and regexp find.
metastr = stripHTML(storydd).replace('\n',' ').replace('\t',' ')
m = re.match(r".*?Rating: (.+?) -.*?",metastr)
if m:
self.story.setMetadata('rating', m.group(1))
m = re.match(r".*?Genre: (.+?) -.*?",metastr)
if m:
for g in m.group(1).split(','):
self.story.addToList('genre',g)
m = re.match(r".*?Published: ([a-zA-Z]+ \d\d?, \d\d\d\d).*?",metastr)
if m:
self.story.setMetadata('datePublished',makeDate(m.group(1), "%B %d, %Y"))
m = re.match(r".*?Updated: ([a-zA-Z]+ \d\d?, \d\d\d\d).*?",metastr)
if m:
self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%B %d, %Y"))
m = re.match(r".*? (\d+) words Genre.*?",metastr)
if m:
self.story.setMetadata('numWords', m.group(1))
for small in storydd.findAll('small'):
small.extract() ## removes the <small> tags, leaving only the summary.
self.setDescription(url,storydd)
#self.story.setMetadata('description',stripHTML(storydd))
return
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
# find <!-- headerend --> & <!-- footerstart --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>')
# problems with some stories confusing Soup. This is a nasty
# hack, but it works.
data = data[data.index("<crazytagstringnobodywouldstumbleonaccidently"):]
soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
body = soup.findAll('body') ## some stories use a nested body and body
## tag, in which case we don't
## need crazytagstringnobodywouldstumbleonaccidently
## and use the second one instead.
if len(body)>1:
text = body[1]
text.name='div' # force to be a div to avoid multiple body tags.
else:
text = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
text.name='div' # change to div tag.
if not data or not text:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
# not sure how, but we can get html, etc tags still in some
# stories. That breaks later updates because it confuses
# epubutils.py
for tag in text.findAll('head'):
tag.extract()
for tag in text.findAll('body') + text.findAll('html'):
tag.name = 'div'
return self.utf8FromSoup(url,text)
def getClass():
return FictionAlleyOrgSiteAdapter

View file

@ -0,0 +1,178 @@
import re
import urllib2
import urlparse
from .. import BeautifulSoup
from ..BeautifulSoup import NavigableString
from base_adapter import BaseSiteAdapter, makeDate
from .. import exceptions
def getClass():
return FictionManiaTVAdapter
def _get_query_data(url):
components = urlparse.urlparse(url)
query_data = urlparse.parse_qs(components.query)
return dict((key, data[0]) for key, data in query_data.items())
# yields Tag _and_ NavigableString siblings from the given tag. The
# BeautifulSoup findNextSiblings() method for some reasons only returns either
# NavigableStrings _or_ Tag objects, not both.
def _yield_next_siblings(tag):
sibling = tag.nextSibling
while sibling:
yield sibling
sibling = sibling.nextSibling
class FictionManiaTVAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'fmt'
SITE_DOMAIN = 'fictionmania.tv'
BASE_URL = 'http://' + SITE_DOMAIN + '/stories/'
READ_TEXT_STORY_URL_TEMPLATE = BASE_URL + 'readtextstory.html?storyID=%s'
DETAILS_URL_TEMPLATE = BASE_URL + 'details.html?storyID=%s'
DATETIME_FORMAT = '%m/%d/%Y'
ALTERNATIVE_DATETIME_FORMAT = '%m/%d/%y'
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
query_data = urlparse.parse_qs(self.parsedUrl.query)
story_id = query_data['storyID'][0]
self.story.setMetadata('storyId', story_id)
self._setURL(self.READ_TEXT_STORY_URL_TEMPLATE % story_id)
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
# Always single chapters, probably should use the Anthology feature to
# merge chapters of a story
self.story.setMetadata('numChapters', 1)
def _customized_fetch_url(self, url, exception=None, parameters=None):
if exception:
try:
data = self._fetchUrl(url, parameters)
except urllib2.HTTPError:
raise exception(self.url)
# Just let self._fetchUrl throw the exception, don't catch and
# customize it.
else:
data = self._fetchUrl(url, parameters)
return BeautifulSoup.BeautifulSoup(data)
@staticmethod
def getSiteDomain():
return FictionManiaTVAdapter.SITE_DOMAIN
@classmethod
def getSiteExampleURLs(cls):
return cls.READ_TEXT_STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self.BASE_URL) + '(readtextstory|details)\.html\?storyID=\d+$'
def extractChapterUrlsAndMetadata(self):
url = self.DETAILS_URL_TEMPLATE % self.story.getMetadata('storyId')
soup = self._customized_fetch_url(url)
keep_summary_html = self.getConfig('keep_summary_html')
for row in soup.find('table')('tr'):
cells = row('td')
key = cells[0].b.string.strip(':')
try:
value = cells[1].string
except AttributeError:
value = None
if key == 'Story Name-Title':
self.story.setMetadata('title', value)
self.chapterUrls.append((value, self.url))
elif key == 'File Name':
self.story.setMetadata('fileName', value)
elif key == 'File Size':
self.story.setMetadata('fileSize', value)
elif key == 'Author':
element = cells[1].a
self.story.setMetadata('author', element.string)
query_data = _get_query_data(element['href'])
self.story.setMetadata('authorId', query_data['word'])
self.story.setMetadata('authorUrl', urlparse.urljoin(url, element['href']))
elif key == 'Date Added':
try:
date = makeDate(value, self.DATETIME_FORMAT)
except ValueError:
date = makeDate(value, self.ALTERNATIVE_DATETIME_FORMAT)
self.story.setMetadata('datePublished', date)
elif key == 'Old Name':
self.story.setMetadata('oldName', value)
elif key == 'New Name':
self.story.setMetadata('newName', value)
elif key == 'Other Key Names':
for name in value.split(', '):
self.story.addToList('characters', name)
# I have no clue how the rating system works, if you are reading
# transgender fanfiction, you are probably an adult.
elif key == 'Rating':
self.story.setMetadata('rating', value)
elif key == 'Complete':
self.story.setMetadata('status', 'Complete' if value == 'Complete' else 'In-Progress')
elif key == 'Categories':
for element in cells[1]('a'):
self.story.addToList('category', element.string)
elif key == 'Key Words':
for element in cells[1]('a'):
self.story.addToList('keyWords', element.string)
elif key == 'Main Characters Age':
element = cells[1].a
self.story.setMetadata('mainCharactersAge', element.string)
elif key == 'Synopsis':
element = cells[1]
# Replace td with div to avoid possible strange formatting in
# the ebook later on
element.name = 'div'
if keep_summary_html:
self.story.setMetadata('description', unicode(element))
else:
self.story.setMetadata('description', ''.join(element(text=True)))
elif key == 'Reads':
self.story.setMetadata('readings', value)
def getChapterText(self, url):
soup = self._customized_fetch_url(url)
element = soup.find('pre')
element.name = 'div'
# The story's content is contained in a <pre> tag, probably taken 1:1
# from the source text file. A simple replacement of all newline
# characters with a break line tag should take care of formatting.
# While wrapping in paragraphs would be possible, it's too much work,
# I'd rather display the story 1:1 like it was found in the pre tag.
content = unicode(element)
content = content.replace('\n', '<br />')
if self.getConfig('non_breaking_spaces'):
content = content.replace(' ', '&nbsp;')
return content

View file

@ -0,0 +1,194 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import time
import json
from .. import BeautifulSoup as bs
#from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
class FictionPadSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','fpad')
self.dateformat = "%Y-%m-%dT%H:%M:%SZ"
self.is_adult=False
self.username = None
self.password = None
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
self._setURL("https://"+self.getSiteDomain()
+"/author/"+m.group('author')
+"/stories/"+self.story.getMetadata('storyId'))
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
@staticmethod
def getSiteDomain():
return 'fictionpad.com'
@classmethod
def getSiteExampleURLs(self):
return "https://fictionpad.com/author/Author/stories/1234/Some-Title"
def getSiteURLPattern(self):
# http://fictionpad.com/author/Serdd/stories/4275
return r"http(s)?://(www\.)?fictionpad\.com/author/(?P<author>[^/]+)/stories/(?P<id>\d+)"
# <form method="post" action="/signin">
# <input name="authenticity_token" type="hidden" value="u+cfdXh46dRnwVnSlmE2B2BFmHgu760paqgBG6KQeos=" />
# <input type="hidden" name="remember" value="1">
# <strong class="help-start text-center">or with FictionPad</strong>
# <label class="control-label hidden-placeholder">Pseudonym or Email Address</label>
# <input name="login" class="input-block-level" type="text" placeholder="Pseudonym or Email Address" maxlength="50" required autofocus>
# <label class="control-label hidden-placeholder">Password</label>
# <input name="password" class="input-block-level" type="password" placeholder="Password" minlength="6" required>
# <button type="submit" class="btn btn-primary btn-block">Sign In</button>
# <p class="help-end">
# <a href="/passwordreset">Forgot your password?</a>
# </p>
# </form>
def performLogin(self):
params = {}
if self.password:
params['login'] = self.username
params['password'] = self.password
else:
params['login'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['remember'] = '1'
loginUrl = 'http://' + self.getSiteDomain() + '/signin'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['login']))
## need to pull empty login page first to get authenticity_token
soup = bs.BeautifulSoup(self._fetchUrl(loginUrl))
params['authenticity_token']=soup.find('input', {'name':'authenticity_token'})['value']
data = self._postUrl(loginUrl, params)
if "Invalid email/pseudonym and password combination." in data:
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['login']))
raise exceptions.FailedToLogin(loginUrl,params['login'])
def extractChapterUrlsAndMetadata(self):
# fetch the chapter. From that we will get almost all the
# metadata and chapter list
url=self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
if "This is a mature story. Please sign in to read it." in data:
self.performLogin()
data = self._fetchUrl(url)
find = "wordyarn.config.page = "
data = data[data.index(find)+len(find):]
data = data[:data.index("</script>")]
data = data[:data.rindex(";")]
data = data.replace('tables:','"tables":')
tables = json.loads(data)['tables']
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(url)
else:
raise e
# looks like only one author per story allowed.
author = tables['users'][0]
story = tables['stories'][0]
story_ver = tables['story_versions'][0]
print("story:%s"%story)
self.story.setMetadata('authorId',author['id'])
self.story.setMetadata('author',author['display_name'])
self.story.setMetadata('authorUrl','https://'+self.host+'/author/'+author['display_name']+'/stories')
self.story.setMetadata('title',story_ver['title'])
self.setDescription(url,story_ver['description'])
if not ('assets/story_versions/covers' in story_ver['profile_image_url@2x']):
self.setCoverImage(url,story_ver['profile_image_url@2x'])
self.story.setMetadata('datePublished',makeDate(story['published_at'], self.dateformat))
self.story.setMetadata('dateUpdated',makeDate(story['published_at'], self.dateformat))
self.story.setMetadata('followers',story['followers_count'])
self.story.setMetadata('comments',story['comments_count'])
self.story.setMetadata('views',story['views_count'])
self.story.setMetadata('likes',int(story['likes'])) # no idea why they floated these.
if 'dislikes' in story:
self.story.setMetadata('dislikes',int(story['dislikes']))
if story_ver['is_complete']:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
self.story.setMetadata('rating', story_ver['maturity_level'])
self.story.setMetadata('numWords', unicode(story_ver['word_count']))
for i in tables['fandoms']:
self.story.addToList('category',i['name'])
for i in tables['genres']:
self.story.addToList('genre',i['name'])
for i in tables['characters']:
self.story.addToList('characters',i['name'])
for c in tables['chapters']:
chtitle = "Chapter %d"%c['number']
if c['title']:
chtitle += " - %s"%c['title']
self.chapterUrls.append((chtitle,c['body_url']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
if not url:
data = u"<em>This chapter has no text.</em>"
else:
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(u"<div id='story'>"+data+u"</div>")
return self.utf8FromSoup(url,soup)
def getClass():
return FictionPadSiteAdapter

View file

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import time
## They're from the same people and pretty much identical.
from adapter_fanfictionnet import FanFictionNetSiteAdapter
class FictionPressComSiteAdapter(FanFictionNetSiteAdapter):
def __init__(self, config, url):
FanFictionNetSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','fpcom')
@staticmethod
def getSiteDomain():
return 'www.fictionpress.com'
@classmethod
def getAcceptDomains(cls):
return ['www.fictionpress.com','m.fictionpress.com']
@classmethod
def getSiteExampleURLs(self):
return "https://www.fictionpress.com/s/1234/1/ https://www.fictionpress.com/s/1234/12/ http://www.fictionpress.com/s/1234/1/Story_Title http://m.fictionpress.com/s/1234/1/"
def getSiteURLPattern(self):
return r"https?://(www|m)?\.fictionpress\.com/s/\d+(/\d+)?(/|/[a-zA-Z0-9_-]+)?/?$"
def getClass():
return FictionPressComSiteAdapter

View file

@ -0,0 +1,223 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import time
import httplib, urllib
from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
from base_adapter import BaseSiteAdapter, makeDate
class FicwadComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','fw')
# get storyId from url--url validation guarantees second part is storyId
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
self.username = "NoneGiven"
self.password = ""
@staticmethod
def getSiteDomain():
return 'ficwad.com'
@classmethod
def getSiteExampleURLs(self):
return "http://ficwad.com/story/1234"
def getSiteURLPattern(self):
return re.escape(r"http://"+self.getSiteDomain())+"/story/\d+?$"
def performLogin(self,url):
params = {}
if self.password:
params['username'] = self.username
params['password'] = self.password
else:
params['username'] = self.getConfig("username")
params['password'] = self.getConfig("password")
loginUrl = 'http://' + self.getSiteDomain() + '/account/login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['username']))
d = self._postUrl(loginUrl,params)
if "Login attempt failed..." in d:
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['username']))
raise exceptions.FailedToLogin(url,params['username'])
return False
else:
return True
def extractChapterUrlsAndMetadata(self):
# fetch the chapter. From that we will get almost all the
# metadata and chapter list
url = self.url
logger.debug("URL: "+url)
# use BeautifulSoup HTML parser to make everything easier to find.
try:
data = self._fetchUrl(url)
# non-existent/removed story urls get thrown to the front page.
if "<h2>Welcome to FicWad</h2>" in data:
raise exceptions.StoryDoesNotExist(self.url)
soup = bs.BeautifulSoup(data)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
h3 = soup.find('h3')
storya = h3.find('a',href=re.compile("^/story/\d+$"))
if storya : # if there's a story link in the h3 header, this is a chapter page.
# normalize story URL on chapter list.
self.story.setMetadata('storyId',storya['href'].split('/',)[2])
url = "http://"+self.getSiteDomain()+storya['href']
logger.debug("Normalizing to URL: "+url)
self._setURL(url)
try:
soup = bs.BeautifulSoup(self._fetchUrl(url))
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# if blocked, attempt login.
if soup.find("li",{"class":"blocked"}):
if self.performLogin(url): # performLogin raises
# FailedToLogin if it fails.
soup = bs.BeautifulSoup(self._fetchUrl(url))
# title - first h4 tag will be title.
titleh4 = soup.find('h4')
self.story.setMetadata('title', stripHTML(titleh4.a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"^/author/\d+"))
self.story.setMetadata('authorId',a['href'].split('/')[2])
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
self.story.setMetadata('author',a.string)
# description
storydiv = soup.find("div",{"id":"story"})
self.setDescription(url,storydiv.find("blockquote",{'class':'summary'}).p)
#self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
# most of the meta data is here:
metap = storydiv.find("p",{"class":"meta"})
self.story.addToList('category',metap.find("a",href=re.compile(r"^/category/\d+")).string)
# warnings
# <span class="req"><a href="/help/38" title="Medium Spoilers">[!!] </a> <a href="/help/38" title="Rape/Sexual Violence">[R] </a> <a href="/help/38" title="Violence">[V] </a> <a href="/help/38" title="Child/Underage Sex">[Y] </a></span>
spanreq = metap.find("span",{"class":"req"})
if spanreq: # can be no warnings.
for a in spanreq.findAll("a"):
self.story.addToList('warnings',a['title'])
## perhaps not the most efficient way to parse this, using
## regexps for each rather than something more complex, but
## IMO, it's more readable and amenable to change.
metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ')
#print "metap: (%s)"%metastr
m = re.match(r".*?Rating: (.+?) -.*?",metastr)
if m:
self.story.setMetadata('rating', m.group(1))
m = re.match(r".*?Genres: (.+?) -.*?",metastr)
if m:
for g in m.group(1).split(','):
self.story.addToList('genre',g)
m = re.match(r".*?Characters: (.*?) -.*?",metastr)
if m:
for g in m.group(1).split(','):
if g:
self.story.addToList('characters',g)
m = re.match(r".*?Published: ([0-9/]+?) -.*?",metastr)
if m:
self.story.setMetadata('datePublished',makeDate(m.group(1), "%Y/%m/%d"))
# Updated can have more than one space after it. <shrug>
m = re.match(r".*?Updated: ([0-9/]+?) +-.*?",metastr)
if m:
self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%Y/%m/%d"))
m = re.match(r".*? - ([0-9/]+?) words.*?",metastr)
if m:
self.story.setMetadata('numWords',m.group(1))
if metastr.endswith("Complete"):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
# get the chapter list first this time because that's how we
# detect the need to login.
storylistul = soup.find('ul',{'id':'storylist'})
if not storylistul:
# no list found, so it's a one-chapter story.
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
chapterlistlis = storylistul.findAll('li')
for chapterli in chapterlistlis:
if "blocked" in chapterli['class']:
# paranoia check. We should already be logged in by now.
raise exceptions.FailedToLogin(url,self.username)
else:
#print "chapterli.h4.a (%s)"%chapterli.h4.a
self.chapterUrls.append((chapterli.h4.a.string,
u'http://%s%s'%(self.getSiteDomain(),
chapterli.h4.a['href'])))
#print "self.chapterUrls:%s"%self.chapterUrls
self.story.setMetadata('numChapters',len(self.chapterUrls))
return
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
span = soup.find('div', {'id' : 'storytext'})
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,span)
def getClass():
return FicwadComSiteAdapter

View file

@ -0,0 +1,304 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import cookielib as cl
import json
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return FimFictionNetSiteAdapter
class FimFictionNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','fimficnet')
self.story.setMetadata('storyId', self.parsedUrl.path.split('/',)[2])
self._setURL("http://"+self.getSiteDomain()+"/story/"+self.story.getMetadata('storyId')+"/")
self.is_adult = False
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d %b %Y"
@staticmethod
def getSiteDomain():
return 'www.fimfiction.net'
@classmethod
def getAcceptDomains(cls):
# mobile.fimifction.com isn't actually a valid domain, but we can still get the story id from URLs anyway
return ['www.fimfiction.net','mobile.fimfiction.net', 'www.fimfiction.com', 'mobile.fimfiction.com']
@classmethod
def getSiteExampleURLs(self):
return "http://www.fimfiction.net/story/1234/story-title-here http://www.fimfiction.net/story/1234/ http://www.fimfiction.com/story/1234/1/ http://mobile.fimfiction.net/story/1234/1/story-title-here/chapter-title-here"
def getSiteURLPattern(self):
return r"https?://(www|mobile)\.fimfiction\.(net|com)/story/\d+/?.*"
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
cookieproc = urllib2.HTTPCookieProcessor()
cookie = cl.Cookie(version=0, name='view_mature', value='true',
port=None, port_specified=False,
domain=self.getSiteDomain(), domain_specified=False, domain_initial_dot=False,
path='/story', path_specified=True,
secure=False,
expires=time.time()+10000,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = urllib2.build_opener(cookieproc)
try:
apiResponse = urllib2.urlopen("http://www.fimfiction.net/api/story.php?story=%s" % (self.story.getMetadata("storyId"))).read()
apiData = json.loads(apiResponse)
# Unfortunately, we still need to load the story index
# page to parse the characters. And chapters, now, too.
data = self.do_fix_blockquotes(self._fetchUrl(self.url))
soup = bs.BeautifulSoup(data)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Warning: mysql_fetch_array(): supplied argument is not a valid MySQL result resource" in data:
raise exceptions.StoryDoesNotExist(self.url)
# Can cause problems if a missing story is referenced in a comment.
# Shouldn't be needed anyway.
# if "/images/missing_story.png" in data:
# raise exceptions.StoryDoesNotExist(self.url)
if "This story has been marked as having adult content. Please click below to confirm you are of legal age to view adult material in your country." in data:
raise exceptions.AdultCheckRequired(self.url)
if self.password:
params = {}
params['password'] = self.password
data = self._postUrl(self.url,params)
if "Enter the password the author set for this story to view it." in data:
if self.getConfig('fail_on_password'):
raise exceptions.FailedToDownload("%s requires story password and fail_on_password is true."%self.url)
else:
raise exceptions.FailedToLogin(self.url,"Story requires individual password",passwdonly=True)
if "Invalid story id" in apiData.values():
raise exceptions.StoryDoesNotExist(self.url)
storyMetadata = apiData["story"]
## Title
a = soup.find('a', href=re.compile(r'^/story/'+self.story.getMetadata('storyId')))
self.story.setMetadata('title',stripHTML(a))
# self.story.setMetadata("title", storyMetadata["title"])
# if not storyMetadata["title"]:
# raise exceptions.FailedToDownload("%s doesn't have a title in the API. This is a known fimfiction.net bug with titles containing ."%self.url)
self.story.setMetadata("author", storyMetadata["author"]["name"])
self.story.setMetadata("authorId", storyMetadata["author"]["id"])
self.story.setMetadata("authorUrl", "http://%s/user/%s" % (self.getSiteDomain(), storyMetadata["author"]["name"]))
# chapters = [{"chapterTitle": chapter["title"], "chapterURL": chapter["link"]} for chapter in storyMetadata["chapters"]]
# ## this is bit of a kludge based on the assumption all the
# ## 'bad' chapters will be at the end.
# ## limit down to the number of chapters reported by chapter_count.
# chapters = chapters[:storyMetadata["chapter_count"]]
# for chapter in chapters:
# self.chapterUrls.append((chapter["chapterTitle"], chapter["chapterURL"]))
# self.story.setMetadata("numChapters", len(self.chapterUrls))
for chapter in soup.findAll('a',{'class':'chapter_link'}):
self.chapterUrls.append((stripHTML(chapter), 'http://'+self.host+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# In the case of fimfiction.net, possible statuses are 'Completed', 'Incomplete', 'On Hiatus' and 'Cancelled'
# For the sake of bringing it in line with the other adapters, 'Incomplete' becomes 'In-Progress'
# and 'Complete' beomes 'Completed'. 'Cancelled' seems an important enough (not to mention more strictly true)
# status to leave unchanged.
# Nov2012 - 'On Hiatus' is now passed, too. It's easy now for users to change/remove if they want
# with replace_metadata
status = storyMetadata["status"].replace("Incomplete", "In-Progress").replace("Complete", "Completed")
self.story.setMetadata("status", status)
self.story.setMetadata("rating", storyMetadata["content_rating_text"])
## Warnings aren't included in the API.
bottomli = soup.find('li',{'class':'bottom'})
if bottomli:
bottomspans = bottomli.findAll('span')
# the first span in bottom is the rating, obtained above.
if bottomspans and len(bottomspans) > 1:
for warning in bottomspans[1:]:
self.story.addToList('warnings',warning.string)
for category in storyMetadata["categories"]:
if storyMetadata["categories"][category]:
self.story.addToList("genre", category)
self.story.setMetadata("numWords", str(storyMetadata["words"]))
# fimfic is the first site with an explicit cover image.
if "image" in storyMetadata.keys():
if "full_image" in storyMetadata:
coverurl = storyMetadata["full_image"]
else:
coverurl = storyMetadata["image"]
if coverurl.startswith('//'): # fix for img urls missing 'http:'
coverurl = "http:"+coverurl
self.setCoverImage(self.url,coverurl)
# fimf has started including extra stuff inside the description div.
descdivstr = u"%s"%soup.find("div", {"class":"description"})
hrstr=u"<hr />"
descdivstr = u'<div class="description">'+descdivstr[descdivstr.index(hrstr)+len(hrstr):]
self.setDescription(self.url,descdivstr)
# Can't trust dates from API anymore I'm told.
# Dates are in Unix time
# Take the publish date from the first chapter posted
# rawDatePublished = storyMetadata["chapters"][0]["date_modified"]
# self.story.setMetadata("datePublished", datetime.fromtimestamp(rawDatePublished))
# rawDateUpdated = storyMetadata["date_modified"]
# self.story.setMetadata("dateUpdated", datetime.fromtimestamp(rawDateUpdated))
oldestChapter = None
newestChapter = None
self.newestChapterNum = None # save for comparing during update.
# Scan all chapters to find the oldest and newest, on
# FiMFiction it's possible for authors to insert new chapters
# out-of-order or change the dates of earlier ones by editing
# them--That WILL break epub update.
for index, chapterDate in enumerate(soup.findAll('span', {'class':'date'})):
date=re.sub(r"(\d+)(st|nd|rd|th)",r"\1",chapterDate.contents[1].strip())
chapterDate = makeDate(date,self.dateformat)
if oldestChapter == None or chapterDate < oldestChapter:
oldestChapter = chapterDate
if newestChapter == None or chapterDate > newestChapter:
newestChapter = chapterDate
self.newestChapterNum = index
self.story.setMetadata("dateUpdated", newestChapter)
pubdatetag = soup.find('span', {'class':'date_approved'})
if pubdatetag is None:
self.story.setMetadata("datePublished", oldestChapter)
else:
pubdateraw = pubdatetag('span')[1].text
datestripped=re.sub(r"(\d+)(st|nd|rd|th)",r"\1",pubdateraw.strip())
pubDate = makeDate(datestripped,self.dateformat)
self.story.setMetadata("datePublished", pubDate)
chars = soup.find("div", {"class":"inner_data"})
# fimfic stopped putting the char name on or around the char
# icon now for some reason. Pull it from the image name with
# some heuristics.
for character in [character_icon["src"] for character_icon in chars.findAll("img", {"class":"character_icon"})]:
# //static.fimfiction.net/images/characters/twilight_sparkle.png
# 5th split /, remove last four, replace _, capitolize every word(title())
char = character.split('/')[5][:-4].replace('_',' ').title()
if char == 'Oc':
char = "OC"
if char == 'Cmc':
char = "Cutie Mark Crusaders"
self.story.addToList("characters", char)
# extra site specific metadata
extralist = ["likes","dislikes","views","total_views","short_description"]
for metakey in extralist:
if metakey in storyMetadata:
value = storyMetadata[metakey]
if not isinstance(value,basestring):
value = unicode(value)
self.story.setMetadata(metakey, value)
## Groups and sequels code from FaceDeer
allGroupLists = soup.findAll('ul', {'id':'story_group_list'})
for groupList in allGroupLists:
for groupName in groupList.findAll('a', {'href':re.compile('^/group/')}):
self.story.addToList("groupsUrl", 'http://'+self.host+groupName["href"])
self.story.addToList("groups",stripHTML(groupName).replace(',', ';'))
sequelStoryHeader = soup.find('h1', {'class':'header-stories'}, text="Sequels")
if not sequelStoryHeader == None:
sequelContainer = sequelStoryHeader.parent.parent
for sequel in sequelContainer.findAll('a', {'class':'story_link'}):
self.story.addToList("sequelsUrl", 'http://'+self.host+sequel["href"])
self.story.addToList("sequels", stripHTML(sequel).replace(',', ';'))
#The link to the prequel is embedded in the description text, so erring
#on the side of caution and wrapping this whole thing in a try block.
#If anything goes wrong this probably wasn't a valid prequel link.
try:
description = soup.find('div', {'class':'description'})
firstHR = description.find("hr")
nextSib = firstHR.nextSibling
if "This story is a sequel to" in nextSib.string:
link = nextSib.nextSibling
if link.name == "a":
self.story.setMetadata("prequelUrl", 'http://'+self.host+link["href"])
self.story.setMetadata("prequel", stripHTML(link))
except:
pass
def hookForUpdates(self,chaptercount):
if self.oldchapters and len(self.oldchapters) > self.newestChapterNum:
print("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1))
self.oldchapters = self.oldchapters[:self.newestChapterNum]
return len(self.oldchapters)
def do_fix_blockquotes(self,data):
if self.getConfig('fix_fimf_blockquotes'):
# <p class="double"><blockquote>
# </blockquote></p>
# include > in re groups so there's always something in the group.
data = re.sub(r'<p([^>]*>\s*)<blockquote([^>]*>)',r'<blockquote\2<p\1',data)
data = re.sub(r'</blockquote(>\s*)</p>',r'</p\1</blockquote>',data)
return data
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self.do_fix_blockquotes(self._fetchUrl(url))
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr')).find('div', {'class' : 'chapter_content'})
if soup == None:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,soup)

View file

@ -0,0 +1,288 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return FineStoriesComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class FineStoriesComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2].split(':')[0])
if 'storyInfo' in self.story.getMetadata('storyId'):
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/s/storyInfo.php?id='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','fnst')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%Y-%m-%d"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'finestories.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/s/1234 http://"+self.getSiteDomain()+"/s/1234:4010 http://"+self.getSiteDomain()+"/library/storyInfo.php?id=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain())+r"/(s|library)?/(storyInfo.php\?id=)?\d+(:\d+)?(;\d+)?$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Free Registration' in data \
or "Invalid Password!" in data \
or "Invalid User Name!" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['theusername'] = self.username
params['thepassword'] = self.password
else:
params['theusername'] = self.getConfig("username")
params['thepassword'] = self.getConfig("password")
params['rememberMe'] = '1'
params['page'] = 'http://'+self.getSiteDomain()+'/'
params['submit'] = 'Login'
loginUrl = 'http://' + self.getSiteDomain() + '/login.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['theusername']))
d = self._fetchUrl(loginUrl, params)
if "My Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['theusername']))
raise exceptions.FailedToLogin(url,params['theusername'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"/a/\w+"))
self.story.setMetadata('authorId',a['href'].split('/')[2])
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
self.story.setMetadata('author',a.text)
# Find the chapters:
chapters = soup.findAll('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')+":\d+$"))
if len(chapters) != 0:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['href']))
else:
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/s/'+self.story.getMetadata('storyId')))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# surprisingly, the detailed page does not give enough details, so go to author's page
skip=0
i=0
while i == 0:
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')+"&skip="+str(skip)))
a = asoup.findAll('td', {'class' : 'lc2'})
for lc2 in a:
if lc2.find('a')['href'] == '/s/'+self.story.getMetadata('storyId'):
i=1
break
if a[len(a)-1] == lc2:
skip=skip+10
for cat in lc2.findAll('div', {'class' : 'typediv'}):
self.story.addToList('category',cat.text)
self.story.setMetadata('numWords', lc2.findNext('td', {'class' : 'num'}).text)
lc4 = lc2.findNext('td', {'class' : 'lc4'})
try:
a = lc4.find('a', href=re.compile(r"/library/show_series.php\?id=\d+"))
i = a.parent.text.split('(')[1].split(')')[0]
self.setSeries(a.text, i)
self.story.setMetadata('seriesUrl','http://'+self.host+a['href'])
except:
pass
try:
a = lc4.find('a', href=re.compile(r"/library/universe.php\?id=\d+"))
self.story.addToList("category",a.text)
except:
pass
for a in lc4.findAll('span', {'class' : 'help'}):
a.extract()
self.setDescription('http://'+self.host+'/s/'+self.story.getMetadata('storyId'),lc4.text.split('[More Info')[0])
for b in lc4.findAll('b'):
label = b.text
value = b.nextSibling
if 'For Age' in label:
self.story.setMetadata('rating', value)
if 'Tags' in label:
for genre in value.split(', '):
self.story.addToList('genre',genre)
if 'Posted' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value.split('/ (')[0]), self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value.split('/ (')[0]), self.dateformat))
if 'Concluded' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value.split('/ (')[0]), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value.split('/ (')[0]), self.dateformat))
status = lc4.find('span', {'class' : 'ab'})
if status != None:
self.story.setMetadata('status', 'In-Progress')
if "Last Activity" in status.text:
self.story.setMetadata('dateUpdated', makeDate(status.text.split('Activity: ')[1].split(')')[0], self.dateformat))
else:
self.story.setMetadata('status', 'Completed')
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
# some big chapters are split over several pages
pager = div.find('span', {'class' : 'pager'})
if pager != None:
urls=pager.findAll('a')
urls=urls[:len(urls)-1]
for ur in urls:
soup = bs.BeautifulSoup(self._fetchUrl("http://"+self.getSiteDomain()+ur['href']),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div1 = soup.find('div', {'id' : 'story'})
# appending next section
last=div.findAll('p')
next=div1.find('span', {'class' : 'conTag'}).nextSibling
last[len(last)-1]=last[len(last)-1].append(next)
div.append(div1)
# removing all the left-over stuff
for a in div.findAll('span'):
a.extract()
for a in div.findAll('h1'):
a.extract()
for a in div.findAll('h2'):
a.extract()
for a in div.findAll('h3'):
a.extract()
for a in div.findAll('h4'):
a.extract()
for a in div.findAll('br'):
a.extract()
for a in div.findAll('div', {'class' : 'date'}):
a.extract()
a = div.find('form')
if a != None:
b = a.nextSibling
while b != None:
a.extract()
a=b
b=b.nextSibling
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,310 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return GrangerEnchantedCom
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class GrangerEnchantedCom(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
self.section=self.parsedUrl.path.split('/',)[1]
# normalized story URL.
if "malfoymanor" in self.parsedUrl.netloc:
self._setURL('http://malfoymanor.' + self.getSiteDomain() + '/themanor/viewstory.php?sid='+self.story.getMetadata('storyId'))
self.story.addToList("category","The Manor")
else:
self._setURL('http://' + self.getSiteDomain() + '/enchant/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','gech')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d/%b/%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'grangerenchanted.com'
@classmethod
def getAcceptDomains(cls):
return ['grangerenchanted.com','malfoymanor.grangerenchanted.com']
@classmethod
def getSiteExampleURLs(self):
return "http://grangerenchanted.com/enchant/viewstory.php?sid=1234 http://malfoymanor.grangerenchanted.com/themanor/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return r"http://(malfoymanor.)?grangerenchanted.com/(enchant|themanor)?/viewstory.php\?sid=\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
if "enchant" in self.section:
loginUrl = 'http://grangerenchanted.com/enchant/user.php?action=login'
else:
loginUrl = 'http://malfoymanor.grangerenchanted.com/themanor/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=1"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+self.section+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Read' in label:
self.story.setMetadata('read', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=4'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+self.section+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
try:
self.story.setMetadata('reviews',
stripHTML(soup.find('div',{'id':'sort'}).
findAll('a', href=re.compile(r'^reviews.php'))[1]))
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story1'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,202 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','hp')
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.is_adult=False
# get storyId from url--url validation guarantees query is only psid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?psid='+self.story.getMetadata('storyId'))
@staticmethod
def getSiteDomain():
return 'www.harrypotterfanfiction.com'
@classmethod
def getAcceptDomains(cls):
return ['www.harrypotterfanfiction.com','harrypotterfanfiction.com']
@classmethod
def getSiteExampleURLs(self):
return "http://www.harrypotterfanfiction.com/viewstory.php?psid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+r"(www\.)?"+re.escape("harrypotterfanfiction.com/viewstory.php?psid=")+r"\d+$"
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def extractChapterUrlsAndMetadata(self):
url = self.url+'&index=1'
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
## Title
a = soup.find('a', href=re.compile(r'\?psid='+self.story.getMetadata('storyId')))
self.story.setMetadata('title',stripHTML(a))
## javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid=290995'
if "This story may contain adult themes." in a['href'] and not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?showuid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
## hpcom doesn't give us total words--but it does give
## us words/chapter. I'd rather add than fetch and
## parse another page.
words=0
for tr in soup.find('table',{'class':'text'}).findAll('tr'):
tdstr = tr.findAll('td')[2].string
if tdstr and tdstr.isdigit():
words+=int(tdstr)
self.story.setMetadata('numWords',str(words))
# Find the chapters:
tablelist = soup.find('table',{'class':'text'})
for chapter in tablelist.findAll('a', href=re.compile(r'\?chapterid=\d+')):
#javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?chapterid=433441&i=1'
# just in case there's tags, like <i> in chapter titles.
chpt=re.sub(r'^.*?(\?chapterid=\d+).*?',r'\1',chapter['href'])
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php'+chpt))
self.story.setMetadata('numChapters',len(self.chapterUrls))
## Finding the metadata is a bit of a pain. Desc is the only thing this color.
desctable= soup.find('table',{'bgcolor':'#f0e8e8'})
self.setDescription(url,desctable)
#self.story.setMetadata('description',stripHTML(desctable))
## Finding the metadata is a bit of a pain. Most of the meta
## data is in a center.table without a bgcolor.
#for center in soup.findAll('center'):
table = soup.find('table',{'class':'storymaininfo'})
if table:
metastr = stripHTML(str(table)).replace('\n',' ').replace('\t',' ')
# Rating: 12+ Story Reviews: 3
# Chapters: 3
# Characters: Andromeda, Ted, Bellatrix, R. Lestrange, Lucius, Narcissa, OC
# Genre(s): Fluff, Romance, Young Adult Era: OtherPairings: Other Pairing, Lucius/Narcissa
# Status: Completed
# First Published: 2010.09.02
# Last Published Chapter: 2010.09.28
# Last Updated: 2010.09.28
# Favorite Story Of: 1 users
# Warnings: Scenes of a Mild Sexual Nature
m = re.match(r".*?Status: Completed.*?",metastr)
if m:
self.story.setMetadata('status','Completed')
else:
self.story.setMetadata('status','In-Progress')
m = re.match(r".*?Rating: (.+?) Story Reviews.*?",metastr)
if m:
self.story.setMetadata('rating', m.group(1))
m = re.match(r".*?Genre\(s\): (.+?) Era.*?",metastr)
if m:
for g in m.group(1).split(','):
self.story.addToList('genre',g)
m = re.match(r".*?Characters: (.+?) Genre.*?",metastr)
if m:
for g in m.group(1).split(','):
self.story.addToList('characters',g)
m = re.match(r".*?Warnings: (.+).*?",metastr)
if m:
for w in m.group(1).split(','):
if w != 'Now Warnings':
self.story.addToList('warnings',w)
m = re.match(r".*?First Published: ([0-9\.]+).*?",metastr)
if m:
self.story.setMetadata('datePublished',makeDate(m.group(1), "%Y.%m.%d"))
# Updated can have more than one space after it. <shrug>
m = re.match(r".*?Last Updated: ([0-9\.]+).*?",metastr)
if m:
self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%Y.%m.%d"))
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
## most adapters use BeautifulStoneSoup here, but non-Stone
## allows nested div tags.
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'fluidtext'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)
def getClass():
return HarryPotterFanFictionComSiteAdapter

View file

@ -0,0 +1,172 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return HennethAnnunNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class HennethAnnunNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/stories/chapter.cfm?stid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','htan')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.henneth-annun.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/stories/chapter.cfm?stid=1234"
def getSiteURLPattern(self):
return "http://"+self.getSiteDomain()+"/stories/chapter(_view)?.cfm\?stid="+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "We're sorry. This story is not available." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: This story is not available.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('h2', {'id':'page_heading'})
self.story.setMetadata('title',stripHTML(a))
# Find the chapters: chapter_view.cfm?stid=6663&amp;spordinal=1"
for chapter in soup.findAll('a', href=re.compile(r'chapter_view.cfm\?stid='+self.story.getMetadata('storyId')+"&spordinal=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/stories/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
self.story.setMetadata('numWords', soup.find('tr', {'class':'foot'}).findAll('td')[1].text)
self.setDescription(url,soup.find('div', {'id':'summary'}))
# <span class="label">Rated:</span> NC-17<br /> etc
info = soup.find('div', {'id':'storyinformation'})
labels=info.findAll('b')
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Completion' in label:
if 'Complete' in value.string:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Rating' in label:
self.story.setMetadata('rating', value.string)
if 'Era:' in label:
self.story.addToList('category',value.string)
if 'Genre' in label:
self.story.addToList('genre',value.string)
labels=info.findAll('strong')
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Author' in label:
value=value.nextSibling
self.story.setMetadata('authorId',value['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+value['href'])
self.story.setMetadata('author',value.string)
if 'Post' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated:' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
for char in soup.findAll('a', href=re.compile(r"/resources/bios_view.cfm\?scid=\d+")):
self.story.addToList('characters',stripHTML(char))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'class' : 'block chapter'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,231 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return HLFictionNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class HLFictionNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','hlf')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'hlfiction.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title and author
a = soup.find('div', {'id' : 'pagetitle'})
aut = a.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',aut['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+aut['href'])
self.story.setMetadata('author',aut.string)
aut.extract()
self.story.setMetadata('title',stripHTML(a)[:(len(a.string)-3)])
# Find the chapters:
chapters=soup.find('select')
if chapters != None:
for chapter in chapters.findAll('option'):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php?sid='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
else:
self.chapterUrls.append((self.story.getMetadata('title'),url))
self.story.setMetadata('numChapters',len(self.chapterUrls))
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
for list in asoup.findAll('div', {'class' : re.compile('listbox\s+')}):
a = list.find('a')
if ('viewstory.php?sid='+self.story.getMetadata('storyId')) in a['href']:
break
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = list.findAll('span', {'class' : 'classification'})
for labelspan in labels:
label = labelspan.string
value = labelspan.nextSibling
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'classification':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value[:len(value)-2])
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'categories.php\?catid=\d+'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
for char in value.string.split(', '):
if not 'None' in char:
self.story.addToList('characters',char)
if 'Genre' in label:
for genre in value.string.split(', '):
if not 'None' in genre:
self.story.addToList('genre',genre)
if 'Warnings' in label:
for warning in value.string.split(', '):
if not 'None' in warning:
self.story.addToList('warnings',warning)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = list.find('a', href=re.compile(r"series.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if ('viewstory.php?sid='+self.story.getMetadata('storyId')) in a['href']:
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,232 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return HPFandomNetAdapterAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class HPFandomNetAdapterAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
# XXX Most sites don't have the /eff part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/eff/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','hpfdm') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%Y.%m.%d" # XXX
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.hpfandom.net' # XXX
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/eff/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/eff/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/eff/'+a['href'])
self.story.setMetadata('author',a.string)
## Going to get the rest from the author page.
authdata = self._fetchUrl(self.story.getMetadata('authorUrl'))
# fix a typo in the site HTML so I can find the Characters list.
authdata = authdata.replace('<td width=10%">','<td width="10%">')
# hpfandom.net only seems to indicate adult-only by javascript on the story/chapter links.
if "javascript:if (confirm('Slash/het fiction which incorporates sexual situations to a somewhat graphic degree and some violence. ')) location = 'viewstory.php?sid=%s'"%self.story.getMetadata('storyId') in authdata \
and not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
authsoup = bs.BeautifulSoup(authdata)
reviewsa = authsoup.find('a', href="reviews.php?sid="+self.story.getMetadata('storyId')+"&a=")
# <table><tr><td><p><b><a ...>
metablock = reviewsa.findParent("table")
#print("metablock:%s"%metablock)
## Title
titlea = metablock.find('a', href=re.compile("viewstory.php"))
#print("titlea:%s"%titlea)
if titlea == None:
raise exceptions.FailedToDownload("Story URL (%s) not found on author's page, can't use chapter URLs"%url)
self.story.setMetadata('title',stripHTML(titlea))
# Find the chapters: !!! hpfandom.net differs from every other
# eFiction site--the sid on viewstory for chapters is
# *different* for each chapter
for chapter in soup.findAll('a', {'href':re.compile(r"viewstory.php\?sid=\d+&i=\d+")}):
m = re.match(r'.*?(viewstory.php\?sid=\d+&i=\d+).*?',chapter['href'])
# just in case there's tags, like <i> in chapter titles.
#print("====chapter===%s"%m.group(1))
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/eff/'+m.group(1)))
if len(self.chapterUrls) == 0:
self.chapterUrls.append((stripHTML(self.story.getMetadata('title')),url))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
summary = metablock.find("td",{"class":"summary"})
summary.name='span'
self.setDescription(url,summary)
# words & completed in first row of metablock.
firstrow = stripHTML(metablock.find('tr'))
# A Mother's Love xx Going Grey 1 (G+) by Kiristeen | Reviews - 18 | Words: 27468 | Completed: Yes
m = re.match(r".*?\((?P<rating>[^)]+)\).*?Words: (?P<words>\d+).*?Completed: (?P<status>Yes|No)",firstrow)
if m != None:
if m.group('rating') != None:
self.story.setMetadata('rating', m.group('rating'))
if m.group('words') != None:
self.story.setMetadata('numWords', m.group('words'))
if m.group('status') != None:
if 'Yes' in m.group('status'):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
# <tr><td width="10%" valign="top">Chapters:</td><td width="40%" valign="top">4</td>
# <td width="10%" valign="top">Published:</td><td width="40%" valign="top">2010.09.29</td></tr>
# <tr><td width="10%" valign="top">Completed:</td><td width="40%" valign="top">Yes</td><td width="10%" valign="top">Updated:</td><td width="40%" valign="top">2010.10.03</td></tr>
labels = metablock.findAll('td',{'width':'10%'})
for td in labels:
label = td.string
value = td.nextSibling.string
#print("\nlabel:%s\nvalue:%s\n"%(label,value))
if 'Category' in label and value:
cats = td.parent.findAll('a',href=re.compile(r'categories.php'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label and value: # this site can have Character label with no
# values, apparently. Others as a precaution.
for char in value.split(','):
self.story.addToList('characters',char.strip())
if 'Genre' in label and value:
for genre in value.split(','):
self.story.addToList('genre',genre.strip())
if 'Warnings' in label and value:
for warning in value.split(','):
if warning.strip() != 'none':
self.story.addToList('warnings',warning.strip())
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
# There's no good wrapper around the chapter text. :-/
# There are, however, tables with width=100% just above and below the real text.
data = re.sub(r'<table width="100%">.*?</table>','<div name="storybody">',
data,count=1,flags=re.DOTALL)
data = re.sub(r'<table width="100%">.*?</table>','</div>',
data,count=1,flags=re.DOTALL)
soup = bs.BeautifulStoneSoup(data,selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find("div",{'name':'storybody'})
#print("\n\ndiv:%s\n\n"%div)
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,223 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return HPFanficArchiveComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class HPFanficArchiveComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/stories/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','hpffa')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%B %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.hpfanficarchive.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/stories/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/stories/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/stories/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/stories/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
val = labelspan.nextSibling
value = unicode('')
while val and not defaultGetattr(val,'class') == 'label':
value += unicode(val)
val = val.nextSibling
label = labelspan.string
#print("label:%s\nvalue:%s"%(label,value))
if 'Summary' in label:
self.setDescription(url,value)
if 'Rated' in label:
self.story.setMetadata('rating', stripHTML(value))
if 'Word count' in label:
self.story.setMetadata('numWords', stripHTML(value))
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Pairing' in label:
ships = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=4'))
for ship in ships:
self.story.addToList('ships',ship.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in stripHTML(value):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/stories/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,282 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return IkEternalNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class IkEternalNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ike')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%B %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.ik-eternal.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&warning=1"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
# Since the warning text can change by warning level, let's
# look for the warning pass url. ksarchive uses
# &amp;warning= -- actually, so do other sites. Must be an
# eFiction book.
# viewstory.php?sid=1882&amp;warning=4
# viewstory.php?sid=1654&amp;ageconsent=ok&amp;warning=5
#print data
#m = re.search(r"'viewstory.php\?sid=1882(&amp;warning=4)'",data)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data,selfClosingTags=('p')) #poor formatting of the paragraphs in the title page
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
asoup = soup.find('div', {'class': 'listbox'})
for a in asoup.findAll('p'):
a.name='br'
labels = asoup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,289 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ImagineEFicComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class ImagineEFicComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ime')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%Y.%m.%d"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'imagine.e-fic.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=4"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return InDeathNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class InDeathNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
self._setURL('http://www.' + self.getSiteDomain() + '/blog/archive/'+self.story.getMetadata('storyId')+'-'+m.group('name')+'/')
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','idn')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d %B %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'indeath.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/blog/archive/123-story-in-death/"
def getSiteURLPattern(self):
# http://www.indeath.net/blog/archive/169-ransom-in-death/
return re.escape("http://")+re.escape(self.getSiteDomain())+r"/blog/(archive/)?(?P<id>\d+)\-(?P<name>[a-z0-9\-]*)/?$"
def getDateFromComponents(self, postmonth, postday):
ym = re.search("Entries\ in\ (?P<mon>January|February|March|April|May|June|July|August|September|October|November|December)\ (?P<year>\d{4})",postmonth)
d = re.search("(?P<day>\d{2})\ (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)",postday)
postdate = makeDate(d.group('day')+' '+ym.group('mon')+' '+ym.group('year'),self.dateformat)
return postdate
def getAuthorData(self):
mainUrl = self.url.replace("/archive","")
try:
maindata = self._fetchUrl(mainUrl)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.meta)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
mainsoup = bs.BeautifulSoup(maindata)
# find first entry
e = mainsoup.find('div',{'class':"entry"})
# get post author as author
d = e.find('div',{'class':"desc"})
a = d.find('strong')
self.story.setMetadata('author',a.contents[0].string.strip())
# Don't seem to be able to get author pages anymore
self.story.setMetadata('authorUrl','http://www.indeath.net/')
self.story.setMetadata('authorId','0')
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url = self.url
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.meta)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# Now go hunting for all the meta data and the chapter list.
## Title
h = soup.find('a', id="blog_title")
t = h.find('span')
self.story.setMetadata('title',stripHTML(t.contents[0]).strip())
s = t.find('div')
if s != None:
self.setDescription(url,s)
# Get Author from main blog page since it's not reliably on the archive page
self.getAuthorData()
# Find the chapters:
chapters=soup.findAll('a', title="View entry", href=re.compile(r'http://www.indeath.net/blog/'+self.story.getMetadata('storyId')+"/entry\-(\d+)\-([^/]*)/$"))
#reverse the list since newest at the top
chapters.reverse()
# Get date published & updated from first & last entries
posttable=soup.find('div', id="main_column")
postmonths=posttable.findAll('th', text=re.compile(r'Entries\ in\ '))
postmonths.reverse()
postdates=posttable.findAll('span', _class="desc", text=re.compile('\d{2}\ (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'))
postdates.reverse()
self.story.setMetadata('datePublished',self.getDateFromComponents(postmonths[0],postdates[0]))
self.story.setMetadata('dateUpdated',self.getDateFromComponents(postmonths[len(postmonths)-1],postdates[len(postdates)-1]))
# Process List of Chapters
self.story.setMetadata('numChapters',len(chapters))
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
for x in range(0,len(chapters)):
# just in case there's tags, like <i> in chapter titles.
chapter=chapters[x]
if len(chapters)==1:
self.chapterUrls.append((self.story.getMetadata('title'),chapter['href']))
else:
ct = stripHTML(chapter)
tnew = re.match("(?i)"+self.story.getMetadata('title')+r" - (?P<newtitle>.*)$",ct)
if tnew:
chaptertitle = tnew.group('newtitle')
else:
chaptertitle = ct
self.chapterUrls.append((chaptertitle,chapter['href']))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
#chapter=bs.BeautifulSoup('<div class="story"></div>')
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr','span','center'))
chapter = soup.find("div", "entry_content")
if None == chapter:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,chapter)

View file

@ -0,0 +1,314 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# Search for XXX comments--that's where things are most likely to need changing.
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return KSArchiveComAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class KSArchiveComAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ksa') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b/%d/%Y" # XXX
@classmethod
def getAcceptDomains(cls):
return ['www.ksarchive.com','ksarchive.com']
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'ksarchive.com' # XXX
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return "http://(www.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
# Furthermore, there's a couple sites now with more than
# one warning level for different ratings. And they're
# fussy about it. midnightwhispers has three: 10, 3 & 5.
# we'll try 5 first.
addurl = "&ageconsent=ok&warning=2" # XXX
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
# Since the warning text can change by warning level, let's
# look for the warning pass url. ksarchive uses
# &amp;warning= -- actually, so do other sites. Must be an
# eFiction book.
# viewstory.php?sid=1882&amp;warning=4
# viewstory.php?sid=1654&amp;ageconsent=ok&amp;warning=5
#print data
#m = re.search(r"'viewstory.php\?sid=1882(&amp;warning=4)'",data)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a)) # title's inside a <b> tag.
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',stripHTML(a))
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = stripHTML(labelspan)
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
# poor HTML(unclosed <p> for one) can cause run on
# over the next label.
if '<span class="label">' in svalue:
svalue = svalue[0:svalue.find('<span class="label">')]
break
else:
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [stripHTML(cat) for cat in cats]
for cat in catstext:
# ran across one story with an empty <a href="browse.php?type=categories&amp;catid=1"></a>
# tag in the desc once.
if cat and cat.strip() in ('Poetry','Essays'):
self.story.addToList('category',stripHTML(cat))
if 'Characters' in label:
self.story.addToList('characters','Kirk')
self.story.addToList('characters','Spock')
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [stripHTML(char) for char in chars]
for char in charstext:
self.story.addToList('characters',stripHTML(char))
## Not all sites use Genre, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genrestext = [stripHTML(genre) for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',stripHTML(genre))
## In addition to Genre (which is very site specific) KSA
## has 'Story Type', which is much more what most sites
## call genre.
if 'Story Type' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=5')) # XXX
genrestext = [stripHTML(genre) for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',stripHTML(genre))
## Not all sites use Warnings, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [stripHTML(warning) for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',stripHTML(warning))
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = stripHTML(a)
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
if "A fatal MySQL error was encountered" in data:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Database error on the site reported!" % url)
else:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,250 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return LibraryOfMoriaComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class LibraryOfMoriaComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/a/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','lom')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%B %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.libraryofmoria.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/a/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/a/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
addurl = "&ageconsent=ok&warning=3"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/a/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/a/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
if 'Type' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warning' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=5'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/a/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,257 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import urlparse
import time
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
class LiteroticaSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["utf8",
"Windows-1252"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.setMetadata('siteabbrev','litero')
# normalize to first chapter. Not sure if they ever have more than 2 digits.
storyId = self.parsedUrl.path.split('/',)[2]
# replace later chapters with first chapter but don't remove numbers
# from the URL that disambiguate stories with the same title.
storyId = re.sub("-ch-?\d\d", "", storyId)
self.story.setMetadata('storyId', storyId)
## accept m(mobile)url, but use www.
url = re.sub("^(www|german|spanish|french|dutch|italian|romanian|portuguese|other)\.i",
"\1",
url)
## strip ?page=...
url = re.sub("\?page=.*$", "", url)
## set url
self._setURL(url)
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = '%m/%d/%y'
@staticmethod
def getSiteDomain():
return 'literotica.com'
@classmethod
def getAcceptDomains(cls):
return ['www.literotica.com',
'www.i.literotica.com',
'german.literotica.com',
'german.i.literotica.com',
'spanish.literotica.com',
'spanish.i.literotica.com',
'french.literotica.com',
'french.i.literotica.com',
'dutch.literotica.com',
'dutch.i.literotica.com',
'italian.literotica.com',
'italian.i.literotica.com',
'romanian.literotica.com',
'romanian.i.literotica.com',
'portuguese.literotica.com',
'portuguese.i.literotica.com',
'other.literotica.com',
'other.i.literotica.com']
@classmethod
def getSiteExampleURLs(self):
return "http://www.literotica.com/s/story-title https://www.literotica.com/s/story-title http://portuguese.literotica.com/s/story-title http://german.literotica.com/s/story-title"
def getSiteURLPattern(self):
return r"https?://(www|german|spanish|french|dutch|italian|romanian|portuguese|other)(\.i)?\.literotica\.com/s/([a-zA-Z0-9_-]+)"
def extractChapterUrlsAndMetadata(self):
"""
NOTE: Some stories can have versions,
e.g. /my-story-ch-05-version-10
NOTE: If two stories share the same title, a running index is added,
e.g.: /my-story-ch-02-1
Strategy:
* Go to author's page, search for the current story link,
* If it's in a tr.root-story => One-part story
* , get metadata and be done
* If it's in a tr.sl => Chapter in series
* Search up from there until we find a tr.ser-ttl (this is the
story)
* Gather metadata
* Search down from there for all tr.sl until the next
tr.ser-ttl, foreach
* Chapter link is there
"""
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
logger.debug("Chapter/Story URL: <%s> " % self.url)
try:
data1 = self._fetchUrl(self.url)
soup1 = bs.BeautifulSoup(data1)
#strip comments from soup
[comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, bs.Comment))]
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# author
a = soup1.find("span", "b-story-user-y")
self.story.setMetadata('authorId', urlparse.parse_qs(a.a['href'].split('?')[1])['uid'][0])
authorurl = a.a['href']
if authorurl.startswith('//'):
authorurl = self.parsedUrl.scheme+':'+authorurl
self.story.setMetadata('authorUrl', authorurl)
self.story.setMetadata('author', a.text)
# get the author page
try:
dataAuth = self._fetchUrl(authorurl)
soupAuth = bs.BeautifulSoup(dataAuth)
#strip comments from soup
[comment.extract() for comment in soupAuth.findAll(text=lambda text:isinstance(text, bs.Comment))]
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(authorurl)
else:
raise e
## Find link to url in author's page
## site has started using //domain.name/asdf urls remove https?: from front
storyLink = soupAuth.find('a', href=self.url[self.url.index(':')+1:])
if storyLink is not None:
urlTr = storyLink.parent.parent
if urlTr['class'] == "sl":
isSingleStory = False
else:
isSingleStory = True
else:
raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (url, authorurl))
if isSingleStory:
self.story.setMetadata('title', storyLink.text)
self.story.setMetadata('description', urlTr.findAll("td")[1].text)
self.story.addToList('eroticatags', urlTr.findAll("td")[2].text)
date = urlTr.findAll('td')[-1].text
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
self.chapterUrls = [(storyLink.text, self.url)]
else:
seriesTr = urlTr.previousSibling
while seriesTr['class'] != 'ser-ttl':
seriesTr = seriesTr.previousSibling
m = re.match("^(?P<title>.*?):\s(?P<numChapters>\d+)\sPart\sSeries$", seriesTr.find("strong").text)
self.story.setMetadata('title', m.group('title'))
self.story.setMetadata('numChapters', int(m.group('numChapters')))
## Walk the chapters
chapterTr = seriesTr.nextSibling
self.chapterUrls = []
dates = []
descriptions = []
while chapterTr is not None and chapterTr['class'] == 'sl':
descriptions.append(chapterTr.findAll("td")[1].text)
chapterLink = chapterTr.find("td", "fc").find("a")
self.chapterUrls.append((chapterLink.text, "http:" + chapterLink["href"]))
self.story.addToList('eroticatags', chapterTr.findAll("td")[2].text)
dates.append(makeDate(chapterTr.findAll('td')[-1].text, self.dateformat))
chapterTr = chapterTr.nextSibling
## Set description to joint chapter descriptions
self.story.setMetadata('description', " / ".join(descriptions))
## Set the oldest date as publication date, the newest as update date
dates.sort()
self.story.setMetadata('datePublished', dates[0])
self.story.setMetadata('dateUpdated', dates[-1])
# normalize on first chapter URL.
self._setURL(self.chapterUrls[0][1])
# set storyId to 'title-author' to avoid duplicates
# self.story.setMetadata('storyId',
# re.sub("[^a-z0-9]", "", self.story.getMetadata('title').lower())
# + "-"
# + re.sub("[^a-z0-9]", "", self.story.getMetadata('author').lower()))
return
def getChapterText(self, url):
logger.debug('Getting chapter text from <%s>' % url)
data1 = self._fetchUrl(url)
soup1 = bs.BeautifulSoup(data1)
#strip comments from soup
[comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, bs.Comment))]
# get story text
story1 = soup1.find('div', 'b-story-body-x').p
story1.name='div'
story1.append('<br />')
storytext = self.utf8FromSoup(url,story1)
# find num pages
pgs = int(soup1.find("span", "b-pager-caption-t r-d45").string.split(' ')[0])
logger.debug("pages: "+str(pgs))
# get all the pages
for i in xrange(2, pgs+1):
try:
logger.debug("fetching page "+str(i))
time.sleep(0.5)
data2 = self._fetchUrl(url, {'page': i})
soup2 = bs.BeautifulSoup(data2)
[comment.extract() for comment in soup2.findAll(text=lambda text:isinstance(text, bs.Comment))]
story2 = soup2.find('div', 'b-story-body-x').p
story2.name='div'
story2.append('<br />')
storytext += self.utf8FromSoup(url,story2)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(url)
else:
raise e
return storytext
def getClass():
return LiteroticaSiteAdapter

View file

@ -0,0 +1,237 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return LumosSycophantHexComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class LumosSycophantHexComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','lsph')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'lumos.sycophanthex.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=19"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
if "Age Consent Required" in data:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
pt = soup.find('div', {'id' : 'pagetitle'})
a = pt.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = pt.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
rating=pt.text.split('(')[1].split(')')[0]
self.story.setMetadata('rating', rating)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
value = labels[0].previousSibling
svalue = ""
while value != None:
val = value
value = value.previousSibling
while not defaultGetattr(val,'class') == 'label':
svalue += str(val)
val = val.nextSibling
self.setDescription(url,svalue)
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Word count' in label:
self.story.setMetadata('numWords', value.split(' -')[0])
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Complete' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value.split(' -')[0]), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,237 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','mm')
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/fanfic/view_st.php/'+self.story.getMetadata('storyId'))
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
@staticmethod
def getSiteDomain():
return 'www.mediaminer.org'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfic/view_st.php/123456 http://"+self.getSiteDomain()+"/fanfic/view_ch.php/1234123/123444#fic_c"
def getSiteURLPattern(self):
## http://www.mediaminer.org/fanfic/view_st.php/76882
## http://www.mediaminer.org/fanfic/view_ch.php/167618/594087#fic_c
return re.escape("http://"+self.getSiteDomain())+\
"/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+(#fic_c)?)?$"
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# [ A - All Readers ], strip '[' ']'
## Above title because we remove the smtxt font to get title.
smtxt = soup.find("font",{"class":"smtxt"})
if not smtxt:
raise exceptions.StoryDoesNotExist(self.url)
rating = smtxt.string[1:-1]
self.story.setMetadata('rating',rating)
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"/fanfic/src.php/u/\d+"))
self.story.setMetadata('authorId',a['href'].split('/')[-1])
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
self.story.setMetadata('author',a.string)
## Title - Good grief. Title varies by chaptered, 1chapter and 'type=one shot'--and even 'one-shot's can have titled chapter.
## But, if colspan=2, there's no chapter title.
## <td class="ffh">Atmosphere: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
## <td colspan=2 class="ffh">Hearts of Ice <font class="smtxt">[ P - Pre-Teen ]</font></td>
## <td colspan=2 class="ffh">Suzaku no Princess <font class="smtxt">[ P - Pre-Teen ]</font></td>
## <td class="ffh">The Kraut, The Bartender, and The Drunkard: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
## <td class="ffh">Betrayal and Justice: A Cold Heart</b> <font size="-1">( Chapter 1 )</font> <font class="smtxt">[ A - All Readers ]</font></td>
## <td class="ffh">Question and Answer: Question and Answer</b> <font size="-1">( One-Shot )</font> <font class="smtxt">[ A - All Readers ]</font></td>
title = soup.find('td',{'class':'ffh'})
for font in title.findAll('font'):
font.extract() # removes 'font' tags from inside the td.
if title.has_key('colspan'):
titlet = stripHTML(title)
else:
## No colspan, it's part chapter title--even if it's a one-shot.
titlet = ':'.join(stripHTML(title).split(':')[:-1]) # strip trailing 'Chapter X' or chapter title
self.story.setMetadata('title',titlet)
## The story title is difficult to reliably parse from the
## story pages. Getting it from the author page is, but costs
## another fetch.
# authsoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
# titlea = authsoup.find('a',{'href':'/fanfic/view_st.php/'+self.story.getMetadata('storyId')})
# self.story.setMetadata('title',titlea.text)
# save date from first for later.
firstdate=None
# Find the chapters
select = soup.find('select',{'name':'cid'})
if not select:
self.chapterUrls.append(( self.story.getMetadata('title'),self.url))
else:
for option in select.findAll("option"):
chapter = stripHTML(option.string)
## chapter can be: Chapter 7 [Jan 23, 2011]
## or: Vigilant Moonlight ( Chapter 1 ) [Jan 30, 2004]
## or even: Prologue ( Prologue ) [Jul 31, 2010]
m = re.match(r'^(.*?) (\( .*? \) )?\[(.*?)\]$',chapter)
chapter = m.group(1)
# save date from first for later.
if not firstdate:
firstdate = m.group(3)
self.chapterUrls.append((chapter,'http://'+self.host+'/fanfic/view_ch.php/'+self.story.getMetadata('storyId')+'/'+option['value']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# category
# <a href="/fanfic/src.php/a/567">Ranma 1/2</a>
for a in soup.findAll('a',href=re.compile(r"^/fanfic/src.php/a/")):
self.story.addToList('category',a.string)
# genre
# <a href="/fanfic/src.php/a/567">Ranma 1/2</a>
for a in soup.findAll('a',href=re.compile(r"^/fanfic/src.php/g/")):
self.story.addToList('genre',a.string)
# if firstdate, then the block below will only have last updated.
if firstdate:
self.story.setMetadata('datePublished', makeDate(firstdate, "%b %d, %Y"))
# Everything else is in <tr bgcolor="#EEEED4">
metastr = stripHTML(soup.find("tr",{"bgcolor":"#EEEED4"})).replace('\n',' ').replace('\r',' ').replace('\t',' ')
# Latest Revision: August 03, 2010
m = re.match(r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d)",metastr)
if m:
self.story.setMetadata('dateUpdated', makeDate(m.group(1), "%B %d, %Y"))
if not firstdate:
self.story.setMetadata('datePublished',
self.story.getMetadataRaw('dateUpdated'))
else:
self.story.setMetadata('dateUpdated',
self.story.getMetadataRaw('datePublished'))
# Words: 123456
m = re.match(r".*?\| Words: (\d+) \|",metastr)
if m:
self.story.setMetadata('numWords', m.group(1))
# Summary: ....
m = re.match(r".*?Summary: (.*)$",metastr)
if m:
self.setDescription(url, m.group(1))
#self.story.setMetadata('description', m.group(1))
# completed
m = re.match(r".*?Status: Completed.*?",metastr)
if m:
self.story.setMetadata('status','Completed')
else:
self.story.setMetadata('status','In-Progress')
return
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data=self._fetchUrl(url)
soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
anchor = soup.find('a',{'name':'fic_c'})
if None == anchor:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
## find divs with align=left, those are paragraphs in newer stories.
divlist = anchor.findAllNext('div',{'align':'left'})
if divlist:
for div in divlist:
div.name='p' # convert to <p> mediaminer uses div with
# a margin for paragraphs.
anchor.append(div) # cheat! stuff all the content
# divs into anchor just as a
# holder.
del div['style']
del div['align']
anchor.name='div'
return self.utf8FromSoup(url,anchor)
else:
logger.debug('Using kludgey text find for older mediaminer story.')
## Some older mediaminer stories are unparsable with BeautifulSoup.
## Really nasty formatting. Sooo... Cheat! Parse it ourselves a bit first.
## Story stuff falls between:
data = "<div id='HERE'>" + data[data.find('<a name="fic_c">'):] +"</div>"
soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
for tag in soup.findAll('td',{'class':'ffh'}) + \
soup.findAll('div',{'class':'acl'}) + \
soup.findAll('div',{'class':'footer smtxt'}) + \
soup.findAll('table',{'class':'tbbrdr'}):
tag.extract() # remove tag from soup.
return self.utf8FromSoup(url,soup)
def getClass():
return MediaMinerOrgSiteAdapter

View file

@ -0,0 +1,293 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return MerlinFicDtwinsCoUk
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class MerlinFicDtwinsCoUk(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','mrfd')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'merlinfic.dtwins.co.uk'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=4"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Pairing' in label:
ships = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for ship in ships:
self.story.addToList('ships',ship.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=3'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,289 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# Search for XXX comments--that's where things are most likely to need changing.
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return MidnightwhispersCaAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class MidnightwhispersCaAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','mw') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%B %d, %Y" # XXX
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.midnightwhispers.ca' # XXX
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
# Furthermore, there's a couple sites now with more than
# one warning level for different ratings. And they're
# fussy about it. midnightwhispers has three: 10, 3 & 5.
# we'll try 5 first.
addurl = "&ageconsent=ok&warning=5" # XXX
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
# Since the warning text can change by warning level, let's
# look for the warning pass url. nfacommunity uses
# &amp;warning= -- actually, so do other sites. Must be an
# eFiction book.
# viewstory.php?sid=1882&amp;warning=4
# viewstory.php?sid=1654&amp;ageconsent=ok&amp;warning=5
#print data
#m = re.search(r"'viewstory.php\?sid=1882(&amp;warning=4)'",data)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a)) # title's inside a <b> tag.
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
## Not all sites use Genre, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',genre.string)
## Not all sites use Warnings, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
if "A fatal MySQL error was encountered" in data:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Database error on the site reported!" % url)
else:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,335 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return MuggleNetComAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class MuggleNetComAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','mgln') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%y" # XXX
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain.
return 'fanfiction.mugglenet.com'
@classmethod
def getAcceptDomains(cls):
return ['fanfiction.mugglenet.com','fanfic.mugglenet.com']
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+r"fanfic(tion)?\.mugglenet\.com"+re.escape("/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if "class='errortext'>Registered Users Only" in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login&sid='+self.story.getMetadata('storyId')
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
# http://fanfiction.mugglenet.com/viewstory.php?sid=91079&ageconsent=ok&warning=3
addurl = "&ageconsent=ok&warning=3" # XXX &warning=5
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
#print("\nurl:%s\ndata:\n%s\n"%(url,data))
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
# Since the warning text can change by warning level, let's
# look for the warning pass url. nfacommunity uses
# &amp;warning= -- actually, so do other sites. Must be an
# eFiction book.
# viewstory.php?sid=1882&amp;warning=4
# viewstory.php?sid=1654&amp;ageconsent=ok&amp;warning=5
#print data
#m = re.search(r"'viewstory.php\?sid=1882(&amp;warning=4)'",data)
m = re.search(r"'viewstory.php\?sid=%s((?:&amp;ageconsent=ok)?&amp;warning=\d+)'"%self.story.getMetadata('storyId'),data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# Not good enough-- content can contain a ('), which ends the content prematurely.
# metadesc = soup.find('meta',{'name':'description'})
# print("removeAllEntities(metadesc['content']):\n%s\n"%removeAllEntities(metadesc['content']))
start='<span class="label">Summary: </span>'
end='<span class="label">Rated:</span>'
summarydata = data[data.index(start)+len(start):data.index(end)]
#print("summarydata:\n%s\n"%summarydata)
self.setDescription(url,bs.BeautifulSoup(summarydata))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
# not good enough--poorly formated summary html will break it.
# if 'Summary' in label:
# ## Everything until the next span class='label'
# svalue = ""
# while not defaultGetattr(value,'class') == 'label':
# svalue += str(value)
# value = value.nextSibling
# self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
## Not all sites use Genre, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',genre.string)
## Not all sites use Warnings, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,212 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return NationalLibraryNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class NationalLibraryNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only storyid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?storyid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ntlb')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m-%d-%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
return 'national-library.net'
@classmethod
def getAcceptDomains(cls):
return ['www.national-library.net','national-library.net']
@classmethod
def getSiteExampleURLs(self):
# ONLY the stories archived on or after June 17, 2006 and that are hosted on the website:
return "http://"+self.getSiteDomain()+"/viewstory.php?storyid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+"(www\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?storyid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('h1')
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"authorresults.php\?author=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for p in soup.findAll('p'):
chapters = p.findAll('a', href=re.compile(r'viewstory.php\?storyid='+self.story.getMetadata('storyId')+"&chapnum=\d+$"))
if len(chapters) > 0:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
break
self.story.setMetadata('numChapters',len(self.chapterUrls))
self.story.setMetadata('status', 'Completed')
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('b')
for x in range(2,len(labels)):
value = labels[x].nextSibling
label = labels[x].string
if 'Summary' in label:
self.setDescription(url,value)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rating' in label:
self.story.setMetadata('rating', stripHTML(value.nextSibling))
if 'Word Count' in label:
self.story.setMetadata('numWords', value.string)
if 'Category' in label:
for cat in value.string.split(', '):
self.story.addToList('category',cat)
if 'Crossover Shows' in label:
for cat in value.string.split(', '):
if "No Show" not in cat:
self.story.addToList('category',cat)
if 'Character' in label:
for char in value.string.split(', '):
self.story.addToList('characters',char)
if 'Pairing' in label:
for char in value.string.split(', '):
self.story.addToList('ships',char)
if 'Warnings' in label:
for warning in value.string.split(', '):
self.story.addToList('warnings',warning)
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Series' in label:
self.setSeries(stripHTML(value.nextSibling), value.nextSibling.nextSibling.string[2:])
self.story.setMetadata('seriesUrl','http://'+self.host+'/'+value.nextSibling['href'])
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
story=asoup.find('a', href=re.compile(r'viewstory.php\?storyid='+self.story.getMetadata('storyId')))
a=story.findNext(text=re.compile('Genre')).parent.nextSibling.string.split(', ')
for genre in a:
self.story.setMetadata('genre', genre)
a=story.findNext(text=re.compile('Archived'))
self.story.setMetadata('datePublished', makeDate(stripHTML(a.parent.nextSibling), self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(stripHTML(a.parent.nextSibling), self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div')
# bit messy since higly inconsistent
for p in soup.findAll('p', {'align' : 'center'}):
p.extract()
p = soup.findAll('p')
for x in range(0,3):
p[x].extract()
if "Chapters: " in stripHTML(p[3]):
p[3].extract()
for x in range(len(p)-2,len(p)-1):
p[x].extract()
for p in soup.findAll('h1'):
p.extract()
for p in soup.findAll('h3'):
p.extract()
for p in soup.findAll('a'):
p.extract()
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,218 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return NCISFicComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class NCISFicComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only storyid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?storyid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ncisf')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m-%d-%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
return 'ncisfic.com'
@classmethod
def getAcceptDomains(cls):
return ['www.ncisfic.com','ncisfic.com']
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?storyid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+"(www\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?storyid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('h1')
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"authorresults.php\?author=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for p in soup.findAll('p'):
chapters = p.findAll('a', href=re.compile(r'viewstory.php\?storyid='+self.story.getMetadata('storyId')+"&chapnum=\d+$"))
if len(chapters) > 0:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
break
self.story.setMetadata('numChapters',len(self.chapterUrls))
self.story.setMetadata('status', 'Completed')
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('b')
for x in range(2,len(labels)):
value = labels[x].nextSibling
label = labels[x].string
if 'Summary' in label:
self.setDescription(url,value)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rating' in label:
self.story.setMetadata('rating', stripHTML(value.nextSibling))
if 'Word Count' in label:
self.story.setMetadata('numWords', value.string)
if 'Category' in label:
for cat in value.string.split(', '):
self.story.addToList('category',cat)
if 'Crossover Shows' in label:
for cat in value.string.split(', '):
if "No Show" not in cat:
self.story.addToList('category',cat)
if 'Character' in label:
for char in value.string.split(', '):
self.story.addToList('characters',char)
if 'Pairing' in label:
for char in value.string.split(', '):
self.story.addToList('ships',char)
if 'Warnings' in label:
for warning in value.string.split(', '):
self.story.addToList('warnings',warning)
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Series' in label:
if "No Series" not in value.nextSibling.string:
self.setSeries(stripHTML(value.nextSibling), value.nextSibling.nextSibling.string[2:])
self.story.setMetadata('seriesUrl','http://'+self.host+'/'+value.nextSibling['href'])
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
story=asoup.find('a', href=re.compile(r'viewstory.php\?storyid='+self.story.getMetadata('storyId')))
a=story.findNext('font')
if 'Complete' in a.string:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
a=story.findNext(text=re.compile('Genre')).parent.nextSibling.string.split(', ')
for genre in a:
self.story.setMetadata('genre', genre)
a=story.findNext(text=re.compile('Archived'))
self.story.setMetadata('datePublished', makeDate(stripHTML(a.parent.nextSibling), self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(stripHTML(a.parent.nextSibling), self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div')
# bit messy since higly inconsistent
for p in soup.findAll('p', {'align' : 'center'}):
p.extract()
p = soup.findAll('p')
for x in range(0,3):
p[x].extract()
if "Chapters: " in stripHTML(p[3]):
p[3].extract()
for x in range(len(p)-2,len(p)-1):
p[x].extract()
for p in soup.findAll('h1'):
p.extract()
for p in soup.findAll('h3'):
p.extract()
for p in soup.findAll('a'):
p.extract()
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,210 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return NCISFictionNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class NCISFictionNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["iso-8859-1",
"Windows-1252"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL("http://"+self.getSiteDomain()\
+"/chapters.php?stid="+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ncisfn')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d/%m/%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.ncisfiction.net'
## Changed from www.ncisfiction.com to www.ncisfiction.net Oct
## 2012 due to the ncisfiction.com domain expiring. Still accept
## .com domains for existing updates, etc.
@classmethod
def getAcceptDomains(cls):
return ['www.ncisfiction.net','www.ncisfiction.com']
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/story.php?stid=01234 http://"+self.getSiteDomain()+"/chapters.php?stid=1234"
def getSiteURLPattern(self):
return r'http://www\.ncisfiction\.(net|com)/(chapters|story)?.php\?stid=\d+'
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulStoneSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title and author
a = soup.find('div', {'class' : 'main_title'})
aut = a.find('a')
self.story.setMetadata('authorId',aut['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+aut['href'])
self.story.setMetadata('author',aut.string)
aut.extract()
self.story.setMetadata('title',stripHTML(a)[:len(stripHTML(a))-2])
# Find the chapters:
i=0
chapters=soup.findAll('table', {'class' : 'story_table'})
for chapter in chapters:
ch=chapter.find('a')
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(ch),'http://'+self.host+'/'+ch['href']))
if i == 0:
self.story.setMetadata('datePublished', makeDate(stripHTML(chapter.find('td')).split('Added: ')[1], self.dateformat))
if i == len(chapters)-1:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(chapter.find('td')).split('Added: ')[1], self.dateformat))
i=i+1
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
info = soup.find('table', {'class' : 'story_info'})
# no convenient way to calculate word count as it is logged differently for stories with and without series
labels = info.findAll('tr')
for tr in labels:
value = tr.find('td')
label = tr.find('th').string
if 'Summary' in label:
self.setDescription(url,value)
if 'Rating' in label:
self.story.setMetadata('rating', value.string)
if 'Category' in label:
cats = value.findAll('a')
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = value.findAll('a')
for char in chars:
self.story.addToList('characters',char.string)
if 'Pairing' in label:
ships = value.findAll('a')
for ship in ships:
self.story.addToList('ships',ship.string)
if 'Genre' in label:
genres = value.findAll('a')
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = value.findAll('a')
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Status' in label:
if 'not completed' in value.text:
self.story.setMetadata('status', 'In-Progress')
else:
self.story.setMetadata('status', 'Completed')
try:
# Find Series name from series URL.
a = soup.find('div',{'class' : 'sub_header'})
series_name = a.find('a').string
i = a.text.split('#')[1]
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl','http://'+self.host+'/'+a.find('a')['href'])
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'class' : 'story_text'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,212 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return NetRaptorOrgAdapter
class NetRaptorOrgAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/fanfiction/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','netrap')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d/%m/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'netraptor.org'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfiction/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/fanfiction/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url = self.url+'&index=1'
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
pagetitle = soup.find('div',{'id':'pagetitle'})
a = pagetitle.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/fanfiction/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/fanfiction/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/fanfiction/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url))
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,289 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# Search for XXX comments--that's where things are most likely to need changing.
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return NfaCommunityComAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class NfaCommunityComAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','nfa') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y" # XXX
@classmethod
def getAcceptDomains(cls):
return ['www.nfacommunity.com','nfacommunity.com']
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'nfacommunity.com' # XXX
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return "http://(www.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
# Furthermore, there's a couple sites now with more than
# one warning level for different ratings. And they're
# fussy about it. nfacommunity has two: 4 & 5.
# we'll try 5 first.
addurl = "&ageconsent=ok&warning=5" # XXX
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
# Since the warning text can change by warning level, let's
# look for the warning pass url. nfacommunity uses
# &amp;warning= -- actually, so do other sites. Must be an
# eFiction book.
# viewstory.php?sid=1882&amp;warning=4
# viewstory.php?sid=1654&amp;ageconsent=ok&amp;warning=5
#print data
#m = re.search(r"'viewstory.php\?sid=1882(&amp;warning=4)'",data)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
## Not all sites use Genre, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',genre.string)
## Not all sites use Warnings, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,214 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return NHAMagicalWorldsUsAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class NHAMagicalWorldsUsAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','nha')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = " %m/%d/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'nha.magical-worlds.us'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
try:
# in case link points somewhere other than the first chapter
a = soup.findAll('option')[1]['value']
self.story.setMetadata('storyId',a.split('=',)[1])
url = 'http://'+self.host+'/'+a
soup = bs.BeautifulSoup(self._fetchUrl(url))
except:
pass
for info in asoup.findAll('table', {'width' : '100%', 'bordercolor' : re.compile(r'#')}):
a = info.find('a')
if 'viewstory.php?sid='+self.story.getMetadata('storyId') == a['href'] or \
('viewstory.php?sid='+self.story.getMetadata('storyId')+'&') in a['href']:
self.story.setMetadata('title',stripHTML(a))
break
# Find the chapters:
chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+'&chapter=\d+$'))
if len(chapters) == 0:
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d):
try:
return d.name
except:
return ""
cats = info.findAll('a',href=re.compile('categories.php'))
for cat in cats:
self.story.addToList('category',cat.string)
a = info.find('a', href=re.compile(r'viewuser.php'))
val = a.nextSibling
svalue = ""
while not defaultGetattr(val) == 'br':
val = val.nextSibling
val = val.nextSibling
while not defaultGetattr(val) == 'br':
svalue += unicode(val)
val = val.nextSibling
self.setDescription(url,svalue)
#does not provide convenient way to get word count
labels = info.findAll('i')
for labelspan in labels:
value = labelspan.nextSibling
label = stripHTML(labelspan)
if 'Rating' in label:
self.story.setMetadata('rating', value.split(' -')[0])
if 'Genres' in label:
genres = value.string.split(', ')
for genre in genres:
if 'None' not in genre:
self.story.addToList('genre',genre.split(' -')[0])
if 'Characters' in label:
chars = value.string.split(', ')
for char in chars:
if 'None' not in char:
self.story.addToList('characters',char.split(' -')[0])
if 'Warnings' in label:
warnings = value.string.split(', ')
for warning in warnings:
if 'None' not in warning:
self.story.addToList('warnings',warning.split(' -')[0])
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(value.split(' -')[0], self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(value.split(' -')[0], self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data, selfClosingTags=('br','hr','span','center')) # some chapters seem to be hanging up on those tags, so it is safer to close them
story = soup.find('div', {"id" : "story"})
if None == story:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,story)

View file

@ -0,0 +1,176 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return NickAndGregNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class NickAndGregNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/desert_archive/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','nag')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%Y/%m/%d"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.nickandgreg.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/desert_archive/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/desert_archive/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&i=1'
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/desert_archive/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
chapters = soup.find('select')
for chapter in chapters.findAll('option'):
if chapter.text != 'Story Index' and chapter.text != 'Chapters':
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/desert_archive/'+chapter['value']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
for div in asoup.findAll('td', {'class' : 'tblborder6'}):
a = div.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
if a != None:
break
self.setDescription(url,div.find('br').nextSibling)
a=div.text.split('Rating:')
if len(a) == 2: self.story.setMetadata('rating', a[1].split(' -')[0])
a=div.text.split('Characters:')
if len(a) == 2:
for char in a[1].split(' -')[0].split(', '):
self.story.addToList('characters',char)
a=div.text.split('Genres:')
if len(a) == 2:
for genre in a[1].split(' -')[0].split(', '):
self.story.addToList('genre',genre)
a=div.text.split('Warnings:')
if len(a) == 2:
for warn in a[1].split(' -')[0].split(', '):
if 'none' not in warn:
self.story.addToList('warnings',warn)
a=div.text.split('Completed:')
if len(a) ==2:
if 'Yes' in a[1]:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
a=div.text.split('Published:')
if len(a) == 2: self.story.setMetadata('datePublished', makeDate(stripHTML(a[1].split(' -')[0]), self.dateformat))
a=div.text.split('Updated:')
if len(a) == 2: self.story.setMetadata('dateUpdated', makeDate(stripHTML(a[1].split(' -')[0]), self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
# wrap a div around it.
divsoup = bs.BeautifulStoneSoup('<div class="story"></div>',
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = divsoup.find('div')
div.append(soup.find('table', {'class' : 'tblborder6'}))
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,177 @@
import re
import urllib2
import urlparse
from .. import BeautifulSoup
from base_adapter import BaseSiteAdapter, makeDate
from .. import exceptions
def getClass():
return NocturnalLightNetAdapter
# yields Tag _and_ NavigableString siblings from the given tag. The
# BeautifulSoup findNextSiblings() method for some reasons only returns either
# NavigableStrings _or_ Tag objects, not both.
def _yield_next_siblings(tag):
sibling = tag.nextSibling
while sibling:
yield sibling
sibling = sibling.nextSibling
class NocturnalLightNetAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'nln'
SITE_DOMAIN = 'nocturnal-light.net'
BASE_URL = 'http://' + SITE_DOMAIN + '/fanfiction/'
STORY_URL_TEMPLATE = BASE_URL + 'story/%s'
AUTHORS_URL_TEMPLATE = BASE_URL + 'authors/%s'
DATETIME_FORMAT = '%m-%d-%y'
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
url_tokens = self.parsedUrl.path.split('/')
story_id = url_tokens[url_tokens.index('story') + 1]
self.story.setMetadata('storyId', story_id)
self._setURL(self.STORY_URL_TEMPLATE % story_id)
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
def _customized_fetch_url(self, url, exception=None, parameters=None):
if exception:
try:
data = self._fetchUrl(url, parameters)
except urllib2.HTTPError:
raise exception(self.url)
# Just let self._fetchUrl throw the exception, don't catch and
# customize it.
else:
data = self._fetchUrl(url, parameters)
return BeautifulSoup.BeautifulSoup(data)
@staticmethod
def getSiteDomain():
return NocturnalLightNetAdapter.SITE_DOMAIN
@classmethod
def getSiteExampleURLs(cls):
return cls.STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self.STORY_URL_TEMPLATE[:-2]) + r'\d+.*$'
def extractChapterUrlsAndMetadata(self):
soup = self._customized_fetch_url(self.url)
# Since no 404 error code we have to raise the exception ourselves.
# A title that is just 'by' indicates that there is no author name
# and no story title available.
if soup.title.string.strip() == 'by':
raise exceptions.StoryDoesNotExist(self.url)
# "storycontent" is found in a single-chapter story
author_anchor = soup.find('div', id=lambda id: id in ('main', 'storycontent')).h1.a
self.story.setMetadata('author', author_anchor.string)
url_tokens = author_anchor['href'].split('/')
author_id = url_tokens[url_tokens.index('authors')+1]
self.story.setMetadata('authorId', author_id)
self.story.setMetadata('authorUrl', self.AUTHORS_URL_TEMPLATE % author_id)
chapter_anchors = soup('a', href=lambda href: href and href.startswith('/fanfiction/story/'))
for chapter_anchor in chapter_anchors:
url = urlparse.urljoin(self.BASE_URL, chapter_anchor['href'])
self.chapterUrls.append((chapter_anchor.string, url))
author_url = urlparse.urljoin(self.BASE_URL, author_anchor['href'])
soup = self._customized_fetch_url(author_url)
story_id = self.story.getMetadata('storyId')
for listbox in soup('div', {'class': 'listbox'}):
url_tokens = listbox.a['href'].split('/')
# Found the div containing the story's metadata; break the loop and
# parse the element
if story_id == url_tokens[url_tokens.index('story')+1]:
break
else:
raise exceptions.FailedToDownload(self.url)
title = listbox.a.string
self.story.setMetadata('title', title)
# No chapter anchors found in the original story URL, so the story has
# only a single chapter.
if not chapter_anchors:
self.chapterUrls.append((title, self.url))
for b_tag in listbox('b'):
key = b_tag.string.strip(':')
try:
value = b_tag.nextSibling.string.replace('&bull;', '').strip(': ')
# This can happen with some fancy markup in the summary. Just
# ignore this error and set value to None, the summary parsing
# takes care of this
except AttributeError:
value = None
if key == 'Summary':
contents = []
keep_summary_html = self.getConfig('keep_summary_html')
for sibling in _yield_next_siblings(b_tag):
if isinstance(sibling, BeautifulSoup.Tag):
if sibling.name == 'b' and sibling.findPreviousSibling().name == 'br':
break
if keep_summary_html:
contents.append(self.utf8FromSoup(author_url, sibling))
else:
contents.append(''.join(sibling(text=True)))
else:
contents.append(sibling)
# Pop last break line tag
contents.pop()
self.story.setMetadata('description', ''.join(contents))
elif key == 'Category':
for sibling in b_tag.findNextSiblings(['a', 'b']):
if sibling.name == 'b':
break
self.story.addToList('category', sibling.string)
elif key == 'Rating':
self.story.setMetadata('rating', value)
elif key == 'Chapters':
self.story.setMetadata('numChapters', int(value))
# Also parse reviews number which lies right after the chapters
# section
reviews_anchor = b_tag.findNextSibling('a')
reviews = reviews_anchor.string.split(' ')[1].strip('()')
self.story.setMetadata('reviews', reviews)
elif key == 'Completed':
self.story.setMetadata('status', 'Completed' if value == 'Yes' else 'In-Progress')
elif key == 'Date Added':
self.story.setMetadata('datePublished', makeDate(value, self.DATETIME_FORMAT))
elif key == 'Last Updated':
self.story.setMetadata('dateUpdated', makeDate(value, self.DATETIME_FORMAT))
elif key == 'Read':
self.story.setMetadata('readings', value.split()[0])
if self.story.getMetadata('rating') == 'NC-17' and not (self.is_adult or self.getConfig('is_adult')):
raise exceptions.AdultCheckRequired(self.url)
def getChapterText(self, url):
soup = self._customized_fetch_url(url)
return self.utf8FromSoup(url, soup.find('div', id='storytext'))

View file

@ -0,0 +1,262 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return OcclumencySycophantHexComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class OcclumencySycophantHexComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','osph')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'occlumency.sycophanthex.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'This story contains adult content and/or themes.' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['rememberme'] = '1'
params['sid'] = ''
params['intent'] = ''
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Logout" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
try:
# in case link points somewhere other than the first chapter
a = soup.findAll('option')[1]['value']
self.story.setMetadata('storyId',a.split('=',)[1])
url = 'http://'+self.host+'/'+a
soup = bs.BeautifulSoup(self._fetchUrl(url))
except:
pass
for info in asoup.findAll('table', {'class' : 'border'}):
a = info.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
if a != None:
self.story.setMetadata('title',stripHTML(a))
break
# Find the chapters:
chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1$'))
if len(chapters) == 0:
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d):
try:
return d.name
except:
return ""
cats = info.findAll('a',href=re.compile('categories.php'))
for cat in cats:
self.story.addToList('category',cat.string)
a = info.find('a', href=re.compile(r'reviews.php\?sid='+self.story.getMetadata('storyId')))
val = a.nextSibling
svalue = ""
while not defaultGetattr(val) == 'br':
val = val.nextSibling
val = val.nextSibling
while not defaultGetattr(val) == 'table':
svalue += str(val)
val = val.nextSibling
self.setDescription(url,svalue)
# <span class="label">Rated:</span> NC-17<br /> etc
labels = info.findAll('b')
for labelspan in labels:
value = labelspan.nextSibling
label = stripHTML(labelspan)
if 'Rating' in label:
self.story.setMetadata('rating', value)
if 'Word Count' in label:
self.story.setMetadata('numWords', value)
if 'Genres' in label:
genres = value.string.split(', ')
for genre in genres:
if genre != 'none':
self.story.addToList('genre',genre)
if 'Characters' in label:
chars = value.string.split(', ')
for char in chars:
if char != 'none':
self.story.addToList('characters',char)
if 'Warnings' in label:
warnings = value.string.split(', ')
for warning in warnings:
if warning != ' none':
self.story.addToList('warnings',warning)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
data = data.replace('<div align="left"', '<div align="left">')
soup = bs.BeautifulSoup(data, selfClosingTags=('br','hr'))
story = soup.find('div', {"align" : "left"})
if None == story:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,story)

View file

@ -0,0 +1,269 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return OneDirectionFanfictionComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class OneDirectionFanfictionComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','odf')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'onedirectionfanfiction.com'
@classmethod
def getAcceptDomains(cls):
return ['www.onedirectionfanfiction.com','onedirectionfanfiction.com']
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+"(www\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=4"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
if "Age Consent Required" in data:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while value and not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=6'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,240 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2, urllib, cookielib
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return PhoenixSongNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class PhoenixSongNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[3])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/fanfiction/story/' +self.story.getMetadata('storyId')+'/')
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','phs')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%B %d %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.phoenixsong.net'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfiction/story/1234/"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/fanfiction/story/")+r"\d+/?$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Please login to continue.' in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['txtusername'] = self.username
params['txtpassword'] = self.password
else:
params['txtusername'] = self.getConfig("username")
params['txtpassword'] = self.getConfig("password")
#params['remember'] = '1'
params['login'] = 'Login'
loginUrl = 'http://' + self.getSiteDomain() + '/users/processlogin.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['txtusername']))
d = self._fetchUrl(loginUrl, params)
if 'Please login to continue.' in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['txtusername']))
raise exceptions.FailedToLogin(url,params['txtusername'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url
logger.debug("URL: "+url)
try:
if self.getConfig('force_login'):
self.performLogin(url)
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
b = soup.find('div', {'id' : 'nav25'})
a = b.find('a', href=re.compile(r'fanfiction/story/'+self.story.getMetadata('storyId')+"/$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url. /fanfiction/stories.php?psid=125
a = b.find('a', href=re.compile(r"/fanfiction/stories.php\?psid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
chapters = soup.find('select')
if chapters == None:
self.chapterUrls.append((self.story.getMetadata('title'),url))
for b in soup.findAll('b'):
if b.text == "Updated":
date = b.nextSibling.string.split(': ')[1].split(',')
self.story.setMetadata('datePublished', makeDate(date[0]+date[1], self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(date[0]+date[1], self.dateformat))
else:
i = 0
chapters = chapters.findAll('option')
for chapter in chapters:
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['value']))
if i == 0:
self.story.setMetadata('storyId',chapter['value'].split('/')[3])
head = bs.BeautifulSoup(self._fetchUrl('http://'+self.host+chapter['value'])).findAll('b')
for b in head:
if b.text == "Updated":
date = b.nextSibling.string.split(': ')[1].split(',')
self.story.setMetadata('datePublished', makeDate(date[0]+date[1], self.dateformat))
if i == (len(chapters)-1):
head = bs.BeautifulSoup(self._fetchUrl('http://'+self.host+chapter['value'])).findAll('b')
for b in head:
if b.text == "Updated":
date = b.nextSibling.string.split(': ')[1].split(',')
self.story.setMetadata('dateUpdated', makeDate(date[0]+date[1], self.dateformat))
i = i+1
self.story.setMetadata('numChapters',len(self.chapterUrls))
asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
info = asoup.find('a', href=re.compile(r'fanfiction/story/'+self.story.getMetadata('storyId')+"/$"))
while info != None:
info = info.findNext('div')
b = info.find('b')
val = b.nextSibling
if 'Rating' in b.string:
self.story.setMetadata('rating', val.string.split(': ')[1])
if 'Words' in b.string:
self.story.setMetadata('numWords', val.string.split(': ')[1])
if 'Setting' in b.string:
self.story.addToList('category', val.string.split(': ')[1])
if 'Status' in b.string:
if 'Completed' in val:
val = 'Completed'
else:
val = 'In-Progress'
self.story.setMetadata('status', val)
if 'Summary' in b.string:
b.extract()
info.find('br').extract()
self.setDescription(url,info)
break
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
chapter=bs.BeautifulSoup('<div class="story"></div>')
for p in soup.findAll('p'):
if "This is for problems with the formatting or the layout of the chapter." in stripHTML(p):
break
chapter.append(p)
for a in chapter.findAll('div'):
a.extract()
for a in chapter.findAll('table'):
a.extract()
for a in chapter.findAll('script'):
a.extract()
for a in chapter.findAll('form'):
a.extract()
for a in chapter.findAll('textarea'):
a.extract()
if None == chapter:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,chapter)

View file

@ -0,0 +1,300 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return PommeDeSangComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class PommeDeSangComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# pommedesang.com has two 'sections', shown in URL as
# 'efiction' and 'sds' that change how things should be
# handled.
# http://pommedesang.com/efiction/viewstory.php?sid=1234
# http://pommedesang.com/sds/viewstory.php?sid=1234
self.section=self.parsedUrl.path.split('/',)[1]
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/'+self.section+'/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','pmds')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
if 'efiction' in self.section:
self.dateformat = "%b %d, %Y"
else:
self.dateformat = "%m/%d/%y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'pommedesang.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/efiction/viewstory.php?sid=1234 http://"+self.getSiteDomain()+"/sds/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return r"http://"+self.getSiteDomain()+"/(efiction|sds)?/viewstory.php\?sid=\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/'+self.section+'/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=5"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile('viewstory.php\?sid=\d+'))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+self.section+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# summary, rated, word count, categories, characters, genre, warnings, completed, published, updated, seires
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+self.section+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile('viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if ('viewstory.php?sid='+self.story.getMetadata('storyId')) in a['href']:
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,248 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return PonyFictionArchiveNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class PonyFictionArchiveNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
if "explicit" in self.parsedUrl.netloc:
self._setURL('http://explicit.' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self.dateformat = "%d/%b/%y"
else:
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self.dateformat = "%d %b %Y"
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','pffa')
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'ponyfictionarchive.net'
@classmethod
def getAcceptDomains(cls):
return ['www.ponyfictionarchive.net','ponyfictionarchive.net','explicit.ponyfictionarchive.net']
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234 http://explicit."+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+"(www\.|explicit\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&warning=9"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
genres = soup.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
warnings = soup.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=3'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
status = soup.find('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
self.story.setMetadata('status',status.string)
section = soup.findAll('span', {'class' : 'General'})[1]
self.story.setMetadata('rating', section.previousSibling.previousSibling.string)
value = section.nextSibling
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url)) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,277 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import cookielib as cl
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
# Search for XXX comments--that's where things are most likely to need changing.
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return PortkeyOrgAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class PortkeyOrgAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/story/'+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','prtky') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d/%m/%y" # XXX
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'fanfiction.portkey.org' # XXX
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/story/1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/story/")+r"\d+(/\d+)?$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
# portkey screws around with using a different URL to set the
# cookie and it's a pain. So... cheat!
if self.is_adult or self.getConfig("is_adult"):
cookieproc = urllib2.HTTPCookieProcessor()
cookie = cl.Cookie(version=0, name='verify17', value='1',
port=None, port_specified=False,
domain=self.getSiteDomain(), domain_specified=False, domain_initial_dot=False,
path='/', path_specified=True,
secure=False,
expires=time.time()+10000,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = urllib2.build_opener(cookieproc)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "You must be over 18 years of age to view it" in data: # XXX
raise exceptions.AdultCheckRequired(self.url)
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
#print data
# Now go hunting for all the meta data and the chapter list.
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"/profile/\d+"))
#print("======a:%s"%a)
self.story.setMetadata('authorId',a['href'].split('/')[-1])
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
self.story.setMetadata('author',a.string)
## Going to get the rest from the author page.
authsoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')))
## Title
titlea = authsoup.find('a', href=re.compile(r'/story/'+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(titlea))
metablock = titlea.parent
# Find the chapters:
for chapter in soup.find('select',{'name':'select5'}).findAll('option', {'value':re.compile(r'/story/'+self.story.getMetadata('storyId')+"/\d+$")}):
# just in case there's tags, like <i> in chapter titles.
chtitle = stripHTML(chapter)
if not chtitle:
chtitle = "(Untitled Chapter)"
self.chapterUrls.append((chtitle,'http://'+self.host+chapter['value']))
if len(self.chapterUrls) == 0:
self.chapterUrls.append((stripHTML(self.story.getMetadata('title')),url))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <SPAN class="dark-small-bold">Contents:</SPAN> <SPAN class="small-grey">NC17 </SPAN>
# <SPAN class="dark-small-bold">Published: </SPAN><SPAN class="small-grey">12/11/07</SPAN>
# <SPAN class="dark-small-bold"><BR>
# Description:</SPAN> <SPAN class="small-black">A special book helps Harry tap into the power the Dark Lord knows not. Of course its a book on sex magic and rituals… but Harrys not complaining. Spurned on by the ghost of a pervert founder, Harry leads his friends in the hunt for Voldemorts Horcruxes.
# EROTIC COMEDY! Loads of crude humor and sexual situations!
# </SPAN>
labels = metablock.findAll('span',{'class':'dark-small-bold'})
for labelspan in labels:
value = labelspan.findNext('span').string
label = stripHTML(labelspan)
# print("\nlabel:%s\nlabel:%s\nvalue:%s\n"%(labelspan,label,value))
if 'Description' in label:
self.setDescription(url,value)
if 'Contents' in label:
self.story.setMetadata('rating', value)
if 'Words' in label:
self.story.setMetadata('numWords', value)
# if 'Categories' in label:
# cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
# catstext = [cat.string for cat in cats]
# for cat in catstext:
# self.story.addToList('category',cat.string)
# if 'Characters' in label:
# chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
# charstext = [char.string for char in chars]
# for char in charstext:
# self.story.addToList('characters',char.string)
if 'Genre' in label:
# genre is typo'ed on the site--it falls between the
# dark-small-bold label and dark-small-bold content
# spans.
svalue = ""
value = labelspan.nextSibling
while not defaultGetattr(value,'class') == 'dark-small-bold':
svalue += str(value)
value = value.nextSibling
for genre in svalue.split("/"):
genre = genre.strip()
if genre != 'None':
self.story.addToList('genre',genre)
## Not all sites use Warnings, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
# if 'Warnings' in label:
# warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
# warningstext = [warning.string for warning in warnings]
# self.warning = ', '.join(warningstext)
# for warning in warningstext:
# self.story.addToList('warnings',warning.string)
if 'Status' in label:
if 'Completed' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
# try:
# # Find Series name from series URL.
# a = metablock.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
# series_name = a.string
# series_url = 'http://'+self.host+'/'+a['href']
# # use BeautifulSoup HTML parser to make everything easier to find.
# seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
# storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
# i=1
# for a in storyas:
# if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
# self.setSeries(series_name, i)
# break
# i+=1
# except:
# # I find it hard to care if the series parsing fails
# pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
data = data.replace("HTML>","div>")
soup = bs.BeautifulSoup(data)
#print("soup:%s"%soup)
tag = soup.find('td', {'class' : 'story'})
if tag == None and "<center><b>Chapter does not exist!</b></center>" in data:
print("Chapter is missing at: %s"%url)
return self.utf8FromSoup(url,bs.BeautifulStoneSoup("<div><p><center><b>Chapter does not exist!</b></center></p><p>Chapter is missing at: <a href='%s'>%s</a></p></div>"%(url,url)))
tag.name='div' # force to be a div to avoid problems with nook.
centers = tag.findAll('center')
# first two and last two center tags are some script, 'report
# story', 'report story' and an ad.
centers[0].extract()
centers[1].extract()
centers[-1].extract()
centers[-2].extract()
if None == tag:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,tag)

View file

@ -0,0 +1,210 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','pns')
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/fanfiction/viewstory.php?sid='+self.story.getMetadata('storyId'))
@staticmethod
def getSiteDomain():
return 'www.potionsandsnitches.net'
@classmethod
def getAcceptDomains(cls):
return ['www.potionsandsnitches.net','potionsandsnitches.net']
@classmethod
def getSiteExampleURLs(self):
return "http://www.potionsandsnitches.net/fanfiction/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+r"(www\.)?"+re.escape("potionsandsnitches.net/fanfiction/viewstory.php?sid=")+r"\d+$"
def extractChapterUrlsAndMetadata(self):
url = self.url+'&index=1'
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/fanfiction/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/fanfiction/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
## <meta name='description' content='&lt;p&gt;Description&lt;/p&gt; ...' >
## Summary, strangely, is in the content attr of a <meta name='description'> tag
## which is escaped HTML. Unfortunately, we can't use it because they don't
## escape (') chars in the desc, breakin the tag.
#meta_desc = soup.find('meta',{'name':'description'})
#metasoup = bs.BeautifulStoneSoup(meta_desc['content'])
#self.story.setMetadata('description',stripHTML(metasoup))
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next div class='listbox'
svalue = ""
while not defaultGetattr(value,'class') == 'listbox':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
if "Snape and Harry (required)" in char:
self.story.addToList('characters',"Snape")
self.story.addToList('characters',"Harry")
else:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',genre.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
# limit date values, there's some extra chars.
self.story.setMetadata('datePublished', makeDate(stripHTML(value[:12]), "%d %b %Y"))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value[:12]), "%d %b %Y"))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/fanfiction/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)
def getClass():
return PotionsAndSnitchesNetSiteAdapter

View file

@ -0,0 +1,243 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return PotterFicsComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class PotterFicsComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL. gets rid of chapter if there, left with chapter index URL
nurl = "http://"+self.getSiteDomain()+"/historias/"+self.story.getMetadata('storyId')
self._setURL(nurl)
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','potficscom')
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.potterfics.com'
@classmethod
def getSiteExampleURLs(self):
return "http://www.potterfics.com/historias/12345 http://www.potterfics.com/historias/12345/capitulo-1 "
def getSiteURLPattern(self):
#http://www.potterfics.com/historias/127583
#http://www.potterfics.com/historias/127583/capitulo-1
#http://www.potterfics.com/historias/127583/capitulo-4
#http://www.potterfics.com/historias/92810 -> Complete story
#http://www.potterfics.com/historias/111194 -> Complete, single chap
p = re.escape("http://"+self.getSiteDomain()+"/historias/")+\
r"(?P<id>\d+)(/capitulo-(?P<ch>\d+))?/?$"
return p
def extractChapterUrlsAndMetadata(self):
#this converts '/historias/12345' to 'http://www.potterfics.com/historias/12345'
def makeAbsoluteURL(url):
if url[0] == '/':
url = 'http://'+self.getSiteDomain()+url
return url
#use this to get month numbers from Spanish months
SpanishMonths = {
'enero' : '01',
'febrero' : '02',
'marzo' : '03',
'abril' : '04',
'mayo' : '05',
'junio' : '06',
'julio' : '07',
'agosto' : '08',
'septiembre' : '09',
'octubre' : '10',
'noviembre' : '11',
'diciembre' : '12'
}
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
#print data
#deal with adult content warnings - doesn't seem to apply to this site
#set constant meta for this site:
#Set Language = Spanish
self.story.setMetadata('language', 'Spanish')
#Set Category = Harry Potter
# This is better done in plugin-defaults.ini and defaults.ini
# by adding a section for this site with the line:
# extracategories:Harry Potter
#self.story.addToList('category','Harry Potter')
#get the rest of the meta
# use BeautifulSoup HTML parser to make everything easier to find.
#self closing br and img present!
soup = bs.BeautifulSoup(data,selfClosingTags=('br','img'))
#we want the second table directly under the body, contains all the metadata
table = soup.html.body.findAll('table', recursive=False)[1]
#within that, we want the second row, first cell
cell = table.tr.findNextSibling('tr').td
#find first metadata block
mb = cell.div.findNextSibling('div')
#Get meta...
self.story.setMetadata('title', stripHTML(mb.b))
#strip out brackets on rating
self.story.setMetadata('rating', mb.span.string[1:-1])
#Completion status is denoted by the presence of this image:
if mb.find('img',title="Historia terminada"):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
#find next metadata block
#author details
mb = mb.findNextSibling('div')
self.story.setMetadata('author', mb.b.a.string.strip())
self.story.setMetadata('authorUrl', makeAbsoluteURL(mb.b.a['href']))
self.story.setMetadata('authorId', self.story.getMetadata('authorUrl').split('/')[4])
#dates and times
mb = mb.find('span')
#posted/published = Escrita
date = mb.find(text=re.compile('Escrita el ')).strip().split()
year = int(date[7][:-1]) # need to remove the last char from year, it is a comma
month = int(SpanishMonths[date[5].lower()])
day = int(date[3])
time = date[8].split(':')
hour = int(time[0])
minute = int(time[1])
self.story.setMetadata('datePublished', datetime.datetime(year, month, day, hour, minute))
#updated = Actualizada
date = mb.find(text=re.compile('Actualizada el ')).strip().split()
year = int(date[7][:-1]) # need to remove the last char from year, it is a comma
month = int(SpanishMonths[date[5].lower()])
day = int(date[3])
time = date[8].split(':')
hour = int(time[0])
minute = int(time[1])
self.story.setMetadata('dateUpdated', datetime.datetime(year, month, day, hour, minute))
mb = mb.span.findNextSibling('span').findNextSibling('span')
wc = mb.find(text=re.compile(' palabras en total')).strip()
self.story.setMetadata('numWords', wc.split()[0])
#then we come to categories and genres. Oh dear. On this site, categories hold everything from genre, to ships, to crossovers.
#To make things worse, there is also another genre field, which often holds similar/duplicate info. Links to genre pages do not work
#though, so perhaps those will be phased out?
#for now, put them all into the genre list
links = mb.findAll('a',href=re.compile('/(categorias|generos)/\d+'))
genlist = [i.string.strip() for i in links]
self.story.extendList('genre',genlist)
#get the chapter urls
#we can go back to the table cell we found before
#get its last element and work backwards to find the last ordered list on the page
list = cell.contents[len(cell)-1].findPrevious('ol')
chapters = []
revs = 0
chnum = 0
for li in list:
chnum += 1
chTitle = str(chnum) + '. ' + li.a.b.string.strip()
chURL = makeAbsoluteURL(li.a['href'])
chapters.append((chTitle,chURL))
#Get reviews, add to total
revs += int(li.div.a.string.split()[0])
self.chapterUrls.extend(chapters)
self.story.setMetadata('numChapters', len(chapters))
self.story.setMetadata('reviews', revs)
#Now for the description... this may be tricky...
#if it is there (doesn't have to be), it will be before the chapter list,
#separated by a horizontal rule, and after the google ad bar
#get list's parent div
mb = list.parent
#get the div before that, will either be the description, or the google ad bar
mb = mb.findPreviousSibling('div')
if 'google_ad_client' in str(mb):
#couldn't find description, leaving it blank
pass
else:
self.setDescription(url,mb)
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url),
selfClosingTags=('br','hr','img'))
div = soup.find('div', {'id' : 'cuerpoHistoria'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,298 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return PotterHeadsAnonymousComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class PotterHeadsAnonymousComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','pha')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d %b %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'fanfic.potterheadsanonymous.com'
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=4"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# Since the warning text can change by warning level, let's
# look for the warning pass url. ksarchive uses
# &amp;warning= -- actually, so do other sites. Must be an
# eFiction book.
# viewstory.php?sid=1882&amp;warning=4
# viewstory.php?sid=1654&amp;ageconsent=ok&amp;warning=5
#print data
#m = re.search(r"'viewstory.php\?sid=1882(&amp;warning=4)'",data)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
pagetitle = soup.find('div',{'id':'pagetitle'})
## Title
a = pagetitle.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = ""
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=3'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = bs.BeautifulSoup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

Some files were not shown because too many files have changed in this diff Show more