Compare commits

..

No commits in common. "main" and "v4.29.0" have entirely different histories.

193 changed files with 18395 additions and 31568 deletions

3
.gitignore vendored
View file

@ -20,9 +20,6 @@
# pycharm project specific settings files
.idea
# vscode project specific settings file
.vscode
cleanup.sh
FanFictionDownLoader.zip
*.epub

View file

@ -52,9 +52,9 @@ Test versions are available at:
- The [test plugin] is posted at MobileRead.
- The test version of CLI for pip install is uploaded to the testpypi repository and can be installed with:
```
pip install --extra-index-url https://test.pypi.org/simple/ --upgrade FanFicFare
```
> `pip install --extra-index-url https://testpypi.python.org/pypi --upgrade FanFicFare`
### Other Releases

View file

@ -33,7 +33,7 @@ except NameError:
from calibre.customize import InterfaceActionBase
# pulled out from FanFicFareBase for saving in prefs.py
__version__ = (4, 57, 7)
__version__ = (4, 29, 0)
## Apparently the name for this class doesn't matter--it was still
## 'demo' for the first few versions.

View file

@ -1,20 +0,0 @@
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2024, Jim Miller'
__docformat__ = 'restructuredtext en'
## References:
## https://www.mobileread.com/forums/showthread.php?p=4435205&postcount=65
## https://www.mobileread.com/forums/showthread.php?p=4102834&postcount=389
from calibre_plugins.action_chains.events import ChainEvent
class FanFicFareDownloadFinished(ChainEvent):
# replace with the name of your event
name = 'FanFicFare Download Finished'
def get_event_signal(self):
return self.gui.iactions['FanFicFare'].download_finished_signal

View file

@ -2,6 +2,7 @@
from __future__ import (unicode_literals, division, absolute_import,
print_function)
import six
__license__ = 'GPL v3'
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>, 2018, Jim Miller'
@ -21,9 +22,7 @@ from calibre.gui2.actions import menu_action_unique_name
from calibre.gui2.keyboard import ShortcutConfig
from calibre.utils.config import config_dir
from calibre.utils.date import now, format_date, qt_to_dt, UNDEFINED_DATE
import fanficfare.six as six
from six import text_type as unicode
from fanficfare.six import text_type as unicode
# Global definition of our plugin name. Used for common functions that require this.
plugin_name = None

View file

@ -2,6 +2,7 @@
from __future__ import (unicode_literals, division, absolute_import,
print_function)
import six
__license__ = 'GPL v3'
__copyright__ = '2021, Jim Miller'
@ -23,8 +24,7 @@ from PyQt5.Qt import (QWidget, QVBoxLayout, QHBoxLayout, QGridLayout, QLabel,
from calibre.gui2 import dynamic, info_dialog
from calibre.gui2.complete2 import EditWithComplete
from calibre.gui2.dialogs.confirm_delete import confirm
import fanficfare.six as six
from six import text_type as unicode
from fanficfare.six import text_type as unicode
try:
from calibre.ebooks.covers import generate_cover as cal_generate_cover
@ -371,7 +371,6 @@ class ConfigWidget(QWidget):
prefs['suppresstitlesort'] = self.std_columns_tab.suppresstitlesort.isChecked()
prefs['authorcase'] = self.std_columns_tab.authorcase.isChecked()
prefs['titlecase'] = self.std_columns_tab.titlecase.isChecked()
prefs['seriescase'] = self.std_columns_tab.seriescase.isChecked()
prefs['setanthologyseries'] = self.std_columns_tab.setanthologyseries.isChecked()
prefs['set_author_url'] =self.std_columns_tab.set_author_url.isChecked()
@ -417,10 +416,6 @@ class ConfigWidget(QWidget):
prefs['auto_reject_from_email'] = self.imap_tab.auto_reject_from_email.isChecked()
prefs['update_existing_only_from_email'] = self.imap_tab.update_existing_only_from_email.isChecked()
prefs['download_from_email_immediately'] = self.imap_tab.download_from_email_immediately.isChecked()
prefs['site_split_jobs'] = self.other_tab.site_split_jobs.isChecked()
prefs['reconsolidate_jobs'] = self.other_tab.reconsolidate_jobs.isChecked()
prefs.save_to_db()
self.plugin_action.set_popup_mode()
@ -761,7 +756,6 @@ class BasicTab(QWidget):
tooltip=_("One URL per line:\n<b>http://...,note</b>\n<b>http://...,title by author - note</b>"),
rejectreasons=rejecturllist.get_reject_reasons(),
reasonslabel=_('Add this reason to all URLs added:'),
accept_storyurls=True,
save_size_name='fff:Add Reject List')
d.exec_()
if d.result() == d.Accepted:
@ -1100,7 +1094,7 @@ class CalibreCoverTab(QWidget):
self.plugin_gen_cover = QRadioButton(_('Plugin %(gc)s')%no_trans,self)
self.plugin_gen_cover.setToolTip(_("Use the %(gc)s plugin to create covers.<br>"
"Requires that you have the the %(gc)s plugin installed.<br>"
"Additional settings are below.")%no_trans)
"Additional settings are below."%no_trans))
self.gencov_rdgrp.addButton(self.plugin_gen_cover)
# always, new only, when no cover from site, inject yes/no...
self.plugin_gen_cover.setChecked(prefs['plugin_gen_cover'])
@ -1280,31 +1274,6 @@ class OtherTab(QWidget):
self.l = QVBoxLayout()
self.setLayout(self.l)
groupbox = QGroupBox()
self.l.addWidget(groupbox)
groupl = QVBoxLayout()
groupbox.setLayout(groupl)
label = QLabel("<h3>"+
_("Background Job Settings")+
"</h3>"
)
label.setWordWrap(True)
groupl.addWidget(label)
self.site_split_jobs = QCheckBox(_('Split downloads into separate background jobs by site'),self)
self.site_split_jobs.setToolTip(_("Launches a separate background Job for each site in the list of stories to download/update. Otherwise, there will be only one background job."))
self.site_split_jobs.setChecked(prefs['site_split_jobs'])
groupl.addWidget(self.site_split_jobs)
self.reconsolidate_jobs = QCheckBox(_('Reconsolidate split downloads before updating library'),self)
self.reconsolidate_jobs.setToolTip(_("Hold all downloads/updates launched together until they all finish. Otherwise, there will be a 'Proceed to update' dialog for each site."))
self.reconsolidate_jobs.setChecked(prefs['reconsolidate_jobs'])
groupl.addWidget(self.reconsolidate_jobs)
self.l.addSpacing(5)
label = QLabel(_("These controls aren't plugin settings as such, but convenience buttons for setting Keyboard shortcuts and getting all the FanFicFare confirmation dialogs back again."))
label.setWordWrap(True)
self.l.addWidget(label)
@ -1638,11 +1607,6 @@ class StandardColumnsTab(QWidget):
self.setanthologyseries.setChecked(prefs['setanthologyseries'])
row.append(self.setanthologyseries)
self.seriescase = QCheckBox(_('Fix Series Case?'),self)
self.seriescase.setToolTip(_("If checked, Calibre's routine for correcting the capitalization of title will be applied.")
+"\n"+_("This effects Calibre metadata only, not FanFicFare metadata in title page."))
self.seriescase.setChecked(prefs['seriescase'])
row.append(self.seriescase)
grid = QGridLayout()
for rownum, row in enumerate(rows):
for colnum, col in enumerate(row):

View file

@ -38,7 +38,6 @@ from calibre.gui2 import gprefs
show_download_options = 'fff:add new/update dialogs:show_download_options'
from calibre.gui2.dialogs.confirm_delete import confirm
from calibre.gui2.complete2 import EditWithComplete
from fanficfare.exceptions import NotGoingToDownload
from fanficfare.six import text_type as unicode, ensure_text
# pulls in translation files for _() strings
@ -156,6 +155,15 @@ class RejectUrlEntry:
return retval
class NotGoingToDownload(Exception):
def __init__(self,error,icon='dialog_error.png',showerror=True):
self.error=error
self.icon=icon
self.showerror=showerror
def __str__(self):
return self.error
class DroppableQTextEdit(QTextEdit):
def __init__(self,parent):
QTextEdit.__init__(self,parent)
@ -181,32 +189,12 @@ class DroppableQTextEdit(QTextEdit):
else:
return QTextEdit.insertFromMimeData(self, mime_data)
class HotKeyedSizePersistedDialog(SizePersistedDialog):
def __init__(self, gui, save_size_name):
super(HotKeyedSizePersistedDialog,self).__init__(gui, save_size_name)
self.keys=dict()
def addCtrlKeyPress(self,key,func):
# print("addKeyPress: key(0x%x)"%key)
# print("control: 0x%x"%QtCore.Qt.ControlModifier)
self.keys[key]=func
def keyPressEvent(self, event):
# print("event: key(0x%x) modifiers(0x%x)"%(event.key(),event.modifiers()))
if (event.modifiers() & QtCore.Qt.ControlModifier) and event.key() in self.keys:
func = self.keys[event.key()]
return func()
else:
return super(HotKeyedSizePersistedDialog,self).keyPressEvent(event)
class AddNewDialog(HotKeyedSizePersistedDialog):
class AddNewDialog(SizePersistedDialog):
go_signal = pyqtSignal(object, object, object, object)
def __init__(self, gui, prefs, icon):
super(AddNewDialog,self).__init__(gui, 'fff:add new dialog')
SizePersistedDialog.__init__(self, gui, 'fff:add new dialog')
self.prefs = prefs
self.setMinimumWidth(300)
@ -345,9 +333,6 @@ class AddNewDialog(HotKeyedSizePersistedDialog):
self.button_box.rejected.connect(self.reject)
self.l.addWidget(self.button_box)
self.addCtrlKeyPress(QtCore.Qt.Key_Return,self.ok_clicked)
self.addCtrlKeyPress(QtCore.Qt.Key_Enter,self.ok_clicked) # num pad
def click_show_download_options(self,x):
self.gbf.setVisible(x)
gprefs[show_download_options] = x
@ -490,15 +475,14 @@ class AddNewDialog(HotKeyedSizePersistedDialog):
self.collision.setCurrentIndex(i)
def get_fff_options(self):
retval = dict(self.extraoptions)
retval.update( {
'fileform': unicode(self.fileform.currentText()),
'collision': unicode(self.collision.currentText()),
'updatemeta': self.updatemeta.isChecked(),
'bgmeta': False, # self.bgmeta.isChecked(),
'smarten_punctuation':self.prefs['smarten_punctuation'],
'do_wordcount':self.prefs['do_wordcount'],
} )
retval = {
'fileform': unicode(self.fileform.currentText()),
'collision': unicode(self.collision.currentText()),
'updatemeta': self.updatemeta.isChecked(),
'bgmeta': False, # self.bgmeta.isChecked(),
'smarten_punctuation':self.prefs['smarten_punctuation'],
'do_wordcount':self.prefs['do_wordcount'],
}
if self.merge:
retval['fileform']=='epub'
@ -513,6 +497,7 @@ class AddNewDialog(HotKeyedSizePersistedDialog):
def get_urlstext(self):
return unicode(self.url.toPlainText())
class FakeLineEdit():
def __init__(self):
pass
@ -634,48 +619,6 @@ class UserPassDialog(QDialog):
self.status=False
self.hide()
class TOTPDialog(QDialog):
'''
Need to collect Timebased One Time Password(TOTP) for some sites.
'''
def __init__(self, gui, site, exception=None):
QDialog.__init__(self, gui)
self.status=False
self.l = QVBoxLayout()
self.setLayout(self.l)
grid = QGridLayout()
self.l.addLayout(grid)
self.setWindowTitle(_('Time-based One Time Password(TOTP)'))
grid.addWidget(QLabel(_("Site requires a Time-based One Time Password(TOTP) for this url:\n%s")%exception.url),0,0,1,2)
grid.addWidget(QLabel(_("TOTP:")),2,0)
self.totp = QLineEdit(self)
grid.addWidget(self.totp,2,1)
horz = QHBoxLayout()
self.l.addLayout(horz)
self.ok_button = QPushButton(_('OK'), self)
self.ok_button.clicked.connect(self.ok)
horz.addWidget(self.ok_button)
self.cancel_button = QPushButton(_('Cancel'), self)
self.cancel_button.clicked.connect(self.cancel)
horz.addWidget(self.cancel_button)
self.resize(self.sizeHint())
def ok(self):
self.status=True
self.hide()
def cancel(self):
self.status=False
self.hide()
def LoopProgressDialog(gui,
book_list,
foreach_function,
@ -713,7 +656,6 @@ class _LoopProgressDialog(QProgressDialog):
QProgressDialog.__init__(self,
init_label,
_('Cancel'), 0, len(book_list), gui)
self.gui = gui
self.setWindowTitle(win_title)
self.setMinimumWidth(500)
self.book_list = book_list
@ -1320,7 +1262,6 @@ class EditTextDialog(SizePersistedDialog):
icon=None, title=None, label=None, tooltip=None,
read_only=False,
rejectreasons=[],reasonslabel=None,
accept_storyurls=False,
save_size_name='fff:edit text dialog',
):
SizePersistedDialog.__init__(self, parent, save_size_name)
@ -1334,10 +1275,7 @@ class EditTextDialog(SizePersistedDialog):
self.setWindowIcon(icon)
self.l.addWidget(self.label)
if accept_storyurls:
self.textedit = DroppableQTextEdit(self)
else:
self.textedit = QTextEdit(self)
self.textedit = QTextEdit(self)
self.textedit.setLineWrapMode(QTextEditNoWrap)
self.textedit.setReadOnly(read_only)
self.textedit.setText(text)
@ -1381,18 +1319,7 @@ class EditTextDialog(SizePersistedDialog):
def get_reason_text(self):
return unicode(self.reason_edit.currentText()).strip()
class QTextEditPlainPaste(QTextEdit):
def insertFromMimeData(self, mimeData):
# logger.debug("insertFromMimeData called")
#Ensure it is text.
if (mimeData.hasText()):
text = mimeData.text()
self.insertPlainText(text)
#In case not text.
else:
QTextEdit.insertFromMimeData(self, mimeData)
class IniTextDialog(HotKeyedSizePersistedDialog):
class IniTextDialog(SizePersistedDialog):
def __init__(self, parent, text,
icon=None, title=None, label=None,
@ -1400,7 +1327,9 @@ class IniTextDialog(HotKeyedSizePersistedDialog):
read_only=False,
save_size_name='fff:ini text dialog',
):
super(IniTextDialog,self).__init__(parent, save_size_name)
SizePersistedDialog.__init__(self, parent, save_size_name)
self.keys=dict()
self.l = QVBoxLayout()
self.setLayout(self.l)
@ -1411,7 +1340,7 @@ class IniTextDialog(HotKeyedSizePersistedDialog):
self.setWindowIcon(icon)
self.l.addWidget(self.label)
self.textedit = QTextEditPlainPaste(self)
self.textedit = QTextEdit(self)
highlighter = IniHighlighter(self.textedit,
sections=get_valid_sections(),
@ -1501,6 +1430,19 @@ class IniTextDialog(HotKeyedSizePersistedDialog):
# print("call parent accept")
return SizePersistedDialog.accept(self)
def addCtrlKeyPress(self,key,func):
# print("addKeyPress: key(0x%x)"%key)
# print("control: 0x%x"%QtCore.Qt.ControlModifier)
self.keys[key]=func
def keyPressEvent(self, event):
# print("event: key(0x%x) modifiers(0x%x)"%(event.key(),event.modifiers()))
if (event.modifiers() & QtCore.Qt.ControlModifier) and event.key() in self.keys:
func = self.keys[event.key()]
return func()
else:
return SizePersistedDialog.keyPressEvent(self, event)
def get_plain_text(self):
return unicode(self.textedit.toPlainText())
@ -1569,6 +1511,7 @@ class IniTextDialog(HotKeyedSizePersistedDialog):
# And finally we set this new cursor as the parent's
self.textedit.setTextCursor(cursor)
class ViewLog(SizePersistedDialog):
def label_clicked(self, event, lineno=None):

File diff suppressed because it is too large Load diff

View file

@ -2,6 +2,7 @@
from __future__ import (unicode_literals, division, absolute_import,
print_function)
import six
__license__ = 'GPL v3'
__copyright__ = '2020, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
@ -16,6 +17,9 @@ from io import StringIO
from collections import defaultdict
import sys
from calibre.utils.ipc.server import Empty, Server
from calibre.utils.ipc.job import ParallelJob
from calibre.constants import numeric_version as calibre_version
from calibre.utils.date import local_tz
# pulls in translation files for _() strings
@ -30,11 +34,21 @@ except NameError:
#
# ------------------------------------------------------------------------------
def do_download_worker_single(site,
book_list,
options,
merge,
notification=lambda x,y:x):
def do_download_worker(book_list,
options,
cpus,
merge=False,
notification=lambda x,y:x):
'''
Coordinator job, to launch child jobs to do downloads.
This is run as a worker job in the background to keep the UI more
responsive and get around any memory leak issues as it will launch
a child job for each book as a worker process
'''
## Now running one BG proc per site, which downloads for the same
## site in serial.
logger.info("CPUs:%s"%cpus)
server = Server(pool_size=cpus)
logger.info(options['version'])
@ -43,87 +57,142 @@ def do_download_worker_single(site,
from calibre.debug import print_basic_debug_info
print_basic_debug_info(sys.stderr)
sites_lists = defaultdict(list)
[ sites_lists[x['site']].append(x) for x in book_list if x['good'] ]
totals = {}
# can't do direct assignment in list comprehension? I'm sure it
# makes sense to some pythonista.
# [ totals[x['url']]=0.0 for x in book_list if x['good'] ]
[ totals.update({x['url']:0.0}) for x in book_list if x['good'] ]
# logger.debug(sites_lists.keys())
# Queue all the jobs
jobs_running = 0
for site in sites_lists.keys():
site_list = sites_lists[site]
logger.info(_("Launch background process for site %s:")%site + "\n" +
"\n".join([ x['url'] for x in site_list ]))
# logger.debug([ x['url'] for x in site_list])
args = ['calibre_plugins.fanficfare_plugin.jobs',
'do_download_site',
(site,site_list,options,merge)]
job = ParallelJob('arbitrary_n',
"site:(%s)"%site,
done=None,
args=args)
job._site_list = site_list
job._processed = False
server.add_job(job)
jobs_running += 1
# This server is an arbitrary_n job, so there is a notifier available.
# Set the % complete to a small number to avoid the 'unavailable' indicator
notification(0.01, _('Downloading FanFiction Stories'))
from calibre_plugins.fanficfare_plugin import FanFicFareBase
fffbase = FanFicFareBase(options['plugin_path'])
with fffbase: # so the sys.path was modified while loading the
# plug impl.
from fanficfare.fff_profile import do_cprofile
## extra function just so I can easily use the same
## @do_cprofile decorator
@do_cprofile
def profiled_func():
count = 0
totals = {}
# can't do direct assignment in list comprehension? I'm sure it
# makes sense to some pythonista.
# [ totals[x['url']]=0.0 for x in book_list if x['good'] ]
[ totals.update({x['url']:0.0}) for x in book_list if x['good'] ]
# logger.debug(sites_lists.keys())
def do_indiv_notif(percent,msg):
# dequeue the job results as they arrive, saving the results
count = 0
while True:
job = server.changed_jobs_queue.get()
# logger.debug("job get job._processed:%s"%job._processed)
# A job can 'change' when it is not finished, for example if it
# produces a notification.
msg = None
try:
## msg = book['url']
(percent,msg) = job.notifications.get_nowait()
# logger.debug("%s<-%s"%(percent,msg))
if percent == 10.0: # Only when signaling d/l done.
count += 1
totals[msg] = 1.0/len(totals)
# logger.info("Finished: %s"%msg)
else:
totals[msg] = percent/len(totals)
notification(max(0.01,sum(totals.values())), _('%(count)d of %(total)d stories finished downloading')%{'count':count,'total':len(totals)})
notification(max(0.01,sum(totals.values())), _('%(count)d of %(total)d stories finished downloading')%{'count':count,'total':len(totals)})
except Empty:
pass
# without update, is_finished will never be set. however, we
# do want to get all the notifications for status so we don't
# miss the 'done' ones.
job.update(consume_notifications=False)
do_list = []
done_list = []
logger.info("\n\n"+_("Downloading FanFiction Stories")+"\n%s\n"%("\n".join([ "%(status)s %(url)s %(comment)s" % book for book in book_list])))
## pass failures from metadata through bg job so all results are
## together.
# if not job._processed:
# sleep(0.5)
## Can have a race condition where job.is_finished before
## notifications for all downloads have been processed.
## Or even after the job has been finished.
# logger.debug("job.is_finished(%s) or job._processed(%s)"%(job.is_finished, job._processed))
if not job.is_finished:
continue
## only process each job once. We can get more than one loop
## after job.is_finished.
if not job._processed:
# sleep(1)
# A job really finished. Get the information.
## This is where bg proc details end up in GUI log.
## job.details is the whole debug log for each proc.
logger.info("\n\n" + ("="*80) + " " + job.details.replace('\r',''))
# logger.debug("Finished background process for site %s:\n%s"%(job._site_list[0]['site'],"\n".join([ x['url'] for x in job._site_list ])))
for b in job._site_list:
book_list.remove(b)
book_list.extend(job.result)
job._processed = True
jobs_running -= 1
## Can't use individual count--I've seen stories all reported
## finished before results of all jobs processed.
if jobs_running == 0:
book_list = sorted(book_list,key=lambda x : x['listorder'])
logger.info("\n"+_("Download Results:")+"\n%s\n"%("\n".join([ "%(status)s %(url)s %(comment)s" % book for book in book_list])))
good_lists = defaultdict(list)
bad_lists = defaultdict(list)
for book in book_list:
if book['good']:
do_list.append(book)
good_lists[book['status']].append(book)
else:
done_list.append(book)
for book in do_list:
# logger.info("%s"%book['url'])
done_list.append(do_download_for_worker(book,options,merge,do_indiv_notif))
count += 1
return finish_download(done_list)
return profiled_func()
bad_lists[book['status']].append(book)
def finish_download(donelist):
book_list = sorted(donelist,key=lambda x : x['listorder'])
logger.info("\n"+_("Download Results:")+"\n%s\n"%("\n".join([ "%(status)s %(url)s %(comment)s" % book for book in book_list])))
order = [_('Add'),
_('Update'),
_('Meta'),
_('Different URL'),
_('Rejected'),
_('Skipped'),
_('Bad'),
_('Error'),
]
j = 0
for d in [ good_lists, bad_lists ]:
for status in order:
if d[status]:
l = d[status]
logger.info("\n"+status+"\n%s\n"%("\n".join([book['url'] for book in l])))
for book in l:
book['reportorder'] = j
j += 1
del d[status]
# just in case a status is added but doesn't appear in order.
for status in d.keys():
logger.info("\n"+status+"\n%s\n"%("\n".join([book['url'] for book in d[status]])))
break
good_lists = defaultdict(list)
bad_lists = defaultdict(list)
for book in book_list:
if book['good']:
good_lists[book['status']].append(book)
else:
bad_lists[book['status']].append(book)
order = [_('Add'),
_('Update'),
_('Meta'),
_('Different URL'),
_('Rejected'),
_('Skipped'),
_('Bad'),
_('Error'),
]
stnum = 0
for d in [ good_lists, bad_lists ]:
for status in order:
stnum += 1
if d[status]:
l = d[status]
logger.info("\n"+status+"\n%s\n"%("\n".join([book['url'] for book in l])))
for book in l:
# Add prior listorder to 10000 * status num for
# ordering of accumulated results with multiple bg
# jobs
book['reportorder'] = stnum*10000 + book['listorder']
del d[status]
# just in case a status is added but doesn't appear in order.
for status in d.keys():
logger.info("\n"+status+"\n%s\n"%("\n".join([book['url'] for book in d[status]])))
server.close()
# return the book list as the job result
return book_list
def do_download_site(site,book_list,options,merge,notification=lambda x,y:x):
# logger.info(_("Started job for %s")%site)
retval = []
for book in book_list:
# logger.info("%s"%book['url'])
retval.append(do_download_for_worker(book,options,merge,notification))
notification(10.0,book['url'])
return retval
def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
'''
Child job, to download story when run as a worker job
@ -133,13 +202,13 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
fffbase = FanFicFareBase(options['plugin_path'])
with fffbase: # so the sys.path was modified while loading the
# plug impl.
from calibre_plugins.fanficfare_plugin.dialogs import NotGoingToDownload
from calibre_plugins.fanficfare_plugin.prefs import (
SAVE_YES, SAVE_YES_UNLESS_SITE, OVERWRITE, OVERWRITEALWAYS, UPDATE,
UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, CALIBREONLYSAVECOL)
from calibre_plugins.fanficfare_plugin.wordcount import get_word_count
from fanficfare import adapters, writers
from fanficfare.epubutils import get_update_data
from fanficfare.exceptions import NotGoingToDownload
from fanficfare.six import text_type as unicode
from calibre_plugins.fanficfare_plugin.fff_util import get_fff_config
@ -168,7 +237,6 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
adapter.is_adult = book['is_adult']
adapter.username = book['username']
adapter.password = book['password']
adapter.totp = book['totp']
adapter.setChaptersRange(book['begin'],book['end'])
## each site download job starts with a new copy of the
@ -208,6 +276,7 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
book['author_sort'] = book['author'] = story.getList("author", removeallentities=True)
book['publisher'] = story.getMetadata("publisher")
book['url'] = story.getMetadata("storyUrl", removeallentities=True)
book['tags'] = story.getSubjectTags(removeallentities=True)
book['comments'] = story.get_sanitized_description()
book['series'] = story.getMetadata("series", removeallentities=True)
@ -349,7 +418,8 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
except:
logger.error("WordCount failed")
if options['smarten_punctuation'] and options['fileform'] == "epub":
if options['smarten_punctuation'] and options['fileform'] == "epub" \
and calibre_version >= (0, 9, 39):
# for smarten punc
from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS
from calibre.utils.logging import Log
@ -359,14 +429,12 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
data = {'smarten_punctuation':True}
opts = ALL_OPTS.copy()
opts.update(data)
O = namedtuple('Options', ' '.join(ALL_OPTS.keys()))
O = namedtuple('Options', ' '.join(six.iterkeys(ALL_OPTS)))
opts = O(**opts)
log = Log(level=Log.DEBUG)
polish({outfile:outfile}, opts, log, logger.info)
## here to catch tags set in chapters in literotica for
## both overwrites and updates.
book['tags'] = story.getSubjectTags(removeallentities=True)
except NotGoingToDownload as d:
book['good']=False
book['status']=_('Bad')
@ -392,8 +460,7 @@ def inject_cal_cols(book,story,configuration):
if 'calibre_columns' in book:
injectini = ['[injected]']
extra_valid = []
for k in book['calibre_columns'].keys():
v = book['calibre_columns'][k]
for k, v in six.iteritems(book['calibre_columns']):
story.setMetadata(k,v['val'])
injectini.append('%s_label:%s'%(k,v['label']))
extra_valid.append(k)

File diff suppressed because it is too large Load diff

View file

@ -126,7 +126,6 @@ default_prefs['suppressauthorsort'] = False
default_prefs['suppresstitlesort'] = False
default_prefs['authorcase'] = False
default_prefs['titlecase'] = False
default_prefs['seriescase'] = False
default_prefs['setanthologyseries'] = False
default_prefs['mark'] = False
default_prefs['mark_success'] = True
@ -198,11 +197,6 @@ default_prefs['auto_reject_from_email'] = False
default_prefs['update_existing_only_from_email'] = False
default_prefs['download_from_email_immediately'] = False
#default_prefs['single_proc_jobs'] = True # setting and code removed
default_prefs['site_split_jobs'] = True
default_prefs['reconsolidate_jobs'] = True
def set_library_config(library_config,db,setting=PREFS_KEY_SETTINGS):
db.prefs.set_namespaced(PREFS_NAMESPACE,
setting,

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -30,12 +30,8 @@ from .. import configurable as configurable
## must import each adapter here.
from . import base_adapter
from . import base_efiction_adapter
from . import adapter_test1
from . import adapter_test2
from . import adapter_test3
from . import adapter_test4
from . import adapter_fanfictionnet
from . import adapter_fictionalleyarchiveorg
from . import adapter_fictionpresscom
@ -65,10 +61,11 @@ from . import adapter_dokugacom
from . import adapter_storiesofardacom
from . import adapter_ncisfictioncom
from . import adapter_fanfiktionde
from . import adapter_ponyfictionarchivenet
from . import adapter_themasquenet
from . import adapter_pretendercentrecom
from . import adapter_darksolaceorg
from . import adapter_storyroomcom
from . import adapter_finestoriescom
from . import adapter_dracoandginnycom
from . import adapter_wolverineandroguecom
from . import adapter_thehookupzonenet
@ -106,10 +103,12 @@ from . import adapter_fireflyfansnet
from . import adapter_trekfanfictionnet
from . import adapter_wwwutopiastoriescom
from . import adapter_sinfuldreamscomunicornfic
from . import adapter_sinfuldreamscomwhisperedmuse
from . import adapter_sinfuldreamscomwickedtemptation
from . import adapter_asianfanficscom
from . import adapter_mttjustoncenet
from . import adapter_narutoficorg
from . import adapter_starskyhutcharchivenet
from . import adapter_thedelphicexpansecom
from . import adapter_wwwaneroticstorycom
from . import adapter_lcfanficcom
@ -118,8 +117,11 @@ from . import adapter_alternatehistorycom
from . import adapter_wattpadcom
from . import adapter_novelonlinefullcom
from . import adapter_wwwnovelallcom
from . import adapter_wuxiaworldxyz
from . import adapter_novelupdatescc
from . import adapter_hentaifoundrycom
from . import adapter_mugglenetfanfictioncom
from . import adapter_swiorgru
from . import adapter_fanficsme
from . import adapter_fanfictalkcom
from . import adapter_scifistoriescom
@ -127,20 +129,13 @@ from . import adapter_chireadscom
from . import adapter_scribblehubcom
from . import adapter_fictionlive
from . import adapter_thesietchcom
from . import adapter_fastnovelsnet
from . import adapter_squidgeworldorg
from . import adapter_novelfull
from . import adapter_psychficcom
from . import adapter_deviantartcom
from . import adapter_readonlymindcom
from . import adapter_wwwsunnydaleafterdarkcom
from . import adapter_syosetucom
from . import adapter_kakuyomujp
from . import adapter_fanfictionsfr
from . import adapter_touchfluffytail
from . import adapter_spiritfanfictioncom
from . import adapter_superlove
from . import adapter_cfaa
from . import adapter_althistorycom
## This bit of complexity allows adapters to be added by just adding
## importing. It eliminates the long if/else clauses we used to need
@ -224,21 +219,6 @@ def get_section_url(url):
## return unchanged in that case.
return url
def get_url_search(url):
'''
For adapters that have story URLs that can change. This is
used for searching the Calibre library by identifiers:url for
sites (generally) that contain author or title that can
change, but also have a unique identifier that doesn't.
returns a string containing a regexp, not a compiled re object.
'''
cls = _get_class_for(url)[0]
if not cls:
## still apply common processing.
cls = base_adapter.BaseSiteAdapter
return cls.get_url_search(url)
def getAdapter(config,url,anyurl=False):
#logger.debug("trying url:"+url)

View file

@ -22,9 +22,9 @@ logger = logging.getLogger(__name__)
from .base_otw_adapter import BaseOTWAdapter
def getClass():
return AdastrafanficComAdapter
return SquidgeWorldOrgAdapter
class AdastrafanficComAdapter(BaseOTWAdapter):
class SquidgeWorldOrgAdapter(BaseOTWAdapter):
def __init__(self, config, url):
BaseOTWAdapter.__init__(self, config, url)

View file

@ -68,7 +68,9 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter):
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%B %d, %Y"
self.dateformat = "%Y-%m-%d"
## Added because adult-fanfiction.org does send you to
## www.adult-fanfiction.org when you go to it and it also moves
@ -137,45 +139,91 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r'https?://(anime|anime2|bleach|books|buffy|cartoon|celeb|comics|ff|games|hp|inu|lotr|manga|movies|naruto|ne|original|tv|xmen|ygo|yuyu)\.adult-fanfiction\.org/story\.php\?no=\d+$'
##This is not working right now, so I'm commenting it out, but leaving it for future testing
## Login seems to be reasonably standard across eFiction sites.
#def needToLoginCheck(self, data):
##This adapter will always require a login
# return True
# <form name="login" method="post" action="">
# <div class="top">E-mail: <span id="sprytextfield1">
# <input name="email" type="text" id="email" size="20" maxlength="255" />
# <span class="textfieldRequiredMsg">Email is required.</span><span class="textfieldInvalidFormatMsg">Invalid E-mail.</span></span></div>
# <div class="top">Password: <span id="sprytextfield2">
# <input name="pass1" type="password" id="pass1" size="20" maxlength="32" />
# <span class="textfieldRequiredMsg">password is required.</span><span class="textfieldMinCharsMsg">Minimum 8 characters8.</span><span class="textfieldMaxCharsMsg">Exceeded 32 characters.</span></span></div>
# <div class="top"><br /> <input name="loginsubmittop" type="hidden" id="loginsubmit" value="TRUE" />
# <input type="submit" value="Login" />
# </div>
# </form>
##This is not working right now, so I'm commenting it out, but leaving it for future testing
#def performLogin(self, url, soup):
# params = {}
# if self.password:
# params['email'] = self.username
# params['pass1'] = self.password
# else:
# params['email'] = self.getConfig("username")
# params['pass1'] = self.getConfig("password")
# params['submit'] = 'Login'
# # copy all hidden input tags to pick up appropriate tokens.
# for tag in soup.findAll('input',{'type':'hidden'}):
# params[tag['name']] = tag['value']
# logger.debug("Will now login to URL {0} as {1} with password: {2}".format(url, params['email'],params['pass1']))
# d = self.post_request(url, params, usecache=False)
# d = self.post_request(url, params, usecache=False)
# soup = self.make_soup(d)
#if not (soup.find('form', {'name' : 'login'}) == None):
# logger.info("Failed to login to URL %s as %s" % (url, params['email']))
# raise exceptions.FailedToLogin(url,params['email'])
# return False
#else:
# return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def doExtractChapterUrlsAndMetadata(self, get_cover=True):
## You need to have your is_adult set to true to get this story
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
else:
d = self.post_request('https://www.adult-fanfiction.org/globals/ajax/age-verify.php', {"verify":"1"})
if "Age verified successfully" not in d:
raise exceptions.FailedToDownload("Failed to Verify Age: {0}".format(d))
url = self.url
logger.debug("URL: "+url)
data = self.get_request(url)
# logger.debug(data)
if "The dragons running the back end of the site can not seem to find the story you are looking for." in data:
raise exceptions.StoryDoesNotExist("{0}.{1} says: The dragons running the back end of the site can not seem to find the story you are looking for.".format(self.zone, self.getBaseDomain()))
soup = self.make_soup(data)
##This is not working right now, so I'm commenting it out, but leaving it for future testing
#self.performLogin(url, soup)
## Title
## Some of the titles have a backslash on the story page, but not on the Author's page
## So I am removing it from the title, so it can be found on the Author's page further in the code.
## Also, some titles may have extra spaces ' ', and the search on the Author's page removes them,
## so I have to here as well. I used multiple replaces to make sure, since I did the same below.
h1 = soup.find('h1')
# logger.debug("Title:%s"%h1)
self.story.setMetadata('title',stripHTML(h1).replace('\\','').replace(' ',' ').replace(' ',' ').replace(' ',' ').strip())
a = soup.find('a', href=re.compile(r'story.php\?no='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a).replace('\\','').replace(' ',' ').replace(' ',' ').replace(' ',' ').strip())
# Find the chapters from first list only
chapters = soup.select_one('select.chapter-select').select('option')
for chapter in chapters:
self.add_chapter(chapter,self.url+'&chapter='+chapter['value'])
# Find the chapters:
chapters = soup.find('ul',{'class':'dropdown-content'})
for i, chapter in enumerate(chapters.findAll('a')):
self.add_chapter(chapter,self.url+'&chapter='+unicode(i+1))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"profile.php\?id=\d+"))
a = soup.find('a', href=re.compile(r"profile.php\?no=\d+"))
if a == None:
# I know that the original author of fanficfare wants to always have metadata,
# but I posit that if the story is there, even if we can't get the metadata from the
@ -184,56 +232,140 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter):
self.story.setMetadata('authorUrl','https://www.adult-fanfiction.org')
self.story.setMetadata('author','Unknown')
logger.warning('There was no author found for the story... Metadata will not be retreived.')
self.setDescription(url,'>>>>>>>>>> No Summary Given, Unknown Author <<<<<<<<<<')
self.setDescription(url,'>>>>>>>>>> No Summary Given <<<<<<<<<<')
else:
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl',a['href'])
self.story.setMetadata('author',stripHTML(a))
## The story page does not give much Metadata, so we go to
## the Author's page. Except it's actually a sub-req for
## list of author's stories for that subdomain
author_Url = 'https://members.{0}/load-user-stories.php?subdomain={1}&uid={2}'.format(
self.getBaseDomain(),
self.zone,
self.story.getMetadata('authorId'))
##The story page does not give much Metadata, so we go to the Author's page
logger.debug('Getting the load-user-stories page: {0}'.format(author_Url))
##Get the first Author page to see if there are multiple pages.
##AFF doesn't care if the page number is larger than the actual pages,
##it will continue to show the last page even if the variable is larger than the actual page
author_Url = '{0}&view=story&zone={1}&page=1'.format(self.story.getMetadata('authorUrl'), self.zone)
#author_Url = self.story.getMetadata('authorUrl')+'&view=story&zone='+self.zone+'&page=1'
##I'm resetting the author page to the zone for this story
self.story.setMetadata('authorUrl',author_Url)
logger.debug('Getting the author page: {0}'.format(author_Url))
adata = self.get_request(author_Url)
none_found = "No stories found in this category."
if none_found in adata:
raise exceptions.StoryDoesNotExist("{0}.{1} says: {2}".format(self.zone, self.getBaseDomain(), none_found))
if "The member you are looking for does not exist." in adata:
raise exceptions.StoryDoesNotExist("{0}.{1} says: The member you are looking for does not exist.".format(self.zone, self.getBaseDomain()))
#raise exceptions.StoryDoesNotExist(self.zone+'.'+self.getBaseDomain() +" says: The member you are looking for does not exist.")
asoup = self.make_soup(adata)
# logger.debug(asoup)
story_card = asoup.select_one('div.story-card:has(a[href="{0}"])'.format(url))
# logger.debug(story_card)
##Getting the number of author pages
pages = 0
pagination=asoup.find('ul',{'class' : 'pagination'})
if pagination:
pages = pagination.findAll('li')[-1].find('a')
if not pages == None:
pages = pages['href'].split('=')[-1]
else:
pages = 0
## Category
## I've only seen one category per story so far, but just in case:
for cat in story_card.select('div.story-card-category'):
# remove Category:, old code suggests Located: is also
# possible, so removing by <strong>
cat.find("strong").decompose()
self.story.addToList('category',stripHTML(cat))
storya = None
##If there is only 1 page of stories, check it to get the Metadata,
if pages == 0:
a = asoup.findAll('li')
for lc2 in a:
if lc2.find('a', href=re.compile(r'story.php\?no='+self.story.getMetadata('storyId')+"$")):
storya = lc2
break
## otherwise go through the pages
else:
page=1
i=0
while i == 0:
##We already have the first page, so if this is the first time through, skip getting the page
if page != 1:
author_Url = '{0}&view=story&zone={1}&page={2}'.format(self.story.getMetadata('authorUrl'), self.zone, unicode(page))
logger.debug('Getting the author page: {0}'.format(author_Url))
adata = self.get_request(author_Url)
##This will probably never be needed, since AFF doesn't seem to care what number you put as
## the page number, it will default to the last page, even if you use 1000, for an author
## that only hase 5 pages of stories, but I'm keeping it in to appease Saint Justin Case (just in case).
if "The member you are looking for does not exist." in adata:
raise exceptions.StoryDoesNotExist("{0}.{1} says: The member you are looking for does not exist.".format(self.zone, self.getBaseDomain()))
# we look for the li element that has the story here
asoup = self.make_soup(adata)
self.setDescription(url,story_card.select_one('div.story-card-description'))
a = asoup.findAll('li')
for lc2 in a:
if lc2.find('a', href=re.compile(r'story.php\?no='+self.story.getMetadata('storyId')+"$")):
i=1
storya = lc2
break
page = page + 1
if page > int(pages):
break
for tag in story_card.select('span.story-tag'):
self.story.addToList('eroticatags',stripHTML(tag))
## created/updates share formatting
for meta in story_card.select('div.story-card-meta-item span:last-child'):
meta = stripHTML(meta)
if 'Created: ' in meta:
meta = meta.replace('Created: ','')
self.story.setMetadata('datePublished', makeDate(meta, self.dateformat))
if 'Updated: ' in meta:
meta = meta.replace('Updated: ','')
self.story.setMetadata('dateUpdated', makeDate(meta, self.dateformat))
##Split the Metadata up into a list
##We have to change the soup type to a string, then remove the newlines, and double spaces,
##then changes the <br/> to '-:-', which seperates the different elemeents.
##Then we strip the HTML elements from the string.
##There is also a double <br/>, so we have to fix that, then remove the leading and trailing '-:-'.
##They are always in the same order.
## EDIT 09/26/2016: Had some trouble with unicode errors... so I had to put in the decode/encode parts to fix it
liMetadata = unicode(storya).replace('\n','').replace('\r','').replace('\t',' ').replace(' ',' ').replace(' ',' ').replace(' ',' ')
liMetadata = stripHTML(liMetadata.replace(r'<br/>','-:-').replace('<!-- <br /-->','-:-'))
liMetadata = liMetadata.strip('-:-').strip('-:-').encode('utf-8')
for i, value in enumerate(liMetadata.decode('utf-8').split('-:-')):
if i == 0:
# The value for the title has been manipulated, so may not be the same as gotten at the start.
# I'm going to use the href from the storya retrieved from the author's page to determine if it is correct.
if storya.find('a', href=re.compile(r'story.php\?no='+self.story.getMetadata('storyId')+"$"))['href'] != url:
raise exceptions.StoryDoesNotExist('Did not find story in author story list: {0}'.format(author_Url))
elif i == 1:
##Get the description
self.setDescription(url,stripHTML(value.strip()))
else:
# the rest of the values can be missing, so instead of hardcoding the numbers, we search for them.
if 'Located :' in value:
self.story.setMetadata('category',value.replace(r'&gt;',r'>').replace(r'Located :',r'').strip())
elif 'Category :' in value:
# Get the Category
self.story.setMetadata('category',value.replace(r'&gt;',r'>').replace(r'Located :',r'').strip())
elif 'Content Tags :' in value:
# Get the Erotic Tags
value = stripHTML(value.replace(r'Content Tags :',r'')).strip()
for code in re.split(r'\s',value):
self.story.addToList('eroticatags',code)
elif 'Posted :' in value:
# Get the Posted Date
value = value.replace(r'Posted :',r'').strip()
if value.startswith('008'):
# It is unknown how the 200 became 008, but I'm going to change it back here
value = value.replace('008','200')
elif value.startswith('0000'):
# Since the date is showing as 0000,
# I'm going to put the memberdate here
value = asoup.find('div',{'id':'contentdata'}).find('p').get_text(strip=True).replace('Member Since','').strip()
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
elif 'Edited :' in value:
# Get the 'Updated' Edited date
# AFF has the time for the Updated date, and we only want the date,
# so we take the first 10 characters only
value = value.replace(r'Edited :',r'').strip()[0:10]
if value.startswith('008'):
# It is unknown how the 200 became 008, but I'm going to change it back here
value = value.replace('008','200')
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
elif value.startswith('0000') or '-00-' in value:
# Since the date is showing as 0000,
# or there is -00- in the date,
# I'm going to put the Published date here
self.story.setMetadata('dateUpdated', self.story.getMetadata('datPublished'))
else:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
else:
# This catches the blank elements, and the Review and Dragon Prints.
# I am not interested in these, so do nothing
zzzzzzz=0
# grab the text for an individual chapter.
def getChapterText(self, url):
@ -241,11 +373,10 @@ class AdultFanFictionOrgAdapter(BaseSiteAdapter):
logger.debug('Getting chapter text from: %s' % url)
soup = self.make_soup(self.get_request(url))
chaptertag = soup.select_one('div.chapter-body')
chaptertag = soup.find('ul',{'class':'pagination'}).parent.parent.parent.findNextSibling('li')
if None == chaptertag:
raise exceptions.FailedToDownload("Error downloading Chapter: {0}! Missing required element!".format(url))
## chapter text includes a copy of story title, author,
## chapter title, & eroticatags specific to the chapter. Did
## before, too.
# Change td to a div.
chaptertag.name='div'
return self.utf8FromSoup(url,chaptertag)

View file

@ -1,40 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2026 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import re
from .base_xenforo2forum_adapter import BaseXenForo2ForumAdapter
def getClass():
return AltHistoryComAdapter
## NOTE: This is a different site than www.alternatehistory.com.
class AltHistoryComAdapter(BaseXenForo2ForumAdapter):
def __init__(self, config, url):
BaseXenForo2ForumAdapter.__init__(self, config, url)
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ahc')
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'althistory.com'

View file

@ -49,21 +49,8 @@ class ArchiveOfOurOwnOrgAdapter(BaseOTWAdapter):
return ['archiveofourown.org',
'archiveofourown.com',
'archiveofourown.net',
'archiveofourown.gay',
'download.archiveofourown.org',
'download.archiveofourown.com',
'download.archiveofourown.net',
'ao3.org',
]
def mod_url_request(self, url):
return url
def mod_url_request(self, url):
## add / to *not* replace media.archiveofourown.org
if self.getConfig("use_archive_transformativeworks_org",False):
return url.replace("/archiveofourown.org","/archive.transformativeworks.org")
elif self.getConfig("use_archiveofourown_gay",False):
return url.replace("/archiveofourown.org","/archiveofourown.gay")
else:
return url

View file

@ -92,7 +92,7 @@ class ASexStoriesComAdapter(BaseSiteAdapter):
self.story.setMetadata('title', title.string)
# Author
author = soup1.find('div',{'class':'story-info'}).find_all('div',{'class':'story-info-bl'})[1].find('a')
author = soup1.find('div',{'class':'story-info'}).findAll('div',{'class':'story-info-bl'})[1].find('a')
authorurl = author['href']
self.story.setMetadata('author', author.string)
self.story.setMetadata('authorUrl', authorurl)
@ -112,7 +112,7 @@ class ASexStoriesComAdapter(BaseSiteAdapter):
### add it before the rest of the pages, if any
self.add_chapter('1', self.url)
chapterTable = soup1.find('div',{'class':'pages'}).find_all('a')
chapterTable = soup1.find('div',{'class':'pages'}).findAll('a')
if chapterTable is not None:
# Multi-chapter story
@ -124,7 +124,7 @@ class ASexStoriesComAdapter(BaseSiteAdapter):
self.add_chapter(chapterTitle, chapterUrl)
rated = soup1.find('div',{'class':'story-info'}).find_all('div',{'class':'story-info-bl5'})[0].find('img')['title'].replace('- Rate','').strip()
rated = soup1.find('div',{'class':'story-info'}).findAll('div',{'class':'story-info-bl5'})[0].find('img')['title'].replace('- Rate','').strip()
self.story.setMetadata('rating',rated)
self.story.setMetadata('dateUpdated', makeDate('01/01/2001', '%m/%d/%Y'))

View file

@ -48,7 +48,7 @@ class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','asph')
@ -64,10 +64,10 @@ class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return r"https?://"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
@ -92,7 +92,7 @@ class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
params['intent'] = ''
params['submit'] = 'Submit'
loginUrl = 'https://' + self.getSiteDomain() + '/user.php'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
@ -130,20 +130,20 @@ class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
asoup = self.make_soup(self.get_request(self.story.getMetadata('authorUrl')))
try:
# in case link points somewhere other than the first chapter
a = soup.find_all('option')[1]['value']
a = soup.findAll('option')[1]['value']
self.story.setMetadata('storyId',a.split('=',)[1])
url = 'https://'+self.host+'/'+a
url = 'http://'+self.host+'/'+a
soup = self.make_soup(self.get_request(url))
except:
pass
for info in asoup.find_all('table', {'width' : '100%', 'bordercolor' : re.compile(r'#')}):
for info in asoup.findAll('table', {'width' : '100%', 'bordercolor' : re.compile(r'#')}):
a = info.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
if a != None:
self.story.setMetadata('title',stripHTML(a))
@ -151,13 +151,13 @@ class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
# Find the chapters:
chapters=soup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1$'))
chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1$'))
if len(chapters) == 0:
self.add_chapter(self.story.getMetadata('title'),url)
else:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/'+chapter['href'])
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href'])
# eFiction sites don't help us out a lot with their meta data
@ -170,7 +170,7 @@ class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
except:
return ""
cats = info.find_all('a',href=re.compile('categories.php'))
cats = info.findAll('a',href=re.compile('categories.php'))
for cat in cats:
self.story.addToList('category',cat.string)
@ -188,7 +188,7 @@ class AshwinderSycophantHexComAdapter(BaseSiteAdapter):
## <td><span class="sb"><b>Published:</b> 04/08/2007</td>
## one story had <b>Updated...</b> in the description. Restrict to sub-table
labels = info.find('table').find_all('b')
labels = info.find('table').findAll('b')
for labelspan in labels:
value = labelspan.nextSibling
label = stripHTML(labelspan)

View file

@ -147,7 +147,7 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
mainmeta = soup.find('footer', {'class': 'main-meta'})
alist = mainmeta.find('span', string='Author(s)')
alist = alist.parent.find_all('a', href=re.compile(r"/profile/u/[^/]+"))
alist = alist.parent.findAll('a', href=re.compile(r"/profile/u/[^/]+"))
for a in alist:
self.story.addToList('authorId',a['href'].split('/')[-1])
self.story.addToList('authorUrl','https://'+self.host+a['href'])
@ -159,10 +159,10 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
chapters=soup.find('select',{'name':'chapter-nav'})
hrefattr=None
if chapters:
chapters=chapters.find_all('option')
chapters=chapters.findAll('option')
hrefattr='value'
else: # didn't find <select name='chapter-nav', look for alternative
chapters=soup.find('div',{'class':'widget--chapters'}).find_all('a')
chapters=soup.find('div',{'class':'widget--chapters'}).findAll('a')
hrefattr='href'
for index, chapter in enumerate(chapters):
if chapter.text != 'Foreword' and 'Collapse chapters' not in chapter.text:
@ -202,7 +202,7 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
# story tags
a = mainmeta.find('span',string='Tags')
if a:
tags = a.parent.find_all('a')
tags = a.parent.findAll('a')
for tag in tags:
self.story.addToList('tags', tag.text)
@ -230,7 +230,7 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
# upvote, subs, and views
a = soup.find('div',{'class':'title-meta'})
spans = a.find_all('span', recursive=False)
spans = a.findAll('span', recursive=False)
self.story.setMetadata('upvotes', re.search(r'\(([^)]+)', spans[0].find('span').text).group(1))
self.story.setMetadata('subscribers', re.search(r'\(([^)]+)', spans[1].find('span').text).group(1))
if len(spans) > 2: # views can be private
@ -252,39 +252,13 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
data = self.get_request(url)
soup = self.make_soup(data)
# logger.debug(data)
ageform = soup.select_one('form[action="/account/toggle_age"]')
# logger.debug(ageform)
if ageform and (self.is_adult or self.getConfig("is_adult")):
params = {}
params['is_of_age']=ageform.select_one('input#is_of_age')['value']
params['current_url']=ageform.select_one('input#current_url')['value']
params['csrf_aff_token']=ageform.select_one('input[name="csrf_aff_token"]')['value']
loginUrl = 'https://' + self.getSiteDomain() + '/account/mark_over_18'
logger.info("Will now toggle age to URL (%s)" % (loginUrl))
# logger.debug(params)
data = self.post_request(loginUrl, params)
soup = self.make_soup(data)
# logger.debug(data)
content = soup.find('div', {'id': 'user-submitted-body'})
if self.getConfig('inject_chapter_image'):
logger.debug("Injecting chapter image")
imgdiv = soup.select_one('div#bodyText div.bot-spacer')
if imgdiv:
content.insert(0, "\n")
content.insert(0, imgdiv)
content.insert(0, "\n")
if self.getConfig('inject_chapter_title'):
logger.debug("Injecting full-length chapter title")
title = soup.find('h1', {'id' : 'chapter-title'}).text
newTitle = soup.new_tag('h3')
newTitle.string = title
content.insert(0, "\n")
content.insert(0, newTitle)
content.insert(0, "\n")
return self.utf8FromSoup(url,content)

View file

@ -126,7 +126,7 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter):
# Find the chapters:
# The update date is with the chapter links... so we will update it here as well
for chapter in soup.find_all('a', href=re.compile(r'/stories/chapter.php\?storyid='+self.story.getMetadata('storyId')+r"&chapterid=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'/stories/chapter.php\?storyid='+self.story.getMetadata('storyId')+r"&chapterid=\d+$")):
value = chapter.findNext('td').findNext('td').string.replace('(added on','').replace(')','').strip()
self.story.setMetadata('dateUpdated', makeDate(value, self.dateformat))
self.add_chapter(chapter,'https://'+self.getSiteDomain()+chapter['href'])
@ -134,11 +134,11 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter):
# Get the MetaData
# Erotia Tags
tags = soup.find_all('a',href=re.compile(r'/stories/search.php\?selectedcode'))
tags = soup.findAll('a',href=re.compile(r'/stories/search.php\?selectedcode'))
for tag in tags:
self.story.addToList('eroticatags',tag.text)
for td in soup.find_all('td'):
for td in soup.findAll('td'):
if len(td.text)>0:
if 'Added on:' in td.text and '<table' not in unicode(td):
value = td.text.replace('Added on:','').strip()
@ -169,20 +169,20 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter):
raise exceptions.FailedToDownload("Error downloading Chapter: {0}! Missing required element!".format(url))
#strip comments from soup
[comment.extract() for comment in chaptertag.find_all(string=lambda text:isinstance(text, Comment))]
[comment.extract() for comment in chaptertag.findAll(string=lambda text:isinstance(text, Comment))]
# BDSM Library basically wraps it's own html around the document,
# so we will be removing the script, title and meta content from the
# storyblock
for tag in chaptertag.find_all('head') + chaptertag.find_all('style') + chaptertag.find_all('title') + chaptertag.find_all('meta') + chaptertag.find_all('o:p') + chaptertag.find_all('link'):
for tag in chaptertag.findAll('head') + chaptertag.findAll('style') + chaptertag.findAll('title') + chaptertag.findAll('meta') + chaptertag.findAll('o:p') + chaptertag.findAll('link'):
tag.extract()
for tag in chaptertag.find_all('o:smarttagtype'):
for tag in chaptertag.findAll('o:smarttagtype'):
tag.name = 'span'
## I'm going to take the attributes off all of the tags
## because they usually refer to the style that we removed above.
for tag in chaptertag.find_all(True):
for tag in chaptertag.findAll(True):
tag.attrs = None
return self.utf8FromSoup(url,chaptertag)

View file

@ -157,6 +157,9 @@ class BloodshedverseComAdapter(BaseSiteAdapter):
self.story.addToList('warnings', warning)
elif key == 'Chapters':
self.story.setMetadata('numChapters', int(value))
elif key == 'Words':
# Apparently only numChapters need to be an integer for
# some strange reason. Remove possible ',' characters as to
@ -171,7 +174,7 @@ class BloodshedverseComAdapter(BaseSiteAdapter):
# ugly %p(am/pm) hack moved into makeDate so other sites can use it.
self.story.setMetadata('dateUpdated', date)
if self.story.getMetadataRaw('rating') == 'NC-17' and not (self.is_adult or self.getConfig('is_adult')):
if self.story.getMetadata('rating') == 'NC-17' and not (self.is_adult or self.getConfig('is_adult')):
raise exceptions.AdultCheckRequired(self.url)
def getChapterText(self, url):

View file

@ -116,7 +116,7 @@ class ChaosSycophantHexComAdapter(BaseSiteAdapter):
self.story.setMetadata('rating', rating)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href']+addurl)
@ -134,7 +134,7 @@ class ChaosSycophantHexComAdapter(BaseSiteAdapter):
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
value = labels[0].previousSibling
svalue = ""
@ -154,22 +154,22 @@ class ChaosSycophantHexComAdapter(BaseSiteAdapter):
self.story.setMetadata('numWords', value.split(' -')[0])
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
@ -194,7 +194,7 @@ class ChaosSycophantHexComAdapter(BaseSiteAdapter):
series_url = 'http://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

View file

@ -88,8 +88,8 @@ class ChireadsComSiteAdapter(BaseSiteAdapter):
intro = stripHTML(info.select_one('.inform-inform-txt').span)
self.setDescription(self.url, intro)
for content in soup.find_all('div', {'id': 'content'}):
for a in content.find_all('a'):
for content in soup.findAll('div', {'id': 'content'}):
for a in content.findAll('a'):
self.add_chapter(a.get_text(), a['href'])

View file

@ -98,7 +98,7 @@ class ChosenTwoFanFicArchiveAdapter(BaseSiteAdapter):
## Title
## Some stories have a banner that has it's own a tag before the actual text title...
## so I'm checking the pagetitle div for all a tags that match the criteria, then taking the last.
a = soup.find('div',{'id':'pagetitle'}).find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))[-1]
a = soup.find('div',{'id':'pagetitle'}).findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))[-1]
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
@ -110,7 +110,7 @@ class ChosenTwoFanFicArchiveAdapter(BaseSiteAdapter):
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
#self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href'])
self.add_chapter(chapter,'https://{0}/{1}{2}'.format(self.host, chapter['href'],addURL))
@ -127,7 +127,7 @@ class ChosenTwoFanFicArchiveAdapter(BaseSiteAdapter):
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
val = labelspan.nextSibling
value = unicode('')
@ -149,27 +149,27 @@ class ChosenTwoFanFicArchiveAdapter(BaseSiteAdapter):
self.story.setMetadata('numWords', stripHTML(value))
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Pairing' in label:
ships = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=4'))
ships = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=4'))
for ship in ships:
self.story.addToList('ships',ship.string)
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
for warning in warnings:
self.story.addToList('warnings',warning.string)
@ -196,7 +196,7 @@ class ChosenTwoFanFicArchiveAdapter(BaseSiteAdapter):
seriessoup = self.make_soup(self.get_request(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# this site has several links to each story.

View file

@ -74,74 +74,38 @@ class DeviantArtComSiteAdapter(BaseSiteAdapter):
return r'https?://www\.deviantart\.com/(?P<author>[^/]+)/art/(?P<id>[^/]+)/?'
def performLogin(self, url):
if self.username and self.username != 'NoneGiven':
username = self.username
else:
username = self.getConfig('username')
# logger.debug("\n\nusername:(%s)\n\n"%username)
if not username:
logger.info("Login Required for URL %s" % url)
raise exceptions.FailedToLogin(url,username)
data = self.get_request_raw('https://www.deviantart.com/users/login', referer=url, usecache=False)
data = self.get_request_raw('https://www.deviantart.com/users/login', referer=url)
data = self.decode_data(data)
soup = self.make_soup(data)
params = {
'referer': 'https://www.deviantart.com/_sisu/do/signin', # soup.find('input', {'name': 'referer'})['value'],
'referer_type': soup.find('input', {'name': 'referer_type'})['value'],
'referer': url,
'csrf_token': soup.find('input', {'name': 'csrf_token'})['value'],
'challenge': soup.find('input', {'name': 'challenge'})['value'],
'lu_token': soup.find('input', {'name': 'lu_token'})['value'],
'remember': 'on',
'username': username
}
loginUrl = 'https://' + self.getSiteDomain() + '/_sisu/do/step2'
logger.debug('Will now login to deviantARt as (%s)' % username)
result = self.post_request(loginUrl, params, usecache=False)
soup = self.make_soup(result)
if not soup.find('input', {'name': 'lu_token2'}):
logger.info("Login Failed for URL %s (no lu_token2 found)" % url)
raise exceptions.FailedToLogin(url,username)
params = {
'referer': 'https://www.deviantart.com/_sisu/do/signin', # soup.find('input', {'name': 'referer'})['value'],
'referer_type': soup.find('input', {'name': 'referer_type'})['value'],
'csrf_token': soup.find('input', {'name': 'csrf_token'})['value'],
'challenge': soup.find('input', {'name': 'challenge'})['value'],
'lu_token': soup.find('input', {'name': 'lu_token'})['value'],
'lu_token2': soup.find('input', {'name': 'lu_token2'})['value'],
'remember': 'on',
'username': ''
}
if self.password:
params['username'] = self.username
params['password'] = self.password
else:
params['username'] = self.getConfig('username')
params['password'] = self.getConfig('password')
# logger.debug("\n\nparams['password']:(%s)\n\n"%params['password'])
loginUrl = 'https://' + self.getSiteDomain() + '/_sisu/do/signin'
logger.debug('Will now send password to deviantARt')
logger.debug('Will now login to deviantARt as (%s)' % params['username'])
result = self.post_request(loginUrl, params, usecache=False)
if 'Log In | DeviantArt' in result:
logger.error('Failed to login to deviantArt as %s' % username)
raise exceptions.FailedToLogin('https://www.deviantart.com', username)
logger.error('Failed to login to deviantArt as %s' % params['username'])
raise exceptions.FailedToLogin('https://www.deviantart.com', params['username'])
else:
return True
def requiresLogin(self, data):
return '</a> has limited the viewing of this artwork to members of the DeviantArt community only' in data
def isLoggedIn(self, data):
return '<form id="logout-form" action="https://www.deviantart.com/users/logout" method="POST">' in data
def isWatchersOnly(self, data):
return '>Watchers-Only Deviation<' in data
return '<span>Watchers-Only Deviation</span>' in data
def requiresMatureContentEnabled(self, data):
return (
@ -150,50 +114,44 @@ class DeviantArtComSiteAdapter(BaseSiteAdapter):
or '>This filter hides content that may be inappropriate for some viewers<' in data
or '>May contain sensitive content<' in data
or '>Log in to view<' in data
or '>This deviation has been labeled as containing themes not suitable for all deviants.<' in data
)
def extractChapterUrlsAndMetadata(self):
isLoggedIn = False
logger.debug('URL: %s', self.url)
data = self.get_request(self.url)
soup = self.make_soup(data)
## story can require login outright, or it can show up as
## watchers-only or mature-enabled without the same 'requires
## login' strings.
if self.requiresLogin(data) or ( not self.isLoggedIn(data) and
(self.isWatchersOnly(data) or
self.requiresMatureContentEnabled(data)) ):
if self.requiresLogin(data):
if self.performLogin(self.url):
isLoggedIn = True
data = self.get_request(self.url, usecache=False)
soup = self.make_soup(data)
## Check watchers only and mature enabled again, separately,
## after login because they can still apply after login.
if self.isWatchersOnly(data):
raise exceptions.FailedToDownload(
'Deviation is only available for watchers.' +
'You must watch this author before you can download it.'
)
if self.requiresMatureContentEnabled(data):
raise exceptions.FailedToDownload(
'Deviation is set as mature, you must go into your account ' +
'and enable showing of mature content.'
)
)
appurl = soup.select_one('meta[property="og:url"]')['content']
if appurl:
story_id = urlparse(appurl).path.lstrip('/')
else:
logger.debug("Looking for JS story id")
## after login, this is only found in a JS block. Dunno why.
## F875A309-B0DB-860E-5079-790D0FBE5668
match = re.match(r'\\"deviationUuid\\":\\"(?P<id>[A-Z0-9-]+)\\",',data)
if match:
story_id = match.group('id')
else:
raise exceptions.FailedToDownload('Failed to find Story ID.')
if self.requiresMatureContentEnabled(data):
# as far as I can tell deviantArt has no way to show mature
# content that doesn't involve logging in or using JavaScript
if not isLoggedIn:
self.performLogin(self.url)
isLoggedIn = True
data = self.get_request(self.url, usecache=False)
soup = self.make_soup(data)
if self.requiresMatureContentEnabled(data):
raise exceptions.FailedToDownload(
'Deviation is set as mature, you must go into your account ' +
'and enable showing of mature content.'
)
appurl = soup.select_one('meta[property="da:appurl"]')['content']
story_id = urlparse(appurl).path.lstrip('/')
self.story.setMetadata('storyId', story_id)
title = soup.select_one('h1').get_text()
@ -222,35 +180,19 @@ class DeviantArtComSiteAdapter(BaseSiteAdapter):
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s', url)
data = self.get_request(url)
# logger.debug(data)
soup = self.make_soup(data)
# remove comments section to avoid false matches
comments = soup.select_one('[data-hook=comments_thread]')
if comments:
comments.decompose()
# previous search not always found in some stories.
# <div id="comments"></div> inside the real containing
# div seems more common
commentsdiv = soup.select_one('div#comments')
if commentsdiv:
commentsdiv.parent.decompose()
# three different 'content' tags to look for.
# This is the current in Oct 2024
content = soup.select_one('[data-editor-viewer="1"]')
comments.decompose()
content = soup.select_one('[data-id=rich-content-viewer]')
if content is None:
# older story? I can't find any of this style in Oct2024
content = soup.select_one('[data-id="rich-content-viewer"]')
if content is None:
# olderer story, but used by some older (2018) posts
# older story
content = soup.select_one('.legacy-journal')
if content is None:
raise exceptions.FailedToDownload(
'Could not find story text. Please open a bug with the URL %s' % self.url
if content is None:
raise exceptions.FailedToDownload(
'Could not find story text. Please open a bug with the URL %s' % self.url
)
return self.utf8FromSoup(url, content)

View file

@ -95,7 +95,7 @@ class DokugaComAdapter(BaseSiteAdapter):
params['Submit'] = 'Submit'
# copy all hidden input tags to pick up appropriate tokens.
for tag in soup.find_all('input',{'type':'hidden'}):
for tag in soup.findAll('input',{'type':'hidden'}):
params[tag['name']] = tag['value']
loginUrl = 'http://' + self.getSiteDomain() + '/fanfiction'
@ -153,7 +153,7 @@ class DokugaComAdapter(BaseSiteAdapter):
self.story.setMetadata('title',stripHTML(a))
# Find the chapters:
chapters = soup.find('select').find_all('option')
chapters = soup.find('select').findAll('option')
if len(chapters)==1:
self.add_chapter(self.story.getMetadata('title'),'http://'+self.host+'/'+self.section+'/story/'+self.story.getMetadata('storyId')+'/1')
else:
@ -168,7 +168,7 @@ class DokugaComAdapter(BaseSiteAdapter):
asoup=asoup.find('div', {'id' : 'cb_tabid_52'}).find('div')
#grab the rest of the metadata from the author's page
for div in asoup.find_all('div'):
for div in asoup.findAll('div'):
nav=div.find('a', href=re.compile(r'/fanfiction/story/'+self.story.getMetadata('storyId')+"/1$"))
if nav != None:
break
@ -208,7 +208,7 @@ class DokugaComAdapter(BaseSiteAdapter):
else:
asoup=asoup.find('div', {'id' : 'maincol'}).find('div', {'class' : 'padding'})
for div in asoup.find_all('div'):
for div in asoup.findAll('div'):
nav=div.find('a', href=re.compile(r'/spark/story/'+self.story.getMetadata('storyId')+"/1$"))
if nav != None:
break

View file

@ -161,7 +161,7 @@ class DracoAndGinnyComAdapter(BaseSiteAdapter):
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href']+addurl)
@ -181,13 +181,13 @@ class DracoAndGinnyComAdapter(BaseSiteAdapter):
self.setDescription(url,content.find('blockquote'))
for genre in content.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1')):
for genre in content.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')):
self.story.addToList('genre',genre.string)
for warning in content.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2')):
for warning in content.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')):
self.story.addToList('warnings',warning.string)
labels = content.find_all('b')
labels = content.findAll('b')
for labelspan in labels:
value = labelspan.nextSibling
@ -208,22 +208,22 @@ class DracoAndGinnyComAdapter(BaseSiteAdapter):
self.story.setMetadata('rating', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
@ -247,7 +247,7 @@ class DracoAndGinnyComAdapter(BaseSiteAdapter):
seriessoup = self.make_soup(self.get_request(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links

View file

@ -138,7 +138,7 @@ class EFPFanFicNet(BaseSiteAdapter):
# no selector found, so it's a one-chapter story.
self.add_chapter(self.story.getMetadata('title'),url)
else:
allOptions = select.find_all('option', {'value' : re.compile(r'viewstory')})
allOptions = select.findAll('option', {'value' : re.compile(r'viewstory')})
for o in allOptions:
url = u'https://%s/%s' % ( self.getSiteDomain(),
o['value'])
@ -170,14 +170,14 @@ class EFPFanFicNet(BaseSiteAdapter):
if authsoup != None:
# last author link with offset should be the 'next' link.
authurl = u'https://%s/%s' % ( self.getSiteDomain(),
authsoup.find_all('a',href=re.compile(r'viewuser\.php\?uid=\d+&catid=&offset='))[-1]['href'] )
authsoup.findAll('a',href=re.compile(r'viewuser\.php\?uid=\d+&catid=&offset='))[-1]['href'] )
# Need author page for most of the metadata.
logger.debug("fetching author page: (%s)"%authurl)
authsoup = self.make_soup(self.get_request(authurl))
#print("authsoup:%s"%authsoup)
storyas = authsoup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r'&i=1$'))
storyas = authsoup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r'&i=1$'))
for storya in storyas:
#print("======storya:%s"%storya)
storyblock = storya.findParent('div',{'class':'storybloc'})
@ -194,7 +194,7 @@ class EFPFanFicNet(BaseSiteAdapter):
# Tipo di coppia: Het | Personaggi: Akasuna no Sasori , Akatsuki, Nuovo Personaggio | Note: OOC | Avvertimenti: Tematiche delicate<br />
# Categoria: <a href="categories.php?catid=1&amp;parentcatid=1">Anime & Manga</a> > <a href="categories.php?catid=108&amp;parentcatid=108">Naruto</a> | Contesto: Naruto Shippuuden | Leggi le <a href="reviews.php?sid=1331275&amp;a=">3</a> recensioni</div>
cats = noteblock.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = noteblock.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
@ -262,7 +262,7 @@ class EFPFanFicNet(BaseSiteAdapter):
seriessoup = self.make_soup(self.get_request(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId'))+'&i=1':
@ -288,11 +288,11 @@ class EFPFanFicNet(BaseSiteAdapter):
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
# remove any header and 'o:p' tags.
for tag in div.find_all("head") + div.find_all("o:p"):
for tag in div.findAll("head") + div.findAll("o:p"):
tag.extract()
# change any html and body tags to div.
for tag in div.find_all("html") + div.find_all("body"):
for tag in div.findAll("html") + div.findAll("body"):
tag.name='div'
# remove extra bogus doctype.

View file

@ -126,7 +126,7 @@ class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter):
self.story.setMetadata('rating', rating)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href']+addurl)
@ -144,7 +144,7 @@ class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter):
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
value = labels[0].previousSibling
svalue = ""
@ -164,22 +164,22 @@ class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter):
self.story.setMetadata('numWords', value.split(' -')[0])
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
@ -204,7 +204,7 @@ class ErosnSapphoSycophantHexComAdapter(BaseSiteAdapter):
series_url = 'http://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links

View file

@ -53,9 +53,6 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
#Setting the 'Zone' for each "Site"
self.zone = self.parsedUrl.netloc.replace('.fanficauthors.net','')
# site change .nsns to -nsns
self.zone = self.zone.replace('.nsns','-nsns')
# normalized story URL.
self._setURL('https://{0}.{1}/{2}/'.format(
self.zone, self.getBaseDomain(), self.story.getMetadata('storyId')))
@ -82,10 +79,7 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
@classmethod
def getAcceptDomains(cls):
# need both .nsns(old) and -nsns(new) because it's a domain
# change, not just URL change.
return ['aaran-st-vines.nsns.fanficauthors.net',
'aaran-st-vines-nsns.fanficauthors.net',
'abraxan.fanficauthors.net',
'bobmin.fanficauthors.net',
'canoncansodoff.fanficauthors.net',
@ -101,12 +95,9 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
'jeconais.fanficauthors.net',
'kinsfire.fanficauthors.net',
'kokopelli.nsns.fanficauthors.net',
'kokopelli-nsns.fanficauthors.net',
'ladya.nsns.fanficauthors.net',
'ladya-nsns.fanficauthors.net',
'lorddwar.fanficauthors.net',
'mrintel.nsns.fanficauthors.net',
'mrintel-nsns.fanficauthors.net',
'musings-of-apathy.fanficauthors.net',
'ruskbyte.fanficauthors.net',
'seelvor.fanficauthors.net',
@ -117,7 +108,7 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
################################################################################################
@classmethod
def getSiteExampleURLs(self):
return ("https://aaran-st-vines-nsns.fanficauthors.net/A_Story_Name/ "
return ("https://aaran-st-vines.nsns.fanficauthors.net/A_Story_Name/ "
+ "https://abraxan.fanficauthors.net/A_Story_Name/ "
+ "https://bobmin.fanficauthors.net/A_Story_Name/ "
+ "https://canoncansodoff.fanficauthors.net/A_Story_Name/ "
@ -132,10 +123,10 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
+ "https://jbern.fanficauthors.net/A_Story_Name/ "
+ "https://jeconais.fanficauthors.net/A_Story_Name/ "
+ "https://kinsfire.fanficauthors.net/A_Story_Name/ "
+ "https://kokopelli-nsns.fanficauthors.net/A_Story_Name/ "
+ "https://ladya-nsns.fanficauthors.net/A_Story_Name/ "
+ "https://kokopelli.nsns.fanficauthors.net/A_Story_Name/ "
+ "https://ladya.nsns.fanficauthors.net/A_Story_Name/ "
+ "https://lorddwar.fanficauthors.net/A_Story_Name/ "
+ "https://mrintel-nsns.fanficauthors.net/A_Story_Name/ "
+ "https://mrintel.nsns.fanficauthors.net/A_Story_Name/ "
+ "https://musings-of-apathy.fanficauthors.net/A_Story_Name/ "
+ "https://ruskbyte.fanficauthors.net/A_Story_Name/ "
+ "https://seelvor.fanficauthors.net/A_Story_Name/ "
@ -145,16 +136,8 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
################################################################################################
def getSiteURLPattern(self):
## .nsns kept here to match both . and -
return r'https?://(aaran-st-vines.nsns|abraxan|bobmin|canoncansodoff|chemprof|copperbadge|crys|deluded-musings|draco664|fp|frenchsession|ishtar|jbern|jeconais|kinsfire|kokopelli.nsns|ladya.nsns|lorddwar|mrintel.nsns|musings-of-apathy|ruskbyte|seelvor|tenhawk|viridian|whydoyouneedtoknow)\.fanficauthors\.net/([a-zA-Z0-9_]+)/'
@classmethod
def get_section_url(cls,url):
## only changing .nsns to -nsns and only when part of the
## domain.
url = url.replace('.nsns.fanficauthors.net','-nsns.fanficauthors.net')
return url
################################################################################################
def doExtractChapterUrlsAndMetadata(self, get_cover=True):
@ -180,7 +163,7 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
# Find the chapters:
# The published and update dates are with the chapter links...
# so we have to get them from there.
chapters = soup.find_all('a', href=re.compile('/'+self.story.getMetadata(
chapters = soup.findAll('a', href=re.compile('/'+self.story.getMetadata(
'storyId')+'/([a-zA-Z0-9_]+)/'))
# Here we are getting the published date. It is the date the first chapter was "updated"
@ -219,7 +202,7 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
## Raising AdultCheckRequired after collecting chapters gives
## a double chapter list. So does genre, but it de-dups
## automatically.
if( self.story.getMetadataRaw('rating') in ['Mature','Adult Only']
if( self.story.getMetadata('rating') == 'Mature'
and not (self.is_adult or self.getConfig("is_adult")) ):
raise exceptions.AdultCheckRequired(self.url)
@ -243,7 +226,7 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
if( self.story.getMetadataRaw('rating') in ['Mature','Adult Only'] and
if( self.story.getMetadata('rating') == 'Mature' and
(self.is_adult or self.getConfig("is_adult")) ):
addurl = "?bypass=1"
else:
@ -258,8 +241,8 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
"Error downloading Chapter: '{0}'! Missing required element!".format(url))
#Now, there are a lot of extranious tags within the story division.. so we will remove them.
for tag in story.find_all('ul',{'class':'pager'}) + story.find_all(
'div',{'class':'alert'}) + story.find_all('div', {'class':'btn-group'}):
for tag in story.findAll('ul',{'class':'pager'}) + story.findAll(
'div',{'class':'alert'}) + story.findAll('div', {'class':'btn-group'}):
tag.extract()
return self.utf8FromSoup(url,story)

View file

@ -150,7 +150,7 @@ class FanFicsMeAdapter(BaseSiteAdapter):
self.story.setMetadata('rating',stripHTML(get_meta_content(u'Рейтинг')))
## Need to login for any rating higher than General.
if self.story.getMetadataRaw('rating') != 'General' and self.needToLoginCheck(data):
if self.story.getMetadata('rating') != 'General' and self.needToLoginCheck(data):
self.performLogin(url)
# reload after login.
data = self.get_request(url,usecache=False)

View file

@ -44,8 +44,9 @@ class FanfictalkComAdapter(BaseSiteAdapter):
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/archive/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ahpfftc')
@ -56,24 +57,24 @@ class FanfictalkComAdapter(BaseSiteAdapter):
@classmethod
def getAcceptDomains(cls):
return [cls.getSiteDomain(),'archive.hpfanfictalk.com','fanfictalk.com']
return [cls.getSiteDomain(),'archive.hpfanfictalk.com']
@classmethod
def getConfigSections(cls):
"Only needs to be overriden if has additional ini sections."
return [cls.getConfigSection(),'archive.hpfanfictalk.com','fanfictalk.com']
return [cls.getConfigSection(),'archive.hpfanfictalk.com']
@staticmethod # must be @stgetAcceptDomainsaticmethod, don't remove it.
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'archive.fanfictalk.com'
return 'fanfictalk.com'
@classmethod
def getSiteExampleURLs(cls):
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/archive/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return r"https?://("+r"|".join([x.replace('.',r'\.') for x in self.getAcceptDomains()])+r")(/archive)?/viewstory\.php\?sid=\d+$"
return r"https?://(archive\.hp)?"+re.escape(self.getSiteDomain())+r"(/archive)?/viewstory\.php\?sid=\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
@ -117,7 +118,7 @@ class FanfictalkComAdapter(BaseSiteAdapter):
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/'+chapter['href'])
self.add_chapter(chapter,'https://'+self.host+'/archive/'+chapter['href'])
# categories
for a in soup.select("div#sort a"):
@ -170,14 +171,14 @@ class FanfictalkComAdapter(BaseSiteAdapter):
# Site allows stories to be in several series at once. FFF
# isn't thrilled with that, we have series00, series01, etc.
# Example:
# https://archive.fanfictalk.com/viewstory.php?sid=483
# https://fanfictalk.com/archive/viewstory.php?sid=483
if self.getConfig("collect_series"):
seriesspan = soup.find('span',label='Series')
for i, seriesa in enumerate(seriesspan.find_all('a', href=re.compile(r"viewseries\.php\?seriesid=\d+"))):
# logger.debug(seriesa)
series_name = stripHTML(seriesa)
series_url = 'https://'+self.host+'/'+seriesa['href']
series_url = 'https://'+self.host+'/archive/'+seriesa['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+'))
@ -204,17 +205,9 @@ class FanfictalkComAdapter(BaseSiteAdapter):
# grab the text for an individual chapter.
def getChapterText(self, url):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=3"
else:
addurl=""
logger.debug('Getting chapter text from: %s' % url)
logger.debug('Getting chapter text from: %s' % (url+addurl))
soup = self.make_soup(self.get_request(url+addurl))
soup = self.make_soup(self.get_request(url))
div = soup.find('div', {'id' : 'story'})

View file

@ -93,14 +93,6 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
# logger.debug("post-url:%s"%url)
return url
@classmethod
def get_url_search(cls,url):
regexp = super(getClass(), cls).get_url_search(url)
regexp = re.sub(r"^(?P<keep>.*net/s/\d+/\d+/)(?P<urltitle>[^\$]*)?",
r"\g<keep>(.*)",regexp)
logger.debug(regexp)
return regexp
def getSiteURLPattern(self):
return self._get_site_url_pattern()
@ -110,31 +102,6 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
return re.sub(r"https?://(www|m)\.(?P<keep>fanfiction\.net/s/\d+/\d+/).*",
r"https://www.\g<keep>",url)+self.urltitle
def get_request(self,url,usecache=True):
## use super version if not set or isn't a chapter URL with a
## title.
if( not self.getConfig("try_shortened_title_urls") or
not re.match(r"https?://www\.fanfiction\.net/s/\d+/\d+/(?P<title>[^/]+)$", url) ):
return super(getClass(), self).get_request(url,usecache)
## kludgey way to attempt more than one URL variant by
## removing title one letter at a time. Note that network and
## open_pages_in_browser retries still happen first.
titlelen = len(url.split('/')[-1])
maxcut = min([4,titlelen])
j = 0
while j < maxcut: # should actually leave loop either by
# return or exception raise.
try:
useurl = url
if j: # j==0, full URL, then remove letters.
useurl = url[:-j]
return super(getClass(), self).get_request(useurl,usecache)
except exceptions.HTTPErrorFFF as fffe:
if j >= maxcut or 'Page not found or expired' not in unicode(fffe):
raise
j = j+1
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
# fetch the chapter. From that we will get almost all the
@ -167,7 +134,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
## the first chapter. It generates another server request and
## doesn't seem to be needed lately, so now default it to off.
try:
chapcount = len(soup.find('select', { 'name' : 'chapter' } ).find_all('option'))
chapcount = len(soup.find('select', { 'name' : 'chapter' } ).findAll('option'))
# get chapter part of url.
except:
chapcount = 1
@ -212,7 +179,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
## For 1, use the second link.
## For 2, fetch the crossover page and pull the two categories from there.
pre_links = soup.find('div',{'id':'pre_story_links'})
categories = pre_links.find_all('a',{'class':'xcontrast_txt'})
categories = pre_links.findAll('a',{'class':'xcontrast_txt'})
#print("xcontrast_txt a:%s"%categories)
if len(categories) > 1:
# Strangely, the ones with *two* links are the
@ -251,7 +218,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
grayspan = gui_table1i.find('span', {'class':'xgray xcontrast_txt'})
# for b in grayspan.find_all('button'):
# for b in grayspan.findAll('button'):
# b.extract()
metatext = stripHTML(grayspan).replace('Hurt/Comfort','Hurt-Comfort')
#logger.debug("metatext:(%s)"%metatext)
@ -290,7 +257,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
# Updated: <span data-xutime='1368059198'>5/8</span> - Published: <span data-xutime='1278984264'>7/12/2010</span>
# Published: <span data-xutime='1384358726'>8m ago</span>
dates = soup.find_all('span',{'data-xutime':re.compile(r'^\d+$')})
dates = soup.findAll('span',{'data-xutime':re.compile(r'^\d+$')})
if len(dates) > 1 :
# updated get set to the same as published upstream if not found.
self.story.setMetadata('dateUpdated',datetime.fromtimestamp(float(dates[0]['data-xutime'])))
@ -341,10 +308,11 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
img = soup.select_one('img.lazy.cimage')
cover_url=img['data-original']
except:
## Nov 2023 - src is always "/static/images/d_60_90.jpg" now
## Only take cover if there's data-original
## Primary motivator is to prevent unneeded author page hits.
pass
img = soup.select_one('img.cimage:not(.lazy)')
if img:
cover_url=img['src']
## Nov 19, 2020, ffnet lazy cover images returning 0 byte
## files.
logger.debug("cover_url:%s"%cover_url)
authimg_url = ""
@ -395,7 +363,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
# no selector found, so it's a one-chapter story.
self.add_chapter(self.story.getMetadata('title'),url)
else:
allOptions = select.find_all('option')
allOptions = select.findAll('option')
for o in allOptions:
## title URL will be put back on chapter URL during
## normalize_chapterurl() anyway, but also here for

View file

@ -1,157 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2024 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import io
import logging
import re
import zipfile
from bs4 import BeautifulSoup
# py2 vs py3 transition
from .base_adapter import BaseSiteAdapter, makeDate
from fanficfare.htmlcleanup import stripHTML
from .. import exceptions as exceptions
logger = logging.getLogger(__name__)
def getClass():
return FanfictionsFrSiteAdapter
class FanfictionsFrSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev', 'fanfictionsfr')
self.story.setMetadata('langcode','fr')
self.story.setMetadata('language','Français')
# get storyId from url--url validation guarantees query correct
match = re.match(self.getSiteURLPattern(), url)
if not match:
raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs())
story_id = match.group('id')
self.story.setMetadata('storyId', story_id)
fandom_name = match.group('fandom')
self._setURL('https://%s/fanfictions/%s/%s/chapters.html' % (self.getSiteDomain(), fandom_name, story_id))
@staticmethod
def getSiteDomain():
return 'www.fanfictions.fr'
@classmethod
def getSiteExampleURLs(cls):
return 'https://%s/fanfictions/fandom/fanfiction-id/chapters.html' % cls.getSiteDomain()
def getSiteURLPattern(self):
return r'https?://(?:www\.)?fanfictions\.fr/fanfictions/(?P<fandom>[^/]+)/(?P<id>[^/]+)(/chapters.html)?'
def extractChapterUrlsAndMetadata(self):
logger.debug('URL: %s', self.url)
data = self.get_request(self.url)
soup = self.make_soup(data)
# detect if the fanfiction is 'suspended' (chapters unavailable)
alert_div = soup.find('div', id='alertInactiveFic')
if alert_div:
raise exceptions.FailedToDownload("Failed to download the fanfiction, most likely because it is suspended.")
title_element = soup.find('h1', itemprop='name')
self.story.setMetadata('title', stripHTML(title_element))
author_div = soup.find('div', itemprop='author')
author_name = stripHTML(author_div.a)
author_id = author_div.a['href'].split('/')[-1].replace('.html', '')
self.story.setMetadata('author', author_name)
self.story.setMetadata('authorId', author_id)
published_date_element = soup.find('span', class_='date-distance')
published_date_text = published_date_element['data-date']
published_date = makeDate(published_date_text, '%Y-%m-%d %H:%M:%S')
if published_date:
self.story.setMetadata('datePublished', published_date)
status_element = soup.find('p', title="Statut de la fanfiction").find('span', class_='badge')
french_status = stripHTML(status_element)
status_translation = {
"En cours": "In-Progress",
"Terminée": "Completed",
"One-shot": "Completed",
}
self.story.setMetadata('status', status_translation.get(french_status, french_status))
genre_elements = soup.find('div', title="Format et genres").find_all('span', class_="highlightable")
self.story.extendList('genre', [ stripHTML(genre) for genre in genre_elements[1:] ])
category_elements = soup.find_all('li', class_="breadcrumb-item")
self.story.extendList('category', [ stripHTML(category) for category in category_elements[-2].find_all('a') ])
first_description = soup.find('p', itemprop='abstract')
self.setDescription(self.url, first_description)
chapter_cards = soup.find_all(class_=['card', 'chapter'])
for chapter_card in chapter_cards:
chapter_title_tag = chapter_card.find('h2')
if chapter_title_tag:
chapter_title = stripHTML(chapter_title_tag)
chapter_link = 'https://'+self.getSiteDomain()+chapter_title_tag.find('a')['href']
# Clean up the chapter title by replacing multiple spaces and newline characters with a single space
chapter_title = re.sub(r'\s+', ' ', chapter_title)
self.add_chapter(chapter_title, chapter_link)
last_chapter_div = chapter_cards[-1]
updated_date_element = last_chapter_div.find('span', class_='date-distance')
last_chapter_update_date = updated_date_element['data-date']
date = makeDate(last_chapter_update_date, '%Y-%m-%d %H:%M:%S')
if date:
self.story.setMetadata('dateUpdated', date)
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
response, redirection_url = self.get_request_redirected(url)
if "telecharger_pdf.html" in redirection_url:
with zipfile.ZipFile(io.BytesIO(response.encode('latin1'))) as z:
# Assuming there's only one text file inside the zip
file_list = z.namelist()
if len(file_list) != 1:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Zip file should contain exactly one text file!" % url)
text_filename = file_list[0]
with z.open(text_filename) as text_file:
# Decode the text file with windows-1252 encoding
text = text_file.read().decode('windows-1252')
return text.replace("\r\n", "<br>\r\n")
else:
soup = self.make_soup(response)
div_content = soup.find('div', id='readarea')
if div_content is None:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url, div_content)

View file

@ -134,7 +134,7 @@ class FanFiktionDeAdapter(BaseSiteAdapter):
self.story.setMetadata('author',stripHTML(a))
# Find the chapters:
for chapter in soup.find('select').find_all('option'):
for chapter in soup.find('select').findAll('option'):
self.add_chapter(chapter,'https://'+self.host+'/s/'+self.story.getMetadata('storyId')+'/'+chapter['value'])
## title="Wörter" failed with max_zalgo:1
@ -181,13 +181,13 @@ class FanFiktionDeAdapter(BaseSiteAdapter):
# #find metadata on the author's page
# asoup = self.make_soup(self.get_request("https://"+self.getSiteDomain()+"?a=q&a1=v&t=nickdetailsstories&lbi=stories&ar=0&nick="+self.story.getMetadata('authorId')))
# tr=asoup.find_all('tr')
# tr=asoup.findAll('tr')
# for i in range(1,len(tr)):
# a = tr[i].find('a')
# if '/s/'+self.story.getMetadata('storyId')+'/1/' in a['href']:
# break
# td = tr[i].find_all('td')
# td = tr[i].findAll('td')
# self.story.addToList('category',stripHTML(td[2]))
# self.story.setMetadata('rating', stripHTML(td[5]))
# self.story.setMetadata('numWords', stripHTML(td[6]))
@ -204,7 +204,7 @@ class FanFiktionDeAdapter(BaseSiteAdapter):
soup = self.make_soup(self.get_request(url))
div = soup.find('div', {'id' : 'storytext'})
for a in div.find_all('script'):
for a in div.findAll('script'):
a.extract()
if None == div:

View file

@ -0,0 +1,168 @@
# -*- coding: utf-8 -*-
# Copyright 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
####################################################################################################
### Adapted by Rikkit on November 7. 2017
###=================================================================================================
### Tested with Calibre
####################################################################################################
from __future__ import absolute_import
import logging
import re
# py2 vs py3 transition
from .base_adapter import BaseSiteAdapter, makeDate
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
logger = logging.getLogger(__name__)
def getClass():
''' Initializing the class '''
return FastNovelNetAdapter
class FastNovelNetAdapter(BaseSiteAdapter):
''' Adapter for FASTNOVEL.net '''
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev', 'fstnvl')
self.dateformat = '%d/%m/%Y'
# get storyId from url--url validation guarantees query correct
match = re.match(self.getSiteURLPattern(), url)
if not match:
raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs())
story_id = match.group('id')
self.story.setMetadata('storyId', story_id)
self._setURL('https://%s/%s/' % (self.getSiteDomain(), story_id))
@staticmethod
def getSiteDomain():
return 'fastnovels.net'
@classmethod
def getAcceptDomains(cls):
return [cls.getSiteDomain(),'fastnovel.net']
@classmethod
def getConfigSections(cls):
"Only needs to be overriden if has additional ini sections."
return [cls.getConfigSection(),'fastnovel.net']
@classmethod
def getSiteExampleURLs(cls):
return "https://fastnovels.net/a-story-name-id"
def getSiteURLPattern(self):
# https://fastnovels.net/ultimate-scheming-system-158/
# also accept fastnovel.net
return r"https?://fastnovels?\.net/(?P<id>[^/]+)"
## Normalized chapter URLs by changing old titlenum part to be
## same as storyId.
def normalize_chapterurl(self,url):
# https://fastnovels.net/cultivation-chat-group8-29/chapter-25206.html
return re.sub(r"\.net/.*(?P<keep>/chapter-\d+.html)",
r".net/"+self.story.getMetadata('storyId')+r"\g<keep>",url)
def extractChapterUrlsAndMetadata(self):
logger.debug('URL: %s', self.url)
(data,rurl) = self.get_request_redirected(self.url)
if rurl != self.url:
match = re.match(self.getSiteURLPattern(), rurl)
if not match:
## shouldn't happen, but in case it does...
raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs())
story_id = match.group('id')
self.story.setMetadata('storyId', story_id)
self._setURL('https://%s/%s/' % (self.getSiteDomain(), story_id))
logger.debug("set to redirected url:%s"%self.url)
soup = self.make_soup(data)
self.story.setMetadata('title', soup.find('h1').string)
for li in soup.select('.meta-data li'):
label = li.select_one('label')
if not label:
continue
if label.string == "Author:":
for a in li.select('a'):
self.story.setMetadata('authorId', a["href"].split('/')[2])
self.story.setMetadata('authorUrl','https://'+self.host+a["href"])
self.story.setMetadata('author', a["title"])
if label.string == "Genre:":
for a in li.select('a'):
self.story.addToList('genre',a["title"])
if label.string == "Status:":
if li.select_one('strong').string.strip() == "Completed":
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if label.string == "Last updated:":
dateUpd = label.next_sibling.strip()
self.story.setMetadata('dateUpdated', makeDate(stripHTML(dateUpd), self.dateformat))
coverurl = soup.select_one('div.book-cover')["data-original"]
if coverurl != "https://fastnovels.net/images/novel/default.jpg":
self.setCoverImage(self.url, coverurl)
tags = soup.select_one('.tags')
if tags:
for a in tags.select("li.tag-item a"):
self.story.addToList('tags', a["title"])
# extract tags, because it inside description
tags.extract()
self.setDescription(self.url, soup.select_one('div.content p'))
## number from end of storyId, taken this way in case it changes.
# <input id="film_id" type="hidden" value="10667">
film_id = soup.select_one('input#post_id')['value']
ch_data = self.post_request('https://'+self.host+'/',
parameters={'id': film_id,
'list_postdata': '1'})
# logger.debug(ch_data)
ch_soup = self.make_soup(ch_data)
# logger.debug(ch_soup)
# for book in soup.select("#list-chapters .book"):
# volume = book.select_one('.title a').string
for a in ch_soup.select(".list-chapters a.chapter"):
# title = volume + " " + stripHTML(a)
title = stripHTML(a)
self.add_chapter(title, 'https://' + self.host + a["href"])
def getChapterText(self, url):
data = self.get_request(url)
soup = self.make_soup(data)
story = soup.select_one('#chapter-body')
if not story:
raise exceptions.FailedToDownload(
"Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url, story)

View file

@ -15,16 +15,16 @@
# limitations under the License.
#
from __future__ import absolute_import,unicode_literals
# import datetime
from __future__ import absolute_import
import datetime
import logging
import json
logger = logging.getLogger(__name__)
import re
# from .. import translit
from .. import translit
from ..htmlcleanup import stripHTML
from .. import exceptions# as exceptions
from .. import exceptions as exceptions
# py2 vs py3 transition
@ -58,7 +58,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = u"%d %m %Y г., %H:%M"
self.dateformat = "%d %m %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
@ -67,33 +67,17 @@ class FicBookNetAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "https://"+cls.getSiteDomain()+"/readfic/12345 https://"+cls.getSiteDomain()+"/readfic/93626/246417#part_content https://"+cls.getSiteDomain()+"/readfic/578de1cd-a8b4-7ff1-aa49-750426508b82 https://"+cls.getSiteDomain()+"/readfic/578de1cd-a8b4-7ff1-aa49-750426508b82/94793742#part_content"
return "https://"+cls.getSiteDomain()+"/readfic/12345 https://"+cls.getSiteDomain()+"/readfic/93626/246417#part_content"
def getSiteURLPattern(self):
return r"https?://"+re.escape(self.getSiteDomain()+"/readfic/")+r"[\d\-a-zA-Z]+"
def performLogin(self,url,data):
params = {}
if self.password:
params['login'] = self.username
params['password'] = self.password
else:
params['login'] = self.getConfig("username")
params['password'] = self.getConfig("password")
logger.debug("Try to login in as (%s)" % params['login'])
d = self.post_request('https://' + self.getSiteDomain() + '/login_check_static',params,usecache=False)
if 'Войти используя аккаунт на сайте' in d:
raise exceptions.FailedToLogin(url,params['login'])
return True
return r"https?://"+re.escape(self.getSiteDomain()+"/readfic/")+r"\d+"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self,get_cover=True):
def extractChapterUrlsAndMetadata(self):
url=self.url
logger.debug("URL: "+url)
data = self.get_request(url)
soup = self.make_soup(data)
adult_div = soup.find('div',id='adultCoverWarning')
@ -103,11 +87,9 @@ class FicBookNetAdapter(BaseSiteAdapter):
else:
raise exceptions.AdultCheckRequired(self.url)
## Title
try:
a = soup.find('section',{'class':'chapter-info'}).find('h1')
except AttributeError:
raise exceptions.FailedToDownload("Error collecting meta: %s! Missing required element!" % url)
a = soup.find('section',{'class':'chapter-info'}).find('h1')
# kill '+' marks if present.
sup = a.find('sup')
if sup:
@ -123,6 +105,34 @@ class FicBookNetAdapter(BaseSiteAdapter):
self.story.setMetadata('author',a.text)
logger.debug("Author: (%s)"%self.story.getMetadata('author'))
# Find the chapters:
pubdate = None
chapters = soup.find('ul', {'class' : 'list-of-fanfic-parts'})
if chapters != None:
for chapdiv in chapters.findAll('li', {'class':'part'}):
chapter=chapdiv.find('a',href=re.compile(r'/readfic/'+self.story.getMetadata('storyId')+r"/\d+#part_content$"))
churl='https://'+self.host+chapter['href']
self.add_chapter(chapter,churl)
datespan = chapdiv.find('span')
if pubdate == None and datespan:
pubdate = translit.translit(stripHTML(datespan))
update = translit.translit(stripHTML(datespan))
else:
self.add_chapter(self.story.getMetadata('title'),url)
self.story.setMetadata('numChapters',1)
pubdate=translit.translit(stripHTML(soup.find('div',{'class':'title-area'}).find('span')))
update=pubdate
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
if not ',' in pubdate:
pubdate=datetime.date.today().strftime(self.dateformat)
if not ',' in update:
update=datetime.date.today().strftime(self.dateformat)
pubdate=pubdate.split(',')[0]
update=update.split(',')[0]
fullmon = {"yanvarya":"01", u"января":"01",
"fievralya":"02", u"февраля":"02",
"marta":"03", u"марта":"03",
@ -136,50 +146,34 @@ class FicBookNetAdapter(BaseSiteAdapter):
"noyabrya":"11", u"ноября":"11",
"diekabrya":"12", u"декабря":"12" }
# Find the chapters:
pubdate = None
chapters = soup.find('ul', {'class' : 'list-of-fanfic-parts'})
if chapters is not None:
for chapdiv in chapters.find_all('li', {'class':'part'}):
chapter=chapdiv.find('a',href=re.compile(r'/readfic/'+self.story.getMetadata('storyId')+r"/\d+#part_content$"))
churl='https://'+self.host+chapter['href']
for (name,num) in fullmon.items():
if name in pubdate:
pubdate = pubdate.replace(name,num)
if name in update:
update = update.replace(name,num)
# Find the chapter dates.
date_str = chapdiv.find('span', {'title': True})['title'].replace(u"\u202fг. в", "")
for month_name, month_num in fullmon.items():
date_str = date_str.replace(month_name, month_num)
chapterdate = makeDate(date_str,self.dateformat)
self.add_chapter(chapter,churl,
{'date':chapterdate.strftime(self.getConfig("datechapter_format",self.getConfig("datePublished_format",self.dateformat)))})
if pubdate is None and chapterdate:
pubdate = chapterdate
update = chapterdate
else:
self.add_chapter(self.story.getMetadata('title'),url)
date_str = soup.find('div', {'class' : 'part-date'}).find('span', {'title': True})['title'].replace(u"\u202fг. в", "")
for month_name, month_num in fullmon.items():
date_str = date_str.replace(month_name, month_num)
pubdate = update = makeDate(date_str,self.dateformat)
logger.debug("numChapters: (%s)"%self.story.getMetadata('numChapters'))
self.story.setMetadata('dateUpdated', update)
self.story.setMetadata('datePublished', pubdate)
## remove extra ' г.' on date.
update = update.replace(' г.','')
pubdate = pubdate.replace(' г.','')
self.story.setMetadata('dateUpdated', makeDate(update, self.dateformat))
self.story.setMetadata('datePublished', makeDate(pubdate, self.dateformat))
self.story.setMetadata('language','Russian')
dlinfo = soup.select_one('header.d-flex.flex-column.gap-12.word-break')
## after site change, I don't see word count anywhere.
# pr=soup.find('a', href=re.compile(r'/printfic/\w+'))
# pr='https://'+self.host+pr['href']
# pr = self.make_soup(self.get_request(pr))
# pr=pr.findAll('div', {'class' : 'part_text'})
# i=0
# for part in pr:
# i=i+len(stripHTML(part).split(' '))
# self.story.setMetadata('numWords', unicode(i))
series_label = dlinfo.select_one('div.description.word-break').find('strong', string='Серия:')
logger.debug('Series: %s'%str(series_label))
if series_label:
series_div = series_label.find_next_sibling("div")
# No accurate series number as for that, additional request needs to be made
self.setSeries(stripHTML(series_div.a), 1)
self.story.setMetadata('seriesUrl','https://' + self.getSiteDomain() + series_div.a.get('href'))
dlinfo = soup.find('div',{'class':'fanfic-main-info'})
i=0
fandoms = dlinfo.select_one('div:not([class])').find_all('a', href=re.compile(r'/fanfiction/\w+'))
fandoms = dlinfo.find('div').findAll('a', href=re.compile(r'/fanfiction/\w+'))
for fandom in fandoms:
self.story.addToList('category',fandom.string)
i=i+1
@ -188,16 +182,13 @@ class FicBookNetAdapter(BaseSiteAdapter):
tags = soup.find('div',{'class':'tags'})
if tags:
for genre in tags.find_all('a',href=re.compile(r'/tags/')):
for genre in tags.findAll('a',href=re.compile(r'/tags/')):
self.story.addToList('genre',stripHTML(genre))
logger.debug("category: (%s)"%self.story.getMetadata('category'))
logger.debug("genre: (%s)"%self.story.getMetadata('genre'))
ratingdt = dlinfo.find('div',{'class':re.compile(r'badge-rating-.*')})
self.story.setMetadata('rating', stripHTML(ratingdt.find('span')))
# meta=table.find_all('a', href=re.compile(r'/ratings/'))
# meta=table.findAll('a', href=re.compile(r'/ratings/'))
# i=0
# for m in meta:
# if i == 0:
@ -215,11 +206,6 @@ class FicBookNetAdapter(BaseSiteAdapter):
else:
self.story.setMetadata('status', 'In-Progress')
try:
self.story.setMetadata('universe', stripHTML(dlinfo.find('a', href=re.compile('/fandom_universe/'))))
except AttributeError:
pass
paircharsdt = soup.find('strong',string='Пэйринг и персонажи:')
# site keeps both ships and indiv chars in /pairings/ links.
if paircharsdt:
@ -233,98 +219,8 @@ class FicBookNetAdapter(BaseSiteAdapter):
self.story.addToList('characters',stripHTML(paira))
summary=soup.find('div', itemprop='description')
if summary:
# Fix for the text not displaying properly
summary['class'].append('part_text')
self.setDescription(url,summary)
#self.story.setMetadata('description', summary.text)
stats = soup.find('div', {'class':'hat-actions-container'})
targetdata = stats.find_all('span', {'class' : 'main-info'})
for data in targetdata:
svg_class = data.find('svg')['class'][1] if data.find('svg') else None
value = int(stripHTML(data)) if stripHTML(data).isdigit() else 0
if svg_class == 'ic_thumbs-up' and value > 0:
self.story.setMetadata('likes', value)
#logger.debug("likes: (%s)"%self.story.getMetadata('likes'))
elif svg_class == 'ic_bubble-dark' and value > 0:
self.story.setMetadata('reviews', value)
#logger.debug("reviews: (%s)"%self.story.getMetadata('reviews'))
elif svg_class == 'ic_bookmark' and value > 0:
self.story.setMetadata('numCollections', value)
logger.debug("numCollections: (%s)"%self.story.getMetadata('numCollections'))
# Grab the amount of pages and words
targetpages = soup.find('strong',string='Размер:').find_next('div')
if targetpages:
targetpages_text = re.sub(r"(?<!\,)\s| ", "", targetpages.text, flags=re.UNICODE | re.MULTILINE)
pages_raw = re.search(r'(\d+)(?:страницы|страниц)', targetpages_text, re.UNICODE)
pages = int(pages_raw.group(1))
if pages > 0:
self.story.setMetadata('pages', pages)
logger.debug("pages: (%s)"%self.story.getMetadata('pages'))
numWords_raw = re.search(r"(\d+)(?:слова|слов)", targetpages_text, re.UNICODE)
numWords = int(numWords_raw.group(1))
if numWords > 0:
self.story.setMetadata('numWords', numWords)
logger.debug("numWords: (%s)"%self.story.getMetadata('numWords'))
# Grab FBN Category
class_tag = soup.select_one('div[class^="badge-with-icon direction"]').find('span', {'class' : 'badge-text'}).text
if class_tag:
self.story.setMetadata('classification',class_tag)
#logger.debug("classification: (%s)"%self.story.getMetadata('classification'))
# Find dedication.
ded = soup.find('div', {'class' : 'js-public-beta-dedication'})
if ded:
ded['class'].append('part_text')
self.story.setMetadata('dedication',ded)
# Find author comment
comm = soup.find('div', {'class' : 'js-public-beta-author-comment'})
if comm:
comm['class'].append('part_text')
self.story.setMetadata('authorcomment',comm)
follows = stats.find('fanfic-follow-button')[':follow-count']
if int(follows) > 0:
self.story.setMetadata('follows', int(follows))
logger.debug("follows: (%s)"%self.story.getMetadata('follows'))
# Grab the amount of awards
numAwards = 0
try:
awards = soup.find('fanfic-reward-list')[':initial-fic-rewards-list']
award_list = json.loads(awards)
numAwards = int(len(award_list))
# Grab the awards, but if multiple awards have the same name, only one will be kept; only an issue with hundreds of them.
self.story.extendList('awards', [str(award['user_text']) for award in award_list])
#logger.debug("awards (%s)"%self.story.getMetadata('awards'))
except (TypeError, KeyError):
logger.debug("Could not grab the awards")
if numAwards > 0:
self.story.setMetadata('numAwards', numAwards)
logger.debug("Num Awards (%s)"%self.story.getMetadata('numAwards'))
if get_cover:
cover = soup.find('fanfic-cover', {'class':"jsVueComponent"})
if cover is not None:
self.setCoverImage(url,cover['src-original'])
def replace_formatting(self,tag):
tname = tag.name
## operating on plain text because BS4 is hard to work on
## text with.
## stripHTML() discards whitespace around other tags, like <i>
txt = tag.get_text()
txt = txt.replace("\n","<br/>")
soup = self.make_soup("<"+tname+">"+txt+"</"+tname+">")
return soup.find(tname)
self.setDescription(url,summary)
#self.story.setMetadata('description', summary.text)
# grab the text for an individual chapter.
def getChapterText(self, url):
@ -334,60 +230,10 @@ class FicBookNetAdapter(BaseSiteAdapter):
soup = self.make_soup(self.get_request(url))
chapter = soup.find('div', {'id' : 'content'})
if chapter is None: ## still needed?
if chapter == None: ## still needed?
chapter = soup.find('div', {'class' : 'public_beta_disabled'})
if chapter is None:
if None == chapter:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
## ficbook uses weird CSS white-space: pre-wrap; for
## paragraphing. Doesn't work with txt output
if 'part_text' in chapter['class'] and self.getConfig('replace_text_formatting'):
## copy classes, except part_text
divclasses = chapter['class']
divclasses.remove('part_text')
chapter = self.replace_formatting(chapter)
chapter['class'] = divclasses
exclude_notes=self.getConfigList('exclude_notes')
if 'headnotes' not in exclude_notes:
# Find the headnote
head_note = soup.select_one("div.part-comment-top div.js-public-beta-comment-before")
if head_note:
# Create the structure for the headnote
head_notes_div_tag = soup.new_tag('div', attrs={'class': 'fff_chapter_notes fff_head_notes'})
head_b_tag = soup.new_tag('b')
head_b_tag.string = 'Примечания:'
if 'text-preline' in head_note['class'] and self.getConfig('replace_text_formatting'):
head_blockquote_tag = self.replace_formatting(head_note)
head_blockquote_tag.name = 'blockquote'
else:
head_blockquote_tag = soup.new_tag('blockquote')
head_blockquote_tag.string = stripHTML(head_note)
head_notes_div_tag.append(head_b_tag)
head_notes_div_tag.append(head_blockquote_tag)
# Prepend the headnotes to the chapter, <hr> to mimic the site
chapter.insert(0, head_notes_div_tag)
chapter.insert(1, soup.new_tag('hr'))
if 'footnotes' not in exclude_notes:
# Find the endnote
end_note = soup.select_one("div.part-comment-bottom div.js-public-beta-comment-after")
if end_note:
# Create the structure for the footnote
end_notes_div_tag = soup.new_tag('div', attrs={'class': 'fff_chapter_notes fff_foot_notes'})
end_b_tag = soup.new_tag('b')
end_b_tag.string = 'Примечания:'
if 'text-preline' in end_note['class'] and self.getConfig('replace_text_formatting'):
end_blockquote_tag = self.replace_formatting(end_note)
end_blockquote_tag.name = 'blockquote'
else:
end_blockquote_tag = soup.new_tag('blockquote')
end_blockquote_tag.string = stripHTML(end_note)
end_notes_div_tag.append(end_b_tag)
end_notes_div_tag.append(end_blockquote_tag)
# Append the endnotes to the chapter, <hr> to mimic the site
chapter.append(soup.new_tag('hr'))
chapter.append(end_notes_div_tag)
return self.utf8FromSoup(url,chapter)

View file

@ -201,10 +201,10 @@ class FictionAlleyArchiveOrgSiteAdapter(BaseSiteAdapter):
# epubutils.py
# Yes, this still applies to fictionalley-archive.
for tag in chaptext.find_all('head') + chaptext.find_all('meta') + chaptext.find_all('script'):
for tag in chaptext.findAll('head') + chaptext.findAll('meta') + chaptext.findAll('script'):
tag.extract()
for tag in chaptext.find_all('body') + chaptext.find_all('html'):
for tag in chaptext.findAll('body') + chaptext.findAll('html'):
tag.name = 'div'
if self.getConfig('include_author_notes'):

View file

@ -55,8 +55,6 @@ class FictionLiveAdapter(BaseSiteAdapter):
self.story_id = self.parsedUrl.path.split('/')[3]
self.story.setMetadata('storyId', self.story_id)
self.chapter_id_to_api = {}
# normalize URL. omits title in the url
self._setURL("https://fiction.live/stories//{s_id}".format(s_id = self.story_id));
@ -173,7 +171,7 @@ class FictionLiveAdapter(BaseSiteAdapter):
tags = data['ta'] if 'ta' in data else []
if (self.story.getMetadataRaw('rating') in {"nsfw", "adult"} or 'smut' in tags) and \
if (self.story.getMetadata('rating') in {"nsfw", "adult"} or 'smut' in tags) and \
not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
@ -241,17 +239,6 @@ class FictionLiveAdapter(BaseSiteAdapter):
a, b = itertools.tee(iterable, 2)
next(b, None)
return list(zip(a, b))
def map_chap_ids_to_api(chapter_ids, route_ids, times):
for index, bounds in enumerate(times):
start, end = bounds
end -= 1
chapter_url = chunkrange_url.format(s_id = data['_id'], start = start, end = end)
self.chapter_id_to_api[chapter_ids[index]] = chapter_url
for route_id in route_ids:
chapter_url = route_chunkrange_url.format(c_id = route_id)
self.chapter_id_to_api[route_id] = chapter_url
## first thing to do is seperate out the appendices
appendices, maintext, routes = [], [], []
@ -273,25 +260,22 @@ class FictionLiveAdapter(BaseSiteAdapter):
## main-text chapter extraction processing. *should* now handle all the edge cases.
## relies on fanficfare ignoring empty chapters!
titles = ["Home"] + [c['title'] for c in maintext]
chapter_ids = ['home'] + [c['id'] for c in maintext]
times = [data['ct']] + [c['ct'] for c in maintext] + [self.most_recent_chunk + 2] # need to be 1 over, and add_url etc does -1
times = pair(times)
titles = [c['title'] for c in maintext]
titles = ["Home"] + titles
if self.getConfig('include_appendices', True): # Add appendices after main text if desired
titles = titles + ["Appendix: " + a['title'][9:] for a in appendices]
chapter_ids = chapter_ids + [a['id'] for a in appendices]
times = times + [(a['ct'], a['ct'] + 2) for a in appendices]
route_ids = [r['id'] for r in routes]
map_chap_ids_to_api(chapter_ids, route_ids, times) # Map chapter ids to API URLs for use when comparing the two
times = [c['ct'] for c in maintext]
times = [data['ct']] + times + [self.most_recent_chunk + 2] # need to be 1 over, and add_url etc does -1
# doesn't actually run without the call to list.
list(map(add_chapter_url, titles, times))
list(map(add_chapter_url, titles, pair(times)))
for a in appendices: # add appendices afterwards
chapter_start = a['ct']
chapter_title = "Appendix: " + a['title'][9:] # 'Appendix: ' rather than '#special' at beginning of name
add_chapter_url(chapter_title, (chapter_start, chapter_start + 2)) # 1 msec range = this one chunk only
for r in routes: # add route at the end, after appendices
route_id = r['id'] # to get route chapter content, the route id is needed, not the timestamp
route_id = r['id'] # to get route chapter content, the route id is needed, not the timestamp
chapter_title = "Route: " + r['title'] # 'Route: ' at beginning of name, since it's a multiroute chapter
add_route_chapter_url(chapter_title, route_id)
@ -434,7 +418,7 @@ class FictionLiveAdapter(BaseSiteAdapter):
# so let's just ignore non-int values here
if not isinstance(v, int):
continue
if 0 <= v < len(choices):
if 0 <= v <= len(choices):
output[v] += 1
return output
@ -518,10 +502,8 @@ class FictionLiveAdapter(BaseSiteAdapter):
# now matches the site and does *not* include dicerolls as posts!
num_votes = str(len(posts)) + " posts" if len(posts) != 0 else "be the first to post."
posts_title = chunk['b'] if 'b' in chunk else "Reader Posts"
output = ""
output += u"<h4><span>" + posts_title + " — <small> Posting " + closed
output += u"<h4><span>Reader Posts — <small> Posting " + closed
output += u"" + num_votes + "</small></span></h4>\n"
## so. a voter can roll with their post. these rolls are in a seperate dict, but have the **same uid**.
@ -547,35 +529,6 @@ class FictionLiveAdapter(BaseSiteAdapter):
return output
def normalize_chapterurl(self, url):
if url.startswith(r'https://fiction.live/api/anonkun/chapters'):
return url
pattern = None
if url.startswith(r'https://fiction.live/api/anonkun/route'):
pattern = r"https?://(?:beta\.)?fiction\.live/[^/]*/[^/]*/[a-zA-Z0-9]+/routes/([a-zA-Z0-9]+)"
elif url.startswith(r'https://fiction.live/'):
pattern = r"https?://(?:beta\.)?fiction\.live/[^/]*/[^/]*/[a-zA-Z0-9]+/[^/]*(/[a-zA-Z0-9]+|home)"
# regex101 rocks
if not pattern:
return url
match = re.match(pattern, url)
if not match:
return url
chapter_id = match.group(1)
if chapter_id.startswith('/'):
chapter_id = chapter_id[1:]
if chapter_id and chapter_id in self.chapter_id_to_api:
return self.chapter_id_to_api[chapter_id]
return url
def format_unknown(self, chunk):
raise NotImplementedError("Unknown chunk type ({}) in fiction.live story.".format(chunk))

View file

@ -23,7 +23,7 @@ class FictionManiaTVAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'fmt'
SITE_DOMAIN = 'fictionmania.tv'
BASE_URL = 'https://' + SITE_DOMAIN + '/stories/'
BASE_URL = 'http://' + SITE_DOMAIN + '/stories/'
READ_TEXT_STORY_URL_TEMPLATE = BASE_URL + 'readtextstory.html?storyID=%s'
DETAILS_URL_TEMPLATE = BASE_URL + 'details.html?storyID=%s'
@ -40,6 +40,10 @@ class FictionManiaTVAdapter(BaseSiteAdapter):
self._setURL(self.READ_TEXT_STORY_URL_TEMPLATE % story_id)
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
# Always single chapters, probably should use the Anthology feature to
# merge chapters of a story
self.story.setMetadata('numChapters', 1)
@staticmethod
def getSiteDomain():
return FictionManiaTVAdapter.SITE_DOMAIN
@ -49,7 +53,7 @@ class FictionManiaTVAdapter(BaseSiteAdapter):
return cls.READ_TEXT_STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return r'https?' + re.escape(self.BASE_URL[len('https'):]) + r'(readtextstory|readhtmlstory|readxstory|details)\.html\?storyID=\d+$'
return r'https?' + re.escape(self.BASE_URL[len('http'):]) + r'(readtextstory|readhtmlstory|readxstory|details)\.html\?storyID=\d+$'
def extractChapterUrlsAndMetadata(self):
url = self.DETAILS_URL_TEMPLATE % self.story.getMetadata('storyId')
@ -163,30 +167,14 @@ class FictionManiaTVAdapter(BaseSiteAdapter):
# <div style="margin-left:10ex;margin-right:10ex">
## fetching SWI version now instead of text.
htmlurl = url.replace('readtextstory','readhtmlstory')
## Used to find by style, but it's inconsistent now. we've seen:
## margin-left:10ex;margin-right:10ex
## margin-right: 5%; margin-left: 5%
## margin-left:5%; margin-right:5%
## margin-left:5%; margin-right:5%; background: white
## And there's some without a <div> tag (or an unclosed div)
## Only the comments appear to be consistent.
beginmarker='<!--Read or display the file-->'
endmarker='''<hr size=1 noshade>
<!--review add read, top and bottom-->
'''
data = self.get_request(htmlurl)
try:
## if both markers are found, assume whatever is in between
## is the chapter text.
soup = self.make_soup(data[data.index(beginmarker):data.index(endmarker)])
return self.utf8FromSoup(htmlurl,soup)
except Exception as e:
# logger.debug(e)
# logger.debug(soup)
soup = self.make_soup(self.get_request(htmlurl))
div = soup.find('div',style="margin-left:10ex;margin-right:10ex")
if div:
return self.utf8FromSoup(htmlurl,div)
else:
logger.debug("Story With Images(SWI) not found, falling back to HTML.")
## fetching html version now instead of text.
## Note that html and SWI pages are *not* formatted the same.
soup = self.make_soup(self.get_request(url.replace('readtextstory','readxstory')))
# logger.debug(soup)

View file

@ -66,8 +66,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
params['username']))
d = self.post_request(loginUrl,params,usecache=False)
if "Login attempt failed..." in d or \
'<div id="error">Please enter your username and password.</div>' in d:
if "Login attempt failed..." in d:
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['username']))
raise exceptions.FailedToLogin(url,params['username'])
@ -115,7 +114,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
titleh4 = soup.find('div',{'class':'storylist'}).find('h4')
self.story.setMetadata('title', stripHTML(titleh4.a))
if 'Deleted story' in self.story.getMetadataRaw('title'):
if 'Deleted story' in self.story.getMetadata('title'):
raise exceptions.StoryDoesNotExist("This story was deleted. %s"%self.url)
# Find authorid and URL from... author url.
@ -130,14 +129,14 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
#self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
# most of the meta data is here:
metap = storydiv.find("div",{"class":"meta"})
metap = storydiv.find("p",{"class":"meta"})
self.story.addToList('category',metap.find("a",href=re.compile(r"^/category/\d+")).string)
# warnings
# <span class="req"><a href="/help/38" title="Medium Spoilers">[!!] </a> <a href="/help/38" title="Rape/Sexual Violence">[R] </a> <a href="/help/38" title="Violence">[V] </a> <a href="/help/38" title="Child/Underage Sex">[Y] </a></span>
spanreq = metap.find("span",{"class":"story-warnings"})
if spanreq: # can be no warnings.
for a in spanreq.find_all("a"):
for a in spanreq.findAll("a"):
self.story.addToList('warnings',a['title'])
## perhaps not the most efficient way to parse this, using
@ -187,7 +186,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
# no list found, so it's a one-chapter story.
self.add_chapter(self.story.getMetadata('title'),url)
else:
chapterlistlis = storylistul.find_all('li')
chapterlistlis = storylistul.findAll('li')
for chapterli in chapterlistlis:
if "blocked" in chapterli['class']:
# paranoia check. We should already be logged in by now.

View file

@ -99,17 +99,6 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
params['username']))
raise exceptions.FailedToLogin(url,params['username'])
def make_soup(self,data):
soup = super(FimFictionNetSiteAdapter, self).make_soup(data)
for img in soup.select('img.lazy-img, img.user_image'):
## FimF has started a 'camo' mechanism for images that
## gets block by CF. attr data-source is original source.
if img.has_attr('data-source'):
img['src'] = img['data-source']
elif img.has_attr('data-src'):
img['src'] = img['data-src']
return soup
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
if self.is_adult or self.getConfig("is_adult"):
@ -117,8 +106,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
## Only needed with password protected stories, which you have
## to have logged into in the website using this account.
if self.getConfig("always_login"):
self.performLogin(self.url)
self.performLogin(self.url)
##---------------------------------------------------------------------------------------------------
## Get the story's title page. Check if it exists.
@ -151,8 +139,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
self.story.setMetadata("authorId", author['href'].split('/')[2])
self.story.setMetadata("authorUrl", "https://%s/user/%s/%s" % (self.getSiteDomain(),
self.story.getMetadata('authorId'),
# meta entry author can be changed by the user.
stripHTML(author)))
self.story.getMetadata('author')))
#Rating text is replaced with full words for historical compatibility after the site changed
#on 2014-10-27
@ -180,13 +167,12 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
# Cover image
if get_cover:
storyImage = soup.select_one('div.story_container__story_image img')
storyImage = storyContentBox.find('img', {'class':'lazy-img'})
if storyImage:
coverurl = storyImage['data-fullsize']
# try setting from data-fullsize, if fails, try using data-src
cover_set = self.setCoverImage(self.url,coverurl)[0]
if not cover_set or cover_set.startswith("failedtoload"):
coverurl = storyImage['src']
if self.setCoverImage(self.url,coverurl)[0] == "failedtoload":
coverurl = storyImage['data-src']
self.setCoverImage(self.url,coverurl)
coverSource = storyImage.parent.find('a', {'class':'source'})
@ -298,26 +284,16 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
descriptionMeta = soup.find('meta', {'property':'og:description'})
self.story.setMetadata("short_description", stripHTML(descriptionMeta['content']))
# groups.
# If there are more than X groups, there's a 'Show all' button
# that calls for a JSON containing HTML with the full list.
# But it doesn't work reliably with FlareSolverr.
groupList = None
#groups
groupButton = soup.find('button', {'data-click':'showAll'})
if groupButton != None and groupButton.find('i', {'class':'fa-search-plus'}):
try:
groupResponse = self.get_request("https://www.fimfiction.net/ajax/stories/%s/groups" % (self.story.getMetadata("storyId")))
groupData = json.loads(groupResponse)
groupList = self.make_soup(groupData["content"])
except Exception as e:
logger.warning("Collecting 'groups' (AKA 'Featured In') from JSON failed:%s"%e)
logger.warning("Only 'groups' initially shown on the page will be collected.")
logger.warning("This is a known issue with JSON and FlareSolverr. See #1122")
if not groupList:
groupResponse = self.get_request("https://www.fimfiction.net/ajax/stories/%s/groups" % (self.story.getMetadata("storyId")))
groupData = json.loads(groupResponse)
groupList = self.make_soup(groupData["content"])
else:
groupList = soup.find('ul', {'id':'story-groups-list'})
if groupList:
if not (groupList == None):
for groupContent in groupList.find_all('a'):
self.story.addToList("groupsUrl", 'https://'+self.host+groupContent["href"])
groupName = groupContent.find('span', {"class":"group-name"})
@ -408,33 +384,3 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
# data = self.get_request(url)
if self.getConfig("is_adult"):
self.set_adult_cookie()
def get_urls_from_page(self,url,normalize):
iterate = self.getConfig('scrape_bookshelf', default=False)
if not re.search(r'fimfiction\.net/bookshelf/(?P<listid>.+?)/',url) or iterate == 'legacy':
return super().get_urls_from_page(url,normalize)
self.before_get_urls_from_page(url,normalize)
final_urls = list()
while True:
data = self.get_request(url,usecache=True)
soup = self.make_soup(data)
paginator = soup.select_one('div.paginator-container > div.page_list > ul').find_all('li')
logger.debug("Paginator: " + str(len(paginator)))
stories_container = soup.select_one('div.content > div.two-columns > div.left').find_all('article', recursive=False)
x = 0
logger.debug("Container "+str(len(stories_container)))
for story_raw in stories_container:
x += 1
story_url = story_raw.select_one('div.story_content_box > header.title > div > a.story_name').get('href')
url_story = ('https://' + self.getSiteDomain() + story_url)
#logger.debug(url_story)
final_urls.append(url_story)
logger.debug("Discovered %s new stories."%str(x))
next_button = paginator[-1].select_one('a')
logger.debug("Next button: " + next_button.get_text())
if next_button.get_text() or not iterate:
return {'urllist': final_urls}
url = ('https://' + self.getSiteDomain() + next_button.get('href'))

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# Copyright 2024 FanFicFare team
# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -19,20 +19,22 @@ from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
from .base_otw_adapter import BaseOTWAdapter
# py2 vs py3 transition
from .adapter_storiesonlinenet import StoriesOnlineNetAdapter
def getClass():
return SuperloveAdapter
return FineStoriesComAdapter
class SuperloveAdapter(BaseOTWAdapter):
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class FineStoriesComAdapter(StoriesOnlineNetAdapter):
def __init__(self, config, url):
BaseOTWAdapter.__init__(self, config, url)
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','sluv')
@classmethod
def getSiteAbbrev(cls):
return 'fnst'
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'superlove.sayitditto.net'
return 'finestories.com'

View file

@ -93,9 +93,6 @@ class FireFlyFansNetSiteAdapter(BaseSiteAdapter):
a = soup.find('a', href=re.compile(r"profileshow.aspx\?u="))
self.story.setMetadata('authorId', a['href'].split('=')[1])
if not self.story.getMetadata('authorId'):
logger.warning("Site authorUrl missing authorId, using SiteMissingAuthorId")
self.story.setMetadata('authorId', 'SiteMissingAuthorId')
self.story.setMetadata('authorUrl', 'http://' +
self.host + '/' + a['href'])
self.story.setMetadata('author', a.string)
@ -105,6 +102,7 @@ class FireFlyFansNetSiteAdapter(BaseSiteAdapter):
# to download them one at a time yourself. I'm also setting the status to
# complete
self.add_chapter(self.story.getMetadata('title'), self.url)
self.story.setMetadata('numChapters', 1)
self.story.setMetadata('status', 'Completed')
## some stories do not have a summary listed, so I'm setting it here.

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2024 FanFicFare team
# Copyright 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -18,15 +18,15 @@
from __future__ import absolute_import
import re
from .base_xenforo2forum_adapter import BaseXenForo2ForumAdapter
from .base_xenforoforum_adapter import BaseXenForoForumAdapter
def getClass():
return QuestionablequestingComAdapter
class QuestionablequestingComAdapter(BaseXenForo2ForumAdapter):
class QuestionablequestingComAdapter(BaseXenForoForumAdapter):
def __init__(self, config, url):
BaseXenForo2ForumAdapter.__init__(self, config, url)
BaseXenForoForumAdapter.__init__(self, config, url)
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','qq')

View file

@ -161,7 +161,7 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/'+chapter['href']+addurl)
@ -178,7 +178,7 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
@ -199,22 +199,22 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
@ -238,7 +238,7 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
seriessoup = self.make_soup(self.get_request(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links

28
fanficfare/adapters/adapter_inkbunnynet.py Executable file → Normal file
View file

@ -125,7 +125,7 @@ class InkBunnyNetSiteAdapter(BaseSiteAdapter):
soup = self.make_soup(self.get_request(url,usecache=False))
# removing all of the scripts
for tag in soup.find_all('script'):
for tag in soup.findAll('script'):
tag.extract()
@ -134,7 +134,7 @@ class InkBunnyNetSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('title', stripHTML(title))
# Get Author
authortag = soup.find('table',{'class':'pooltable'}).find('a',href=re.compile(r'/gallery/|/scraps/'))
authortag = soup.find('table',{'class':'pooltable'}).find('a',href=re.compile(r'/gallery/'))
author = authortag['href'].split('/')[-1] # no separate ID
self.story.setMetadata('author', author)
self.story.setMetadata('authorId', author)
@ -149,7 +149,7 @@ class InkBunnyNetSiteAdapter(BaseSiteAdapter):
if not self.getConfig('keep_summary_html'):
synopsis = stripHTML(synopsis)
self.setDescription(url, synopsis)
self.setDescription(url, stripHTML(synopsis))
#Getting Keywords/Genres
keywords = bookdetails.find('div', {'id':'kw_scroll'}).find_next_siblings('div')[0].div.div.find_all('a')
@ -157,11 +157,10 @@ class InkBunnyNetSiteAdapter(BaseSiteAdapter):
self.story.addToList('genre', stripHTML(kword))
# Getting the Category
category = bookdetails.findChildren('div', recursive=False)[2].find('span', string='Type:').parent
category.find('span').decompose()
self.story.setMetadata('category', stripHTML(category))
for div in bookdetails.find_all('div'):
if 'Rating:' == stripHTML(div)[:7]:
if 'Details' == stripHTML(div).strip():
self.story.setMetadata('category', div.find_next_siblings('div')[0].span.next_sibling.strip())
elif 'Rating:' == stripHTML(div).strip()[:7]:
rating = div.span.next_sibling.strip()
self.story.setMetadata('rating', rating)
break
@ -179,14 +178,7 @@ class InkBunnyNetSiteAdapter(BaseSiteAdapter):
if get_cover:
cover_img = soup.find('img', {'id':'magicbox'})
if cover_img:
# image content is treated like a normal image submission
self.setCoverImage(url, cover_img['src'])
else:
# image content is present, but secondary to text file
cover_div = soup.find('div', {'class': 'content magicboxParent'})
cover_img = cover_div.find('img', {'class':'shadowedimage'}) if cover_div else None
if cover_img:
self.setCoverImage(url, cover_img['src'])
## Save for use below
self.soup = soup
@ -200,11 +192,3 @@ class InkBunnyNetSiteAdapter(BaseSiteAdapter):
raise exceptions.FailedToDownload("Error downloading Chapter: %s No text block found -- non-story URL?" % url)
return self.utf8FromSoup(url, story)
def before_get_urls_from_page(self,url,normalize):
# To display the links to stories that are not available to guests.
if self.getConfig("username") and self.getConfig("always_login"):
# performLogin extracts token from the soup
soup = self.make_soup(self.get_request(url))
self.performLogin(url, soup)

View file

@ -1,213 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging, time
logger = logging.getLogger(__name__)
import re, json
from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six.moves import http_cookiejar as cl
from .base_adapter import BaseSiteAdapter, makeDate
def getClass():
return KakuyomuJpAdapter
genres = {
'FANTASY': '異世界ファンタジー',
'ACTION': '現代ファンタジー',
'SF': 'SF',
'LOVE_STORY': '恋愛',
'ROMANCE': 'ラブコメ',
'DRAMA': '現代ドラマ',
'HORROR': 'ホラー',
'MYSTERY': 'ミステリー',
'NONFICTION': 'エッセイ・ノンフィクション',
'HISTORY': '歴史・時代・伝奇',
'CRITICISM': '創作論・評論',
'OTHERS': '詩・童話・その他',
'FAN_FICTION': '二次創作',
}
class KakuyomuJpAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev', 'kakuyomu')
self.story.setMetadata('language', 'Japanese')
self.storyId = self.path.split('/')[-1]
self.story.setMetadata('storyId', self.storyId)
@staticmethod
def getSiteDomain():
return 'kakuyomu.jp'
@classmethod
def getSiteExampleURLs(cls):
return ("https://kakuyomu.jp/works/12341234123412341234")
def getSiteURLPattern(self):
return r"^https?://kakuyomu\.jp/works/[0-9]+$"
def extractChapterUrlsAndMetadata(self):
data = self.get_request(self.url)
# Page could not be found
if 'お探しのページは見つかりませんでした' in data:
raise exceptions.StoryDoesNotExist(self.url)
soup = self.make_soup(data)
info = json.loads(soup.find(id='__NEXT_DATA__').contents[0])['props']['pageProps']['__APOLLO_STATE__']
workKey = 'Work:%s' % self.storyId
# Title
self.story.setMetadata('title', info[workKey]['title'])
# Author
authorKey = info[workKey]['author']['__ref']
self.story.setMetadata('authorId', authorKey.split(':')[1])
self.story.setMetadata('authorUrl', 'https://kakuyomu.jp/users/%s' % info[authorKey]['name'])
self.story.setMetadata('author', info[authorKey]['activityName'])
# Description
self.setDescription(self.url, info[workKey]['introduction'])
self.story.setMetadata('catchphrase', info[workKey]['catchphrase'])
# Date Published and Updated
# 2024-01-01T03:00:12Z
self.story.setMetadata('datePublished',
makeDate(info[workKey]['publishedAt'], '%Y-%m-%dT%H:%M:%SZ'))
self.story.setMetadata('dateUpdated',
makeDate(info[workKey]['editedAt'], '%Y-%m-%dT%H:%M:%SZ'))
# Character count
self.story.setMetadata('numWords', info[workKey]['totalCharacterCount'])
# Status
completed = info[workKey]['serialStatus'] == 'COMPLETED'
self.story.setMetadata('status', 'Completed' if completed else 'In-Progress')
# Warnings
rating = 'G'
if info[workKey]['isCruel']:
rating = 'R15'
self.story.addToList('warnings', '残酷描写有り')
if info[workKey]['isViolent']:
rating = 'R15'
self.story.addToList('warnings', '暴力描写有り')
if info[workKey]['isSexual']:
rating = 'R15'
self.story.addToList('warnings', '性描写有り')
# Tags
for tag in info[workKey]['tagLabels']:
if re.match(r'[Rr].?[1][5]', tag) is None:
self.story.addToList('freeformtags', tag)
else:
rating = 'R15'
# Rating
self.story.setMetadata('rating', rating)
# Genre
self.story.setMetadata('genre', genres[info[workKey]['genre']])
if info[workKey]['genre'] == 'FAN_FICTION':
fandomKey = info[workKey]['fanFictionSource']['__ref']
self.story.addToList('fandoms', info[fandomKey]['title'])
# Ratings, Comments, Etc.
self.story.setMetadata('reviews', info[workKey]['reviewCount'])
self.story.setMetadata('points', info[workKey]['totalReviewPoint'])
self.story.setMetadata('comments', info[workKey]['totalPublicEpisodeCommentCount'])
self.story.setMetadata('views', info[workKey]['totalReadCount'])
self.story.setMetadata('follows', info[workKey]['totalFollowers'])
self.story.setMetadata('collections', len(info[workKey]['publicWorkCollections']))
self.story.setMetadata('events', info[workKey]['totalWorkContestCount'] + info[workKey]['totalUserEventCount'])
self.story.setMetadata('published', info[workKey]['hasPublication'])
# visitorWorkFollowing
# workReviewByVisitor
# Chapters, Episodes
# TOC nodes are in a list
# each have a list of named episodes
# each can have a named chapter
# named chapters can be at depth 1 or 2
# episodes might be empty (premium subscription)
prependSectionTitles = self.getConfig('prepend_section_titles', 'firstepisode')
numEpisodes = 0
titles = []
nestingLevel = 0
newSection = False
for tocNodeRef in info[workKey]['tableOfContentsV2']:
tocNode = info[tocNodeRef['__ref']]
if tocNode['chapter'] is not None:
chapter = info[tocNode['chapter']['__ref']]
while chapter['level'] <= nestingLevel:
titles.pop()
nestingLevel -= 1
titles.append(chapter['title'])
nestingLevel = chapter['level']
newSection = True
else:
titles = []
nestingLevel = 0
newSection = False
for episodeRef in tocNode['episodeUnions']:
if not episodeRef['__ref'].startswith('EmptyEpisode'):
numEpisodes += 1
episode = info[episodeRef['__ref']]
epUrl = 'https://kakuyomu.jp/works/' + self.storyId + '/episodes/' + episode['id']
epTitle = episode['title']
if ((len(titles) > 0) and
((newSection and prependSectionTitles == 'firstepisode') or
prependSectionTitles == 'true')):
titles.append(epTitle)
# bracket with ZWSP to mark presence of section titles
epTitle = u'\u200b' + u'\u3000\u200b'.join(titles)
titles.pop()
self.add_chapter(epTitle, epUrl)
newSection = False
logger.debug("Story: <%s>", self.story)
return
def getChapterText(self, url):
logger.debug('Getting chapter text from <%s>' % url)
soup = self.make_soup(self.get_request(url))
soup = soup.find('div', {'class':'widget-episodeBody js-episode-body'})
if soup is None:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
soup.attrs = {'class':'episode-body'}
return self.utf8FromSoup(url, soup)

View file

@ -144,13 +144,13 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
# Find authorid and URL from... author urls.
pagetitle = soup.find('div',id='pagetitle')
for a in pagetitle.find_all('a', href=re.compile(r"viewuser.php\?uid=\d+")):
for a in pagetitle.findAll('a', href=re.compile(r"viewuser.php\?uid=\d+")):
self.story.addToList('authorId',a['href'].split('=')[1])
self.story.addToList('authorUrl','https://'+self.host+'/'+a['href'])
self.story.addToList('author',stripHTML(a))
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/'+chapter['href']+addurl)
@ -166,7 +166,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = stripHTML(labelspan)
@ -193,7 +193,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [stripHTML(cat) for cat in cats]
for cat in catstext:
# ran across one story with an empty <a href="browse.php?type=categories&amp;catid=1"></a>
@ -204,7 +204,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
if 'Characters' in label:
self.story.addToList('characters','Kirk')
self.story.addToList('characters','Spock')
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [stripHTML(char) for char in chars]
for char in charstext:
self.story.addToList('characters',stripHTML(char))
@ -213,7 +213,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genrestext = [stripHTML(genre) for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
@ -223,7 +223,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
## has 'Story Type', which is much more what most sites
## call genre.
if 'Story Type' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=5')) # XXX
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=5')) # XXX
genrestext = [stripHTML(genre) for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
@ -233,21 +233,21 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [stripHTML(warning) for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',stripHTML(warning))
if 'Universe' in label:
universes = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=3')) # XXX
universes = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=3')) # XXX
universestext = [stripHTML(universe) for universe in universes]
self.universe = ', '.join(universestext)
for universe in universestext:
self.story.addToList('universe',stripHTML(universe))
if 'Crossover Fandom' in label:
crossoverfandoms = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=4')) # XXX
crossoverfandoms = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=4')) # XXX
crossoverfandomstext = [stripHTML(crossoverfandom) for crossoverfandom in crossoverfandoms]
self.crossoverfandom = ', '.join(crossoverfandomstext)
for crossoverfandom in crossoverfandomstext:
@ -274,7 +274,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
series_url = 'https://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links

View file

@ -19,7 +19,6 @@ from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
import re
import json
from bs4.element import Comment
from ..htmlcleanup import stripHTML
@ -38,7 +37,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
#logger.debug("LiteroticaComAdapter:__init__ - url='%s'" % url)
logger.debug("LiteroticaComAdapter:__init__ - url='%s'" % url)
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','litero')
@ -48,15 +47,16 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
# where first chapter doesn't have '-ch-'.
# Now just rely on extractChapterUrlsAndMetadata to reset
# storyId to first chapter link.
storyId = self.parsedUrl.path.split('/',)[2]
## DON'T normalize to www.literotica.com--keep for language,
## which will be set in _setURL(url). Also, multi-chapter
## have been keeping the language when 'normalizing' to first
## chapter.
url = re.sub(r"^(https?://)"+LANG_RE+r"(\.i)?",
r"https://\2",
r"\1\2",
url)
url = url.replace('/beta/','/') # to allow beta site URLs.
url = url.replace('/beta/s/','/s/') # to allow beta site URLs.
## strip ?page=...
url = re.sub(r"\?page=.*$", "", url)
@ -66,7 +66,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y"
self.dateformat = "%m/%d/%y"
@staticmethod
def getSiteDomain():
@ -78,12 +78,10 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "https://www.literotica.com/s/story-title https://www.literotica.com/series/se/9999999 https://www.literotica.com/s/story-title https://www.literotica.com/i/image-or-comic-title https://www.literotica.com/p/poem-title https://portuguese.literotica.com/s/story-title https://german.literotica.com/s/story-title"
return "http://www.literotica.com/s/story-title https://www.literotica.com/s/story-title http://portuguese.literotica.com/s/story-title http://german.literotica.com/s/story-title"
def getSiteURLPattern(self):
# also https://www.literotica.com/series/se/80075773
# /s/ for story, /i/ for image/comic, /p/ for poem
return r"https?://"+LANG_RE+r"(\.i)?\.literotica\.com/((beta/)?[sip]/([a-zA-Z0-9_-]+)|series/se/(?P<storyseriesid>[a-zA-Z0-9_-]+))"
return r"https?://"+LANG_RE+r"(\.i)?\.literotica\.com/(beta/)?s/([a-zA-Z0-9_-]+)"
def _setURL(self,url):
# logger.debug("set URL:%s"%url)
@ -92,337 +90,263 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
lang = m.group('lang')
if lang not in ('www','other'):
self.story.setMetadata('language',lang.capitalize())
# reset storyId
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[-1])
# logger.debug("language:%s"%self.story.getMetadata('language'))
## apply clean_chapter_titles
def add_chapter(self,chapter_title,url,othermeta={}):
if self.getConfig("clean_chapter_titles"):
storytitle = self.story.getMetadataRaw('title').lower()
chapter_name_type = None
# strip trailing ch or pt before doing the chapter clean.
# doesn't remove from story title metadata
storytitle = re.sub(r'^(.*?)( (ch|pt))?$',r'\1',storytitle)
if chapter_title.lower().startswith(storytitle):
chapter = chapter_title[len(storytitle):].strip()
# logger.debug('\tChapter: "%s"' % chapter)
if chapter == '':
chapter_title = 'Chapter %d' % (self.num_chapters() + 1)
# Sometimes the first chapter does not have type of chapter
if self.num_chapters() == 0:
# logger.debug('\tChapter: first chapter without chapter type')
chapter_name_type = None
else:
separater_char = chapter[0]
# logger.debug('\tseparater_char: "%s"' % separater_char)
chapter = chapter[1:].strip() if separater_char in [":", "-"] else chapter
# logger.debug('\tChapter: "%s"' % chapter)
if chapter.lower().startswith('ch.'):
chapter = chapter[len('ch.'):].strip()
try:
chapter_title = 'Chapter %d' % int(chapter)
except:
chapter_title = 'Chapter %s' % chapter
chapter_name_type = 'Chapter' if chapter_name_type is None else chapter_name_type
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
elif chapter.lower().startswith('pt.'):
chapter = chapter[len('pt.'):].strip()
try:
chapter_title = 'Part %d' % int(chapter)
except:
chapter_title = 'Part %s' % chapter
chapter_name_type = 'Part' if chapter_name_type is None else chapter_name_type
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
elif separater_char in [":", "-"]:
chapter_title = chapter
# logger.debug('\tChapter: taking chapter text as whole')
super(LiteroticaSiteAdapter, self).add_chapter(chapter_title,url,othermeta)
def getCategories(self, soup):
if self.getConfig("use_meta_keywords"):
categories = soup.find("meta", {"name":"keywords"})['content'].split(',')
categories = [c for c in categories if not self.story.getMetadata('title') in c]
if self.story.getMetadata('author') in categories:
categories.remove(self.story.getMetadata('author'))
# logger.debug("Meta = %s" % categories)
for category in categories:
# logger.debug("\tCategory=%s" % category)
# self.story.addToList('category', category.title())
self.story.addToList('eroticatags', category.title())
def extractChapterUrlsAndMetadata(self):
"""
In April 2024, site introduced significant changes, including
adding a 'Story Series' page and link to it in each chapter.
But not all stories, one-shots don't have 'Story Series'.
literotica has 'Story Series' & 'Story'. FFF calls them 'Story' & 'Chapters'
See https://github.com/JimmXinu/FanFicFare/issues/1058#issuecomment-2078490037
So /series/se/ will be the story URL for multi chapters but
keep individual 'chapter' URL for one-shots.
NOTE: Some stories can have versions,
e.g. /my-story-ch-05-version-10
NOTE: If two stories share the same title, a running index is added,
e.g.: /my-story-ch-02-1
Strategy:
* Go to author's page, search for the current story link,
* If it's in a tr.root-story => One-part story
* , get metadata and be done
* If it's in a tr.sl => Chapter in series
* Search up from there until we find a tr.ser-ttl (this is the
story)
* Gather metadata
* Search down from there for all tr.sl until the next
tr.ser-ttl, foreach
* Chapter link is there
"""
logger.debug("Chapter/Story URL: <%s> " % self.url)
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
(data,rurl) = self.get_request_redirected(self.url)
# logger.debug(data)
# logger.debug("Chapter/Story URL: <%s> " % self.url)
(data1,rurl) = self.get_request_redirected(self.url)
## for language domains
self._setURL(rurl)
logger.debug("set opened url:%s"%self.url)
soup = self.make_soup(data)
soup1 = self.make_soup(data1)
#strip comments from soup
[comment.extract() for comment in soup1.findAll(string=lambda text:isinstance(text, Comment))]
if "This submission is awaiting moderator's approval" in data:
if "This submission is awaiting moderator's approval" in data1:
raise exceptions.StoryDoesNotExist("This submission is awaiting moderator's approval. %s"%self.url)
## 2025Feb - domains other than www now use different HTML.
## Need to look for two different versions of basically
## everything.
## not series URL, assumed to be a chapter. Look for Story
## Info block of post-beta page. I don't think it should happen?
if '/series/se' not in self.url:
#logger.debug(data)
## looking for /series/se URL to indicate this is a
## chapter.
if not soup.select_one('div.page__aside') and not soup.select_one('div.sidebar') and not soup.select_one('div[class^="_sidebar_"]'):
raise exceptions.FailedToDownload("Missing Story Info block, Beta turned off?")
storyseriestag = soup.select_one('a.bn_av')
if not storyseriestag:
storyseriestag = soup.select_one('a[class^="_files__link_"]')
# logger.debug("Story Series Tag:%s"%storyseriestag)
if storyseriestag:
self._setURL(storyseriestag['href'])
data = self.get_request(storyseriestag['href'])
# logger.debug(data)
soup = self.make_soup(data)
# logger.debug(soup)
else:
logger.debug("One-shot")
isSingleStory = '/series/se' not in self.url
if not isSingleStory:
# Normilize the url?
state = re.findall(r"prefix\=\"/series/\",state='(.+?)'</script>", data)
json_state = json.loads(state[0].replace("\\'","'").replace("\\\\","\\"))
url_series_id = unicode(re.match(self.getSiteURLPattern(),self.url).group('storyseriesid'))
json_series_id = unicode(json_state['series']['data']['id'])
if json_series_id != url_series_id:
res = re.sub(url_series_id, json_series_id, unicode(self.url))
logger.debug("Normalized url: %s"%res)
self._setURL(res)
## common between one-shots and multi-chapters
# title
self.story.setMetadata('title', stripHTML(soup.select_one('h1')))
# logger.debug(self.story.getMetadata('title'))
# author
## XXX This is still the author URL like:
## https://www.literotica.com/stories/memberpage.php?uid=999999&page=submissions
## because that's what's on the page. It redirects to the /authors/ page.
## Only way I know right now to get the /authors/ is to make
## the req and look at the redirect.
## Should change to /authors/ if/when it starts appearing.
## Assuming it's in the same place.
authora = soup.find("a", class_="y_eU")
if not authora:
authora = soup.select_one('a[class^="_author__title"]')
authora = soup1.find("a", class_="y_eU")
authorurl = authora['href']
if authorurl.startswith('//'):
authorurl = self.parsedUrl.scheme+':'+authorurl
# logger.debug(authora)
# logger.debug(authorurl)
self.story.setMetadata('author', stripHTML(authora))
self.story.setMetadata('authorId', urlparse.parse_qs(authorurl.split('?')[1])['uid'][0])
if authorurl.startswith('//'):
authorurl = self.parsedUrl.scheme+':'+authorurl
self.story.setMetadata('authorUrl', authorurl)
if '?' in authorurl:
self.story.setMetadata('authorId', urlparse.parse_qs(authorurl.split('?')[1])['uid'][0])
elif '/authors/' in authorurl:
self.story.setMetadata('authorId', authorurl.split('/')[-1])
else: # if all else fails
self.story.setMetadata('authorId', stripHTML(authora))
self.story.setMetadata('author', authora.text)
if soup.select('div#tabpanel-tags'):
# logger.debug("tags1")
self.story.extendList('eroticatags', [ stripHTML(t).title() for t in soup.select('div#tabpanel-tags a.av_as') ])
if soup.select('div[class^="_widget__tags_"]'):
# logger.debug("tags2")
self.story.extendList('eroticatags', [ stripHTML(t).title() for t in soup.select('div[class^="_widget__tags_"] a[class^="_tag_item_"]') ])
# logger.debug(self.story.getList('eroticatags'))
# get the author page
dataAuth = self.get_request(authorurl)
soupAuth = self.make_soup(dataAuth)
#strip comments from soup
[comment.extract() for comment in soupAuth.findAll(string=lambda text:isinstance(text, Comment))]
# logger.debug(soupAuth)
## look first for 'Series Introduction', then Info panel short desc
## series can have either, so put in common code.
desc = []
introtag = soup.select_one('div.bp_rh')
descdiv = soup.select_one('div#tabpanel-info div.bn_B') or \
soup.select_one('div[class^="_tab__pane_"] div[class^="_widget__info_"]')
if introtag and stripHTML(introtag):
# make sure there's something in the tag.
# logger.debug("intro %s"%introtag)
desc.append(unicode(introtag))
elif descdiv and stripHTML(descdiv):
# make sure there's something in the tag.
# logger.debug("desc %s"%descdiv)
desc.append(unicode(descdiv))
if not desc or self.getConfig("include_chapter_descriptions_in_summary"):
## Only for backward compatibility with 'stories' that
## don't have an intro or short desc.
descriptions = []
for i, chapterdesctag in enumerate(soup.select('p.br_rk')):
# remove category link, but only temporarily
a = chapterdesctag.a.extract()
descriptions.append("%d. %s" % (i + 1, stripHTML(chapterdesctag)))
# now put it back--it's used below
chapterdesctag.append(a)
desc.append(unicode("<p>"+"</p>\n<p>".join(descriptions)+"</p>"))
## Find link to url in author's page
## site has started using //domain.name/asdf urls remove https?: from front
## site has started putting https back on again.
## site is now using language specific german.lit... etc on author pages.
## site is now back to using www.lit... etc on author pages.
search_url_re = r"https?://"+LANG_RE+r"(\.i)?\." + re.escape(self.getSiteDomain()) + self.url[self.url.index('/s/'):]+r"$"
# logger.debug(search_url_re)
storyLink = soupAuth.find('a', href=re.compile(search_url_re))
# storyLink = soupAuth.find('a', href=re.compile(r'.*literotica.com/s/'+re.escape(self.story.getMetadata('storyId')) ))
# storyLink = soupAuth.find('a', href=re.compile(r'(https?:)?'+re.escape(self.url[self.url.index(':')+1:]).replace(r'www',r'[^\.]+') ))
# storyLink = soupAuth.find('a', href=self.url)#[self.url.index(':')+1:])
self.setDescription(self.url,u''.join(desc))
if storyLink is not None:
# pull the published date from the author page
# default values from single link. Updated below if multiple chapter.
# logger.debug("Found story on the author page.")
date = storyLink.parent.parent.findAll('td')[-1].text
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
if storyLink is not None:
urlTr = storyLink.parent.parent
if "sl" in urlTr['class']:
isSingleStory = False
else:
isSingleStory = True
else:
raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (self.url, authorurl))
if isSingleStory:
## one-shots don't *display* date info, but they have it
## hidden in <script>
## shows _date_approve "date_approve":"01/31/2024"
## multichap also have "date_approve", but they have
## several and they're more than just the story chapters.
date = re.search(r'"date_approve":"(\d\d/\d\d/\d\d\d\d)"',data)
if not date:
date = re.search(r'date_approve:"(\d\d/\d\d/\d\d\d\d)"',data)
if date:
dateval = makeDate(date.group(1), self.dateformat)
self.story.setMetadata('datePublished', dateval)
self.story.setMetadata('dateUpdated', dateval)
## one-shots don't have same json data to get aver_rating
## from below. This kludge matches the data_approve
rateall = re.search(r'rate_all:([\d\.]+)',data)
if rateall:
self.story.setMetadata('averrating', '%4.2f' % float(rateall.group(1)))
## one-shots assumed completed.
self.story.setMetadata('status','Completed')
# Add the category from the breadcumb.
breadcrumbs = soup.find('div', id='BreadCrumbComponent')
if not breadcrumbs:
breadcrumbs = soup.select_one('ul[class^="_breadcrumbs_list_"]')
if not breadcrumbs:
# _breadcrumbs_18u7l_1
breadcrumbs = soup.select_one('nav[class^="_breadcrumbs_"]')
self.story.addToList('category', breadcrumbs.find_all('a')[1].string)
## one-shot chapter
self.add_chapter(self.story.getMetadata('title'), self.url)
self.story.setMetadata('title', storyLink.text.strip('/'))
# logger.debug('Title: "%s"' % storyLink.text.strip('/'))
self.setDescription(authorurl, urlTr.findAll("td")[1].text)
self.story.addToList('category', urlTr.findAll("td")[2].text)
# self.story.addToList('eroticatags', urlTr.findAll("td")[2].text)
date = urlTr.findAll('td')[-1].text
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
self.add_chapter(storyLink.text, self.url)
averrating = stripHTML(storyLink.parent)
## title (0.00)
averrating = averrating[averrating.rfind('(')+1:averrating.rfind(')')]
try:
self.story.setMetadata('averrating', float(averrating))
except:
pass
# self.story.setMetadata('averrating',averrating)
# parse out the list of chapters
else:
## Multi-chapter stories. AKA multi-part 'Story Series'.
bn_antags = soup.select('div#tabpanel-info p.bn_an')
# logger.debug(bn_antags)
if bn_antags and not self.getConfig("dates_from_chapters"):
## Use dates from series metadata unless dates_from_chapters is enabled
dates = []
for datetag in bn_antags[:2]:
datetxt = stripHTML(datetag)
# remove 'Started:' 'Updated:'
# Assume can't use 'Started:' 'Updated:' (vs [0] or [1]) because of lang localization
datetxt = datetxt[datetxt.index(':')+1:]
dates.append(datetxt)
# logger.debug(dates)
self.story.setMetadata('datePublished', makeDate(dates[0], self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(dates[1], self.dateformat))
seriesTr = urlTr.previousSibling
while 'ser-ttl' not in seriesTr['class']:
seriesTr = seriesTr.previousSibling
m = re.match(r"^(?P<title>.*?):\s(?P<numChapters>\d+)\sPart\sSeries$", seriesTr.find("strong").text)
self.story.setMetadata('title', m.group('title'))
seriesTitle = m.group('title')
## bn_antags[2] contains "The author has completed this series." or "The author is still actively writing this series."
## I won't be surprised if this breaks later because of lang localization
if "completed" in stripHTML(bn_antags[-1]):
self.story.setMetadata('status','Completed')
else:
self.story.setMetadata('status','In-Progress')
## Walk the chapters
chapterTr = seriesTr.nextSibling
dates = []
descriptions = []
ratings = []
chapters = []
chapter_name_type = None
while chapterTr is not None and 'sl' in chapterTr['class']:
description = "%d. %s" % (len(descriptions)+1,stripHTML(chapterTr.findAll("td")[1]))
description = stripHTML(chapterTr.findAll("td")[1])
chapterLink = chapterTr.find("td", "fc").find("a")
if self.getConfig('chapter_categories_use_all'):
self.story.addToList('category', chapterTr.findAll("td")[2].text)
# self.story.addToList('eroticatags', chapterTr.findAll("td")[2].text)
pub_date = makeDate(chapterTr.findAll('td')[-1].text, self.dateformat)
dates.append(pub_date)
chapterTr = chapterTr.nextSibling
## category from chapter list
self.story.extendList('category',[ stripHTML(t) for t in soup.select('a.br_rl') ])
chapter_title = chapterLink.text
if self.getConfig("clean_chapter_titles"):
# logger.debug('\tChapter Name: "%s"' % chapterLink.text)
seriesTitle = seriesTitle.lower()
# strip trailing ch or pt before doing the chapter clean.
# doesn't remove from story title metadata
seriesTitle = re.sub(r'^(.*?)( (ch|pt))?$',r'\1',seriesTitle)
if chapterLink.text.lower().startswith(seriesTitle):
chapter = chapterLink.text[len(seriesTitle):].strip()
# logger.debug('\tChapter: "%s"' % chapter)
if chapter == '':
chapter_title = 'Chapter %d' % (self.num_chapters() + 1)
# Sometimes the first chapter does not have type of chapter
if self.num_chapters() == 0:
# logger.debug('\tChapter: first chapter without chapter type')
chapter_name_type = None
else:
separater_char = chapter[0]
# logger.debug('\tseparater_char: "%s"' % separater_char)
chapter = chapter[1:].strip() if separater_char in [":", "-"] else chapter
# logger.debug('\tChapter: "%s"' % chapter)
if chapter.lower().startswith('ch.'):
chapter = chapter[len('ch.'):].strip()
try:
chapter_title = 'Chapter %d' % int(chapter)
except:
chapter_title = 'Chapter %s' % chapter
chapter_name_type = 'Chapter' if chapter_name_type is None else chapter_name_type
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
elif chapter.lower().startswith('pt.'):
chapter = chapter[len('pt.'):].strip()
try:
chapter_title = 'Part %d' % int(chapter)
except:
chapter_title = 'Part %s' % chapter
chapter_name_type = 'Part' if chapter_name_type is None else chapter_name_type
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
elif separater_char in [":", "-"]:
chapter_title = chapter
# logger.debug('\tChapter: taking chapter text as whole')
for chapteratag in soup.select('a.br_rj'):
chapter_title = stripHTML(chapteratag)
# logger.debug('\tChapter: "%s"' % chapteratag)
# /series/se does include full URLs current.
chapurl = chapteratag['href']
# pages include full URLs.
chapurl = chapterLink['href']
if chapurl.startswith('//'):
chapurl = self.parsedUrl.scheme + ':' + chapurl
# logger.debug("Chapter URL: " + chapurl)
self.add_chapter(chapter_title, chapurl)
# logger.debug("Chapter Title: " + chapter_title)
# logger.debug("Chapter description: " + description)
chapters.append((chapter_title, chapurl, description, pub_date))
# self.add_chapter(chapter_title, chapurl)
numrating = stripHTML(chapterLink.parent)
## title (0.00)
numrating = numrating[numrating.rfind('(')+1:numrating.rfind(')')]
try:
ratings.append(float(numrating))
except:
pass
# <img src="https://uploads.literotica.com/series/cover/813-1695143444-desktop-x1.jpg" alt="Series cover">
coverimg = soup.select_one('img[alt="Series cover"]')
if coverimg:
self.setCoverImage(self.url,coverimg['src'])
if self.getConfig("clean_chapter_titles") \
and chapter_name_type is not None \
and not chapters[0][0].startswith(chapter_name_type):
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
# logger.debug('\tChapter: first chapter="%s"' % chapters[0][0])
# logger.debug('\tChapter: first chapter number="%s"' % chapters[0][0][len('Chapter'):])
chapters[0] = ("%s %s" % (chapter_name_type, chapters[0][0][len('Chapter'):].strip()),
chapters[0][1],
chapters[0][2],
chapters[0][3]
)
#### Attempting averrating from JS metadata.
#### also alternate chapters from json
try:
state_start="state='"
state_end="'</script>"
i = data.index(state_start)
if i:
state = data[i+len(state_start):data.index(state_end,i)].replace("\\'","'").replace("\\\\","\\")
if state:
# logger.debug(state)
json_state = json.loads(state)
# logger.debug(json.dumps(json_state, sort_keys=True,indent=2, separators=(',', ':')))
all_rates = []
if 'series' in json_state:
all_rates = [ float(x['rate_all']) for x in json_state['series']['works'] ]
if self.getConfig("order_chapters_by_date"):
chapters = sorted(chapters, key=lambda chapter: chapter[3])
for i, chapter in enumerate(chapters):
self.add_chapter(chapter[0], chapter[1])
descriptions.append("%d. %s" % (i + 1, chapter[2]))
## Set the oldest date as publication date, the newest as update date
dates.sort()
self.story.setMetadata('datePublished', dates[0])
self.story.setMetadata('dateUpdated', dates[-1])
## Set description to joint chapter descriptions
self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>")
## Extract dates from chapter approval dates if dates_from_chapters is enabled
if self.getConfig("dates_from_chapters"):
date_approvals = []
for work in json_state['series']['works']:
if 'date_approve' in work:
try:
date_approvals.append(makeDate(work['date_approve'], self.dateformat))
except:
pass
if date_approvals:
# Oldest date is published, newest is updated
date_approvals.sort()
self.story.setMetadata('datePublished', date_approvals[0])
self.story.setMetadata('dateUpdated', date_approvals[-1])
if all_rates:
self.story.setMetadata('averrating', '%4.2f' % (sum(all_rates) / float(len(all_rates))))
if len(ratings) > 0:
self.story.setMetadata('averrating','%4.2f' % (sum(ratings) / float(len(ratings))))
## alternate chapters from JSON
if self.num_chapters() < 1:
logger.debug("Getting Chapters from series JSON")
seriesid = json_state.get('series',{}).get('data',{}).get('id',None)
if seriesid:
logger.info("Fetching chapter data from JSON")
logger.debug(seriesid)
series_json = json.loads(self.get_request('https://literotica.com/api/3/series/%s/works'%seriesid))
# logger.debug(json.dumps(series_json, sort_keys=True,indent=2, separators=(',', ':')))
for chap in series_json:
self.add_chapter(chap['title'], 'https://www.literotica.com/s/'+chap['url'])
# normalize on first chapter URL.
self._setURL(self.get_chapter(0,'url'))
## Collect tags from series/story page if tags_from_chapters is enabled
if self.getConfig("tags_from_chapters"):
self.story.extendList('eroticatags', [ unicode(t['tag']).title() for t in chap['tags'] ])
# reset storyId to first chapter.
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
except Exception as e:
logger.warning("Processing JSON failed. (%s)"%e)
# Add the category from the breadcumb. This might duplicate a category already added.
self.story.addToList('category', soup1.find('div', id='BreadCrumbComponent').findAll('a')[1].string)
self.getCategories(soup1)
## Features removed because not supportable by new site form:
## averrating metadata entry
## order_chapters_by_date option
## use_meta_keywords option
return
def getPageText(self, raw_page, url):
logger.debug('Getting page text')
# logger.debug('Getting page text')
# logger.debug(soup)
raw_page = raw_page.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>')
# logger.debug("\tChapter text: %s" % raw_page)
# logger.debug("\tChapter text: %s" % raw_page)
page_soup = self.make_soup(raw_page)
[comment.extract() for comment in page_soup.find_all(string=lambda text:isinstance(text, Comment))]
fullhtml = ""
for aa_ht_div in page_soup.find_all('div', 'aa_ht') + page_soup.select('div[class^="_article__content_"]'):
if aa_ht_div.div:
html = unicode(aa_ht_div.div)
# Strip some starting and ending tags,
html = re.sub(r'^<div.*?>', r'', html)
html = re.sub(r'</div>$', r'', html)
html = re.sub(r'<p></p>$', r'', html)
fullhtml = fullhtml + html
# logger.debug('getPageText - fullhtml: %s' % fullhtml)
[comment.extract() for comment in page_soup.findAll(string=lambda text:isinstance(text, Comment))]
story2 = page_soup.find('div', 'aa_ht').div
# logger.debug('getPageText - story2: %s' % story2)
fullhtml = unicode(story2)
# logger.debug(fullhtml)
# Strip some starting and ending tags,
fullhtml = re.sub(r'^<div.*?>', r'', fullhtml)
fullhtml = re.sub(r'</div>$', r'', fullhtml)
fullhtml = re.sub(r'<p></p>$', r'', fullhtml)
# logger.debug('getPageText - fullhtml: %s' % fullhtml)
return fullhtml
def getChapterText(self, url):
@ -432,15 +356,9 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
raw_page = self.get_request(url)
page_soup = self.make_soup(raw_page)
pages = page_soup.find('div',class_='l_bH')
if not pages:
pages = page_soup.select_one('div._pagination_h0sum_1')
if not pages:
pages = page_soup.select_one('div.clearfix.panel._pagination_1400x_1')
if not pages:
pages = page_soup.select_one('div[class^="panel clearfix _pagination_"]')
# logger.debug(pages)
fullhtml = ""
self.getCategories(page_soup)
chapter_description = ''
if self.getConfig("description_in_chapter"):
chapter_description = page_soup.find("meta", {"name" : "description"})['content']
@ -451,10 +369,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
## look for highest numbered page, they're not all listed
## when there are many.
last_page_links = pages.find_all('a', class_='l_bJ')
if not last_page_links:
last_page_links = pages.select('a[class^="_pagination__item_"]')
last_page_link = last_page_links[-1]
last_page_link = pages.find_all('a', class_='l_bJ')[-1]
last_page_no = int(urlparse.parse_qs(last_page_link['href'].split('?')[1])['page'][0])
# logger.debug(last_page_no)
for page_no in range(2, last_page_no+1):
@ -463,7 +378,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
raw_page = self.get_request(page_url)
fullhtml += self.getPageText(raw_page, url)
#logger.debug(fullhtml)
# logger.debug(fullhtml)
page_soup = self.make_soup(fullhtml)
fullhtml = self.utf8FromSoup(url, self.make_soup(fullhtml))
fullhtml = chapter_description + fullhtml
@ -471,123 +386,6 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
return fullhtml
def get_urls_from_page(self,url,normalize):
from ..geturls import get_urls_from_html
## hook for logins, etc.
self.before_get_urls_from_page(url,normalize)
# this way it uses User-Agent or other special settings.
data = self.get_request(url,usecache=False)
soup = self.make_soup(data)
page_urls = get_urls_from_html(soup, url, configuration=self.configuration, normalize=normalize)
if not self.getConfig("fetch_stories_from_api",True):
logger.debug('fetch_stories_from_api Not enabled')
return {'urllist': page_urls}
user_story_list = re.search(r'literotica\.com/authors/.+?/lists\?listid=(?P<list_id>\d+)', url)
fav_authors = re.search(r'literotica\.com/authors/.+?/favorites', url)
written = re.search(r'literotica.com/authors/.+?/works/', url)
logger.debug((bool(user_story_list), bool(fav_authors), bool(written)))
# If the url is not supported
if not user_story_list and not fav_authors and not written:
logger.debug('No supported link. %s', url)
return {'urllist':page_urls}
# Grabbing the main list where chapters are contained.
if user_story_list:
js_story_list = re.search(r';\$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]\);\$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]=\{success:!\d,current_page:(?P<current_page>\d+?),last_page:(?P<last_page>\d+?),total:\d+?,per_page:\d+,(has_series:!\d)?data:\$R\[\d+?\]=\[\$R\[\d+?\]=(?P<data>.+)\}\]\}\);', data) # }] } } }); \$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]\);\$R\[\d+?]\(\$R\[\d+?\],\$R\[\d+?\]=\{sliders:
logger.debug('user_story_list ID [%s]'%user_story_list.group('list_id'))
else:
js_story_list = re.search(r'\$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]={current_page:(?P<current_page>\d+?),last_page:(?P<last_page>\d+?),total:\d+?,per_page:\d+,(has_series:!\d,)?data:\$R\[\d+\]=\[\$R\[\d+\]=\{(?!aim)(?P<data>.+)\}\);_\$HY\.r\[', data)
# In case the regex becomes outdated
if not js_story_list:
logger.debug('Failed to grab data from the js.')
return {'urllist':page_urls}
user = None
script_tags = soup.find_all('script')
for script in script_tags:
if not script.string:
continue
# Getting author from the js.
user = re.search(r'_\$HY\.r\[\"AuthorQuery\[\\\"(?P<author>.+?)\\\"\]\"\]', script.string)
if user != None:
logger.debug("User: [%s]"%user.group('author'))
break
else:
logger.debug('Failed to get a username')
return {'urllist': page_urls}
# Extract the current (should be 1) and last page numbers from the js.
logger.debug("Pages %s/%s"%(js_story_list.group('current_page'), js_story_list.group('last_page')))
urls = []
# Necessary to format a proper link as there were no visible data specifying what kind of link that should be.
cat_to_link = {'adult-comics': 'i', 'erotic-art': 'i', 'illustrated-poetry': 'p', 'erotic-audio-poetry': 'p', 'erotic-poetry': 'p', 'non-erotic-poetry': 'p'}
stories_found = re.findall(r"category_info:\$R\[.*?type:\".+?\",pageUrl:\"(.+?)\"}.+?,type:\"(.+?)\",url:\"(.+?)\",", js_story_list.group('data'))
for story in stories_found:
story_category, story_type, story_url = story
urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(story_category, 's'), story_url))
# Removes the duplicates
seen = set()
urls = [x for x in (page_urls + urls) if not (x in seen or seen.add(x))]
logger.debug("Found [%s] stories so far."%len(urls))
# Sometimes the rest of the stories are burried in the js so no fetching in necessery.
if js_story_list.group('last_page') == js_story_list.group('current_page'):
return {'urllist': urls}
user = urlparse.quote(user.group(1))
logger.debug("Escaped user: [%s]"%user)
if written:
category = re.search(r"_\$HY\.r\[\"AuthorSeriesAndWorksQuery\[\\\".+?\\\",\\\"\D+?\\\",\\\"(?P<type>\D+?)\\\"\]\"\]=\$R\[\d+?\]=\$R\[\d+?\]\(\$R\[\d+?\]=\{", data)
elif fav_authors:
category = re.search(r"_\$HY\.r\[\"AuthorFavoriteWorksQuery\[\\\".+?\\\",\\\"(?P<type>\D+?)\\\",\d\]\"\]=\$R\[\d+?\]=\$R\[\d+?\]\(\$R\[\d+?\]={", data)
if not user_story_list and not category:
logger.debug("Type of works not found")
return {'urllist': urls}
last_page = int(js_story_list.group('last_page'))
current_page = int(js_story_list.group('current_page')) + 1
# Fetching the remaining urls from api. Can't trust the number given about the pages left from a website. Sometimes even the api returns outdated number of pages.
while current_page <= last_page:
i = len(urls)
logger.debug("Pages %s/%s"%(current_page, int(last_page)))
if fav_authors:
jsn = self.get_request('https://literotica.com/api/3/users/{}/favorite/works?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22type%22%3A%22{}%22%2C%22withSeriesDetails%22%3Atrue%7D'.format(user, current_page, category.group('type')))
elif user_story_list:
jsn = self.get_request('https://literotica.com/api/3/users/{}/list/{}?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22withSeriesDetails%22%3Atrue%7D'.format(user, user_story_list.group('list_id'), current_page))
else:
jsn = self.get_request('https://literotica.com/api/3/users/{}/series_and_works?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22sort%22%3A%22date%22%2C%22type%22%3A%22{}%22%2C%22listType%22%3A%22expanded%22%7D'.format(user, current_page, category.group('type')))
urls_data = json.loads(jsn)
last_page = urls_data["last_page"]
current_page = int(urls_data["current_page"]) + 1
for story in urls_data['data']:
#logger.debug('parts' in story)
if story['url'] and story.get('work_count') == None:
urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(story["category_info"]["pageUrl"], 's'), str(story['url'])))
continue
# Most of the time series has no url specified and contains all of the story links belonging to the series
urls.append('https://www.literotica.com/series/se/%s'%str(story['id']))
for series_story in story['parts']:
urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(series_story["category_info"]["pageUrl"], 's'), str(series_story['url'])))
logger.debug("Found [%s] stories."%(len(urls) - i))
# Again removing duplicates.
seen = set()
urls = [x for x in urls if not (x in seen or seen.add(x))]
logger.debug("Found total of [%s] stories"%len(urls))
return {'urllist':urls}
def getClass():
return LiteroticaSiteAdapter

View file

@ -116,7 +116,7 @@ class LumosSycophantHexComAdapter(BaseSiteAdapter):
self.story.setMetadata('rating', rating)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href']+addurl)
@ -134,7 +134,7 @@ class LumosSycophantHexComAdapter(BaseSiteAdapter):
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
value = labels[0].previousSibling
svalue = ""
@ -154,22 +154,22 @@ class LumosSycophantHexComAdapter(BaseSiteAdapter):
self.story.setMetadata('numWords', value.split(' -')[0])
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
@ -194,7 +194,7 @@ class LumosSycophantHexComAdapter(BaseSiteAdapter):
series_url = 'http://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

View file

@ -162,7 +162,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
self.story.extendList('authorId', [authorId])
self.story.extendList('authorUrl', [authorUrl])
if not self.story.getMetadataRaw('rating'):
if not self.story.getMetadata('rating'):
ratingTitle = chapter.getRatingTitle()
if ratingTitle:
self.story.setMetadata('rating', ratingTitle)
@ -204,6 +204,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
self.story.setMetadata('datePublished', datePublished)
self.story.setMetadata('dateUpdated', dateUpdated)
self.story.setMetadata('numWords', unicode(wordCount))
self.story.setMetadata('numChapters', len(chapters))
# Site-specific metadata.
self.story.setMetadata('language', self.SITE_LANGUAGE)
@ -677,7 +678,7 @@ class Chapter(object):
def _excludeEditorSignature(self, root):
"""Exclude editor signature from within `root' element."""
for stringNode in root.find_all(string=True):
for stringNode in root.findAll(string=True):
if re.match(self.SIGNED_PATTERN, textNode.string):
editorLink = textNode.findNext('a')
if editorLink:

View file

@ -64,9 +64,7 @@ class MCStoriesComSiteAdapter(BaseSiteAdapter):
return "https://mcstories.com/StoryTitle/ https://mcstories.com/StoryTitle/index.html https://mcstories.com/StoryTitle/StoryTitle1.html"
def getSiteURLPattern(self):
## Note that this uses a regular expression *negative*
## lookahead--story URLs *can't* have /Titles/ /Authors/ etc.
return r"https?://(www\.)?mcstories\.com(?!/(Titles|Authors|Tags|ReadersPicks)/)/[a-zA-Z0-9_-]+/"
return r"https?://(www\.)?mcstories\.com/([a-zA-Z0-9_-]+)/"
def extractChapterUrlsAndMetadata(self):
"""

View file

@ -148,12 +148,12 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# category
# <a href="/fanfic/src.php/a/567">Ranma 1/2</a>
for a in soup.find_all('a',href=re.compile(r"^/fanfic/a/")):
for a in soup.findAll('a',href=re.compile(r"^/fanfic/a/")):
self.story.addToList('category',a.string)
# genre
# <a href="/fanfic/src.php/g/567">Ranma 1/2</a>
for a in soup.find_all('a',href=re.compile(r"^/fanfic/src.php/g/")):
for a in soup.findAll('a',href=re.compile(r"^/fanfic/src.php/g/")):
self.story.addToList('genre',a.string)
metasoup = soup.find("div",{"class":"post-meta"})

View file

@ -154,7 +154,7 @@ class MidnightwhispersAdapter(BaseSiteAdapter): # XXX
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/'+chapter['href']+addurl)
@ -170,7 +170,7 @@ class MidnightwhispersAdapter(BaseSiteAdapter): # XXX
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
@ -191,13 +191,13 @@ class MidnightwhispersAdapter(BaseSiteAdapter): # XXX
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
@ -206,7 +206,7 @@ class MidnightwhispersAdapter(BaseSiteAdapter): # XXX
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
@ -216,7 +216,7 @@ class MidnightwhispersAdapter(BaseSiteAdapter): # XXX
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
@ -243,7 +243,7 @@ class MidnightwhispersAdapter(BaseSiteAdapter): # XXX
series_url = 'https://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links

View file

@ -195,7 +195,7 @@ class LightNovelGateSiteAdapter(BaseSiteAdapter):
[a.extract() for a in story.find_all('a')]
# Some tags have non-standard tag name.
for tag in story.find_all(recursive=True):
for tag in story.findAll(recursive=True):
if tag.name not in HTML_TAGS:
tag.name = 'span'

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2024 FanFicFare team
# Copyright 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -15,24 +15,34 @@
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
from .base_otw_adapter import BaseOTWAdapter
from .adapter_wuxiaworldxyz import WuxiaWorldXyzSiteAdapter
def getClass():
return CFAAAdapter
return NovelUpdatesCcSiteAdapter
class CFAAAdapter(BaseOTWAdapter):
class NovelUpdatesCcSiteAdapter(WuxiaWorldXyzSiteAdapter):
DATE_FORMAT = '%Y-%m-%d %H:%M'
def __init__(self, config, url):
BaseOTWAdapter.__init__(self, config, url)
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','cfaa')
WuxiaWorldXyzSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev', 'nucc')
@staticmethod # must be @staticmethod, don't remove it.
@staticmethod
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.cfaarchive.org'
return 'www.novelupdates.cc'
@classmethod
def getAcceptDomains(cls):
return ['www.novelupdates.cc','m.novelupdates.cc']
@classmethod
def getSiteExampleURLs(cls):
return 'https://%s/story-name' % cls.getSiteDomain()
def getSiteURLPattern(self):
return r'https?://(www|m)\.novelupdates\.cc/(?P<id>[^/]+)(/)?'

View file

@ -137,14 +137,14 @@ class OcclumencySycophantHexComAdapter(BaseSiteAdapter):
try:
# in case link points somewhere other than the first chapter
a = soup.find_all('option')[1]['value']
a = soup.findAll('option')[1]['value']
self.story.setMetadata('storyId',a.split('=',)[1])
url = 'http://'+self.host+'/'+a
soup = self.make_soup(self.get_request(url))
except:
pass
for info in asoup.find_all('table', {'class' : 'border'}):
for info in asoup.findAll('table', {'class' : 'border'}):
a = info.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
if a != None:
self.story.setMetadata('title',stripHTML(a))
@ -152,7 +152,7 @@ class OcclumencySycophantHexComAdapter(BaseSiteAdapter):
# Find the chapters:
chapters=soup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1$'))
chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+&i=1$'))
if len(chapters) == 0:
self.add_chapter(self.story.getMetadata('title'),url)
else:
@ -171,7 +171,7 @@ class OcclumencySycophantHexComAdapter(BaseSiteAdapter):
except:
return ""
cats = info.find_all('a',href=re.compile('categories.php'))
cats = info.findAll('a',href=re.compile('categories.php'))
for cat in cats:
self.story.addToList('category',cat.string)
@ -188,7 +188,7 @@ class OcclumencySycophantHexComAdapter(BaseSiteAdapter):
self.setDescription(url,svalue)
# <span class="label">Rated:</span> NC-17<br /> etc
labels = info.find_all('b')
labels = info.findAll('b')
for labelspan in labels:
value = labelspan.nextSibling
label = stripHTML(labelspan)

View file

@ -93,26 +93,26 @@ class PhoenixSongNetAdapter(BaseSiteAdapter):
chapters = soup.find('select')
if chapters == None:
self.add_chapter(self.story.getMetadata('title'),url)
for b in soup.find_all('b'):
for b in soup.findAll('b'):
if b.text == "Updated":
date = b.nextSibling.string.split(': ')[1].split(',')
self.story.setMetadata('datePublished', makeDate(date[0]+date[1], self.dateformat))
self.story.setMetadata('dateUpdated', makeDate(date[0]+date[1], self.dateformat))
else:
i = 0
chapters = chapters.find_all('option')
chapters = chapters.findAll('option')
for chapter in chapters:
self.add_chapter(chapter,'https://'+self.host+chapter['value'])
if i == 0:
self.story.setMetadata('storyId',chapter['value'].split('/')[3])
head = self.make_soup(self.get_request('https://'+self.host+chapter['value'])).find_all('b')
head = self.make_soup(self.get_request('https://'+self.host+chapter['value'])).findAll('b')
for b in head:
if b.text == "Updated":
date = b.nextSibling.string.split(': ')[1].split(',')
self.story.setMetadata('datePublished', makeDate(date[0]+date[1], self.dateformat))
if i == (len(chapters)-1):
head = self.make_soup(self.get_request('https://'+self.host+chapter['value'])).find_all('b')
head = self.make_soup(self.get_request('https://'+self.host+chapter['value'])).findAll('b')
for b in head:
if b.text == "Updated":
date = b.nextSibling.string.split(': ')[1].split(',')
@ -160,20 +160,20 @@ class PhoenixSongNetAdapter(BaseSiteAdapter):
soup = self.make_soup(self.get_request(url))
chapter=self.make_soup('<div class="story"></div>')
for p in soup.find_all(['p','blockquote']):
for p in soup.findAll(['p','blockquote']):
if "This is for problems with the formatting or the layout of the chapter." in stripHTML(p):
break
chapter.append(p)
for a in chapter.find_all('div'):
for a in chapter.findAll('div'):
a.extract()
for a in chapter.find_all('table'):
for a in chapter.findAll('table'):
a.extract()
for a in chapter.find_all('script'):
for a in chapter.findAll('script'):
a.extract()
for a in chapter.find_all('form'):
for a in chapter.findAll('form'):
a.extract()
for a in chapter.find_all('textarea'):
for a in chapter.findAll('textarea'):
a.extract()

View file

@ -0,0 +1,241 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Software: eFiction
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
import re
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six import text_type as unicode
from .base_adapter import BaseSiteAdapter, makeDate
def getClass():
return PonyFictionArchiveNetAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class PonyFictionArchiveNetAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
if "explicit" in self.parsedUrl.netloc:
self._setURL('https://explicit.' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self.dateformat = "%d/%b/%y"
else:
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self.dateformat = "%d %b %Y"
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','pffa')
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'ponyfictionarchive.net'
@classmethod
def getAcceptDomains(cls):
return ['www.ponyfictionarchive.net','ponyfictionarchive.net','explicit.ponyfictionarchive.net']
@classmethod
def getSiteExampleURLs(cls):
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234 https://explicit."+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return r"https?://(www\.|explicit\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&warning=9"
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logger.debug("URL: "+url)
data = self.get_request(url)
m = re.search(r"'viewstory.php\?sid=\d+((?:&amp;ageconsent=ok)?&amp;warning=\d+)'",data)
if m != None:
if self.is_adult or self.getConfig("is_adult"):
# We tried the default and still got a warning, so
# let's pull the warning number from the 'continue'
# link and reload data.
addurl = m.group(1)
# correct stupid &amp; error in url.
addurl = addurl.replace("&amp;","&")
url = self.url+'&index=1'+addurl
logger.debug("URL 2nd try: "+url)
data = self.get_request(url)
else:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.AccessDenied(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
soup = self.make_soup(data)
# print data
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/'+chapter['href']+addurl)
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
genres = soup.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
warnings = soup.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=3'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
status = soup.find('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
if status: # apparently this site can have stories with neither In-Progress or Complete.
self.story.setMetadata('status',status.string)
try:
# explicit.site and .site have some differences now...
section = soup.findAll('span', {'class' : 'General'})[1]
self.story.setMetadata('rating', section.previousSibling.previousSibling.string)
value = section.nextSibling
svalue = ""
while 'label' not in defaultGetattr(value,'class'):
svalue += unicode(value)
value = value.nextSibling
self.setDescription(url,svalue)
except:
# find rating in data
# <br /> &bull; Mature &bull; <br />
lead = "<br /> &bull; "
trail = " &bull; <br />"
rating = data[data.index(lead)+len(lead):data.index(trail)]
if len(rating)<20: # minor sanity check.
self.story.setMetadata('rating',rating)
descstr = data[data.index(trail)+len(trail):] # from desc on
descstr = descstr[:descstr.index('<span class="label">')] # remove after desc.
self.setDescription(url,descstr)
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Word count' in label:
self.story.setMetadata('numWords', value)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'https://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = self.make_soup(self.get_request(url))
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -80,7 +80,7 @@ class PotionsAndSnitchesOrgSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/fanfiction/'+chapter['href'])
@ -92,7 +92,7 @@ class PotionsAndSnitchesOrgSiteAdapter(BaseSiteAdapter):
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
@ -116,13 +116,13 @@ class PotionsAndSnitchesOrgSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('reads', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
if "Snape and Harry (required)" in char:
@ -132,27 +132,27 @@ class PotionsAndSnitchesOrgSiteAdapter(BaseSiteAdapter):
self.story.addToList('characters',char.string)
if 'Warning' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class'))
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
for warning in warnings:
self.story.addToList('warnings',stripHTML(warning))
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
for genre in genres:
self.story.addToList('genre',stripHTML(genre))
if 'Takes Place' in label:
takesplaces = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class'))
takesplaces = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
for takesplace in takesplaces:
self.story.addToList('takesplaces',stripHTML(takesplace))
if 'Snape flavour' in label:
snapeflavours = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class'))
snapeflavours = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
for snapeflavour in snapeflavours:
self.story.addToList('snapeflavours',stripHTML(snapeflavour))
if 'Tags' in label:
sitetags = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class'))
sitetags = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
for sitetag in sitetags:
self.story.addToList('sitetags',stripHTML(sitetag))
@ -176,7 +176,7 @@ class PotionsAndSnitchesOrgSiteAdapter(BaseSiteAdapter):
series_url = 'http://'+self.host+'/fanfiction/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

View file

@ -121,7 +121,7 @@ class PretenderCenterComAdapter(BaseSiteAdapter):
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/missingpieces/'+chapter['href']+addurl)
@ -138,7 +138,7 @@ class PretenderCenterComAdapter(BaseSiteAdapter):
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
@ -159,22 +159,22 @@ class PretenderCenterComAdapter(BaseSiteAdapter):
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
for warning in warnings:
self.story.addToList('warnings',warning.string)
@ -198,7 +198,7 @@ class PretenderCenterComAdapter(BaseSiteAdapter):
seriessoup = self.make_soup(self.get_request(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+'))
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links

View file

@ -111,7 +111,7 @@ class PsychFicComAdapter(BaseSiteAdapter):
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href']+addurl)
@ -126,7 +126,7 @@ class PsychFicComAdapter(BaseSiteAdapter):
except:
return ""
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
@ -147,22 +147,22 @@ class PsychFicComAdapter(BaseSiteAdapter):
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1'))
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2'))
for warning in warnings:
self.story.addToList('warnings',warning.string)
@ -186,7 +186,7 @@ class PsychFicComAdapter(BaseSiteAdapter):
series_url = 'http://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

View file

@ -104,42 +104,6 @@ class RoyalRoadAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return "https?"+re.escape("://")+r"(www\.|)royalroadl?\.com/fiction/\d+(/.*)?$"
# rr won't send you future updates if you aren't 'caught up'
# on the story. Login isn't required but logging in will
# mark stories you've downloaded as 'read' on rr.
def performLogin(self):
params = {}
if self.password:
params['Email'] = self.username
params['password'] = self.password
else:
params['Email'] = self.getConfig("username")
params['password'] = self.getConfig("password")
if not params['password']:
return
loginUrl = 'https://' + self.getSiteDomain() + '/account/login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['Email']))
## need to pull empty login page first to get request token
soup = self.make_soup(self.get_request(loginUrl))
## FYI, this will fail if cookiejar is shared, but
## use_basic_cache is false.
params['__RequestVerificationToken']=soup.find('input', {'name':'__RequestVerificationToken'})['value']
d = self.post_request(loginUrl, params)
if "Sign in" in d : #Member Account
logger.info("Failed to login to URL %s as %s (requires Email not name)" % (loginUrl,
params['Email']))
raise exceptions.FailedToLogin(self.url,"Failed to login as %s (RoyalRoad requires Email not name)" % params['Email'])
return False
else:
return True
## RR chapter URL only requires the chapter ID number field to be correct, story ID and title values are ignored
## URL format after the domain /fiction/ is long form, storyID/storyTitle/chapter/chapterID/chapterTitle
## short form has /fiction/chapter/chapterID both forms have optional final /
@ -155,18 +119,8 @@ class RoyalRoadAdapter(BaseSiteAdapter):
return self.chapterUrls[chapter_url_index]['url']
return url
def make_soup(self, data):
def make_soup(self,data):
soup = super(RoyalRoadAdapter, self).make_soup(data)
# Parse and store styles in a set
self.styles_to_ignore = set()
style_elements = soup.find_all('style')
for style_element in style_elements:
class_matches = re.findall(r'\.(\S+)\s*\{[^\}]*display\s*:\s*none\s*;[^\}]*\}', style_element.string, flags=re.IGNORECASE)
if class_matches:
self.styles_to_ignore.update(class_matches)
del class_matches
self.handle_spoilers(soup)
return soup
@ -196,9 +150,6 @@ class RoyalRoadAdapter(BaseSiteAdapter):
url = self.url
logger.debug("URL: "+url)
# Log in so site will mark the chapers as read
self.performLogin()
data = self.get_request(url)
soup = self.make_soup(data)
@ -226,7 +177,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
chapters = soup.find('table',{'id':'chapters'}).find('tbody')
tds = [tr.find_all('td') for tr in chapters.find_all('tr')]
tds = [tr.findAll('td') for tr in chapters.findAll('tr')]
if not tds:
raise exceptions.FailedToDownload(
@ -266,8 +217,6 @@ class RoyalRoadAdapter(BaseSiteAdapter):
self.story.setMetadata('status', 'Stub')
elif 'DROPPED' == label:
self.story.setMetadata('status', 'Dropped')
elif 'INACTIVE' == label:
self.story.setMetadata('status', 'Inactive')
elif 'Fan Fiction' == label:
self.story.addToList('category', 'FanFiction')
elif 'Original' == label:
@ -289,8 +238,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
if img:
cover_url = img['src']
# usually URL is for thumbnail. Try expected URL for larger image, if fails fall back to the original URL
cover_set = self.setCoverImage(url,cover_url.replace('/covers-full/', '/covers-large/'))[0]
if not cover_set or cover_set.startswith("failedtoload"):
if self.setCoverImage(url,cover_url.replace('/covers-full/', '/covers-large/'))[0] == "failedtoload":
self.setCoverImage(url,cover_url)
# some content is show as tables, this will preserve them
@ -331,10 +279,5 @@ class RoyalRoadAdapter(BaseSiteAdapter):
if endnote:
# move endnote into chapter text div.
div.append(endnote.extract())
def has_display_none_style(tag):
tag_class = tag.get('class', '')
return any(style in tag_class for style in self.styles_to_ignore)
for element in div.find_all(has_display_none_style):
element.extract()
return self.utf8FromSoup(url,div)

View file

@ -193,7 +193,7 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX
# Find authorid and URL from... author url.
# (fetch multiple authors)
alist = soup.find_all('a', href=re.compile(r"viewuser.php\?uid=\d+"))
alist = soup.findAll('a', href=re.compile(r"viewuser.php\?uid=\d+"))
for a in alist:
self.story.addToList('authorId',a['href'].split('=')[1])
self.story.addToList('authorUrl','http://'+self.host+'/fanfics/'+a['href'])
@ -201,11 +201,11 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX
# Reviews
reviewdata = soup.find('div', {'id' : 'sort'})
a = reviewdata.find_all('a', href=re.compile(r'reviews.php\?type=ST&(amp;)?item='+self.story.getMetadata('storyId')+"$"))[1] # second one.
a = reviewdata.findAll('a', href=re.compile(r'reviews.php\?type=ST&(amp;)?item='+self.story.getMetadata('storyId')+"$"))[1] # second one.
self.story.setMetadata('reviews',stripHTML(a))
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/fanfics/'+chapter['href']+addurl)
@ -222,7 +222,7 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
@ -237,13 +237,13 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
@ -252,7 +252,7 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
@ -262,7 +262,7 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
@ -291,7 +291,7 @@ class SamAndJackNetAdapter(BaseSiteAdapter): # XXX
series_url = 'http://'+self.host+'/fanfics/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

View file

@ -57,9 +57,16 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
self.urltitle = "some-title"
self.set_story_idurl(url)
m = re.match(self.getSiteURLPattern(),url)
# logger.debug("id:%s"%m.group('id'))
# logger.debug("title:%s"%m.group('title'))
# get storyId from url
self.story.setMetadata('storyId', m.group('id'))
# normalized story URL.
self._setURL('https://' + self.getSiteDomain() + '/series/' + self.story.getMetadata('storyId') + '/' + m.group('title') + '/')
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','scrhub') # XXX
@ -68,19 +75,6 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y" # XXX
def set_story_idurl(self,url):
m = re.match(self.getSiteURLPattern(),url)
# logger.debug("id:%s"%m.group('id'))
# logger.debug("urltitle:%s"%m.group('urltitle'))
# get storyId from url
self.story.setMetadata('storyId', m.group('id'))
if m.group('urltitle'):
self.urltitle = m.group('urltitle')
# logger.debug("urltitle:%s"%self.urltitle)
# normalized story URL.
self._setURL('https://' + self.getSiteDomain() + '/series/' + self.story.getMetadata('storyId') + '/' + self.urltitle + '/')
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
@ -94,44 +88,8 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
return "https://"+cls.getSiteDomain()+"/series/1234/storyname/"
def getSiteURLPattern(self):
return self._get_site_url_pattern()
## here so getSiteURLPattern and get_section_url(class method) can
## both use it. Note adapter_fictionpresscom has one too.
@classmethod
def _get_site_url_pattern(cls):
return re.escape("https://"+cls.getSiteDomain())+r"/(series|read)/(?P<id>\d+)([/-](?P<urltitle>[^/]+))?"
@classmethod
def get_section_url(cls,url):
## minimal URL used for section names in INI and reject list
## for comparison
# logger.debug("pre section--url:%s"%url)
m = re.match(cls._get_site_url_pattern(),url)
if m:
url = "https://"+cls.getSiteDomain()\
+"/series/"+m.group('id')+"/a-title/"
# logger.debug("post-section url:%s"%url)
return url
@classmethod
def get_url_search(cls,url):
regexp = super(getClass(), cls).get_url_search(url)
regexp = re.sub(r"^(?P<keep>.*com/series/\d+/)(?P<urltitle>[^$]*)?",
r"\g<keep>(.*)",regexp)
logger.debug(regexp)
return regexp
## normalized chapter URLs DO contain the story title now, but
## normalized to current urltitle in case of title changes.
def normalize_chapterurl(self,url):
# https://www.scribblehub.com/read/862913-hp-the-arcane-thief-litrpg/chapter/1175961/
# logger.debug("pre normal chapter--url:%s"%url)
url = re.sub(r"https?://(?P<keep>www\.scribblehub\.com/read/\d+-).*(?P<chapter>/chapter/\d+/)",
(r"https://\g<keep>"+self.urltitle+r"\g<chapter>"),url)
# logger.debug("post normal chapter-url:%s"%url)
return url
return re.escape("https://"+self.getSiteDomain())+r"/(series|read)/(?P<id>\d+)[/-](?P<title>[^/]+)"
def post_request(self, url,
parameters=None,
usecache=True):
@ -139,8 +97,8 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
return super(getClass(), self).post_request(url, parameters, usecache)
except exceptions.HTTPErrorFFF as e:
## this is a fix for the scribblehub ajax request sometimes returning
# a 400 but only with flaresolverr. Have not been able to reproduce
# in curl/firefox. See: https://github.com/JimmXinu/FanFicFare/pull/900
# a 400 but only with flaresolverr. Have not been able to reproduce
# in curl/firefox. See: https://github.com/JimmXinu/FanFicFare/pull/900
logger.debug("HTTPErrorFFF/Scribblehub: " + str(e.status_code))
if e.status_code == 400 and self.getConfig('use_flaresolverr_proxy'):
return self.decode_data(e.data)
@ -178,15 +136,11 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
soup = self.make_soup(data)
## Title
pagetitle = soup.find('div',{'class':'fic_title'})
self.story.setMetadata('title',stripHTML(pagetitle))
## <link rel="canonical" href="https://www.scribblehub.com/series/862913/hp-the-arcane-thief-litrpg/" />
canonicalurl = soup.select_one('link[rel=canonical]')['href']
self.set_story_idurl(canonicalurl)
url = canonicalurl
# Find authorid and URL from main story page
self.story.setMetadata('authorId',stripHTML(soup.find('span',{'class':'auth_name_fic'})))
self.story.setMetadata('authorUrl',soup.find('div',{'class':'author'}).find('div',{'property':'author'}).find('span',{'property':'name'}).find('a').get('href'))
@ -197,20 +151,33 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
# Get the contents list from scribblehub, iterate through and add to chapters
# Can be fairly certain this will not 404 - we know the story id is valid
contents_payload = {"action": "wi_getreleases_pagination",
"pagenum": -1,
"mypostid": self.story.getMetadata('storyId')}
contents_payload = {"action": "wi_gettocchp",
"strSID": self.story.getMetadata('storyId'),
"strmypostid": 0,
"strFic": "yes"}
# 14/12/22 - Looks like it should follow this format now (below), but still returns a 400
# but not a 403. tested in browser getting rid of all other cookies to try and get a 400 and nopes.
# contents_payload = {"action": "wi_getreleases_pagination",
# "pagenum": 1,
# "mypostid": 421879}
# contents_payload = "action=wi_getreleases_pagination&pagenum=1&mypostid=421879"
contents_data = self.post_request("https://www.scribblehub.com/wp-admin/admin-ajax.php", contents_payload)
# logger.debug(contents_data)
contents_soup = self.make_soup(contents_data)
for toca in contents_soup.select('a.toc_a'):
chapter_url = toca['href']
chapter_name = stripHTML(toca)
# logger.debug("Found Chapter: " + chapter_name + ", url: " + chapter_url)
for i in range(1, int(contents_soup.find('ol',{'id':'ol_toc'}).get('count')) + 1):
chapter_url = contents_soup.find('li',{'cnt':str(i)}).find('a').get('href')
chapter_name = contents_soup.find('li',{'cnt':str(i)}).find('a').get('title')
# logger.debug("Found Chapter " + str(i) + ", name: " + chapter_name + ", url: " + chapter_url)
self.add_chapter(chapter_name, chapter_url)
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
@ -227,13 +194,13 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
# Categories
if soup.find('span',{'class': 'wi_fic_showtags_inner'}):
categories = soup.find('span',{'class': 'wi_fic_showtags_inner'}).find_all('a')
categories = soup.find('span',{'class': 'wi_fic_showtags_inner'}).findAll('a')
for category in categories:
self.story.addToList('category', stripHTML(category))
# Genres
if soup.find('a',{'class': 'fic_genre'}):
genres = soup.find_all('a',{'class': 'fic_genre'})
genres = soup.findAll('a',{'class': 'fic_genre'})
for genre in genres:
self.story.addToList('genre', stripHTML(genre))
@ -245,7 +212,7 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
# Content Warnings
if soup.find('ul',{'class': 'ul_rate_expand'}):
warnings = soup.find('ul',{'class': 'ul_rate_expand'}).find_all('a')
warnings = soup.find('ul',{'class': 'ul_rate_expand'}).findAll('a')
for warn in warnings:
self.story.addToList('warnings', stripHTML(warn))
@ -299,7 +266,7 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
self.story.setMetadata(metadata, stripHTML(row.find('td')))
if soup.find('table',{'class': 'table_pro_overview'}):
stats_table = soup.find('table',{'class': 'table_pro_overview'}).find_all('tr')
stats_table = soup.find('table',{'class': 'table_pro_overview'}).findAll('tr')
for row in stats_table:
find_stats_data("Total Views (All)", row, "views")
find_stats_data("Word Count", row, "numWords")

View file

@ -171,7 +171,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
# Find authorid and URL from... author url.
# (fetch multiple authors)
alist = soup.find_all('a', href=re.compile(r"viewuser.php\?uid=\d+"))
alist = soup.findAll('a', href=re.compile(r"viewuser.php\?uid=\d+"))
for a in alist:
self.story.addToList('authorId',a['href'].split('=')[1])
self.story.addToList('authorUrl','https://'+self.host+'/fanfics/'+a['href'])
@ -180,12 +180,12 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
# Reviews
reviewdata = soup.find('div', {'id' : 'sort'})
a = reviewdata.find_all('a', href=re.compile(r'reviews.php\?type=ST&(amp;)?item='+self.story.getMetadata('storyId')+"$"))[1] # second one.
a = reviewdata.findAll('a', href=re.compile(r'reviews.php\?type=ST&(amp;)?item='+self.story.getMetadata('storyId')+"$"))[1] # second one.
self.story.setMetadata('reviews',stripHTML(a))
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/fanfics/'+chapter['href']+addurl)
@ -208,7 +208,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
self.setDescription(url,self.make_soup(summarydata))
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
@ -220,13 +220,13 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
@ -235,7 +235,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
@ -245,7 +245,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
@ -273,7 +273,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
series_url = 'https://'+self.host+'/fanfics/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

View file

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Software: eFiction
from __future__ import absolute_import
from .base_efiction_adapter import BaseEfictionAdapter
class SinfulDreamsComWhisperedMuse(BaseEfictionAdapter):
@staticmethod
def getSiteDomain():
return 'sinful-dreams.com'
@classmethod
def getPathToArchive(self):
return '/whispered/muse'
@classmethod
def getConfigSection(cls):
"Overriden because [domain/path] section for multiple-adapter domain."
return cls.getSiteDomain()+cls.getPathToArchive()
@classmethod
def getSiteAbbrev(self):
return 'snfldrms-wm'
@classmethod
def getDateFormat(self):
return "%m/%d/%Y"
def getClass():
return SinfulDreamsComWhisperedMuse

View file

@ -109,7 +109,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
self.story.setMetadata('title',stripHTML(titlea))
# Find the chapters (from soup, not authsoup):
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+'/siye/'+chapter['href'])
@ -121,7 +121,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
metatable = soup.find('table',{'width':'95%'})
# Categories
cat_as = metatable.find_all('a', href=re.compile(r'categories.php'))
cat_as = metatable.findAll('a', href=re.compile(r'categories.php'))
for cat_a in cat_as:
self.story.addToList('category',stripHTML(cat_a))
@ -209,7 +209,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
series_url = 'https://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

View file

@ -1,393 +0,0 @@
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
import re
from bs4 import BeautifulSoup
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six import PY3, text_type as unicode
from .base_adapter import BaseSiteAdapter, makeDate
def getClass():
return SpiritFanfictionComAdapter
class SpiritFanfictionComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
# get storyId from url--url validation guarantees query is only sid=1234
self.storyId = unicode(self.getStoryId(url))
self.story.setMetadata('storyId', self.storyId)
# normalized story URL
self._setURL('https://' + self.getSiteDomain() + '/historia/'+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev',self.getSiteAbbrev())
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
if PY3:
self.dateformat = "%Y-%m-%dT%H:%M:%S%z"
self.datelength = len("2015-04-15T22:16:15-03:00")
else:
## python 2 had really poor timezone support and doesn't
## recognize %z. This is a somewhat cheesy way to ignore
## the -/+dddd timezone when under py2.
self.dateformat = "%Y-%m-%dT%H:%M:%S"
self.datelength = len("2015-04-15T22:16:15")
self.chapter_photoUrl = {}
@staticmethod
def getSiteDomain():
return 'www.spiritfanfiction.com'
@classmethod
def getAcceptDomains(cls):
return ['www.spiritfanfiction.com',
'www.socialspirit.com.br',
]
@classmethod
def getSiteExampleURLs(cls):
#Accepted formats
#https://www.spiritfanfiction.com/historia/1234
#https://www.spiritfanfiction.com/historia/story-name-1234
return "https://"+cls.getSiteDomain()+"/historia/story-name-1234 https://"+cls.getSiteDomain()+"/historia/1234"
@classmethod
def getSiteURLPattern(self):
#logger.debug(r"https?://(" + r"|".join([x.replace('.','\.') for x in self.getAcceptDomains()]) + r")/historia/(?:[a-zA-Z0-9-]+-)?(?P<storyId>\d+)")
return r"https?://(" + r"|".join([x.replace('.',r'\.') for x in self.getAcceptDomains()]) + r")/historia/(?:[a-zA-Z0-9-]+-)?(?P<storyId>\d+)"
@classmethod
def getSiteAbbrev(cls):
return 'spirit'
# Login
def needToLoginCheck(self, data):
if 'nao-logado' in data or 'Acessar sua Conta' in data:
return True
return False
def performLogin(self, url, data):
params = {}
params['Usuario'] = self.getConfig("username")
params['Senha'] = self.getConfig("password")
params['Login'] = 'Fazer Login'
login_url = 'https://' + self.getSiteDomain() + '/login'
logger.info("Will now login to URL (%s) as (%s)" % (login_url,
params['Usuario']))
login_page_html = self.get_request(login_url, usecache=False)
login_page_soup = self.make_soup(login_page_html)
session_input = login_page_soup.find('input', {'name': "SessionHash"})
params['SessionHash'] = session_input['value'] if session_input else ""
return_url_input = login_page_soup.find('input', {'name': 'ReturnUrl'})
params['ReturnUrl'] = return_url_input['value'] if return_url_input else ""
response_html = self.post_request(login_url, params)
if 'nao-logado' in response_html or "Acessar sua Conta" in response_html:
logger.info("Failed to login to URL %s as %s" % (login_url,
params['Usuario']))
raise exceptions.FailedToLogin(login_url,params['Usuario'])
else:
return True
def getStoryId(self, url):
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(), url)
if m:
return m.group('storyId')
else:
raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs())
def extractChapterUrlsAndMetadata(self):
data = self.get_request(self.url)
if self.needToLoginCheck(data):
self.performLogin(self.url, data)
data = self.get_request(self.url,usecache=False)
soup = self.make_soup(data)
# Title
title = soup.find('h1', {'class':'tituloPrincipal'})
self.story.setMetadata('title', stripHTML(title.find('strong')))
# Authors
# Find authorid and URL
authors = (title.find_next('div', {'class':'left'})).find_all('span', {'class':'usuario'})
for author in authors:
self.story.addToList('authorId', author.find('a')['href'].split('/')[-1])
self.story.addToList('authorUrl', author.find('a')['href'])
self.story.addToList('author', stripHTML(author.find('strong')))
# Cover image
cover_img = soup.find('img', {'class':'imagemResponsiva'})
if cover_img:
self.setCoverImage(self.url, cover_img['src'])
newestChapter = None
self.newestChapterNum = None # save for comparing during update.
# Find the chapters:
chapters = soup.find_all('table', {'class':'listagemCapitulos espacamentoTop'})
for chapter in chapters:
for row in chapter.find_all('tr', {'class': 'listagem-textoBg1'}): # Find each row with chapter info
a = row.find('a') # Chapter link
# Datetime
date = a.find_next('time')['datetime']
chapterDate = makeDate(date[:self.datelength], self.dateformat).date()
chapter_title = stripHTML(a.find('strong'))
self.add_chapter(chapter_title, a.get('href'), {'date': chapterDate})
if newestChapter == None or chapterDate > newestChapter:
newestChapter = chapterDate
self.newestChapterNum = self.story.getMetadata('numChapters')
logger.debug('numChapters: (%s)', self.story.getMetadata('numChapters'))
# Summary
div_element = soup.find('div', {'class':'clearfix'})
summary = div_element.find('div', class_='texto')
strong_tag = summary.find('strong', text='Sinopse:')
if strong_tag:
strong_tag.decompose()
self.decode_emails(summary)
for a_tag in summary.find_all('a', {'data-cfemail': True}):
email_text = a_tag.string
a_tag.replace_with(email_text)
full_text = unicode(summary)
self.story.setMetadata('description', full_text)
def parse_until_br(attribute, start_index, element_list):
# Initialize counter
next_index = start_index
for element in element_list[start_index:]:
next_index += 1
if element.name == 'br':
break
elif element.name == 'strong':
if attribute == 'status':
if element.contents[0].text == 'Sim':
self.story.setMetadata(attribute, 'Completed')
elif element.contents[0].text == 'Não':
self.story.setMetadata(attribute, 'In-Progress')
elif attribute == 'characters':
terms = re.findall(r"[^&]+", stripHTML(element))
for term in terms:
self.story.addToList(attribute, term)
elif attribute == 'numWords':
self.story.setMetadata(attribute, stripHTML(element).replace('.',''))
else:
self.story.setMetadata(attribute, stripHTML(element))
elif element.name == 'a':
if element.contents[0].name == 'strong':
self.story.addToList(attribute, stripHTML(element.contents[0]))
elif element.name == 'time':
self.story.setMetadata(attribute, makeDate(element['datetime'][:self.datelength], self.dateformat))
return next_index
# Informações Gerais
content_metadata = [
('Iniciado', 'datePublished'),
('Atualizada', 'dateUpdated'),
('Idioma', 'language'),
('Visualizações', 'hits'),
('Favoritos', 'kudos'),
('Comentários', 'comments'),
('Listas de leitura', 'bookmarks'),
('Palavras', 'numWords'),
('Concluído', 'status'),
('Categorias', 'category'),
('Personagens', 'characters'),
('Tags', 'freeformtags'),
('Gêneros:', 'genre'),
('Avisos:', 'warnings')
]
tag_mapping = dict(content_metadata)
information = div_element.find(lambda tag: tag.name == 'div' and
tag.get('class') == ['texto', 'espacamentoTop'] and
tag.get('id') != 'cphConteudo_cphConteudo_divDestaque')
logger.debug('information: (%s)', information)
info_contents = information.contents
i = 0
while i < len(info_contents):
content = info_contents[i]
stripped_tag = stripHTML(content)
if stripped_tag in tag_mapping:
i = parse_until_br(tag_mapping[stripped_tag], i+1, info_contents)
else:
i += 1
# Classificação, Gêneros e Avisos
# Finding div element with class "clearfix baixo"
div_element = soup.find('div', {'class': 'clearfix baixo'})
# Finding div element with class "classificacao"
classificacao_element = div_element.find('div', class_='classificacao')
# Extracting last word from class name
if classificacao_element and 'class' in classificacao_element.attrs:
class_value = classificacao_element.attrs['class']
self.story.setMetadata('rating',class_value[-1].split('-')[-1])
# Extracting text content "Gêneros" and "Avisos"
contents = classificacao_element.find_next('div').contents
i = 0
while i < len(contents):
content = contents[i]
stripped_tag = stripHTML(content)
if stripped_tag in tag_mapping:
i = parse_until_br(tag_mapping[stripped_tag], i+1, contents)
else:
i += 1
## Normalize chapter URLs in case of title change
def normalize_chapterurl(self,url):
#https://www.spiritfanfiction.com/historia/story-name-1234/capitulo56
url = re.sub(r"https?://("+self.getSiteDomain()+r"/historia/\d+/capitulo\d+)$",
r"https://\1",url)
return url
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
save_chapter_soup = self.make_soup("<div></div>")
save_chapter = save_chapter_soup.find('div')
chapter_dl_soup = self.make_soup(self.get_request(url))
if None == chapter_dl_soup:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
div_chapter = chapter_dl_soup.find('div', {'class':'clearfix'})
chapter_text = div_chapter.find('div', class_='texto-capitulo')
exclude_notes=self.getConfigList('exclude_notes')
def append_tag(elem, tag, string=None, classes=None):
'''bs4 requires tags be added separately.'''
new_tag = save_chapter_soup.new_tag(tag)
if string:
new_tag.string=string
if classes:
new_tag['class']=[classes]
elem.append(new_tag)
return new_tag
chapimg = chaphead = chapfoot = None
# Chapter Image
img_url = chapter_text.find('img', {'class':'imagemResponsiva'})
if img_url:
chapimg = chapter_dl_soup.new_tag('p', style="text-align: center")
chapimg.insert(0, chapter_dl_soup.new_tag('img', src=img_url['src']))
for tag in chapter_text.find_all('h2'):
if tag.string.startswith('Notas do Autor'):
chaphead = self.make_soup(unicode(tag.find_next_sibling('div', {'class': 'texto texto-capitulo-notas'})))
elif tag.string.startswith('Notas Finais'):
chapfoot = self.make_soup(unicode(tag.find_next_sibling('div', {'class': 'texto texto-capitulo-notas'})))
else:
# Apparently, not all chapters have the "Capítulo" text anymore, but it's the only other h2 in there
chaptext = self.make_soup(unicode(tag.find_next_sibling('div', {'class': 'texto'})))
# Decode emails
self.decode_emails(chaptext)
if chapimg != None:
if chaptext.div == None:
append_tag(chaptext, 'div')
chaptext.div.insert(0, chapimg)
head_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_head_notes")
if 'chapterheadnotes' not in exclude_notes:
if chaphead != None:
append_tag(head_notes_div,'b',"Notas do Autor:")
self.decode_emails(chaphead)
head_notes_div.append(chaphead)
append_tag(head_notes_div,'hr')
save_chapter.append(chaptext)
foot_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_foot_notes")
## Can appear on every chapter
if 'chapterfootnotes' not in exclude_notes:
if chapfoot != None:
append_tag(foot_notes_div,'hr')
append_tag(foot_notes_div,'b',"Notas Finais:")
self.decode_emails(chapfoot)
foot_notes_div.append(chapfoot)
## remove empty head/food notes div(s)
if not head_notes_div.find(True):
head_notes_div.extract()
if not foot_notes_div.find(True):
foot_notes_div.extract()
return self.utf8FromSoup(url,save_chapter)
def decode_emails(self, html_text):
def decode_email(encoded_email):
email = ""
r = int(encoded_email[:2], 16)
for i in range(2, len(encoded_email), 2):
char_code = int(encoded_email[i:i + 2], 16) ^ r
email += chr(char_code)
return email
# Find all elements with class '__cf_email__'
email_elements = html_text.find_all(class_='__cf_email__')
for element in email_elements:
# Get the data-cfemail attribute value
encoded_email = element.get('data-cfemail')
if encoded_email:
# Decode the email address
decoded_email = decode_email(encoded_email)
# Replace the obfuscated email with the decoded email
element.string = decoded_email
return unicode(html_text)
def before_get_urls_from_page(self,url,normalize):
if self.getConfig("username"):
data = self.get_request(url)
if self.needToLoginCheck(data):
self.performLogin(url, data)

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2019 FanFicFare team
# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -16,20 +16,23 @@
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
from .adapter_test1 import TestSiteAdapter
# Software: eFiction
from .base_efiction_adapter import BaseEfictionAdapter
class Test4SiteAdapter(TestSiteAdapter):
def __init__(self, config, url):
TestSiteAdapter.__init__(self, config, url)
class StarskyHutchArchiveNetSiteAdapter(BaseEfictionAdapter):
@staticmethod
def getSiteDomain():
return 'test4.com'
return 'www.starskyhutcharchive.net'
@classmethod
def getSiteAbbrev(self):
return 'shan'
@classmethod
def getDateFormat(self):
return "%m/%d/%Y"
def getClass():
return Test4SiteAdapter
return StarskyHutchArchiveNetSiteAdapter

View file

@ -93,7 +93,7 @@ class StoriesOfArdaComAdapter(BaseSiteAdapter):
self.story.setMetadata('title',stripHTML(a))
# Find the chapters: chapterview.asp?sid=7000&cid=30919
chapters=soup.find_all('a', href=re.compile(r'chapterview.asp\?sid='+self.story.getMetadata('storyId')+r"&cid=\d+$"))
chapters=soup.findAll('a', href=re.compile(r'chapterview.asp\?sid='+self.story.getMetadata('storyId')+r"&cid=\d+$"))
if len(chapters)==1:
self.add_chapter(self.story.getMetadata('title'),'http://'+self.host+'/'+chapters[0]['href'])
else:
@ -109,14 +109,14 @@ class StoriesOfArdaComAdapter(BaseSiteAdapter):
# no convenient way to get word count
for td in asoup.find_all('td', {'colspan' : '3'}):
for td in asoup.findAll('td', {'colspan' : '3'}):
if td.find('a', href=re.compile(r'chapterlistview.asp\?SID='+self.story.getMetadata('storyId'))) != None:
break
td=td.nextSibling.nextSibling
self.story.setMetadata('dateUpdated', makeDate(stripHTML(td).split(': ')[1], self.dateformat))
try:
tr=td.parent.nextSibling.nextSibling.nextSibling.nextSibling
td=tr.find_all('td')
td=tr.findAll('td')
self.story.setMetadata('rating', td[0].string.split(': ')[1])
self.story.setMetadata('status', td[2].string.split(': ')[1])
self.story.setMetadata('datePublished', makeDate(stripHTML(td[4]).split(': ')[1], self.dateformat))

View file

@ -59,12 +59,8 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
title = ""
if not m.group('chapter') and m.group('title'):
title = m.group('title')
path = m.group('path')
## library allowed for storyInfo.php but doesn't work in normal story url
if path == "library":
path = "s"
# normalized story URL.
self._setURL('https://' + self.getSiteDomain() + '/'+path+'/'+self.story.getMetadata('storyId')+title)
self._setURL('https://' + self.getSiteDomain() + '/s/'+self.story.getMetadata('storyId')+title)
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
@ -88,10 +84,10 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "https://"+cls.getSiteDomain()+"/s/1234/story-title https://"+cls.getSiteDomain()+"/n/1234/story-title"
return "http://"+cls.getSiteDomain()+"/s/1234 http://"+cls.getSiteDomain()+"/s/1234:4010 https://"+cls.getSiteDomain()+"/s/1234 https://"+cls.getSiteDomain()+"/s/1234:4010"
def getSiteURLPattern(self):
return r"https?://"+re.escape(self.getSiteDomain())+r"/(?P<path>s|n|library)/(storyInfo.php\?id=)?(?P<id>\d+)(?P<chapter>:\d+)?(?P<title>/.+)?((;\d+)?$|(:i)?$)?"
return r"https?://"+re.escape(self.getSiteDomain())+r"/(s|library)/(storyInfo.php\?id=)?(?P<id>\d+)(?P<chapter>:\d+)?(?P<title>/.+)?((;\d+)?$|(:i)?$)?"
@classmethod
def getTheme(cls):
@ -147,21 +143,6 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
postAction,
'','',''))
data = self.post_request(postUrl,params,usecache=False)
# logger.debug(data)
while '<h2>Enter TOTP Code:</h2>' in data:
if self.totp:
logger.debug("Trying to TOTP with %s code."%self.totp)
params = {}
params['cmd'] = 'finishTotpVerification'
# google auth app at least shows "123 123", but site expects
# "123123". Remove space if user enters it.
params['totp_code'] = self.totp.replace(' ','')
params['action'] = "continue"
data = self.post_request(postUrl,params,usecache=False)
# logger.debug(data)
self.totp = None
else:
raise exceptions.NeedTimedOneTimePassword(url)
if self.needToLoginCheck(data):
logger.info("Failed to login to URL %s as %s" % (loginUrl,
@ -173,10 +154,6 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
url = self.url
logger.debug("URL: "+url)
## Some stories give 404 if not logged in now. See #1185
if self.getConfig("always_login"):
self.performLogin(self.url)
## Hit story URL to check for changed title part -- if the
## title has changed or (more likely?) the ID number has
## been reassigned to a different title, this will 404
@ -188,7 +165,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
if e.status_code in (401, 403, 410):
data = 'Log In' # to trip needToLoginCheck
elif e.status_code == 404:
raise exceptions.FailedToDownload("Page Not Found - always_login needed? (%s)" % url)
raise exceptions.FailedToDownload("Page Not Found - Story ID Reused? (%s)" % url)
else:
raise e
if self.needToLoginCheck(data):
@ -196,24 +173,13 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
self.performLogin(url)
data = self.get_request(url,usecache=False)
## SOL adds intermediate page to remind users to renew at 3-30 days before expiration - this breaks the soup 'a' search below
if "Your premier membership is going to expire" in data:
soup = self.make_soup(data)
expire = soup.find(string=re.compile("Your premier membership is going to expire"))
remindurl=(soup.find(href=re.compile("later.php"))).get('href')
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: "+expire+"\n"+"Renew or reduce expiration warning time in account setting\n"+remindurl)
## Premium account might redirect to a chapter, while regular
## account doesn't redirect to the URL with embedded /story-title
## So pull url from <a href="/s/000/story-title" rel="bookmark">
## regardless.
soup = self.make_soup(data)
a = soup.find('a',rel="bookmark")
if a:
url = 'https://'+self.host+a['href']
else:
# Contest entries do not have bookmark HREF
logger.info("No Bookmark HREF, using URL="+url)
url = 'https://'+self.host+a['href']
## Premium has "?ind=1" to force index.
## May not be needed w/o premium
@ -232,12 +198,6 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Error! The story you're trying to access is being filtered by your choice of contents filtering.")
elif "Error! Daily Limit Reached" in data or "Sorry! You have reached your daily limit of" in data:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Error! Daily Limit Reached")
elif "by (Hidden)" in data:
#Contest entries have author set to "(Hidden)" which breaks author lookups below
logger.info("Contest entry, setting authorId=(Hidden)")
self.story.addToList('authorId',"(Hidden)")
logger.info("Contest entry, setting author=(Hidden)")
self.story.addToList('author',"(Hidden)")
soup = self.make_soup(data)
# logger.debug(data)
@ -246,40 +206,32 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
a = soup.find('h1')
self.story.setMetadata('title',stripHTML(a))
authfrom = soup.find('footer')
alist = authfrom.find_all('a', {'rel' : 'author'})
if alist:
for a in alist:
self.story.addToList('authorId',a['href'].split('/')[2])
self.story.addToList('authorUrl','https://'+self.host+a['href'])
## both 's Page and s Page
self.story.addToList('author',re.sub(r".s Page$","",stripHTML(a)))
else:
logger.info("AuthorList empty. Contest entry?")
# Find authorid and URL from... author url. Sometimes in top,
# other times in footer.
authfrom = soup.find('div', {'id':'top-header'})
if authfrom is None or 'author' not in str(authfrom):
authfrom = soup.find('footer')
alist = authfrom.findAll('a', {'rel' : 'author'})
for a in alist:
self.story.addToList('authorId',a['href'].split('/')[2])
self.story.addToList('authorUrl','https://'+self.host+a['href'])
self.story.addToList('author',stripHTML(a).replace("'s Page",""))
# The rest of the metadata is within the article tag.
soup = soup.find('article')
# Find the chapters:
# If multiple chapters, they are in "index-list" div.
# <a href="/s/00001/This-is-a-test/1">Chapter 1</a>
# <a href="/n/00001/This-is-a-test/1">Chapter 1</a>
chapters = soup.select('div#index-list a[href*="/s/"],div#index-list a[href*="/n/"]')
# logger.debug(chapters)
chapters = soup.findAll('a', href=re.compile(r'^/s/'+self.story.getMetadata('storyId')+r":\d+(/.*)?$"))
if len(chapters) != 0:
logger.debug("Number of chapters: {0}".format(len(chapters)))
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'https://'+self.host+chapter['href'])
else:
self.add_chapter(self.story.getMetadata('title'),self.story.getMetadata('storyUrl'))
self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+'/s/'+self.story.getMetadata('storyId'))
# The rest of the metadata is within the article tag.
soup = soup.find('article')
if self.story.getList('authorUrl'):
self.getStoryMetadataFromAuthorPage()
else:
logger.info("No authorurl found, setting to homepage. Could be contest story...")
self.story.setMetadata('authorUrl','https://' + self.getSiteDomain() + '/')
self.getStoryMetadataFromAuthorPage()
# Some books have a cover in the index page.
# Samples are:
@ -304,7 +256,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
soup = soup.find('header')
# Remove some tags based on their class or id
elements_to_remove = ['#det-link', '#s-details', '#index-list', '#s-title', '#s-auth', '.copy']
if self.getConfig('include_images') != 'true': # false or coveronly
if not self.getConfig('include_images'):
elements_to_remove.append('img')
for element_name in elements_to_remove:
elements = soup.select(element_name)
@ -323,7 +275,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
self.has_universes = False
title_cell = story_row.find('td', {'class' : 'lc2'})
for cat in title_cell.find_all('div', {'class' : 'typediv'}):
for cat in title_cell.findAll('div', {'class' : 'typediv'}):
self.story.addToList('genre',cat.text)
# in lieu of word count.
@ -400,16 +352,6 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
series_name = stripHTML(series_soup.find('h1', {'id' : 'ptitle'}))
series_name = re.sub(r' . a (series by|collection from).*$','',series_name)
# logger.debug("Series name: '%s'" % series_name)
if i == 0:
# find number in series from series page--not
# included in story page anymore.
# ... <a id="t20130r"></a>2 ...
seriesi = series_soup.select_one("a[id='t"+self.story.getMetadata('storyId')+"r']").parent
# logger.debug(seriesi)
try:
i = int(stripHTML(seriesi))
except:
logger.debug("Failed to convert series number(%s)"%seriesi)
self.setSeries(series_name, i)
# Check if series is in a universe
if self.has_universes:
@ -417,7 +359,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
universes_soup = self.make_soup(self.get_request(universe_url) )
# logger.debug("Universe url='{0}'".format(universe_url))
if universes_soup:
universes = universes_soup.find_all('div', {'class' : 'ser-box'})
universes = universes_soup.findAll('div', {'class' : 'ser-box'})
# logger.debug("Number of Universes: %d" % len(universes))
for universe in universes:
# logger.debug("universe.find('a')={0}".format(universe.find('a')))
@ -512,7 +454,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
return value
def parseOtherAttributes(self, other_attribute_element):
for b in other_attribute_element.find_all('b'):
for b in other_attribute_element.findAll('b'):
#logger.debug('Getting metadata: "%s"' % b)
label = b.text
if label in ['Posted:', 'Concluded:', 'Updated:']:
@ -596,7 +538,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
srtag = soup.find('div', id='sr')
if srtag != None:
# logger.debug('Getting more chapter text for: %s' % url)
logger.debug('Getting more chapter text for: %s' % url)
moretext = self.getMoreText(html)
if moretext != None:
moresoup = self.make_soup(moretext)
@ -611,7 +553,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
if pager != None:
urls=pager.find_all('a')
urls=pager.findAll('a')
urls=urls[:len(urls)-1]
# logger.debug("pager urls:%s"%urls)
pager.extract()
@ -638,13 +580,11 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
chapter_title = None
if self.getConfig('inject_chapter_title'):
if self.num_chapters() > 1:
cttag = pagetag.find('h2')
else:
## single chapter stories formatted a little differently.
cttag = pagetag.find('h1')
if cttag:
chapter_title = cttag.extract()
h2tag = pagetag.find('h2')
if h2tag:
# I'm seeing an h1 now, but it's not logged in?
# Something's broken...
chapter_title = h2tag.extract()
# Strip te header section
tag = pagetag.find('header')
@ -667,7 +607,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
# putting a 'conTag' at the *top* now, too. So this
# was nuking every page but the first and last. Now
# only if 'Continues'
for contag in pagetag.find_all('span', {'class' : 'conTag'}):
for contag in pagetag.findAll('span', {'class' : 'conTag'}):
# remove everything after continues...
if 'Continuation' in contag.text:
tag = contag
@ -696,7 +636,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
# If it is a chapter, there are dates at the start for when it was posted or modified. These plus
# everything before them can be discarded.
postedDates = pagetag.find_all('div', {'class' : 'date'})
postedDates = pagetag.findAll('div', {'class' : 'date'})
# logger.debug(postedDates)
if postedDates:
a = postedDates[0].previousSibling
@ -705,7 +645,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
b = a.previousSibling
a.extract()
a = b
for a in pagetag.find_all('div', {'class' : 'date'}):
for a in pagetag.findAll('div', {'class' : 'date'}):
a.extract()
# Kill the vote form and everything after it.
@ -726,5 +666,4 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
# inject_chapter_title
if chapter_title:
chapter_title.name='h3'
chapter_title['class']='inject_chapter_title'
pagetag.insert(0,chapter_title)

View file

@ -1,53 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
# py2 vs py3 transition
from .adapter_storiesonlinenet import StoriesOnlineNetAdapter
def getClass():
return StoryRoomComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class StoryRoomComAdapter(StoriesOnlineNetAdapter):
@classmethod
def getSiteAbbrev(cls):
return 'stryrm'
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'storyroom.com'
@classmethod
def getAcceptDomains(cls):
return ['finestories.com',cls.getSiteDomain()]
@classmethod
def getConfigSections(cls):
"Only needs to be overriden if has additional ini sections."
return ['finestories.com',cls.getSiteDomain()]
@classmethod
def getSiteURLPattern(self):
return r"https?://("+r"|".join([x.replace('.',r'\.') for x in self.getAcceptDomains()])+r")/(?P<path>s|n|library)/(storyInfo.php\?id=)?(?P<id>\d+)(?P<chapter>:\d+)?(?P<title>/.+)?((;\d+)?$|(:i)?$)?"

View file

@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
import re
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
# py2 vs py3 transition
from .base_adapter import BaseSiteAdapter, makeDate
def getClass():
return SwiOrgRuAdapter
logger = logging.getLogger(__name__)
class SwiOrgRuAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
storyId = self.parsedUrl.path.split('/',)[3]
self.story.setMetadata('storyId', storyId)
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/mlp-fim/story/'+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','swiorgru')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%Y.%m.%d"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
return 'www.swi.org.ru'
@classmethod
def getSiteExampleURLs(cls):
return "http://" + cls.getSiteDomain() + "/mlp-fim/story/11341/ http://" + cls.getSiteDomain() + "/mlp-fim/story/11341/chapter1.html"
def getSiteURLPattern(self):
return r"http://" + re.escape(self.getSiteDomain() + "/mlp-fim/story/")+r"\d+"
def extractChapterUrlsAndMetadata(self):
url=self.url
logger.debug("URL: "+url)
data = self.get_request(url)
soup = self.make_soup(data)
title = soup.find('h1')
for tag in title.findAll('sup'):
tag.extract()
self.story.setMetadata('title', stripHTML(title.text))
logger.debug("Title: (%s)"%self.story.getMetadata('title'))
author_title = soup.find('strong', string = re.compile(u"Автор: "))
if author_title == None:
raise exceptions.FailedToDownload("Error downloading page: %s! Missing required author_title element!" % url)
author = author_title.next_sibling
self.story.setMetadata('authorId', author.text) # Author's name is unique
self.story.setMetadata('authorUrl','http://'+self.host + author['href'])
self.story.setMetadata('author', author.text)
logger.debug("Author: (%s)"%self.story.getMetadata('author'))
date_pub = soup.find('em', string = re.compile(r'\d{4}.\d{2}.\d{2}'))
if not date_pub == None:
self.story.setMetadata('datePublished', makeDate(date_pub.text, self.dateformat))
rating_label = soup.find('strong', string = re.compile(u"рейтинг:"))
if not rating_label == None:
rating = rating_label.next_sibling.next_sibling
self.story.setMetadata('rating', stripHTML(rating))
if not self.is_adult or self.getConfig("is_adult"):
if "NC-18" in rating:
raise exceptions.AdultCheckRequired(self.url)
characters = soup.findAll('img', src=re.compile(r"/mlp-fim/img/chars/\d+.png"))
logger.debug("numCharacters: (%s)"%str(len(characters)))
for x in range(0,len(characters)):
character=characters[x]
self.story.addToList('characters', character['title'])
if soup.find('font', color = r"green", string = u"завершен"):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
categories_label = soup.find('strong', string = u"категории:")
if not categories_label == None:
categories_element = categories_label.next_sibling.next_sibling
categories = re.findall(r'"(.+?)"', categories_element.text)
for x in range(0, len(categories)):
category=categories[x]
self.story.addToList('category', category)
chapters_header = soup.find('h2', string = re.compile(u"Главы:"))
if chapters_header==None:
raise exceptions.FailedToDownload("Error downloading page: %s! Missing required chapters_header element!" % url)
chapters_table = chapters_header.next_sibling.next_sibling
self.story.setMetadata('language','Russian')
chapters=chapters_table.findAll('a', href=re.compile(r'/mlp-fim/story/'+self.story.getMetadata('storyId')+r"/chapter\d+"))
self.story.setMetadata('numChapters', len(chapters))
logger.debug("numChapters: (%s)"%str(self.story.getMetadata('numChapters')))
for x in range(0,len(chapters)):
chapter=chapters[x]
churl='http://'+self.host+chapter['href']
self.add_chapter(chapter,churl)
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = self.make_soup(self.get_request(url))
chapter = soup.find('div', {'id' : 'content'})
chapter_header = chapter.find('h1', id = re.compile("chapter"))
if not chapter_header == None:
chapter_header.decompose()
if chapter == None:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,chapter)

View file

@ -1,459 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging, time
logger = logging.getLogger(__name__)
import re, math
from hashlib import sha256
from base64 import urlsafe_b64encode as b64encode
from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six.moves import http_cookiejar as cl
from ..six.moves.urllib.parse import urlparse
from ..six import text_type as unicode
from .base_adapter import BaseSiteAdapter, makeDate
def getClass():
return SyosetuComAdapter
def getEntry(soup, *args):
for arg in args:
target = soup.find('dt', string=arg)
if target is not None:
return target.findNext('dd')
return None
class SyosetuComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev', 'syosetu')
self.story.setMetadata('language', 'Japanese')
splitPath = self.path.split('/')
self.storyId = splitPath[-2] if (splitPath[-1] == '') else splitPath[-1]
self.story.setMetadata('storyId', self.storyId)
self._setURL('https://' + self.host + '/' + self.storyId + '/')
self.is_adult = False
@staticmethod
def getSiteDomain():
return 'syosetu.com'
@classmethod
def getAcceptDomains(cls):
return [
'ncode.syosetu.com',
'novel18.syosetu.com',
'mypage.syosetu.com',
'xmypage.syosetu.com',
]
@classmethod
def getSiteExampleURLs(cls):
return ("https://ncode.syosetu.com/n1234ab/ "
+"https://novel18.syosetu.com/n1234a "
+"https://ncode.syosetu.com/novelview/infotop/ncode/n1234ab "
+"https://novel18.syosetu.com/novelview/infotop/ncode/n1234a/")
def getSiteURLPattern(self):
return r"^https?://(ncode|novel18)\.syosetu\.com/(novelview/infotop/ncode/)?n[0-9]+[a-z]+/?$"
def set_adult_cookie(self):
cookie = cl.Cookie(version=0, name='over18', value='yes',
port=None, port_specified=False,
domain=self.getSiteDomain(), domain_specified=False, domain_initial_dot=False,
path='/', path_specified=True,
secure=False,
expires=time.time()+10000,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
self.get_configuration().get_cookiejar().set_cookie(cookie)
def performLogin(self, url):
params = {}
if self.password:
params['narouid'] = self.username
params['pass'] = self.password
else:
params['narouid'] = self.getConfig('username')
params['pass'] = self.getConfig('password')
if params['narouid'] and params['pass']:
loginUrl = 'https://syosetu.com/login/login/'
logger.info("Will now login to URL (%s) as (%s)" % (loginUrl,
params['narouid']))
d = self.post_request(loginUrl, params)
if 'href="https://syosetu.com/login/logout/"' not in d:
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['narouid']))
raise exceptions.FailedToLogin(url,params['narouid'])
def extractChapterUrlsAndMetadata(self):
"""
Oneshots are located at /n1234ab/
Serials are located at /n1234ab/#/ (# is a non-padded number,
like 1, 2, ..., 394). Serials can have a single chapter.
Most metadata is located at /novelview/infotop/ncode/n1234ab/
Chapter publish and update times are located at /n1234ab/?p=#
paginated in groups of 100
"""
if self.is_adult or self.getConfig('is_adult'):
self.set_adult_cookie()
# self.performLogin(self.url)
infoUrl = 'https://' + self.host + '/novelview/infotop/ncode/' + self.storyId + '/'
# don't use cache if manual is_adult--should only happen
# if it's an adult story and they don't have is_adult in ini.
(infoData,infoRurl) = self.get_request_redirected(infoUrl,
usecache=(not self.is_adult))
# IDs for general (adult) stories redirect to ncode (novel18)
# despite IDs being shared, stories can't be age-restricted automatically
if infoUrl != infoRurl:
infoUrl = infoRurl
self.host = urlparse(infoRurl).netloc
self._setURL('https://' + self.host + '/' + self.storyId + '/')
if (self.host.split('.')[0] == 'novel18'):
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
# Did not find story. (Invalid ID)
if '投稿済作品が見つかりません。' in infoData:
raise exceptions.StoryDoesNotExist(self.url)
# Story has been deleted.
if 'この作品は作者によって削除されました。' in infoData:
raise exceptions.StoryDoesNotExist(self.url)
if self.getConfig('always_login') and 'href="https://syosetu.com/login/input/"' in infoData:
self.performLogin(self.url)
infoData = self.get_request(infoUrl, usecache=False)
infoSoup = self.make_soup(infoData)
# Title
title = infoSoup.find('a', href=self.url).text.strip()
self.story.setMetadata('title', title)
# Author
# the author URL can always be found at the bottom of the page
# differs between ncode and novel18
authorUrl = (infoSoup.find('a', string='作者マイページ')
or infoSoup.find('a', string='作者Xマイページ'))['href']
self.story.setMetadata('authorUrl', authorUrl)
authorId = urlparse(authorUrl).path.split('/')[1]
self.story.setMetadata('authorId', authorId)
authorElement = getEntry(infoSoup, '作者名')
author = authorElement.text.strip()
try:
if authorElement.find('a') is None:
# when the author isn't linked in the table, a pseudonym has been used
realAuthor = self.make_soup(self.get_request(authorUrl)).find('title').text.strip()
if realAuthor != author:
author = author + ' (' + realAuthor + ')'
except:
logger.info('Author parsing failed, using pseudonym.')
self.story.setMetadata('author', author)
# Description
description = getEntry(infoSoup, 'あらすじ')
description.name = 'div'
description['class'] = 'description'
self.setDescription(self.url, description)
# Date Published and Updated
# 2017年 05月16日 17時30分
published = makeDate(getEntry(infoSoup, '掲載日').text.strip(),
'%Y年 %m月%d%H時%M分')
self.story.setMetadata('datePublished', published)
updated = published
updateElement = getEntry(infoSoup,
'最終部分掲載日', # last part published (complete)
'最新部分掲載日', # latest part published
'最終更新日', # last update (complete)
'最新掲載日' # last update
)
if updateElement is not None:
updated = makeDate(updateElement.text.strip(),
'%Y年 %m月%d%H時%M分')
self.story.setMetadata('dateUpdated', updated)
# Series
# differs between ncode and novel18
series = getEntry(infoSoup, 'シリーズ', 'Xシリーズ')
try:
if series is not None:
seriesName = series.text.strip()
seriesUrl = series.find('a')['href']
seriesSoup = self.make_soup(self.get_request(seriesUrl))
alist = seriesSoup.select('.p-series-novellist .p-series-novellist__title a')
i = 1
for a in alist:
if self.storyId in a['href']:
self.setSeries(seriesName, i)
self.story.setMetadata('seriesUrl', seriesUrl)
break
i += 1
except:
logger.info('Series parsing failed.')
# Character count
# 123,789文字
numMoji = int(re.sub(r'[^\d]', '', getEntry(infoSoup, '文字数').text.strip()))
self.story.setMetadata('numWords', numMoji)
# Status and Chapter count
noveltype = infoSoup.find('span', {'class':'p-infotop-type__type'})
if noveltype.text.strip() == '短編':
numChapters = 1
oneshot = True
completed = True
else:
# '全1,292エピソード\n'
numChapters = int(re.sub(r'[^\d]', '', infoSoup.find('span', {'class':'p-infotop-type__allep'}).text.strip()))
oneshot = False
completed = True if noveltype == '完結済' else False
self.story.setMetadata('status', 'Completed' if completed else 'In-Progress')
# Keywords
flags = []
# not sure what it looks like if a work has no tags
tagsElement = getEntry(infoSoup, 'キーワード')
for tag in tagsElement.text.split():
self.story.addToList('freeformtags', tag)
# Rating, Genre, and Imprint
if self.host.split('.')[0] == 'novel18':
rating = 'R18'
# ミッドナイトノベルズ(大人向け)
imprint = getEntry(infoSoup, '掲載サイト').text.strip().split('(')[0]
self.story.setMetadata('imprint', imprint)
else:
rating = 'R15' if 'R15' in flags else 'G'
# ハイファンタジー〔ファンタジー〕
fullgenre = getEntry(infoSoup, 'ジャンル').text.strip()
self.story.setMetadata('fullgenre', fullgenre)
smallgenre = fullgenre.split('')[0]
self.story.setMetadata('smallgenre', smallgenre)
biggenre = fullgenre.split('')[1][:-1]
self.story.setMetadata('biggenre', biggenre)
self.story.setMetadata('rating', rating)
# Comments, Reviews, Bookmarks, Points
commentsElement = getEntry(infoSoup, '感想')
reviewsElement = getEntry(infoSoup, 'レビュー')
bookmarksElement = getEntry(infoSoup, 'ブックマーク登録')
ratingPointsElement = getEntry(infoSoup, '総合評価')
overallPointsElement = getEntry(infoSoup, '評価ポイント')
# if the story is unlinked from author page, stats will be hidden
# '\n116件\n\n'
if commentsElement is not None:
self.story.setMetadata('comments',
int(re.sub(r'[^\d]', '', commentsElement.next_element.strip())))
# 171件
if reviewsElement is not None:
self.story.setMetadata('reviews',
int(re.sub(r'[^\d]', '', reviewsElement.next_element.strip())))
# 108,610件
if bookmarksElement is not None:
self.story.setMetadata('bookmarks',
int(re.sub(r'[^\d]', '', bookmarksElement.next_element.strip())))
# 166,944pt or ※非公開
if (ratingPointsElement is not None and
ratingPointsElement.text.strip() != '※非公開'):
self.story.setMetadata('ratingpoints',
int(re.sub(r'[^\d]', '', ratingPointsElement.next_element.strip())))
# 384,164pt or ※非公開
if (overallPointsElement is not None and
overallPointsElement.text.strip() != '※非公開'):
self.story.setMetadata('overallpoints',
int(re.sub(r'[^\d]', '', overallPointsElement.next_element.strip())))
# Bookmark metadata
if self.getConfig("always_login"):
if infoSoup.find('div', {'data-remodal-id':'setting_bookmark'}) is None:
self.story.setMetadata('bookmarked', False)
self.story.setMetadata('subscribed', False)
else:
self.story.setMetadata('bookmarked', True)
modal = infoSoup.find('div', {'data-remodal-id':'setting_bookmark'})
# bookmark category name
bookmarkCategory = modal.find('option', {
'class':'js-category_select',
'selected':'selected'}).text.strip()
self.story.setMetadata('bookmarkcategory', bookmarkCategory)
#bookmarkmemo
if modal.find('input', {'class':'js-bookmark_memo'}).has_attr('value'):
self.story.setMetadata('bookmarkmemo',
modal.find('input', {'class':'js-bookmark_memo'})['value'].strip())
#bookmarkprivate
self.story.setMetadata('bookmarkprivate',
modal.find('input', {
'class':'bookmark_jyokyo',
'value':'1'}).has_attr('checked'))
#subscribed
self.story.setMetadata('subscribed',
modal.find('input', {'name':'isnotice'}).has_attr('checked'))
if oneshot:
self.add_chapter(title, self.url)
logger.debug("Story: <%s>", self.story)
return
# serialized story
prependSectionTitles = self.getConfig('prepend_section_titles', 'firstepisode')
tocSoups = []
for n in range(1, int(math.ceil(numChapters/100.0))+1):
tocPage = self.make_soup(self.get_request(self.url + '?p=%s' % n))
tocSoups.append(tocPage.find('div',{'class':'p-eplist'}))
sectionTitle = None
newSection = False
for tocSoup in tocSoups:
for child in tocSoup.findChildren(recursive=False):
if 'p-eplist__chapter-title' in child['class']:
sectionTitle = child.text.strip()
newSection = True
elif 'p-eplist__sublist' in child['class']:
epTitle = child.find('a').text.strip()
updateElement = child.find('div', {'class':'p-eplist__update'})
if updateElement.find('span',{'class':'p-eplist__favep'}) is not None:
# a bookmarked story has some extra text added
updateElement.next_element.extract()
updateElement.next_element.extract()
epPublished = updateElement.next_element.strip()
epUpdated = ''
if updateElement.find('span') is not None:
epUpdated = updateElement.find('span')['title'].strip()
uniqueKey = b64encode(sha256(('title ' + epTitle +
' published ' + epPublished +
' updated ' + epUpdated).encode()).digest()).decode()
epUrl = 'https://' + self.host + child.find('a')['href'] + '#' + uniqueKey
if ((sectionTitle is not None) and
((newSection and prependSectionTitles == 'firstepisode') or
prependSectionTitles == 'true')):
# bracket with ZWSP to mark presence of the section title
epTitle = u'\u200b' + sectionTitle + u'\u3000\u200b' + epTitle
self.add_chapter(epTitle, epUrl)
newSection = False
logger.debug("Story: <%s>", self.story)
return
def getChapterText(self, url):
logger.debug('Getting chapter text from <%s>' % url)
soup = self.make_soup(self.get_request(url))
divs = soup.find_all('div',{'class':'p-novel__text'})
text_divs = []
for div in divs:
if 'p-novel__text--preface' in div['class']:
div['class'] = 'novel_p'
elif 'p-novel__text--afterword' in div['class']:
div['class'] = 'novel_a'
else:
div['class'] = 'novel_honbun'
if self.getConfig('include_author_notes', True) or div['class'] == 'novel_honbun':
text_divs.append(unicode(div))
if not text_divs:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
soup = self.make_soup(' '.join(text_divs))
return self.utf8FromSoup(url, soup)
def before_get_urls_from_page(self,url,normalize):
# syosetu doesn't show adult series or author pages without the cookie
if self.getConfig("is_adult"):
self.set_adult_cookie()
def get_urls_from_page(self,url,normalize):
from ..geturls import get_urls_from_html
# Supporting story page and info page URLs means both links get picked up
# and return duplicate story IDs without a custom handler.
# hook for logins, etc.
self.before_get_urls_from_page(url,normalize)
# this way it uses User-Agent or other special settings.
data = self.get_request(url,usecache=False)
parsedUrlList = get_urls_from_html(self.make_soup(data),
url,
configuration=self.configuration,
normalize=normalize)
urlList = []
ncodes = []
for storyUrl in parsedUrlList:
parsedUrl = urlparse(storyUrl)
host = parsedUrl.netloc
if host in ['ncode.syosetu.com', 'novel18.syosetu.com']:
splitPath = parsedUrl.path.split('/')
storyId = splitPath[-2] if (splitPath[-1] == '') else splitPath[-1]
if storyId not in ncodes:
ncodes.append(storyId)
urlList.append('https://' + host + '/' + storyId + '/')
else:
urlList.append(storyUrl)
return {'urllist':urlList}

View file

@ -131,7 +131,7 @@ class TenhawkPresentsSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href']+addurl)
@ -143,7 +143,7 @@ class TenhawkPresentsSiteAdapter(BaseSiteAdapter):
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.find_all('span',{'class':'label'})
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
@ -164,19 +164,19 @@ class TenhawkPresentsSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('numWords', value)
if 'Categories' in label:
cats = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=categories'))
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=characters'))
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
charstext = [char.string for char in chars]
for char in charstext:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.find_all('a',href=re.compile(r'browse.php\?type=class'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
@ -203,7 +203,7 @@ class TenhawkPresentsSiteAdapter(BaseSiteAdapter):
series_url = 'http://'+self.host+'/'+a['href']
seriessoup = self.make_soup(self.get_request(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
i=1
for a in storyas:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):

View file

@ -29,13 +29,6 @@ from ..six import ensure_text
from .base_adapter import BaseSiteAdapter, makeDate
try: # just a way to switch between CLI and PI
## webbrowser.open doesn't work on some linux flavors.
## piggyback Calibre's version.
from calibre.gui2 import safe_open_url as open_url
except :
from webbrowser import open as open_url
class TestSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
@ -128,7 +121,7 @@ class TestSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('author',prefix+'Test Author aa')
self.setDescription(self.url,u'<div>Description '+self.crazystring+u''' Done
<p>
Some more longer description. "I suck at summaries!" "Better than it sounds!" <span>A span!</span> "My first fic"
Some more longer description. "I suck at summaries!" "Better than it sounds!" "My first fic"
</div>''')
self.story.setMetadata('datePublished',makeDate("1975-03-15","%Y-%m-%d"))
if idstr == '669':
@ -136,9 +129,6 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
else:
self.story.setMetadata('dateUpdated',makeDate("1975-04-15","%Y-%m-%d"))
if idstr == '675' and self.totp != "123321" :
raise exceptions.NeedTimedOneTimePassword(self.url)
if idstr != '674':
self.story.setMetadata('numWords','123456')
@ -149,12 +139,6 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
# greater than 10, no language or series.
if idnum < 10:
self.setSeries('The Great Test',idnum)
self.story.setMetadata('seriesUrl','http://'+self.getSiteDomain()+'/seriesid=1')
elif idnum < 20:
self.setSeries('魔法少女まどか★マギカ',idnum)
self.story.setMetadata('seriesUrl','http://'+self.getSiteDomain()+'/seriesid=1')
elif idnum < 30:
langs = {
0:"English",
1:"Russian",
@ -162,7 +146,11 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
3:"German",
}
self.story.setMetadata('language',langs[idnum%len(langs)])
self.setSeries('The Great Test',idnum)
self.story.setMetadata('seriesUrl','http://'+self.getSiteDomain()+'/seriesid=1')
elif idnum < 20:
self.setSeries('魔法少女まどか★マギカ',idnum)
self.story.setMetadata('seriesUrl','http://'+self.getSiteDomain()+'/seriesid=1')
if idnum == 0:
self.setSeries("A Nook Hyphen Test "+self.story.getMetadata('dateCreated'),idnum)
self.story.setMetadata('seriesUrl','http://'+self.getSiteDomain()+'/seriesid=0')
@ -328,18 +316,13 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
rt = random.uniform(t*0.5, t*1.5)
logger.debug("random sleep(%0.2f-%0.2f):%0.2f"%(t*0.5, t*1.5,rt))
time.sleep(rt)
# open_url("https://echo.free.beeceptor.com/%s.%s"%(self.story.getMetadata('siteabbrev'),
# self.story.getMetadata('storyId')))
if "chapter=1" in url :
text=u'''
<div>
<h3>Prologue</h3>
<div class='leadpara'>
<p>This is a fake adapter for testing purposes. Different sid's will give different errors:</p>
<p>sid&gt;=1000 will use custom test story data from your configuration(personal.ini)</p>
</div>
<div class='failids'>
<p>Hard coded ids:</p>
<p>http://test1.com?sid=664 - Crazy string title</p>
<p>http://test1.com?sid=665, 711-720 - raises AdultCheckRequired</p>
@ -356,7 +339,6 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
<p>http://test1.com?sid=0 - Succeeds, generates some text specifically for testing hyphenation problems with Nook STR/STRwG</p>
<p>Odd sid's will be In-Progress, evens complete. sid&lt;10 will be assigned one of four languages and included in a series.</p>
</div>
</div>
'''
elif self.story.getMetadata('storyId') == '0':
text=u'''<div>
@ -370,7 +352,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
<br />
</div>
'''
elif self.story.getMetadata('storyId') == '667' and ("chapter=2" in url or "chapter=3" in url or "chapter=4" in url):
elif self.story.getMetadata('storyId') == '667' and "chapter=2" in url:
raise exceptions.FailedToDownload("Error downloading Chapter: %s!" % url)
elif self.getSiteDomain() not in url:
## for chapter_urls setting.
@ -415,13 +397,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
else:
if self.story.getMetadata('storyId') == '92':
imgtext='''
<a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownLoaderPluginWithReadingList" title="Tilt-a-Whirl"><img src="http://i.imgur.com/bo8eD.png"></a>
<style>
.loremipsum { background-image: url("https://picsum.photos/2000/1500") }
</style>
<p style="background-image: url('https://picsum.photos/20/10')">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
'''
imgtext='<a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownLoaderPluginWithReadingList" title="Tilt-a-Whirl by Jim &amp; Sarah, on Flickr"><img src="http://i.imgur.com/bo8eD.png"></a>'
else:
imgtext='img goes here when sid=92'
text=u'''
@ -442,9 +418,7 @@ Don't&#8212e;ver&#8212d;o&#8212;that&#8212a;gain, &#27861; &#xE9;
<hr>
horizontal rules
<hr size=1 noshade>
<div class="loremipsum">
<p>"Lorem ipsum dolor sit amet", consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore--et dolore magna aliqua. 'Ut enim ad minim veniam', quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</div>
<br>
<br>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.<br/>
@ -456,6 +430,7 @@ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
<br/> <br/>
<br/>
"Lorem ipsum dolor sit amet", consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore--et dolore magna aliqua. 'Ut enim ad minim veniam', quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.<br>
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</div>
'''%imgtext
soup = self.make_soup(text)
@ -491,7 +466,6 @@ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
desc = '<div><p>The Great Test Series of '+self.getSiteDomain()+'!</p><p>Now with two lines!</p></div>'
return {'name':'The Great Test',
'desc':desc,
'status':'AStatus',
'urllist':['http://'+self.getSiteDomain()+'?sid=1',
'http://'+self.getSiteDomain()+'?sid=2',
'http://'+self.getSiteDomain()+'?sid=3',

View file

@ -1,35 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2019 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
from .adapter_test1 import TestSiteAdapter
class Test2SiteAdapter(TestSiteAdapter):
def __init__(self, config, url):
TestSiteAdapter.__init__(self, config, url)
@staticmethod
def getSiteDomain():
return 'test2.com'
def getClass():
return Test2SiteAdapter

View file

@ -1,35 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2019 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
from .adapter_test1 import TestSiteAdapter
class Test3SiteAdapter(TestSiteAdapter):
def __init__(self, config, url):
TestSiteAdapter.__init__(self, config, url)
@staticmethod
def getSiteDomain():
return 'test3.com'
def getClass():
return Test3SiteAdapter

Some files were not shown because too many files have changed in this diff Show more