Merge branch 'master' into fictionlive

This commit is contained in:
Hazel Shanks 2020-10-21 13:28:57 +13:00
commit 6ab50b6eaa
46 changed files with 6446 additions and 3378 deletions

View file

@ -33,7 +33,7 @@ except NameError:
from calibre.customize import InterfaceActionBase
# pulled out from FanFicFareBase for saving in prefs.py
__version__ = (3, 23, 4)
__version__ = (3, 24, 2)
## Apparently the name for this class doesn't matter--it was still
## 'demo' for the first few versions.

View file

@ -1662,7 +1662,7 @@ class ImapTab(QWidget):
label = QLabel(_('Add these Tag(s) Automatically'))
tooltip = ( _("Tags entered here will be automatically added to stories downloaded from email story URLs.") +"\n"+
_("Any additional stories if you then manually add to the Story URL dialog will also have these tags added.") )
_("Any additional stories you then manually add to the Story URL dialog will also have these tags added.") )
label.setToolTip(tooltip)
self.l.addWidget(label,row,0)
self.imaptags = EditWithComplete(self) # QLineEdit(self)

View file

@ -4,7 +4,7 @@ from __future__ import (absolute_import, unicode_literals, division,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2018, Jim Miller'
__copyright__ = '2020, Jim Miller'
__docformat__ = 'restructuredtext en'
import traceback, re
@ -55,7 +55,7 @@ from calibre.gui2 import gprefs
show_download_options = 'fff:add new/update dialogs:show_download_options'
from calibre.gui2.dialogs.confirm_delete import confirm
from calibre.gui2.complete2 import EditWithComplete
from .fanficfare.six import text_type as unicode
from .fanficfare.six import text_type as unicode, ensure_text
# pulls in translation files for _() strings
try:
@ -252,21 +252,51 @@ class AddNewDialog(SizePersistedDialog):
self.setWindowTitle('FanFicFare')
self.setWindowIcon(icon)
self.toplabel=QLabel("Toplabel")
self.l.addWidget(self.toplabel)
self.url = DroppableQTextEdit(self)
self.url.setToolTip("UrlTooltip")
self.url.setLineWrapMode(QTextEdit.NoWrap)
self.l.addWidget(self.url)
self.merge = self.newmerge = False
self.extraoptions = {}
# elements to hide when doing merge.
self.mergehide = []
self.mergeshow = []
# elements to show again when doing *update* merge
self.mergeupdateshow = []
self.toplabel=QLabel("Toplabel")
self.l.addWidget(self.toplabel)
## XXX add labels for series name and desc? Desc in tooltip?
row = 0
grid = QGridLayout()
label = QLabel('<b>'+_('Series')+':</b>')
grid.addWidget(label,row,0)
self.mergedname=QLabel("mergedname")
tt = _('This name will be used with the %s setting to set the title of the new book.')%'<i>anthology_title_pattern</i>'
label.setToolTip(tt)
self.mergeshow.append(label)
self.mergedname.setToolTip(tt)
grid.addWidget(self.mergedname,row,1,1,-1)
self.l.addLayout(grid)
self.mergeshow.append(self.mergedname)
row+=1
label = QLabel('<b>'+_('Comments')+':</b>')
grid.addWidget(label,row,0)
self.mergeddesc=QLabel("mergeddesc")
tt = _('These comments about the series will be included in the Comments of the new book.')+'<i></i>' # for html for auto-wrap
label.setToolTip(tt)
self.mergeshow.append(label)
self.mergeddesc.setToolTip(tt)
self.mergeddesc.setWordWrap(True)
grid.addWidget(self.mergeddesc,row,1,1,-1)
self.l.addLayout(grid)
self.mergeshow.append(self.mergeddesc)
grid.setColumnStretch(1,1)
self.url = DroppableQTextEdit(self)
self.url.setToolTip("UrlTooltip")
self.url.setLineWrapMode(QTextEdit.NoWrap)
self.l.addWidget(self.url)
self.groupbox = QGroupBox(_("Show Download Options"))
self.groupbox.setCheckable(True)
self.groupbox.setFlat(True)
@ -399,18 +429,32 @@ class AddNewDialog(SizePersistedDialog):
count=""
if url_list_text:
count = " " + _("(%s URLs found)")%len(url_list_text.split()) # count lines
self.toplabel.setText(_('Story URLs for anthology, one per line:') + count)
self.toplabel.setText('<b>'+_('Story URLs for anthology, one per line:') + count + '</b>')
self.url.setToolTip(_('URLs for stories to include in the anthology, one per line.\nWill take URLs from clipboard, but only valid URLs.'))
self.collisionlabel.setText(_('If Story Already Exists in Anthology?'))
self.collision.setToolTip(_("What to do if there's already an existing story with the same URL in the anthology."))
for widget in self.mergehide:
widget.setVisible(False)
for widget in self.mergeshow:
widget.setVisible(True)
if not self.newmerge:
for widget in self.mergeupdateshow:
widget.setVisible(True)
n = extraoptions.get('frompage',{}).get('name',None)
if n:
self.mergedname.setText(n)
else:
self.mergedname.setVisible(False)
d = extraoptions.get('frompage',{}).get('desc',None)
if d:
self.mergeddesc.setText(unicode(d))
else:
self.mergeddesc.setVisible(False)
else:
for widget in self.mergehide:
widget.setVisible(True)
for widget in self.mergeshow:
widget.setVisible(False)
self.toplabel.setText(_('Story URLs, one per line:'))
self.url.setToolTip(_('URLs for stories, one per line.\nWill take URLs from clipboard, but only valid URLs.\nAdd [1,5] after the URL to limit the download to chapters 1-5.'))
self.collisionlabel.setText(_('If Story Already Exists?'))
@ -1328,7 +1372,7 @@ class IniTextDialog(SizePersistedDialog):
self.textedit.setReadOnly(read_only)
self.textedit.setText(text)
self.textedit.setText(ensure_text(text))
self.l.addWidget(self.textedit)
self.lastStart = 0

View file

@ -309,58 +309,85 @@ class FanFicFarePlugin(InterfaceAction):
unique_name='&Update Existing FanFiction Books',
triggered=self.update_dialog)
if prefs['imapserver'] and prefs['imapuser'] and prefs['imapfolder']:
self.get_list_imap_action = self.create_menu_item_ex(self.menu, _('Get Story URLs from &Email'), image='view.png',
unique_name='Get Story URLs from IMAP',
triggered=self.get_urls_from_imap_menu)
self.get_list_imap_action = self.create_menu_item_ex(self.menu, _('Get Story URLs from &Email'), image='view.png',
unique_name='Get Story URLs from IMAP',
triggered=self.get_urls_from_imap_menu)
self.get_list_imap_action.setVisible( bool(prefs['imapserver'] and prefs['imapuser'] and prefs['imapfolder']) )
self.get_list_url_action = self.create_menu_item_ex(self.menu, _('Get Story URLs from Web Page'), image='view.png',
unique_name='Get Story URLs from Web Page',
triggered=self.get_urls_from_page_menu)
self.get_list_action = self.create_menu_item_ex(self.menu, _('Get Story URLs from Selected Books'),
unique_name='Get URLs from Selected Books',
image='bookmarks.png',
triggered=self.list_story_urls)
if self.get_epubmerge_plugin():
self.menu.addSeparator()
self.makeanth_action = self.create_menu_item_ex(self.menu, _('&Make Anthology Epub from URLs'), image='plusplus.png',
unique_name='Make FanFiction Anthology Epub from URLs',
shortcut_name=_('Make FanFiction Anthology Epub from URLs'),
triggered=partial(self.add_dialog,merge=True) )
self.get_anthlist_url_action = self.create_menu_item_ex(self.menu, _('Make Anthology Epub from Web Page'), image='view.png',
unique_name='Make FanFiction Anthology Epub from Web Page',
shortcut_name=_('Make FanFiction Anthology Epub from Web Page'),
triggered=partial(self.get_urls_from_page_menu,anthology=True))
self.menu.addSeparator()
anth_on = bool(self.get_epubmerge_plugin())
self.anth_sub_menu = self.menu.addMenu(_('Anthology Options'))
self.get_anthlist_url_action = self.create_menu_item_ex(self.anth_sub_menu, _('Make Anthology Epub from Web Page'),
image='view.png',
unique_name='Make FanFiction Anthology Epub from Web Page',
shortcut_name=_('Make FanFiction Anthology Epub from Web Page'),
triggered=partial(self.get_urls_from_page_menu,anthology=True))
self.updateanth_action = self.create_menu_item_ex(self.menu, _('Update Anthology Epub'), image='plusplus.png',
unique_name='Update FanFiction Anthology Epub',
shortcut_name=_('Update FanFiction Anthology Epub'),
triggered=self.update_anthology)
self.makeanth_action = self.create_menu_item_ex(self.anth_sub_menu, _('&Make Anthology Epub from URLs'),
image='plusplus.png',
unique_name='Make FanFiction Anthology Epub from URLs',
shortcut_name=_('Make FanFiction Anthology Epub from URLs'),
triggered=partial(self.add_dialog,merge=True) )
if 'Reading List' in self.gui.iactions and (prefs['addtolists'] or prefs['addtoreadlists']) :
self.menu.addSeparator()
addmenutxt, rmmenutxt = None, None
if prefs['addtolists'] and prefs['addtoreadlists'] :
addmenutxt = _('Mark Unread: Add to "To Read" and "Send to Device" Lists')
if prefs['addtolistsonread']:
rmmenutxt = _('Mark Read: Remove from "To Read" and add to "Send to Device" Lists')
else:
rmmenutxt = _('Mark Read: Remove from "To Read" Lists')
elif prefs['addtolists'] :
addmenutxt = _('Add to "Send to Device" Lists')
elif prefs['addtoreadlists']:
addmenutxt = _('Mark Unread: Add to "To Read" Lists')
self.updateanth_action = self.create_menu_item_ex(self.anth_sub_menu, _('Update Anthology Epub'),
image='plusplus.png',
unique_name='Update FanFiction Anthology Epub',
shortcut_name=_('Update FanFiction Anthology Epub'),
triggered=self.update_anthology)
# Make, but set invisible--that way they still appear in
# keyboard shortcuts (and can be set/reset) even when not
# available. Set actions, not just sub invisible because
# that also serves to disable them.
for ac in (self.anth_sub_menu.menuAction(),
self.get_anthlist_url_action,
self.makeanth_action,
self.updateanth_action):
ac.setVisible(anth_on)
rl_on = bool('Reading List' in self.gui.iactions and (prefs['addtolists'] or prefs['addtoreadlists']))
self.rl_sub_menu = self.menu.addMenu(_('Reading List Options'))
addmenutxt, rmmenutxt = None, None
if prefs['addtolists'] and prefs['addtoreadlists'] :
addmenutxt = _('Mark Unread: Add to "To Read" and "Send to Device" Lists')
if prefs['addtolistsonread']:
rmmenutxt = _('Mark Read: Remove from "To Read" and add to "Send to Device" Lists')
else:
rmmenutxt = _('Mark Read: Remove from "To Read" Lists')
elif prefs['addtolists'] :
addmenutxt = _('Add to "Send to Device" Lists')
elif prefs['addtoreadlists']:
addmenutxt = _('Mark Unread: Add to "To Read" Lists')
rmmenutxt = _('Mark Read: Remove from "To Read" Lists')
if addmenutxt:
self.add_send_action = self.create_menu_item_ex(self.menu, addmenutxt,
unique_name='Add to "To Read" and "Send to Device" Lists',
image='plusplus.png',
triggered=partial(self.update_lists,add=True))
add_off = not addmenutxt
if add_off:
addmenutxt = _('Add to Lists Not Configured')
if rmmenutxt:
self.add_remove_action = self.create_menu_item_ex(self.menu, rmmenutxt,
unique_name='Remove from "To Read" and add to "Send to Device" Lists',
image='minusminus.png',
triggered=partial(self.update_lists,add=False))
self.add_send_action = self.create_menu_item_ex(self.rl_sub_menu, addmenutxt,
unique_name='Add to "To Read" and "Send to Device" Lists',
image='plusplus.png',
triggered=partial(self.update_lists,add=True))
self.add_send_action.setVisible(rl_on and not add_off)
rm_off = not rmmenutxt
if rm_off:
rmmenutxt = _('Remove from Lists Not Configured')
self.add_remove_action = self.create_menu_item_ex(self.rl_sub_menu, rmmenutxt,
unique_name='Remove from "To Read" and add to "Send to Device" Lists',
image='minusminus.png',
triggered=partial(self.update_lists,add=False))
self.add_remove_action.setVisible(rl_on and not rm_off)
self.rl_sub_menu.menuAction().setVisible(rl_on)
self.menu.addSeparator()
self.get_list_action = self.create_menu_item_ex(self.menu, _('Remove "New" Chapter Marks from Selected books'),
@ -368,20 +395,15 @@ class FanFicFarePlugin(InterfaceAction):
image='edit-undo.png',
triggered=self.unnew_books)
self.menu.addSeparator()
self.get_list_action = self.create_menu_item_ex(self.menu, _('Get Story URLs from Selected Books'),
unique_name='Get URLs from Selected Books',
image='bookmarks.png',
triggered=self.list_story_urls)
self.reject_list_action = self.create_menu_item_ex(self.menu, _('Reject Selected Books'),
unique_name='Reject Selected Books', image='rotate-right.png',
triggered=self.reject_list_urls)
# self.menu.addSeparator()
# print("platform.system():%s"%platform.system())
# print("platform.mac_ver()[0]:%s"%platform.mac_ver()[0])
if not self.check_macmenuhack(): # not platform.mac_ver()[0]: # Some macs crash on these menu items for unknown reasons.
self.menu.addSeparator()
# self.menu.addSeparator()
self.config_action = self.create_menu_item_ex(self.menu, _('&Configure FanFicFare'),
image= 'config.png',
unique_name='Configure FanFicFare',
@ -464,8 +486,7 @@ class FanFicFarePlugin(InterfaceAction):
self.unnew_books()
def get_urls_from_imap_menu(self):
if not prefs['imapserver'] or not prefs['imapuser'] or not prefs['imapfolder']:
if not (prefs['imapserver'] and prefs['imapuser'] and prefs['imapfolder']):
s=_('FanFicFare Email Settings are not configured.')
info_dialog(self.gui, s, s, show=True, show_copy_button=False)
return
@ -533,7 +554,9 @@ class FanFicFarePlugin(InterfaceAction):
if prefs['imaptags']:
message="<p>"+_("Tag(s) <b><i>%s</i></b> will be added to all stories downloaded in the next dialog, including any story URLs you add manually.")%prefs['imaptags']+"</p>"
confirm(message,'fff_add_imaptags', self.gui, show_cancel_button=False)
self.add_dialog("\n".join(url_list),merge=False,add_tag=prefs['imaptags'])
self.add_dialog("\n".join(url_list),
merge=False,
extraoptions={'add_tag':prefs['imaptags']})
else:
msg = _('No Valid Story URLs Found in Unread Emails.')
if reject_list:
@ -565,12 +588,16 @@ class FanFicFarePlugin(InterfaceAction):
with busy_cursor():
self.gui.status_bar.show_message(_('Fetching Story URLs from Page...'))
url_list = self.get_urls_from_page(url)
frompage = self.get_urls_from_page(url)
url_list = frompage.get('urllist',[])
self.gui.status_bar.show_message(_('Finished Fetching Story URLs from Page.'),3000)
if url_list:
self.add_dialog("\n".join(url_list),merge=d.anthology,anthology_url=url)
self.add_dialog("\n".join(url_list),
merge=d.anthology,
extraoptions={'anthology_url':url,
'frompage':frompage})
else:
info_dialog(self.gui, _('List of Story URLs'),
_('No Valid Story URLs found on given page.'),
@ -578,12 +605,9 @@ class FanFicFarePlugin(InterfaceAction):
show_copy_button=False)
def get_urls_from_page(self,url):
## now returns a {} with at least 'urllist'
logger.debug("get_urls_from_page URL:%s"%url)
## some sites hide mature links unless logged in.
if 'archiveofourown.org' in url or 'fimfiction.net' in url:
configuration = get_fff_config(url)
else:
configuration = None
configuration = get_fff_config(url)
return get_urls_from_page(url,configuration)
def list_story_urls(self):
@ -765,9 +789,14 @@ class FanFicFarePlugin(InterfaceAction):
if confirm(message,'fff_reject_non_fanfiction', self.gui):
self.gui.iactions['Remove Books'].delete_books()
def add_dialog(self,url_list_text=None,merge=False,anthology_url=None,add_tag=None):
'Both new individual stories and new anthologies are created here.'
def add_dialog(self,
url_list_text=None,
merge=False,
extraoptions={}):
'''
Both new individual stories and new anthologies are created here.
Expected extraoptions entries: anthology_url, add_tag, frompage
'''
if not url_list_text:
url_list = self.get_urls_clip()
url_list_text = "\n".join(url_list)
@ -779,7 +808,7 @@ class FanFicFarePlugin(InterfaceAction):
self.prep_downloads,
merge=merge,
newmerge=True,
extraoptions={'anthology_url':anthology_url,'add_tag':add_tag})
extraoptions=extraoptions)
def update_anthology(self):
if not self.get_epubmerge_plugin():
@ -830,8 +859,11 @@ class FanFicFarePlugin(InterfaceAction):
# get list from identifiers:url/uri if present, but only if
# it's *not* a valid story URL.
mergeurl = self.get_story_url(db,book_id)
frompage = {}
if mergeurl and not self.is_good_downloader_url(mergeurl):
url_list = [ adapters.getNormalStoryURL(url) for url in self.get_urls_from_page(mergeurl) ]
frompage = self.get_urls_from_page(mergeurl)
url_list = [ adapters.getNormalStoryURL(url) for url in frompage.get('urllist',[]) ]
frompage['urllist']=url_list
url_list_text = "\n".join(url_list)
@ -848,7 +880,8 @@ class FanFicFarePlugin(InterfaceAction):
merge=True,
newmerge=False,
extrapayload=urlmapfile,
extraoptions={'tdir':tdir,
extraoptions={'frompage':frompage,
'tdir':tdir,
'mergebook':mergebook})
# Need to use AddNewDialog modal here because it's an update
# of an existing book. Don't want the user deleting it or
@ -965,9 +998,6 @@ class FanFicFarePlugin(InterfaceAction):
def prep_downloads(self, options, books, merge=False, extrapayload=None):
'''Fetch metadata for stories from servers, launch BG job when done.'''
logger.debug("add_tag:%s"%options.get('add_tag',None))
if isinstance(books, string_types):
url_list = split_text_to_urls(books)
books = self.convert_urls_to_books(url_list)
@ -1322,7 +1352,7 @@ class FanFicFarePlugin(InterfaceAction):
<p>%s</p>
<p>%s</p>'''%(
_('Change Story URL?'),
_('<b>%s</b> by <b>%s</b> is already in your library with a different source URL:')%(mi.title,', '.join(mi.author)),
_('<b>%(title)s</b> by <b>%(author)s</b> is already in your library with a different source URL:')%{'title':mi.title,'author':', '.join(mi.author)},
_('In library: <a href="%(liburl)s">%(liburl)s</a>')%{'liburl':liburl},
_('New URL: <a href="%(newurl)s">%(newurl)s</a>')%{'newurl':book['url']},
_("Click '<b>Yes</b>' to update/overwrite book with new URL."),
@ -1336,7 +1366,7 @@ class FanFicFarePlugin(InterfaceAction):
<p>%s</p>
<p>%s</p>'''%(
_('Download as New Book?'),
_('<b>%s</b> by <b>%s</b> is already in your library with a different source URL.')%(mi.title,', '.join(mi.author)),
_('<b>%(title)s</b> by <b>%(author)s</b> is already in your library with a different source URL.')%{'title':mi.title,'author':', '.join(mi.author)},
_('You chose not to update the existing book. Do you want to add a new book for this URL?'),
_('New URL: <a href="%(newurl)s">%(newurl)s</a>')%{'newurl':book['url']},
_("Click '<b>Yes</b>' to a new book with new URL."),
@ -1585,7 +1615,6 @@ class FanFicFarePlugin(InterfaceAction):
errorcol_label=None,
lastcheckedcol_label=None):
logger.debug("add_tag:%s"%options.get('add_tag',None))
if options.get('add_tag',False):
book['tags'].extend(options.get('add_tag').split(','))
@ -1806,13 +1835,7 @@ class FanFicFarePlugin(InterfaceAction):
if 'mergebook' in options:
existingbook = options['mergebook']
#print("existingbook:\n%s"%existingbook)
mergebook = self.merge_meta_books(existingbook,good_list,options['fileform'])
if 'mergebook' in options:
mergebook['calibre_id'] = options['mergebook']['calibre_id']
if 'anthology_url' in options:
mergebook['url'] = options['anthology_url']
mergebook = self.merge_meta_books(existingbook,good_list,options)
#print("mergebook:\n%s"%mergebook)
@ -2571,7 +2594,7 @@ class FanFicFarePlugin(InterfaceAction):
def is_good_downloader_url(self,url):
return adapters.getNormalStoryURL(url)
def merge_meta_books(self,existingbook,book_list,fileform):
def merge_meta_books(self,existingbook,book_list,options):
book = self.make_book()
book['author'] = []
book['tags'] = []
@ -2672,10 +2695,12 @@ class FanFicFarePlugin(InterfaceAction):
logger.debug("book['url']:%s"%book['url'])
book['comments'] = '<div><p>' +_("Anthology containing:")+"</p>\n\n"
## if series explicitly collected, include desc, if it's there.
d = options.get('frompage',{}).get('desc','')
book['comments'] = '<div>'+d+'<p>' +_("Anthology containing:")+"</p>\n\n"
wraptitle = lambda x : '<p><b>'+x+'</b></p>\n'
if len(book['author']) > 1:
mkbooktitle = lambda x : wraptitle(_("%s by %s") % (x['title'],' & '.join(x['author'])))
mkbooktitle = lambda x : wraptitle(_("%(title)s by %(author)s") % {'title':x['title'],'author':' & '.join(x['author'])})
else:
mkbooktitle = lambda x : wraptitle(x['title'])
@ -2694,7 +2719,7 @@ class FanFicFarePlugin(InterfaceAction):
book['comments'] += '</div>'
logger.debug(book['comments'])
configuration = get_fff_config(book['url'],fileform)
configuration = get_fff_config(book['url'],options['fileform'])
if existingbook:
book['title'] = deftitle = existingbook['title']
if prefs['anth_comments_newonly']:
@ -2704,25 +2729,30 @@ class FanFicFarePlugin(InterfaceAction):
# book['all_metadata']['description']
series = None
logger.debug("serieslists:%s"%serieslists)
# if all same series, use series for name. But only if all and not previous named
if len(serieslist) == len(book_list):
series = serieslist[0]
book['title'] = series
for sr in serieslist:
if series != sr:
book['title'] = deftitle
series = None
break
if not series and serieslists:
# for multiple series sites: if all stories are
# members of the same series, use it. Or the first
# one, rather.
common_series = get_common_elements(serieslists)
logger.debug("common_series:%s"%common_series)
if common_series:
series = common_series[0]
n = options.get('frompage',{}).get('name',None)
if n:
# series explicitly parsed, use name.
book['title'] = series = n
else:
logger.debug("serieslists:%s"%serieslists)
# if all same series, use series for name. But only if all and not previous named
if len(serieslist) == len(book_list):
series = serieslist[0]
book['title'] = series
for sr in serieslist:
if series != sr:
book['title'] = deftitle
series = None
break
if not series and serieslists:
# for multiple series sites: if all stories are
# members of the same series, use it. Or the first
# one, rather.
common_series = get_common_elements(serieslists)
logger.debug("common_series:%s"%common_series)
if common_series:
series = common_series[0]
book['title'] = series
if prefs['setanthologyseries'] and book['title'] == series:
book['series'] = series+' [0]'
@ -2742,9 +2772,20 @@ class FanFicFarePlugin(InterfaceAction):
for v in ['Completed','In-Progress']:
if v in book['tags']:
book['tags'].remove(v)
## some adapters, like AO3, may have series status.
s = options.get('frompage',{}).get('status','')
if s:
book['all_metadata']['status'] = s
book['tags'].append(s)
book['tags'].extend(configuration.getConfigList('anthology_tags'))
book['all_metadata']['anthology'] = "true"
if 'mergebook' in options:
book['calibre_id'] = options['mergebook']['calibre_id']
if 'anthology_url' in options:
book['url'] = options['anthology_url']
return book
def split_text_to_urls(urls):

View file

@ -5,7 +5,7 @@ from __future__ import (unicode_literals, division, absolute_import,
import six
__license__ = 'GPL v3'
__copyright__ = '2018, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
__copyright__ = '2020, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
__docformat__ = 'restructuredtext en'
import logging
@ -90,7 +90,7 @@ def do_download_worker(book_list,
book_list.append(job.result)
book_id = job._book['calibre_id']
count = count + 1
notification(float(count)/total, _('%d of %d stories finished downloading')%(count,total))
notification(float(count)/total, _('%(count)d of %(total)d stories finished downloading')%{'count':count,'total':total})
# Add this job's output to the current log
logger.info('Logfile for book ID %s (%s)'%(book_id, job._book['title']))
logger.info(job.details)
@ -297,8 +297,8 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
inject_cal_cols(book,story,configuration)
writer.writeStory(outfilename=outfile, forceOverwrite=True)
book['comment'] = _('Update %s completed, added %s chapters for %s total.')%\
(options['fileform'],(urlchaptercount-chaptercount),urlchaptercount)
book['comment'] = _('Update %(fileform)s completed, added %(added)s chapters for %(total)s total.')%\
{'fileform':options['fileform'],'added':(urlchaptercount-chaptercount),'total':urlchaptercount}
book['all_metadata'] = story.getAllMetadata(removeallentities=True)
if options['savemetacol'] != '':
book['savemetacol'] = story.dump_html_metadata()

View file

@ -561,6 +561,14 @@ storynotes_label:Story Notes
add_to_extra_titlepage_entries:,storynotes
[base_xenforoforum]
## Some sites require login for some stories
#username:YourName
#password:yourpassword
## XenForo sites require login for some stories, but don't report that
## to FFF. To download those, set your username, password and set
## always_login:false
#always_login:false
## We've been requested by the site(s) admin to rein in hits. If you
## download fewer stories less often you can likely get by with
@ -1213,65 +1221,6 @@ extra_titlepage_entries:eroticatags,disclaimer
#username:YourName
#password:yourpassword
[archive.hpfanfictalk.com]
## Some sites also require the user to confirm they are adult for
## adult content. In commandline version, this should go in your
## personal.ini, not defaults.ini.
#is_adult:true
add_to_extra_valid_entries:,themes,inclusivity,house,
series00,series00Url,series00HTML,
series01,series01Url,series01HTML,
series02,series02Url,series02HTML,
series03,series03Url,series03HTML,
series04,series04Url,series04HTML,
series05,series05Url,series05HTML,
series06,series06Url,series06HTML,
series07,series07Url,series07HTML,
series08,series08Url,series08HTML,
series09,series09Url,series09HTML,
## Assume entryUrl, apply to "<a class='%slink' href='%s'>%s</a>" to
## make entryHTML.
make_linkhtml_entries:series00,series01,series02,series03,series04,
series05,series06,series07,series08,series09
themes_label:Themes
inclusivity_label:Inclusivity
house_label:HPFT Forum House
## series00 will be the same as common metadata series.
series00HTML_label:Series
series01HTML_label:Additional Series
series02HTML_label:Additional Series
series03HTML_label:Additional Series
series04HTML_label:Additional Series
series05HTML_label:Additional Series
series06HTML_label:Additional Series
series07HTML_label:Additional Series
series08HTML_label:Additional Series
series09HTML_label:Additional Series
## Try to collect series names and numbers of this story in those
## series. This lets us turn it on and off by site without keeping a
## lengthy titlepage_entries per site and prevents it updating in the
## plugin.
collect_series: true
add_to_extra_titlepage_entries:,series01HTML,series02HTML,series03HTML,
series04HTML,series05HTML,series06HTML,series07HTML,series08HTML,series09HTML
## archive.hpfanfictalk.com takes margins away, even from p tags, by
## default. So authors have to either include extra br/p tags or
## their own styles. These allow for both, but leave you at the mercy
## of author CSS.
add_to_output_css:
* {
margin: 0;
padding: 0;
}
add_to_keep_html_attrs:,style
[archive.shriftweb.org]
website_encodings:Windows-1252,utf8,iso-8859-1
@ -1678,6 +1627,72 @@ make_linkhtml_entries:translators,betas
## can change it.
include_in_category:fandoms
[fanfictalk.com]
## Some sites also require the user to confirm they are adult for
## adult content. In commandline version, this should go in your
## personal.ini, not defaults.ini.
#is_adult:true
add_to_extra_valid_entries:,tropes,themes,representation,inclusivity,
house,storytype,contentwarnings,
series00,series00Url,series00HTML,
series01,series01Url,series01HTML,
series02,series02Url,series02HTML,
series03,series03Url,series03HTML,
series04,series04Url,series04HTML,
series05,series05Url,series05HTML,
series06,series06Url,series06HTML,
series07,series07Url,series07HTML,
series08,series08Url,series08HTML,
series09,series09Url,series09HTML,
# fields changed name with domain name change.
include_in_inclusivity:representation
include_in_themes:tropes
## Assume entryUrl, apply to "<a class='%slink' href='%s'>%s</a>" to
## make entryHTML.
make_linkhtml_entries:series00,series01,series02,series03,series04,
series05,series06,series07,series08,series09
tropes_label:Tropes
representation_label:Representation
house_label:HPFT Forum House
storytype_label:Story Type
contentwarnings_label:Content Warnings
## series00 will be the same as common metadata series.
series00HTML_label:Series
series01HTML_label:Additional Series
series02HTML_label:Additional Series
series03HTML_label:Additional Series
series04HTML_label:Additional Series
series05HTML_label:Additional Series
series06HTML_label:Additional Series
series07HTML_label:Additional Series
series08HTML_label:Additional Series
series09HTML_label:Additional Series
## Try to collect series names and numbers of this story in those
## series. This lets us turn it on and off by site without keeping a
## lengthy titlepage_entries per site and prevents it updating in the
## plugin.
collect_series: true
#add_to_extra_titlepage_entries:,tropes,themes,representation,inclusivity,house,storytype,contentwarnings,series01HTML,series02HTML,series03HTML,
# series04HTML,series05HTML,series06HTML,series07HTML,series08HTML,series09HTML
## fanfictalk.com takes margins away, even from p tags, by default.
## So authors have to either include extra br/p tags or their own
## styles. These allow for both, but leave you at the mercy of author
## CSS.
add_to_output_css:
* {
margin: 0;
padding: 0;
}
add_to_keep_html_attrs:,style
[fanfiction-junkies.de]
website_encodings:Windows-1252,utf8
@ -3013,6 +3028,17 @@ sitetags_label:Site tags
## Attempt to fix p and br excess from HTML in great many stories
fix_excess_space:false
[www.novelupdates.cc]
## Note that novelupdates.cc != novelupdates.com
## There is reason to believe that novelupdates.cc may be a
## replacement for wuxiaworld.co, but currently both exist with
## different data.
## When dedup_order_chapter_list:true, use a heuristic algorithm
## specific to novelupdates.cc order and dedup chapters.
dedup_order_chapter_list:false
[www.phoenixsong.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
@ -3043,7 +3069,7 @@ sitetags_label:Site Tags
## Site dedicated to these categories/characters/ships
extracategories:Harry Potter
add_to_include_subject_tags:,takesplaces,snapeflavours,sitetags
#add_to_include_subject_tags:,takesplaces,snapeflavours,sitetags
#add_to_extra_titlepage_entries:,stars,reviews,reads,takesplaces,snapeflavours,sitetags
website_encodings:Windows-1252,utf8
@ -3090,6 +3116,11 @@ extratags:
## add_to_output_css example for [base_xenforoforum:epub].
#legend_spoilers:true
## royalroad.com chapters can have author notes attached to them.
## Setting include_author_notes:true will include them with the
## chapter text.
#include_author_notes:true
[www.scarvesandcoffee.net]
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
@ -3108,6 +3139,12 @@ views_label:Views
averageWords_label:Average Words (Chapter)
add_to_titlepage_entries:,views, averageWords
## Scribble Hub chapters can include author's notes and news blocks. We've
## traditionally included them all in the chapter text, but this allows
## you to customize which you include. Copy this parameter to your
## personal.ini and list the ones you don't want.
#exclude_notes:authornotes,newsboxes
[www.siye.co.uk]
## Site dedicated to these categories/characters/ships
extracategories:Harry Potter
@ -3148,6 +3185,12 @@ extracategories:Lord of the Rings
website_encodings:Windows-1252,utf8
[www.the-sietch.com]
## see [base_xenforoforum]
## the-sietch.com shows more posts per reader page than other XF sites.
reader_posts_per_page:15
[www.thedelphicexpanse.com]
## Site dedicated to these categories/characters/ships
extracategories:Star Trek: Enterprise
@ -3330,12 +3373,6 @@ website_encodings:Windows-1252,utf8
## specific to wuxiaworld.co order and dedup chapters.
dedup_order_chapter_list:false
[www.novelupdates.cc]
## Note that novelupdates.cc != novelupdates.com
## When dedup_order_chapter_list:true, use a heuristic algorithm
## specific to novelupdates.cc order and dedup chapters.
dedup_order_chapter_list:false
[www.wuxiaworld.com]
user_agent:Mozilla/5.0
## Authors on wuxiaworld.com create their own index pages, so it's not

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -166,13 +166,14 @@ from . import adapter_hentaifoundrycom
from . import adapter_mugglenetfanfictioncom
from . import adapter_swiorgru
from . import adapter_fanficsme
from . import adapter_archivehpfanfictalkcom
from . import adapter_fanfictalkcom
from . import adapter_scifistoriescom
from . import adapter_silmarillionwritersguildorg
from . import adapter_chireadscom
from . import adapter_scribblehubcom
from . import adapter_fictionlive
from . import adapter_wuxiaworldsite
from . import adapter_thesietchcom
## This bit of complexity allows adapters to be added by just adding
## importing. It eliminates the long if/else clauses we used to need

View file

@ -38,9 +38,9 @@ class WWWAlternatehistoryComAdapter(BaseXenForo2ForumAdapter):
return 'www.alternatehistory.com'
@classmethod
def getURLPrefix(cls):
def getPathPrefix(cls):
# in case it needs more than just site/
return 'https://' + cls.getSiteDomain() + '/forum'
return '/forum/'
def get_threadmarks_top(self,souptag):
return souptag.find('div',{'class':'block-outer-opposite--threadmarks'})

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2014 Fanficdownloader team, 2018 FanFicFare team
# Copyright 2014 Fanficdownloader team, 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -396,7 +396,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
self.story.setMetadata('seriesUrl',series_url)
def hookForUpdates(self,chaptercount):
if self.oldchapters and len(self.oldchapters) > self.newestChapterNum:
if self.newestChapterNum and self.oldchapters and len(self.oldchapters) > self.newestChapterNum:
logger.info("Existing epub has %s chapters\nNewest chapter is %s. Discarding old chapters from there on."%(len(self.oldchapters), self.newestChapterNum+1))
self.oldchapters = self.oldchapters[:self.newestChapterNum]
return len(self.oldchapters)
@ -572,3 +572,51 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
# logger.debug(skip_tag)
return self.utf8FromSoup(url,save_chapter)
def before_get_urls_from_page(self,url,normalize):
# special stuff to log into archiveofourown.org, if possible.
# Unlike most that show the links to 'adult' stories, but protect
# them, AO3 doesn't even show them if not logged in. Only works
# with saved user/pass--not going to prompt for list.
if self.getConfig("username"):
if self.getConfig("is_adult"):
if '?' in url:
addurl = "&view_adult=true"
else:
addurl = "?view_adult=true"
else:
addurl=""
# just to get an authenticity_token.
data = self._fetchUrl(url+addurl)
# login the session.
self.performLogin(url,data)
# get the list page with logged in session.
def get_series_from_page(self,url,data,normalize=False):
'''
This method is to make it easier for adapters to detect a
series URL, pick out the series metadata and list of storyUrls
to return without needing to override get_urls_from_page
entirely.
'''
## easiest way to get all the weird URL possibilities and stay
## up to date with future changes.
m = re.match(self.getSiteURLPattern().replace('/works/','/series/'),url)
if m:
soup = self.make_soup(data)
retval = {}
retval['urllist']=[ 'https://'+self.host+a['href'] for a in soup.select('h4.heading a:first-child') ]
retval['name']=stripHTML(soup.select_one("h2.heading"))
desc=soup.select_one("div.wrapper dd blockquote.userstuff")
if desc:
desc.name='div' # change blockquote to div to match stories.
retval['desc']=desc
stats=stripHTML(soup.select_one("dl.series dl.stats"))
if 'Complete:Yes' in stats:
retval['status'] = "Completed"
elif 'Complete:No' in stats:
retval['status'] = "In-Progress"
return retval
## return dict with at least {'urllist':['storyUrl','storyUrl',...]}
## optionally 'name' and 'desc'?
return {}

View file

@ -30,11 +30,11 @@ from ..six.moves.urllib.error import HTTPError
from .base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ArchiveHPfanfictalkComAdapter
return FanfictalkComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter):
class FanfictalkComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
@ -48,7 +48,7 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/archive/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ahpfftc')
@ -57,17 +57,26 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter):
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d %b %Y"
@classmethod
def getAcceptDomains(cls):
return [cls.getSiteDomain(),'archive.hpfanfictalk.com']
@classmethod
def getConfigSections(cls):
"Only needs to be overriden if has additional ini sections."
return [cls.getConfigSection(),'archive.hpfanfictalk.com']
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'archive.hpfanfictalk.com'
return 'fanfictalk.com'
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/archive/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return r"https?://(archive\.hp)?"+re.escape(self.getSiteDomain())+r"(/archive)?/viewstory\.php\?sid=\d+$"
def use_pagecache(self):
'''
@ -111,24 +120,27 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter):
# Now go hunting for all the meta data and the chapter list.
pagetitle = soup.find('h3')
pagetitle = soup.select_one('div#pagetitle')
# logger.debug(pagetitle)
## Title
a = pagetitle.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',stripHTML(a))
for a in pagetitle.find_all('a', href=re.compile(r"viewuser.php\?uid=\d+")):
self.story.addToList('authorId',a['href'].split('=')[1])
self.story.addToList('authorUrl','https://'+self.host+'/'+a['href'])
self.story.addToList('author',stripHTML(a))
# Find the chapters:
for chapter in soup.find_all('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href'])
self.add_chapter(chapter,'https://'+self.host+'/archive/'+chapter['href'])
# categories
for a in soup.select("div#sort a"):
self.story.addToList('category',stripHTML(a))
listbox = soup.find('div', {'class':'listbox'})
# this site has two divs with class=gb-50 and no immediate container.
gb50s = soup.find_all('div', {'class':'gb-50'})
@ -137,14 +149,15 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter):
for url in urls:
self.story.addToList(metadata,stripHTML(url))
list_from_urls(listbox,r'browse.php\?type=categories','category')
list_from_urls(gb50s[0],r'browse.php\?type=characters','characters')
list_from_urls(gb50s[0],r'browse.php\?type=class&type_id=11','ships')
list_from_urls(gb50s[0],r'browse.php\?type=class&type_id=10','representation')
list_from_urls(gb50s[0],r'browse.php\?type=class&type_id=7','storytype')
list_from_urls(gb50s[0],r'browse.php\?type=class&type_id=14','house')
list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=4','genre')
list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=13','themes')
list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=8','warnings')
list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=10','inclusivity')
list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=15','contentwarnings')
list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=4','genre')
list_from_urls(gb50s[1],r'browse.php\?type=class&type_id=13','tropes')
bq = soup.find('blockquote2')
if bq:
@ -162,40 +175,27 @@ class ArchiveHPfanfictalkComAdapter(BaseSiteAdapter):
# logger.debug(value)
# logger.debug(label)
if 'Rating' in label:
# Mature Audiences · Incomplete
(rating,status) = value.split('·')
self.story.setMetadata('rating', rating)
if 'Complete' in status:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Story Length' in label:
if 'Words:' in label:
stripHTML(value)
# 10 chapters (45462 words)
v = stripHTML(value)
v = v.split('(')[1]
v = v.split(' words')[0]
self.story.setMetadata('numWords', v)
self.story.setMetadata('numWords', stripHTML(value).replace('·',''))
if 'Published' in label:
if 'Published:' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value).replace('·',''), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
if 'Updated:' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value).replace('·',''), self.dateformat))
# Site allows stories to be in several series at once. FFF
# isn't thrilled with that, we have series00, series01, etc.
# Example:
# http://archive.hpfanfictalk.com/viewstory.php?sid=483
# https://fanfictalk.com/archive/viewstory.php?sid=483
if self.getConfig("collect_series"):
seriesspan = soup.find('span',label='Series')
for i, seriesa in enumerate(seriesspan.find_all('a', href=re.compile(r"viewseries\.php\?seriesid=\d+"))):
# logger.debug(seriesa)
series_name = stripHTML(seriesa)
series_url = 'https://'+self.host+'/'+seriesa['href']
series_url = 'https://'+self.host+'/archive/'+seriesa['href']
seriessoup = self.make_soup(self._fetchUrl(series_url))
storyas = seriessoup.find_all('a', href=re.compile(r'viewstory.php\?sid=\d+'))

View file

@ -401,6 +401,10 @@ class FictionLiveAdapter(BaseSiteAdapter):
if 'multiple' in chunk and chunk['multiple'] == False:
vote = [vote] # normalize to list
for v in vote:
# v should only be int, but there is at least one story where some unrelated string was returned,
# so let's just ignore non-int values here
if not isinstance(v, int):
continue
if 0 <= v <= len(choices):
output[v] += 1
return output

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team
# Copyright 2011 Fanficdownloader team, 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -391,3 +391,10 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,soup)
def before_get_urls_from_page(self,url,normalize):
## Unlike most that show the links to 'adult' stories, but protect
## them, FimF doesn't even show them if not logged in.
# data = self._fetchUrl(url)
if self.getConfig("is_adult"):
self.set_adult_cookie()

View file

@ -103,7 +103,7 @@ class NovelUpdatesCcSiteAdapter(BaseSiteAdapter):
if self.getConfig("dedup_order_chapter_list",False):
# Sort and deduplicate chapters (some stories in incorrect order and/or duplicates)
chapters_data = []
numbers_regex = re.compile('[^0-9\.]') # Everything except decimal and numbers
numbers_regex = re.compile(r'[^0-9\.]') # Everything except decimal and numbers
for ch in chapters:
chapter_title = ch.p.get_text()
chapter_url = ch['href']

View file

@ -48,10 +48,10 @@ class PonyFictionArchiveNetAdapter(BaseSiteAdapter):
# normalized story URL.
if "explicit" in self.parsedUrl.netloc:
self._setURL('http://explicit.' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://explicit.' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self.dateformat = "%d/%b/%y"
else:
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self.dateformat = "%d %b %Y"
# Each adapter needs to have a unique site abbreviation.
@ -68,10 +68,10 @@ class PonyFictionArchiveNetAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234 http://explicit."+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234 https://explicit."+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+r"(www\.|explicit\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return r"https?://(www\.|explicit\.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
@ -138,13 +138,13 @@ class PonyFictionArchiveNetAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/'+chapter['href']+addurl)
self.add_chapter(chapter,'https://'+self.host+'/'+chapter['href']+addurl)
# eFiction sites don't help us out a lot with their meta data
@ -223,7 +223,7 @@ class PonyFictionArchiveNetAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -173,7 +173,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
self.story.setMetadata('title',title)
# Find authorid and URL from... author url.
mt_card_social = soup.find('',{'class':'mt-card-social'})
mt_card_social = soup.find(None,{'class':'mt-card-social'})
author_link = mt_card_social('a')[-1]
if author_link:
authorId = author_link['href'].rsplit('/', 1)[1]
@ -228,7 +228,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
self.story.addToList('warnings',stripHTML(li))
# get cover
img = soup.find('',{'class':'row fic-header'}).find('img')
img = soup.find(None,{'class':'row fic-header'}).find('img')
if img:
cover_url = img['src']
self.setCoverImage(url,cover_url)
@ -252,4 +252,16 @@ class RoyalRoadAdapter(BaseSiteAdapter):
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
if self.getConfig("include_author_notes",True):
# collect both first, changing div for frontnote first
# causes confusion in the tree.
frontnote = div.find_previous('div', {'class':'author-note-portlet'})
endnote = div.find_next('div', {'class':'author-note-portlet'})
if frontnote:
# move frontnote into chapter text div.
div.insert(0,frontnote.extract())
if endnote:
# move endnote into chapter text div.
div.append(endnote.extract())
return self.utf8FromSoup(url,div)

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team
# Copyright 2012 Fanficdownloader team, 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -240,3 +240,19 @@ class ScarvesAndCoffeeNetAdapter(BaseSiteAdapter):
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)
def get_urls_from_page(self,url,normalize):
from ..geturls import get_urls_from_html
# this way it uses User-Agent or other special settings.
data = self._fetchUrl(url,usecache=False)
## I can't find when or why exactly this was added, but it was
## in the old code, so here it remains.
soup = self.make_soup(data)
series = self.get_series_from_page(url,data)
if series:
return series
else:
return {'urllist':get_urls_from_html(soup.find('div',{'id':'mainpage'}),
url,
configuration=self.configuration,
normalize=normalize)}

View file

@ -270,8 +270,58 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
soup = self.make_soup(self._fetchUrl(url))
div = soup.find('div', {'id' : 'chp_raw'})
if div.find('div', {'class' : 'wi_authornotes'}):
div.find('div', {'class' : 'wi_authornotes'}).decompose()
exclude_notes = self.getConfigList('exclude_notes')
if 'authornotes' in exclude_notes:
# Remove author's notes
for author_notes in div.find_all('div', {'class' : 'wi_authornotes'}):
author_notes.decompose()
else:
# Reformat the author's notes
for author_notes in div.find_all('div', {'class' : 'wi_authornotes'}):
author_notes['class'] = ['fff_chapter_notes']
notes_div = soup.new_tag('div')
new_tag = soup.new_tag('b')
new_tag.string = "Author's note:"
notes_div.append(new_tag)
author_notes_body = author_notes.find('div', {'class' : 'wi_authornotes_body'})
if author_notes_body:
new_tag = soup.new_tag('blockquote')
new_tag.append(author_notes_body)
notes_div.append(new_tag)
# Clear old children from the note, then add this
author_notes.clear()
author_notes.append(notes_div)
if 'newsboxes' in exclude_notes:
# Remove author's notes
for news in div.find('div', {'class' : 'wi_news'}):
news.decompose()
else:
# Reformat the news boxes
for news in div.find_all('div', {'class' : 'wi_news'}):
news['class'] = ['fff_chapter_notes']
notes_div = soup.new_tag('div')
news_title = news.find('div', {'class' : 'wi_news_title'})
if news_title:
new_tag = soup.new_tag('b')
new_tag.string = news_title.get_text()
notes_div.append(new_tag)
news_body = news.find('div', {'class' : 'wi_news_body'})
if news_body:
new_tag = soup.new_tag('blockquote')
new_tag.append(news_body)
notes_div.append(new_tag)
# Clear old children from the news box, then add this
news.clear()
news.append(notes_div)
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2019 FanFicFare team
# Copyright 2011 Fanficdownloader team, 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -48,7 +48,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/siye/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/siye/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','siye') # XXX
@ -68,10 +68,10 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/siye/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/siye/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+r"(www\.)?siye\.co\.uk/(siye/)?"+re.escape("viewstory.php?sid=")+r"\d+$"
return r"https?://(www\.)?siye\.co\.uk/(siye/)?"+re.escape("viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
@ -108,7 +108,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
if a is None:
raise exceptions.StoryDoesNotExist(self.url)
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/siye/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/siye/'+a['href'])
self.story.setMetadata('author',a.string)
# need(or easier) to pull other metadata from the author's list page.
@ -126,7 +126,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
# Find the chapters (from soup, not authsoup):
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,'http://'+self.host+'/siye/'+chapter['href'])
self.add_chapter(chapter,'https://'+self.host+'/siye/'+chapter['href'])
if self.num_chapters() < 1:
self.add_chapter(self.story.getMetadata('title'),url)
@ -177,7 +177,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
nxt = label.next_sibling
while nxt and "Hitcount:" not in stripHTML(nxt):
summary += "%s"%nxt
logger.debug(summary)
# logger.debug(summary)
nxt = nxt.next_sibling
if summary.strip().endswith("<br/>"):
summary = summary.strip()[0:-len("<br/>")]
@ -221,7 +221,7 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
# Find Series name from series URL.
a = titlea.findPrevious('a', href=re.compile(r"series.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -25,6 +25,7 @@ from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six.moves.urllib.parse import urlparse, urlunparse
from ..six import text_type as unicode
from ..six.moves.urllib.error import HTTPError
@ -81,57 +82,98 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
## only one theme is supported.
return "Classic"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if self.needToLogin \
or 'Free Registration' in data \
return 'Free Registration' in data \
or "Invalid Password!" in data \
or "Invalid User Name!" in data \
or "Log In" in data \
or "Access to unlinked chapters requires" in data \
or "Log in to Storiesonline" in data :
self.needToLogin = True
return self.needToLogin
or "Log in to Storiesonline" in data \
or "WLPC log in System" in data
def performLogin(self, url):
params = {}
if self.password:
params['theusername'] = self.username
params['thepassword'] = self.password
username = self.username
password = self.password
else:
params['theusername'] = self.getConfig("username")
params['thepassword'] = self.getConfig("password")
params['rememberMe'] = '1'
params['submit'] = 'Login'
username = self.getConfig("username")
password = self.getConfig("password")
loginUrl = 'https://' + self.getSiteDomain() + '/sol-secure/login.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['theusername']))
username))
if not params['theusername'] or not params['thepassword']:
if not username or not password:
logger.info("Login Required for URL %s" % loginUrl)
raise exceptions.FailedToLogin(url,params['theusername'])
raise exceptions.FailedToLogin(url,username)
## fetch 'v' code from login page.
soup = self.make_soup(self._fetchUrl(loginUrl,usecache=False))
## Site now uses a two POST login system on a different
## domain. At least it appears shared between storiesonline
## and finestories.
## fetch 'v' code, post action and redirected domain from login page.
(data,opened) = self._fetchUrlOpened(loginUrl,
usecache=False)
# logger.debug(data)
if not self.needToLoginCheck(data):
## hitting login URL reminds system we're logged in?
logger.debug("don't need to login")
return
useurl = opened.geturl()
soup = self.make_soup(data)
params = {}
params['v']=soup.find('input', {'name':'v'})['value']
params['email'] = username
params['cmd'] = 'SubmitEmail'
postAction = soup.find('form')['action']
parsedUrl = urlparse(useurl)
postUrl = urlunparse((parsedUrl.scheme,
parsedUrl.netloc,
postAction,
'','',''))
# try:
data = self._postUrl(postUrl,params,usecache=False)
# logger.debug(data)
# except HTTPError as e:
# if e.code == 307:
# logger.debug("HTTP Error 307: Temporary Redirect -- assumed to be valid login for this site")
# return
soup = self.make_soup(data)
params['v']=soup.find('input', {'name':'v'})['value']
params['password'] = password
params['cmd'] = 'Log In'
# postAction = soup.find('form')['action']
# parsedUrl = urlparse(useurl)
# postUrl = urlunparse(urlunparse(
# (parsedUrl.scheme,
# parsedUrl.netloc,
# postAction,
# '','',''))
try:
d = self._postUrl(loginUrl,params,usecache=False)
self.needToLogin = False
data = self._postUrl(postUrl,params,usecache=False)
# logger.debug(data)
except HTTPError as e:
if e.code == 307:
logger.debug("HTTP Error 307: Temporary Redirect -- assumed to be valid login for this site")
return True
logger.debug("e Location:%s"%e.headers['Location'])
try:
## need to hit redirect URL so cookies get set for
## the story site domain. I think.
data = self._postUrl(e.headers['Location'],params,usecache=False)
except HTTPError as e:
if e.code == 307:
# logger.debug(e)
return
if self.needToLoginCheck(d):
if self.needToLoginCheck(data):
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['theusername']))
raise exceptions.FailedToLogin(url,params['theusername'])
return False
else:
return True
username))
raise exceptions.FailedToLogin(url,username)
def use_pagecache(self):
'''
@ -148,15 +190,14 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
url = self.url
logger.debug("URL: "+url)
self.needToLogin = False
try:
data = self._fetchUrl(url+":i")
# logger.debug(data)
except HTTPError as e:
if e.code in (404, 410):
raise exceptions.StoryDoesNotExist("Code: %s: %s"%(e.code,self.url))
elif e.code in (401, 403):
self.needToLogin = True
data = ''
data = 'Log In' # to trip needToLoginCheck
else:
raise e
@ -169,7 +210,6 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
if e.code in (404, 410):
raise exceptions.StoryDoesNotExist("Code: %s: %s"%(e.code,self.url))
elif e.code == 401:
self.needToLogin = True
data = ''
else:
raise e
@ -342,8 +382,8 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
self.story.setMetadata("universe", universe_name)
self.story.setMetadata('universeUrl','https://'+self.host+ '/library/universe.php?id=' + universe_id)
break
else:
logger.debug("No universe page")
# else:
# logger.debug("No universe page")
except:
raise
pass
@ -357,9 +397,9 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
universeUrl = 'https://'+self.host+a['href']
# logger.debug("Retrieving Universe - about to get page - universeUrl='{0}".format(universeUrl))
universe_soup = self.make_soup(self._fetchUrl(universeUrl))
logger.debug("Retrieving Universe - have page")
# logger.debug("Retrieving Universe - have page")
if universe_soup:
logger.debug("Retrieving Universe - looking for name")
# logger.debug("Retrieving Universe - looking for name")
universe_name = stripHTML(universe_soup.find('h1', {'id' : 'ptitle'}))
universe_name = re.sub(r' .\s+A Universe from the Mind.*$','',universe_name)
# logger.debug("Universes name: '{0}'".format(universe_name))
@ -368,15 +408,15 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
# logger.debug("Setting universe name: '{0}'".format(universe_name))
self.story.setMetadata('universe',universe_name)
if self.getConfig("universe_as_series") and not self.story.getMetadata('seriesUrl'):
logger.debug("universe_as_series")
# logger.debug("universe_as_series")
# take position in universe page as number in series.
for i, storya in enumerate(universe_soup.find_all('a',href=re.compile(r'^/s/\d+/'))):
if storya['href'].split('/')[2] == self.story.getMetadata('storyId'):
self.setSeries(universe_name, i+1)
self.story.setMetadata('seriesUrl',universeUrl)
break
else:
logger.debug("Do not have a universe")
# else:
# logger.debug("Do not have a universe")
except:
raise
pass
@ -499,7 +539,11 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
chapter_title = None
if self.getConfig('inject_chapter_title'):
chapter_title = pagetag.find('h2').extract()
h2tag = pagetag.find('h2')
if h2tag:
# I'm seeing an h1 now, but it's not logged in?
# Something's broken...
chapter_title = h2tag.extract()
# Strip te header section
tag = pagetag.find('header')

View file

@ -442,6 +442,30 @@ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
soup = self.make_soup(text)
return self.utf8FromSoup(url,soup)
def get_urls_from_page(self,url,normalize):
logger.debug("Fake series test1.com")
'''
This method is to make it easier for adapters to detect a
series URL, pick out the series metadata and list of storyUrls
to return without needing to override get_urls_from_page
entirely.
'''
## easiest way to get all the weird URL possibilities and stay
## up to date with future changes.
return {'name':'The Great Test',
'desc':'<div>The Great Test Series of test1.com!</div>',
'urllist':['http://test1.com?sid=1',
'http://test1.com?sid=2',
'http://test1.com?sid=3',
'http://test1.com?sid=4',
'http://test1.com?sid=5',
'http://test1.com?sid=6',
'http://test1.com?sid=7',
'http://test1.com?sid=8',
'http://test1.com?sid=9',]
}
def getClass():
return TestSiteAdapter

View file

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# Copyright 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import re
# py2 vs py3 transition
from ..six import text_type as unicode
from .base_xenforo2forum_adapter import BaseXenForo2ForumAdapter
def getClass():
return TheSietchComAdapter
class TheSietchComAdapter(BaseXenForo2ForumAdapter):
def __init__(self, config, url):
BaseXenForo2ForumAdapter.__init__(self, config, url)
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','sietch')
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.the-sietch.com'
@classmethod
def getPathPrefix(cls):
# in case it needs more than just site/
return '/index.php?'
def make_reader_url(self,tmcat_num,reader_page_num):
# https://www.the-sietch.com/index.php?threads/shattered-sphere-the-arcadian-free-march.3243/reader/page-2
# discard tmcat_num -- the-sietch.com doesn't have multiple
# threadmark categories yet.
return self.story.getMetadata('storyUrl')+'reader/page-'+unicode(reader_page_num)
# XXX different threadmarks categories

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team
# Copyright 2011 Fanficdownloader team, 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -350,5 +350,34 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
r"https://\1",url)
return url
def before_get_urls_from_page(self,url,normalize):
## Not needed for series pages, but does effect author pages,
## top lists, etc.
if self.getConfig("is_adult"):
self.setSiteMaxRating(url)
def get_series_from_page(self,url,data,normalize=False):
'''
This method is to make it easier for adapters to detect a
series URL, pick out the series metadata and list of storyUrls
to return without needing to override get_urls_from_page
entirely.
'''
## https://www.tthfanfic.org/Series-2329
m = re.match(r"https?://www.tthfanfic.org/Series-(?P<id>\d+)$",url)
if m:
soup = self.make_soup(data)
retval = {}
retval['urllist']=[ 'https://'+self.host+a['href'] for a in soup.select('div.storylistitem a.storylink') ]
retval['name']=stripHTML(soup.select_one("title"))
retval['name'] = retval['name'].replace('TtH • Series • ','')
desc=soup.select_one("div.storybody")
desc.name='div' # change blockquote to div to match stories.
retval['desc']=desc
return retval
## return dict with at least {'urllist':['storyUrl','storyUrl',...]}
## optionally 'name' and 'desc'?
return {}
def getClass():
return TwistingTheHellmouthSiteAdapter

View file

@ -103,7 +103,7 @@ class WuxiaWorldCoSiteAdapter(BaseSiteAdapter):
if self.getConfig("dedup_order_chapter_list",False):
# Sort and deduplicate chapters (some stories in incorrect order and/or duplicates)
chapters_data = []
numbers_regex = re.compile('[^0-9\.]') # Everything except decimal and numbers
numbers_regex = re.compile(r'[^0-9\.]') # Everything except decimal and numbers
for ch in chapters:
chapter_title = ch.p.get_text()
chapter_url = ch['href']

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2019 FanFicFare team
# Copyright 2011 Fanficdownloader team, 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -30,7 +30,7 @@ from functools import partial
import traceback
import copy
from bs4 import BeautifulSoup, __version__
from bs4 import BeautifulSoup, Tag
from ..htmlcleanup import stripHTML
@ -392,6 +392,112 @@ class BaseSiteAdapter(Configurable):
"Needs to be overriden in each adapter class."
pass
def before_get_urls_from_page(self,url,normalize):
## some sites need a login or other prep for 'from page' to
## work best. Separate function to keep adapter code minimal.
pass
def get_urls_from_page(self,url,normalize):
from ..geturls import get_urls_from_html
'''
This is a method in adapter now rather than the generic code
that was in geturls.py to allow individual adapters to
recognize and provide special handling if needed for series.
Prompted largely by AO3 authors leaving links to other stories
in story desc that were getting picked up.
'''
## hook for logins, etc.
self.before_get_urls_from_page(url,normalize)
# this way it uses User-Agent or other special settings.
data = self._fetchUrl(url,usecache=True)
series = self.get_series_from_page(url,data,normalize)
if series:
# just to make it easier for adapters.
if isinstance(series.get('desc',None),(BeautifulSoup,Tag)):
series['desc'] = self.utf8FromSoup(url,series['desc'])
# NOTE: series desc imgs are *not* included in ebook.
# Should they be removed?
return series
else:
return {'urllist':get_urls_from_html(self.make_soup(data),
url,
configuration=self.configuration,
normalize=normalize)}
def get_series_from_page(self,url,data,normalize=False):
from ..geturls import get_urls_from_html
'''
This method is to make it easier for adapters to detect a
series URL, pick out the series metadata and list of storyUrls
to return without needing to override get_urls_from_page
entirely.
'''
# return {}
retval = {}
## return dict with at least {'urllist':['storyUrl','storyUrl',...]}
## 'name' and 'desc' are also used if given.
## for eFiction sites:
## http://www.dracoandginny.com/viewseries.php?seriesid=45
# logger.debug("base get_series_from_page:%s"%url)
try:
if re.match(r".*(view)?series\.php\?s(erie)?sid=\d+.*",url): # seriesid or ssid
# logger.debug("Attempting eFiction get_series_from_page")
soup = self.make_soup(data)
retval = {}
nametag = soup.select_one('div#pagetitle') or soup.select_one('div#storytitle')
# logger.debug(nametag)
if nametag:
nametag.find('a').decompose()
retval['name'] = stripHTML(nametag)
# some have [ - ], some have ' by', some have both.
# order matters.
trailing_strip_list=['[ - ]',' by']
for s in trailing_strip_list:
# logger.debug(retval['name'])
if retval['name'].endswith(s):
# remove trailing s
retval['name'] = retval['name'][:-len(s)].strip()
summaryspan = soup.select_one("div#titleblock span.label") or soup.select_one("div#titleblock span.classification")
# logger.debug(summaryspan)
if summaryspan and stripHTML(summaryspan) == "Summary:":
desc = ""
c = summaryspan.nextSibling
# logger.debug(c)
# strings and tags that aren't <span class='label'>
while c and not (isinstance(c,Tag) and c.name == 'span' and ('label' in c['class'] or 'classification' in c['class'])):
# logger.debug(c)
desc += unicode(c)
c = c.nextSibling
# logger.debug(c)
if desc:
# logger.debug(desc)
# strip spaces and trailing <br> tags.
desc = re.sub(r'( *<br/?>)+$','',desc.strip())
# logger.debug(desc)
retval['desc']=desc.strip()
else:
# some(1?) sites
summarydiv = soup.select_one("div.summarytext") or soup.select_one("blockquote2") # fanfictalk.com
summarydiv.name='div' # force name to div.
if summarydiv:
retval['desc']=summarydiv
# trying to get story urls for series from different
# eFictions is a nightmare that the pre-existing
# get_urls_from_html() handles well enough.
# logger.debug(soup)
retval['urllist']=get_urls_from_html(soup,
url,
configuration=self.configuration,
normalize=normalize)
except Exception as e:
logger.debug("get_series_from_page for eFiction failed:%s"%e)
retval = {}
return retval
# Just for series, in case we choose to change how it's stored or represented later.
def setSeries(self,name,num):
if self.getConfig('collect_series'):

View file

@ -63,7 +63,7 @@ class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
raise exceptions.FailedToLogin(self.url,"No username given. Set in personal.ini or enter when prompted.")
## need a login token.
data = self._fetchUrl(self.getURLPrefix() + '/login',usecache=False)
data = self._fetchUrl(self.getURLPrefix() + 'login',usecache=False)
# logger.debug(data)
# <input type="hidden" name="_xfToken" value="1556822458,710e5bf6fc87c67ea04ab56a910ac3ff" />
find_token='<input type="hidden" name="_xfToken" value="'
@ -71,10 +71,10 @@ class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
xftoken = xftoken[:xftoken.index('"')]
params['remember'] = '1'
params['_xfToken'] = xftoken
params['_xfRedirect'] = self.getURLPrefix() + '/'
params['_xfRedirect'] = self.getURLPrefix()
## https://forum.questionablequesting.com/login/login
loginUrl = self.getURLPrefix() + '/login/login'
loginUrl = self.getURLPrefix() + 'login/login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['login']))
@ -225,7 +225,7 @@ class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
def get_threadmark_range_url(self,tm_item,tmcat_num):
fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
# logger.debug(fetcher)
# logger.debug('data-fetchurl:%s'%fetcher)
return self.getURLPrefix() + fetcher['data-fetchurl']
def get_threadmark_date(self,tm_item):

View file

@ -43,21 +43,18 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
#logger.info("init url: "+url)
BaseSiteAdapter.__init__(self, config, url)
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
#logger.debug("groupdict:%s"%m.groupdict())
if m.group('anchorpost'):
self.story.setMetadata('storyId',m.group('anchorpost'))
self._setURL(self.getURLPrefix() + '/posts/'+m.group('anchorpost')+'/')
self._setURL(self.getURLPrefix() + 'posts/'+m.group('anchorpost')+'/')
else:
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
title = m.group('title') or ""
self._setURL(self.getURLPrefix() + '/'+m.group('tp')+'/'+title+self.story.getMetadata('storyId')+'/')
self._setURL(self.getURLPrefix() + m.group('tp')+'/'+title+self.story.getMetadata('storyId')+'/')
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
@ -75,18 +72,23 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
"Only needs to be overriden if has additional ini sections."
return ['base_xenforoforum',cls.getConfigSection()]
@classmethod
def getPathPrefix(cls):
# The site's fixed path prefix. '/' for most
return '/'
@classmethod
def getURLPrefix(cls):
# The site domain. Does have www here, if it uses it.
return 'https://' + cls.getSiteDomain()
return 'https://' + cls.getSiteDomain() + cls.getPathPrefix()
@classmethod
def getSiteExampleURLs(cls):
return cls.getURLPrefix()+"/threads/some-story-name.123456/ "+cls.getURLPrefix()+"/posts/123456/"
return cls.getURLPrefix()+"threads/some-story-name.123456/ "+cls.getURLPrefix()+"posts/123456/"
def getSiteURLPattern(self):
## need to accept http and https still.
return re.escape(self.getURLPrefix()).replace("https","https?")+r"/(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#?post-(?P<anchorpost>\d+))?$"
return re.escape(self.getURLPrefix()).replace("https","https?")+r"(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#?post-(?P<anchorpost>\d+))?$"
def _fetchUrlOpened(self, url,
parameters=None,
@ -119,12 +121,12 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
## moved from extract metadata to share with normalize_chapterurl.
if not url.startswith('http'):
url = self.getURLPrefix()+'/'+url
url = self.getURLPrefix()+url
if ( url.startswith(self.getURLPrefix()) or
url.startswith('http://'+self.getSiteDomain()) or
url.startswith('https://'+self.getSiteDomain()) ) and \
( '/posts/' in url or '/threads/' in url or 'showpost.php' in url or 'goto/post' in url):
( self.getPathPrefix()+'posts/' in url or self.getPathPrefix()+'threads/' in url or 'showpost.php' in url or 'goto/post' in url):
## brute force way to deal with SB's http->https change
## when hardcoded http urls. Now assumes all
## base_xenforoforum sites use https--true as of
@ -132,10 +134,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
url = url.replace('http://','https://')
# http://forums.spacebattles.com/showpost.php?p=4755532&postcount=9
url = re.sub(r'showpost\.php\?p=([0-9]+)(&postcount=[0-9]+)?',r'/posts/\1/',url)
url = re.sub(r'showpost\.php\?p=([0-9]+)(&postcount=[0-9]+)?',self.getPathPrefix()+r'posts/\1/',url)
# http://forums.spacebattles.com/goto/post?id=15222406#post-15222406
url = re.sub(r'/goto/post\?id=([0-9]+)(#post-[0-9]+)?',r'/posts/\1/',url)
url = re.sub(r'goto/post\?id=([0-9]+)(#post-[0-9]+)?',self.getPathPrefix()+r'posts/\1/',url)
url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
url = re.sub(r'like$','',url) # strip 'like' if incorrect 'like' link instead of proper post URL.
@ -147,24 +149,24 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
## *correct* ones.
# https://forums.sufficientvelocity.com/posts/39915/
if '#post-' in url:
url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
url = self.getURLPrefix()+'posts/'+url.split('#post-')[1]+'/'
## Same as above except for for case where author mistakenly
## used the reply link instead of normal link to post.
# "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
# https://forums.spacebattles.com/posts/
if 'reply?quote=' in url:
url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'
url = self.getURLPrefix()+'posts/'+url.split('reply?quote=')[1]+'/'
## normalize named thread urls, too.
# http://forums.sufficientvelocity.com/threads/harry-potter-and-the-not-fatal-at-all-cultural-exchange-program.330/
url = re.sub(r'/threads/.*\.([0-9]+)/',r'/threads/\1/',url)
url = re.sub(re.escape(self.getPathPrefix())+r'threads/.*\.([0-9]+)/',self.getPathPrefix()+r'threads/\1/',url)
is_chapter_url = True
## One person once put a threadmarks URL directly in an
## index post and now we have to exclude it.
if re.match(r".*/threads/[0-9]+/threadmarks",url):
if re.match(r'.*'+re.escape(self.getPathPrefix())+'threads/[0-9]+/threadmarks',url):
is_chapter_url = False
return (is_chapter_url,url)
@ -174,7 +176,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
## storyId, because this is called before story url has been
## parsed.
# logger.debug("pre--url:%s"%url)
url = re.sub(r'/threads/.*\.(?P<id>[0-9]+)/',r'/threads/\g<id>/',url)
url = re.sub(re.escape(self.getPathPrefix())+r'threads/.*\.(?P<id>[0-9]+)/',self.getPathPrefix()+r'threads/\g<id>/',url)
# logger.debug("post-url:%s"%url)
return url
@ -206,10 +208,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
params['register'] = '0'
params['cookie_check'] = '1'
params['_xfToken'] = ''
params['redirect'] = self.getURLPrefix() + '/'
params['redirect'] = self.getURLPrefix()
## https://forum.questionablequesting.com/login/login
loginUrl = self.getURLPrefix() + '/login/login'
loginUrl = self.getURLPrefix() + 'login/login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['login']))
@ -294,7 +296,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
elif threadmarksa['href'].startswith('/'):
href = 'https://'+self.getSiteDomain()+threadmarksa['href']
else:
href = self.getURLPrefix()+'/'+threadmarksa['href']
href = self.getURLPrefix()+threadmarksa['href']
threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
tmcat_name,
tmcat_num)
@ -385,7 +387,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
date = self.get_threadmark_date(tm_item)
words,kwords = self.get_threadmark_words(tm_item)
if 'http' not in url:
url = self.getURLPrefix()+"/"+url
url = self.getURLPrefix()+url
# logger.debug("%s. %s"%(tmcat_index,name))
threadmarks.append({"tmcat_name":tmcat_name,
"tmcat_num":tmcat_num,
@ -452,7 +454,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
topsoup = souptag = self.make_soup(data)
if '#' not in useurl and '/posts/' not in useurl:
if '#' not in useurl and self.getPathPrefix()+'posts/' not in useurl:
self._setURL(useurl) ## for when threadmarked thread name changes.
self.parse_title(topsoup)
@ -602,7 +604,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
def parse_author(self,souptag):
a = souptag.find('h3',{'class':'userText'}).find('a')
self.story.addToList('authorId',a['href'].split('/')[1])
authorUrl = self.getURLPrefix()+'/'+a['href']
authorUrl = self.getURLPrefix()+a['href']
self.story.addToList('authorUrl',authorUrl)
self.story.addToList('author',a.text)
@ -654,7 +656,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
def get_cache_post(self,postid):
## saved using original 'post-99999' id for key.
postid=unicode(postid) # thank you, Py3.
if '/posts/' in postid:
if self.getPathPrefix()+'posts/' in postid:
## allows chapter urls to be passed in directly.
# assumed normalized to /posts/1234/
postid = "post-"+postid.split('/')[-2]
@ -676,7 +678,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
# first post when always_include_first_post.
if ( self.reader and
self.getConfig("use_reader_mode",True) and
'/threads/' not in url and
self.getPathPrefix()+'threads/' not in url and
(index > 0 or not self.getConfig('always_include_first_post')) ):
logger.debug("Using reader mode")
# in case it changes:
@ -718,7 +720,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
# page. looking for it in cache reuses code in
# cache_posts that finds post tags.
souptag = self.get_cache_post(url)
if not souptag and '/threads/' in url: # first post uses /thread/ URL.
if not souptag and self.getPathPrefix()+'threads/' in url: # first post uses /thread/ URL.
souptag = self.get_first_post(topsoup)
# remove <div class="baseHtml noticeContent"> because it can
@ -729,10 +731,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
postbody = self.get_post_body(souptag)
# XenForo uses <base href="https://forums.spacebattles.com/" />
return self.utf8FromSoup(self.getURLPrefix()+'/',postbody)
return self.utf8FromSoup(self.getURLPrefix(),postbody)
def make_reader_url(self,tmcat_num,reader_page_num):
return self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num)
return self.getURLPrefix()+'threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num)
def get_quote_expand_tag(self,soup):
return soup.find_all('div',{'class':'quoteExpand'})

View file

@ -40,7 +40,7 @@ else: # > 3.0
def pickle_load(f):
return pickle.load(f,encoding="bytes")
version="3.23.4"
version="3.24.2"
os.environ['CURRENT_VERSION_ID']=version
global_cache = 'global_cache'
@ -125,7 +125,7 @@ def main(argv=None,
help='Exclude list of chapters("zchapters") from metadata stdout output. No effect without --meta-only or --json-meta flags', )
parser.add_option('-j', '--json-meta',
action='store_true', dest='jsonmeta',
help='Output metadata as JSON with download, or with --meta-only flag. (Only JSON will be output with --meta-only flag.)', )
help='Output metadata as JSON with download, or with --meta-only flag. (Only JSON will be output with --meta-only flag.) Also now series name and desc if available with --list', )
parser.add_option('--no-output',
action='store_true', dest='nooutput',
help='Do not download chapters and do not write output file. Intended for testing and with --meta-only.', )
@ -249,25 +249,42 @@ def main(argv=None,
parser.print_help();
return
if options.save_cache:
try:
with open(global_cache,'rb') as jin:
options.pagecache = pickle_load(jin)
options.cookiejar = cl.LWPCookieJar()
options.cookiejar.load(global_cookies)
except Exception as e:
## This is not uncommon, will happen when starting a new
## cache, for example.
print("Didn't load --save-cache %s"%e)
if options.list:
configuration = get_configuration(options.list,
passed_defaultsini,
passed_personalini,options)
retlist = get_urls_from_page(options.list, configuration)
print('\n'.join(retlist))
frompage = get_urls_from_page(options.list, configuration)
if options.jsonmeta:
import json
print(json.dumps(frompage, sort_keys=True,
indent=2, separators=(',', ':')))
else:
retlist = frompage.get('urllist',[])
print('\n'.join(retlist))
if options.normalize:
configuration = get_configuration(options.normalize,
passed_defaultsini,
passed_personalini,options)
retlist = get_urls_from_page(options.normalize, configuration,normalize=True)
retlist = get_urls_from_page(options.normalize, configuration,normalize=True).get('urllist',[])
print('\n'.join(retlist))
if options.downloadlist:
configuration = get_configuration(options.downloadlist,
passed_defaultsini,
passed_personalini,options)
retlist = get_urls_from_page(options.downloadlist, configuration)
retlist = get_urls_from_page(options.downloadlist, configuration).get('urllist',[])
urls.extend(retlist)
if options.imaplist or options.downloadimap:
@ -298,15 +315,6 @@ def main(argv=None,
#print("url: (%s)"%url)
urls.append(url)
if options.save_cache:
try:
with open(global_cache,'rb') as jin:
options.pagecache = pickle_load(jin)
options.cookiejar = cl.LWPCookieJar()
options.cookiejar.load(global_cookies)
except Exception as e:
print("Didn't load --save-cache %s"%e)
if not list_only:
if len(urls) < 1:
print("No valid story URLs found")
@ -323,12 +331,6 @@ def main(argv=None,
raise
print("URL(%s) Failed: Exception (%s). Run URL individually for more detail."%(url,e))
# Saved in configurable.py now.
# if options.save_cache:
# with open('global_cache','wb') as jout:
# pickle.dump(options.pagecache,jout,protocol=2)
# options.cookiejar.save('global_cookies')
# make rest a function and loop on it.
def do_download(arg,
options,
@ -374,19 +376,6 @@ def do_download(arg,
adapter = adapters.getAdapter(configuration, url)
## Share pagecache and cookiejar between multiple downloads.
if not hasattr(options,'pagecache'):
options.pagecache = configuration.get_empty_pagecache()
if not hasattr(options,'cookiejar'):
options.cookiejar = configuration.get_empty_cookiejar()
if options.save_cache:
save_cache = global_cache
save_cookies = global_cookies
else:
save_cache = save_cookies = None
configuration.set_pagecache(options.pagecache,save_cache)
configuration.set_cookiejar(options.cookiejar,save_cookies)
# url[begin-end] overrides CLI option if present.
if ch_begin or ch_end:
adapter.setChaptersRange(ch_begin, ch_end)
@ -621,6 +610,19 @@ def get_configuration(url,
if options.progressbar:
configuration.set('overrides','progressbar','true')
## Share pagecache and cookiejar between multiple downloads.
if not hasattr(options,'pagecache'):
options.pagecache = configuration.get_empty_pagecache()
if not hasattr(options,'cookiejar'):
options.cookiejar = configuration.get_empty_cookiejar()
if options.save_cache:
save_cache = global_cache
save_cookies = global_cookies
else:
save_cache = save_cookies = None
configuration.set_pagecache(options.pagecache,save_cache)
configuration.set_cookiejar(options.cookiejar,save_cookies)
return configuration
if __name__ == '__main__':

View file

@ -215,7 +215,7 @@ def get_valid_set_options():
'fix_fimf_blockquotes':(['fimfiction.net'],None,boollist),
'fail_on_password':(['fimfiction.net'],None,boollist),
'keep_prequel_in_description':(['fimfiction.net'],None,boollist),
'include_author_notes':(['fimfiction.net'],None,boollist),
'include_author_notes':(['fimfiction.net','royalroad.com'],None,boollist),
'do_update_hook':(['fimfiction.net',
'archiveofourown.org'],None,boollist),
'always_login':(['archiveofourown.org']+base_xenforo_list,None,boollist),

View file

@ -588,6 +588,14 @@ storynotes_label:Story Notes
add_to_extra_titlepage_entries:,storynotes
[base_xenforoforum]
## Some sites require login for some stories
#username:YourName
#password:yourpassword
## XenForo sites require login for some stories, but don't report that
## to FFF. To download those, set your username, password and set
## always_login:false
#always_login:false
## We've been requested by the site(s) admin to rein in hits. If you
## download fewer stories less often you can likely get by with
@ -1244,65 +1252,6 @@ extra_titlepage_entries:eroticatags,disclaimer
#username:YourName
#password:yourpassword
[archive.hpfanfictalk.com]
## Some sites also require the user to confirm they are adult for
## adult content. In commandline version, this should go in your
## personal.ini, not defaults.ini.
#is_adult:true
add_to_extra_valid_entries:,themes,inclusivity,house,
series00,series00Url,series00HTML,
series01,series01Url,series01HTML,
series02,series02Url,series02HTML,
series03,series03Url,series03HTML,
series04,series04Url,series04HTML,
series05,series05Url,series05HTML,
series06,series06Url,series06HTML,
series07,series07Url,series07HTML,
series08,series08Url,series08HTML,
series09,series09Url,series09HTML,
## Assume entryUrl, apply to "<a class='%slink' href='%s'>%s</a>" to
## make entryHTML.
make_linkhtml_entries:series00,series01,series02,series03,series04,
series05,series06,series07,series08,series09
themes_label:Themes
inclusivity_label:Inclusivity
house_label:HPFT Forum House
## series00 will be the same as common metadata series.
series00HTML_label:Series
series01HTML_label:Additional Series
series02HTML_label:Additional Series
series03HTML_label:Additional Series
series04HTML_label:Additional Series
series05HTML_label:Additional Series
series06HTML_label:Additional Series
series07HTML_label:Additional Series
series08HTML_label:Additional Series
series09HTML_label:Additional Series
## Try to collect series names and numbers of this story in those
## series. This lets us turn it on and off by site without keeping a
## lengthy titlepage_entries per site and prevents it updating in the
## plugin.
collect_series: true
add_to_extra_titlepage_entries:,series01HTML,series02HTML,series03HTML,
series04HTML,series05HTML,series06HTML,series07HTML,series08HTML,series09HTML
## archive.hpfanfictalk.com takes margins away, even from p tags, by
## default. So authors have to either include extra br/p tags or
## their own styles. These allow for both, but leave you at the mercy
## of author CSS.
add_to_output_css:
* {
margin: 0;
padding: 0;
}
add_to_keep_html_attrs:,style
[archive.shriftweb.org]
website_encodings:Windows-1252,utf8,iso-8859-1
@ -1709,6 +1658,72 @@ make_linkhtml_entries:translators,betas
## can change it.
include_in_category:fandoms
[fanfictalk.com]
## Some sites also require the user to confirm they are adult for
## adult content. In commandline version, this should go in your
## personal.ini, not defaults.ini.
#is_adult:true
add_to_extra_valid_entries:,tropes,themes,representation,inclusivity,
house,storytype,contentwarnings,
series00,series00Url,series00HTML,
series01,series01Url,series01HTML,
series02,series02Url,series02HTML,
series03,series03Url,series03HTML,
series04,series04Url,series04HTML,
series05,series05Url,series05HTML,
series06,series06Url,series06HTML,
series07,series07Url,series07HTML,
series08,series08Url,series08HTML,
series09,series09Url,series09HTML,
# fields changed name with domain name change.
include_in_inclusivity:representation
include_in_themes:tropes
## Assume entryUrl, apply to "<a class='%slink' href='%s'>%s</a>" to
## make entryHTML.
make_linkhtml_entries:series00,series01,series02,series03,series04,
series05,series06,series07,series08,series09
tropes_label:Tropes
representation_label:Representation
house_label:HPFT Forum House
storytype_label:Story Type
contentwarnings_label:Content Warnings
## series00 will be the same as common metadata series.
series00HTML_label:Series
series01HTML_label:Additional Series
series02HTML_label:Additional Series
series03HTML_label:Additional Series
series04HTML_label:Additional Series
series05HTML_label:Additional Series
series06HTML_label:Additional Series
series07HTML_label:Additional Series
series08HTML_label:Additional Series
series09HTML_label:Additional Series
## Try to collect series names and numbers of this story in those
## series. This lets us turn it on and off by site without keeping a
## lengthy titlepage_entries per site and prevents it updating in the
## plugin.
collect_series: true
#add_to_extra_titlepage_entries:,tropes,themes,representation,inclusivity,house,storytype,contentwarnings,series01HTML,series02HTML,series03HTML,
# series04HTML,series05HTML,series06HTML,series07HTML,series08HTML,series09HTML
## fanfictalk.com takes margins away, even from p tags, by default.
## So authors have to either include extra br/p tags or their own
## styles. These allow for both, but leave you at the mercy of author
## CSS.
add_to_output_css:
* {
margin: 0;
padding: 0;
}
add_to_keep_html_attrs:,style
[fanfiction-junkies.de]
website_encodings:Windows-1252,utf8
@ -3035,6 +3050,17 @@ sitetags_label:Site tags
## Attempt to fix p and br excess from HTML in great many stories
fix_excess_space:false
[www.novelupdates.cc]
## Note that novelupdates.cc != novelupdates.com
## There is reason to believe that novelupdates.cc may be a
## replacement for wuxiaworld.co, but currently both exist with
## different data.
## When dedup_order_chapter_list:true, use a heuristic algorithm
## specific to novelupdates.cc order and dedup chapters.
dedup_order_chapter_list:false
[www.phoenixsong.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
@ -3065,7 +3091,7 @@ sitetags_label:Site Tags
## Site dedicated to these categories/characters/ships
extracategories:Harry Potter
add_to_include_subject_tags:,takesplaces,snapeflavours,sitetags
#add_to_include_subject_tags:,takesplaces,snapeflavours,sitetags
#add_to_extra_titlepage_entries:,stars,reviews,reads,takesplaces,snapeflavours,sitetags
website_encodings:Windows-1252,utf8
@ -3112,6 +3138,11 @@ extratags:
## add_to_output_css example for [base_xenforoforum:epub].
#legend_spoilers:true
## royalroad.com chapters can have author notes attached to them.
## Setting include_author_notes:true will include them with the
## chapter text.
#include_author_notes:true
[www.scarvesandcoffee.net]
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
@ -3130,6 +3161,12 @@ views_label:Views
averageWords_label:Average Words (Chapter)
add_to_titlepage_entries:,views, averageWords
## Scribble Hub chapters can include author's notes and news blocks. We've
## traditionally included them all in the chapter text, but this allows
## you to customize which you include. Copy this parameter to your
## personal.ini and list the ones you don't want.
#exclude_notes:authornotes,newsboxes
[www.siye.co.uk]
## Site dedicated to these categories/characters/ships
extracategories:Harry Potter
@ -3170,6 +3207,12 @@ extracategories:Lord of the Rings
website_encodings:Windows-1252,utf8
[www.the-sietch.com]
## see [base_xenforoforum]
## the-sietch.com shows more posts per reader page than other XF sites.
reader_posts_per_page:15
[www.thedelphicexpanse.com]
## Site dedicated to these categories/characters/ships
extracategories:Star Trek: Enterprise
@ -3352,12 +3395,6 @@ website_encodings:Windows-1252,utf8
## specific to wuxiaworld.co order and dedup chapters.
dedup_order_chapter_list:false
[www.novelupdates.cc]
## Note that novelupdates.cc != novelupdates.com
## When dedup_order_chapter_list:true, use a heuristic algorithm
## specific to novelupdates.cc order and dedup chapters.
dedup_order_chapter_list:false
[www.wuxiaworld.com]
user_agent:Mozilla/5.0
## Authors on wuxiaworld.com create their own index pages, so it's not

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015 Fanficdownloader team, 2018 FanFicFare team
# Copyright 2015 Fanficdownloader team, 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -31,7 +31,7 @@ from .six import ensure_str
import logging
logger = logging.getLogger(__name__)
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from .gziphttp import GZipProcessor
from . import adapters
@ -39,87 +39,51 @@ from .configurable import Configuration
from .exceptions import UnknownSite, FetchEmailFailed
def get_urls_from_page(url,configuration=None,normalize=False):
if not configuration:
configuration = Configuration(["test1.com"],"EPUB",lightweight=True)
data = None
adapter = None
try:
adapter = adapters.getAdapter(configuration,url,anyurl=True)
# special stuff to log into archiveofourown.org, if possible.
# Unlike most that show the links to 'adult' stories, but protect
# them, AO3 doesn't even show them if not logged in. Only works
# with saved user/pass--not going to prompt for list.
if 'archiveofourown.org' in url:
if adapter.getConfig("username"):
if adapter.getConfig("is_adult"):
if '?' in url:
addurl = "&view_adult=true"
else:
addurl = "?view_adult=true"
else:
addurl=""
# just to get an authenticity_token.
data = adapter._fetchUrl(url+addurl)
# login the session.
adapter.performLogin(url,data)
# get the list page with logged in session.
if 'fimfiction.net' in url and adapter.getConfig("is_adult"):
data = adapter._fetchUrl(url)
adapter.set_adult_cookie()
if 'tthfanfic.org' in url and adapter.getConfig("is_adult"):
## Simple fetch works in testing, but actual pages use a
## POST and has a 'ctkn' value, so we do too.
# adapter._fetchUrl("https://www.tthfanfic.org/setmaxrating.php?sitemaxrating=5")
adapter.setSiteMaxRating(url)
# this way it uses User-Agent or other special settings.
data = adapter._fetchUrl(url,usecache=False)
return adapter.get_urls_from_page(url,normalize)
except UnknownSite:
# no adapter with anyurl=True, must be a random site.
opener = build_opener(HTTPCookieProcessor(),GZipProcessor())
data = opener.open(url).read()
return {'urllist':get_urls_from_html(data,url,configuration,normalize)}
return {}
# kludge because I don't see it on enough sites to be worth generalizing yet.
restrictsearch=None
if 'scarvesandcoffee.net' in url:
restrictsearch=('div',{'id':'mainpage'})
return get_urls_from_html(data,url,configuration,normalize,restrictsearch)
def get_urls_from_html(data,url=None,configuration=None,normalize=False,restrictsearch=None,email=False):
def get_urls_from_html(data,url=None,configuration=None,normalize=False,email=False):
logger.debug("get_urls_from_html")
urls = collections.OrderedDict()
if not configuration:
configuration = Configuration(["test1.com"],"EPUB",lightweight=True)
## soup and re-soup because BS4/html5lib is more forgiving of
## incorrectly nested tags that way.
soup = BeautifulSoup(unicode(BeautifulSoup(data,"html5lib")),"html5lib")
if restrictsearch:
soup = soup.find(*restrictsearch)
#logger.debug("restrict search:%s"%soup)
if isinstance(data,(BeautifulSoup,Tag)):
logger.debug("Using pre-made soup")
soup = data
else:
## soup and re-soup because BS4/html5lib is more forgiving of
## incorrectly nested tags that way.
logger.debug("dbl souping")
soup = BeautifulSoup(unicode(BeautifulSoup(data,"html5lib")),"html5lib")
for a in soup.findAll('a'):
if a.has_attr('href'):
#logger.debug("a['href']:%s"%a['href'])
# logger.debug("a['href']:%s"%a['href'])
href = form_url(url,a['href'])
#logger.debug("1 urlhref:%s"%href)
# logger.debug("1 urlhref:%s"%href)
href = cleanup_url(href,email)
try:
#logger.debug("2 urlhref:%s"%href)
# logger.debug("2 urlhref:%s"%href)
adapter = adapters.getAdapter(configuration,href)
#logger.debug("found adapter")
# logger.debug("found adapter")
if adapter.story.getMetadata('storyUrl') not in urls:
urls[adapter.story.getMetadata('storyUrl')] = [href]
else:
urls[adapter.story.getMetadata('storyUrl')].append(href)
# logger.debug("adapter storyUrl:%s"%adapter.story.getMetadata('storyUrl'))
except Exception as e:
#logger.debug e
# logger.debug(e)
pass
# Simply return the longest URL with the assumption that it contains the

View file

@ -1284,7 +1284,7 @@ class Story(Configurable):
if cover and cover_big_enough:
if len(self.imgtuples) > 0 and 'cover' in self.imgtuples[0]['newsrc']:
# remove existing cover, if there is one.
self.imgsizes[len(self.imgtuples[0]['data'])].remove(0)
# could have only come from first image and is assumed index 0.
del self.imgurls[0]
del self.imgtuples[0]
self.imgurls.insert(0,imgurl)
@ -1292,7 +1292,10 @@ class Story(Configurable):
self.cover=newsrc
self.setMetadata('cover_image','specific')
self.imgtuples.insert(0,{'newsrc':newsrc,'mime':mime,'data':data})
self.imgsizes[len(data)].append(0)
## *Don't* include cover in imgsizes because it can be
## replaced by Calibre etc. So don't re-use it.
## Also saves removing it above.
# self.imgsizes[len(data)].append(0)
else:
if self.getConfig('dedup_img_files',False):
same_sz_imgs = self.imgsizes[len(data)]
@ -1319,6 +1322,9 @@ class Story(Configurable):
self.setMetadata('cover_image','first')
self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data})
self.imgurls.append(imgurl)
## *Don't* include cover in imgsizes because it can be
## replaced by Calibre etc. So don't re-use it.
# self.imgsizes[len(data)].append(len(self.imgtuples)-1)
newsrc = "images/%s-%s.%s"%(
prefix,

View file

@ -27,7 +27,7 @@ setup(
name=package_name,
# Versions should comply with PEP440.
version="3.23.4",
version="3.24.2",
description='A tool for downloading fanfiction to eBook formats',
long_description=long_description,