Add 'Get Story URLs from Web Page' feature to plugin and CLI(-l).

This commit is contained in:
Jim Miller 2012-06-14 18:41:26 -05:00
parent 37a9446162
commit 321daf7de9
4 changed files with 156 additions and 7 deletions

View file

@ -177,6 +177,43 @@ class FakeLineEdit():
def text(self):
pass
class CollectURLDialog(QDialog):
'''
Collect single url for get urls.
'''
def __init__(self, gui, title):
QDialog.__init__(self, gui)
self.gui = gui
self.status=False
self.l = QGridLayout()
self.setLayout(self.l)
self.setWindowTitle(title)
self.l.addWidget(QLabel(title),0,0,1,2)
self.l.addWidget(QLabel("URL:"),1,0)
self.url = QLineEdit(self)
self.l.addWidget(self.url,1,1)
self.ok_button = QPushButton('OK', self)
self.ok_button.clicked.connect(self.ok)
self.l.addWidget(self.ok_button,2,0)
self.cancel_button = QPushButton('Cancel', self)
self.cancel_button.clicked.connect(self.cancel)
self.l.addWidget(self.cancel_button,2,1)
self.resize(self.sizeHint())
def ok(self):
self.status=True
self.hide()
def cancel(self):
self.status=False
self.hide()
class UserPassDialog(QDialog):
'''
Need to collect User/Pass for some sites.

View file

@ -37,11 +37,12 @@ from calibre_plugins.fanfictiondownloader_plugin.common_utils import (set_plugin
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource, get_dcsource_chaptercount, get_story_url_from_html
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.geturls import get_urls_from_page
from calibre_plugins.fanfictiondownloader_plugin.config import (prefs, permitted_values)
from calibre_plugins.fanfictiondownloader_plugin.dialogs import (
AddNewDialog, UpdateExistingDialog, display_story_list, DisplayStoryListDialog,
LoopProgressDialog, UserPassDialog, AboutDialog,
LoopProgressDialog, UserPassDialog, AboutDialog, CollectURLDialog,
OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY,
NotGoingToDownload )
@ -192,6 +193,11 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
shortcut_name='Get URLs from Selected Books',
triggered=self.get_list_urls)
self.get_list_action = self.create_menu_item_ex(self.menu, 'Get Story URLs from Web Page', image='view.png',
unique_name='Get Story URLs from Web Page',
shortcut_name='Get Story URLs from Web Page',
triggered=self.get_urls_from_page)
self.menu.addSeparator()
self.config_action = create_menu_action_unique(self, self.menu, '&Configure Plugin', shortcut=False,
image= 'config.png',
@ -246,6 +252,26 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
(prefs['addtolists'] or prefs['addtoreadlists']) :
self._update_reading_lists(self.gui.library_view.get_selected_ids(),add)
def get_urls_from_page(self):
d = CollectURLDialog(self.gui,"Get Story URLs from Web Page")
d.exec_()
if not d.status:
return
print("URL:%s"%d.url.text())
url_list = get_urls_from_page("%s"%d.url.text())
if url_list:
d = ViewLog(_("List of URLs"),"\n".join(url_list),parent=self.gui)
d.setWindowIcon(get_icon('bookmarks.png'))
d.exec_()
else:
info_dialog(self.gui, _('List of URLs'),
_('No Valid URLs found on given page.'),
show=True,
show_copy_button=False)
def get_list_urls(self):
if len(self.gui.library_view.get_selected_ids()) > 0:
book_list = map( partial(self._convert_id_to_book, good=False), self.gui.library_view.get_selected_ids() )
@ -498,7 +524,7 @@ make_firstimage_cover:true
# find dups
mi = MetaInformation(story.getMetadata("title", removeallentities=True),
(story.getMetadata("author", removeallentities=True),)) # author is a list.
[story.getMetadata("author", removeallentities=True)]) # author is a list.
identicalbooks = db.find_identical_books(mi)
## removed for being overkill.
# for ib in identicalbooks:
@ -784,7 +810,7 @@ make_firstimage_cover:true
if epubmi.cover_data[1] is not None:
db.set_cover(book_id, epubmi.cover_data[1])
# set author link if found. All current adapters have authorUrl.
# set author link if found. All current adapters have authorUrl, except anonymous on AO3.
if 'authorUrl' in book['all_metadata']:
autid=db.get_author_id(book['author'])
db.set_link_field_for_author(autid, unicode(book['all_metadata']['authorUrl']),
@ -926,7 +952,7 @@ make_firstimage_cover:true
confirm(message,'fanfictiondownloader_no_reading_list_%s'%l, self.gui)
def _find_existing_book_id(self,db,book,matchurl=True):
mi = MetaInformation(book["title"],(book["author"],)) # author is a list.
mi = MetaInformation(book["title"],[book["author"]]) # author is a list.
identicalbooks = db.find_identical_books(mi)
if matchurl: # only *really* identical if URL matches, too.
for ib in identicalbooks:
@ -937,7 +963,7 @@ make_firstimage_cover:true
return None
def _make_mi_from_book(self,book):
mi = MetaInformation(book['title'],(book['author'],)) # author is a list.
mi = MetaInformation(book['title'],[book['author']]) # author is a list.
mi.set_identifiers({'url':book['url']})
mi.publisher = book['publisher']
mi.tags = book['tags']

View file

@ -17,7 +17,7 @@
import logging
## XXX cli option for logging level.
logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
logging.basicConfig(level=logging.INFO,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
import sys, os
from os.path import normpath, expanduser, isfile, join
@ -30,6 +30,7 @@ from subprocess import call
from fanficdownloader import adapters,writers,exceptions
from fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
from fanficdownloader.geturls import get_urls_from_page
if sys.version_info < (2, 5):
print "This program requires Python 2.5 or newer."
@ -70,6 +71,9 @@ def main():
parser.add_option("--force",
action="store_true", dest="force",
help="Force overwrite or update of an existing epub, download and overwrite all chapters.",)
parser.add_option("-l", "--list",
action="store_true", dest="list",
help="Get list of valid story URLs from page given.",)
(options, args) = parser.parse_args()
@ -116,6 +120,12 @@ def main():
(var,val) = opt.split('=')
config.set("overrides",var,val)
if options.list:
retlist = get_urls_from_page(args[0])
print "\n".join(retlist)
return
try:
## Attempt to update an existing epub.
if options.update:
@ -202,7 +212,7 @@ def main():
print dne
except exceptions.UnknownSite, us:
print us
if __name__ == "__main__":
#import time
#start = time.time()

View file

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import urlparse
import urllib2 as u2
import ConfigParser
from BeautifulSoup import BeautifulSoup
from gziphttp import GZipProcessor
import adapters
def get_urls_from_page(url):
opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
soup = BeautifulSoup(opener.open(url).read())
normalized = set() # normalized url
retlist = [] # orig urls.
config = ConfigParser.SafeConfigParser()
for a in soup.findAll('a'):
if a.has_key('href'):
href = form_url(url,a['href'])
try:
adapter = adapters.getAdapter(config,href,"EPUB")
if adapter.story.getMetadata('storyUrl') not in normalized:
normalized.add(adapter.story.getMetadata('storyUrl'))
retlist.append(href)
except:
pass
return retlist
def form_url(parenturl,url):
url = url.strip() # ran across an image with a space in the
# src. Browser handled it, so we'd better, too.
if "//" in url or parenturl == None:
returl = url
else:
parsedUrl = urlparse.urlparse(parenturl)
if url.startswith("/") :
returl = urlparse.urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
url,
'','',''))
else:
toppath=""
if parsedUrl.path.endswith("/"):
toppath = parsedUrl.path
else:
toppath = parsedUrl.path[:parsedUrl.path.rindex('/')]
returl = urlparse.urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
toppath + '/' + url,
'','',''))
return returl