From 321daf7de9c4e6d0fe84ad229e6c46e1521bb2f9 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Thu, 14 Jun 2012 18:41:26 -0500 Subject: [PATCH] Add 'Get Story URLs from Web Page' feature to plugin and CLI(-l). --- calibre-plugin/dialogs.py | 37 +++++++++++++++++ calibre-plugin/ffdl_plugin.py | 36 ++++++++++++++--- downloader.py | 14 ++++++- fanficdownloader/geturls.py | 76 +++++++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 fanficdownloader/geturls.py diff --git a/calibre-plugin/dialogs.py b/calibre-plugin/dialogs.py index f2b1b9a9..977ba26c 100644 --- a/calibre-plugin/dialogs.py +++ b/calibre-plugin/dialogs.py @@ -177,6 +177,43 @@ class FakeLineEdit(): def text(self): pass +class CollectURLDialog(QDialog): + ''' + Collect single url for get urls. + ''' + def __init__(self, gui, title): + QDialog.__init__(self, gui) + self.gui = gui + self.status=False + + self.l = QGridLayout() + self.setLayout(self.l) + + self.setWindowTitle(title) + self.l.addWidget(QLabel(title),0,0,1,2) + + self.l.addWidget(QLabel("URL:"),1,0) + self.url = QLineEdit(self) + self.l.addWidget(self.url,1,1) + + self.ok_button = QPushButton('OK', self) + self.ok_button.clicked.connect(self.ok) + self.l.addWidget(self.ok_button,2,0) + + self.cancel_button = QPushButton('Cancel', self) + self.cancel_button.clicked.connect(self.cancel) + self.l.addWidget(self.cancel_button,2,1) + + self.resize(self.sizeHint()) + + def ok(self): + self.status=True + self.hide() + + def cancel(self): + self.status=False + self.hide() + class UserPassDialog(QDialog): ''' Need to collect User/Pass for some sites. diff --git a/calibre-plugin/ffdl_plugin.py b/calibre-plugin/ffdl_plugin.py index e8e9b761..36c8e0b7 100644 --- a/calibre-plugin/ffdl_plugin.py +++ b/calibre-plugin/ffdl_plugin.py @@ -37,11 +37,12 @@ from calibre_plugins.fanfictiondownloader_plugin.common_utils import (set_plugin from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource, get_dcsource_chaptercount, get_story_url_from_html +from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.geturls import get_urls_from_page from calibre_plugins.fanfictiondownloader_plugin.config import (prefs, permitted_values) from calibre_plugins.fanfictiondownloader_plugin.dialogs import ( AddNewDialog, UpdateExistingDialog, display_story_list, DisplayStoryListDialog, - LoopProgressDialog, UserPassDialog, AboutDialog, + LoopProgressDialog, UserPassDialog, AboutDialog, CollectURLDialog, OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, NotGoingToDownload ) @@ -192,6 +193,11 @@ class FanFictionDownLoaderPlugin(InterfaceAction): shortcut_name='Get URLs from Selected Books', triggered=self.get_list_urls) + self.get_list_action = self.create_menu_item_ex(self.menu, 'Get Story URLs from Web Page', image='view.png', + unique_name='Get Story URLs from Web Page', + shortcut_name='Get Story URLs from Web Page', + triggered=self.get_urls_from_page) + self.menu.addSeparator() self.config_action = create_menu_action_unique(self, self.menu, '&Configure Plugin', shortcut=False, image= 'config.png', @@ -246,6 +252,26 @@ class FanFictionDownLoaderPlugin(InterfaceAction): (prefs['addtolists'] or prefs['addtoreadlists']) : self._update_reading_lists(self.gui.library_view.get_selected_ids(),add) + def get_urls_from_page(self): + d = CollectURLDialog(self.gui,"Get Story URLs from Web Page") + d.exec_() + if not d.status: + return + print("URL:%s"%d.url.text()) + + url_list = get_urls_from_page("%s"%d.url.text()) + + if url_list: + d = ViewLog(_("List of URLs"),"\n".join(url_list),parent=self.gui) + d.setWindowIcon(get_icon('bookmarks.png')) + d.exec_() + else: + info_dialog(self.gui, _('List of URLs'), + _('No Valid URLs found on given page.'), + show=True, + show_copy_button=False) + + def get_list_urls(self): if len(self.gui.library_view.get_selected_ids()) > 0: book_list = map( partial(self._convert_id_to_book, good=False), self.gui.library_view.get_selected_ids() ) @@ -498,7 +524,7 @@ make_firstimage_cover:true # find dups mi = MetaInformation(story.getMetadata("title", removeallentities=True), - (story.getMetadata("author", removeallentities=True),)) # author is a list. + [story.getMetadata("author", removeallentities=True)]) # author is a list. identicalbooks = db.find_identical_books(mi) ## removed for being overkill. # for ib in identicalbooks: @@ -784,7 +810,7 @@ make_firstimage_cover:true if epubmi.cover_data[1] is not None: db.set_cover(book_id, epubmi.cover_data[1]) - # set author link if found. All current adapters have authorUrl. + # set author link if found. All current adapters have authorUrl, except anonymous on AO3. if 'authorUrl' in book['all_metadata']: autid=db.get_author_id(book['author']) db.set_link_field_for_author(autid, unicode(book['all_metadata']['authorUrl']), @@ -926,7 +952,7 @@ make_firstimage_cover:true confirm(message,'fanfictiondownloader_no_reading_list_%s'%l, self.gui) def _find_existing_book_id(self,db,book,matchurl=True): - mi = MetaInformation(book["title"],(book["author"],)) # author is a list. + mi = MetaInformation(book["title"],[book["author"]]) # author is a list. identicalbooks = db.find_identical_books(mi) if matchurl: # only *really* identical if URL matches, too. for ib in identicalbooks: @@ -937,7 +963,7 @@ make_firstimage_cover:true return None def _make_mi_from_book(self,book): - mi = MetaInformation(book['title'],(book['author'],)) # author is a list. + mi = MetaInformation(book['title'],[book['author']]) # author is a list. mi.set_identifiers({'url':book['url']}) mi.publisher = book['publisher'] mi.tags = book['tags'] diff --git a/downloader.py b/downloader.py index 8f405e88..abc3781d 100644 --- a/downloader.py +++ b/downloader.py @@ -17,7 +17,7 @@ import logging ## XXX cli option for logging level. -logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s") +logging.basicConfig(level=logging.INFO,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s") import sys, os from os.path import normpath, expanduser, isfile, join @@ -30,6 +30,7 @@ from subprocess import call from fanficdownloader import adapters,writers,exceptions from fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data +from fanficdownloader.geturls import get_urls_from_page if sys.version_info < (2, 5): print "This program requires Python 2.5 or newer." @@ -70,6 +71,9 @@ def main(): parser.add_option("--force", action="store_true", dest="force", help="Force overwrite or update of an existing epub, download and overwrite all chapters.",) + parser.add_option("-l", "--list", + action="store_true", dest="list", + help="Get list of valid story URLs from page given.",) (options, args) = parser.parse_args() @@ -116,6 +120,12 @@ def main(): (var,val) = opt.split('=') config.set("overrides",var,val) + if options.list: + retlist = get_urls_from_page(args[0]) + print "\n".join(retlist) + + return + try: ## Attempt to update an existing epub. if options.update: @@ -202,7 +212,7 @@ def main(): print dne except exceptions.UnknownSite, us: print us - + if __name__ == "__main__": #import time #start = time.time() diff --git a/fanficdownloader/geturls.py b/fanficdownloader/geturls.py new file mode 100644 index 00000000..65bf9398 --- /dev/null +++ b/fanficdownloader/geturls.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +# Copyright 2012 Fanficdownloader team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import urlparse +import urllib2 as u2 +import ConfigParser + +from BeautifulSoup import BeautifulSoup +from gziphttp import GZipProcessor + +import adapters + +def get_urls_from_page(url): + + opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor()) + soup = BeautifulSoup(opener.open(url).read()) + + normalized = set() # normalized url + retlist = [] # orig urls. + config = ConfigParser.SafeConfigParser() + + for a in soup.findAll('a'): + if a.has_key('href'): + href = form_url(url,a['href']) + try: + adapter = adapters.getAdapter(config,href,"EPUB") + if adapter.story.getMetadata('storyUrl') not in normalized: + normalized.add(adapter.story.getMetadata('storyUrl')) + retlist.append(href) + except: + pass + + return retlist + +def form_url(parenturl,url): + url = url.strip() # ran across an image with a space in the + # src. Browser handled it, so we'd better, too. + + if "//" in url or parenturl == None: + returl = url + else: + parsedUrl = urlparse.urlparse(parenturl) + if url.startswith("/") : + returl = urlparse.urlunparse( + (parsedUrl.scheme, + parsedUrl.netloc, + url, + '','','')) + else: + toppath="" + if parsedUrl.path.endswith("/"): + toppath = parsedUrl.path + else: + toppath = parsedUrl.path[:parsedUrl.path.rindex('/')] + returl = urlparse.urlunparse( + (parsedUrl.scheme, + parsedUrl.netloc, + toppath + '/' + url, + '','','')) + return returl +