Add 'Get Story URLs from Web Page' feature to plugin and CLI(-l).

2025-12-22 16:54:06 +01:00 · 2012-06-14 18:41:26 -05:00 · 2012-06-14 18:41:26 -05:00 · 321daf7de9
commit 321daf7de9
parent 37a9446162
4 changed files with 156 additions and 7 deletions
--- a/calibre-plugin/dialogs.py
+++ b/calibre-plugin/dialogs.py
@ -177,6 +177,43 @@ class FakeLineEdit():
    def text(self):
        pass
    
+class CollectURLDialog(QDialog):
+    '''
+    Collect single url for get urls.
+    '''
+    def __init__(self, gui, title):
+        QDialog.__init__(self, gui)
+        self.gui = gui
+        self.status=False
+
+        self.l = QGridLayout()
+        self.setLayout(self.l)
+
+        self.setWindowTitle(title)
+        self.l.addWidget(QLabel(title),0,0,1,2)
+        
+        self.l.addWidget(QLabel("URL:"),1,0)
+        self.url = QLineEdit(self)
+        self.l.addWidget(self.url,1,1)
+   
+        self.ok_button = QPushButton('OK', self)
+        self.ok_button.clicked.connect(self.ok)
+        self.l.addWidget(self.ok_button,2,0)
+
+        self.cancel_button = QPushButton('Cancel', self)
+        self.cancel_button.clicked.connect(self.cancel)
+        self.l.addWidget(self.cancel_button,2,1)
+
+        self.resize(self.sizeHint())
+
+    def ok(self):
+        self.status=True
+        self.hide()
+
+    def cancel(self):
+        self.status=False
+        self.hide()
+
 class UserPassDialog(QDialog):
    '''
    Need to collect User/Pass for some sites.
--- a/calibre-plugin/ffdl_plugin.py
+++ b/calibre-plugin/ffdl_plugin.py
@ -37,11 +37,12 @@ from calibre_plugins.fanfictiondownloader_plugin.common_utils import (set_plugin
 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML
 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource, get_dcsource_chaptercount, get_story_url_from_html
+from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.geturls import get_urls_from_page

 from calibre_plugins.fanfictiondownloader_plugin.config import (prefs, permitted_values)
 from calibre_plugins.fanfictiondownloader_plugin.dialogs import (
    AddNewDialog, UpdateExistingDialog, display_story_list, DisplayStoryListDialog,
-    LoopProgressDialog, UserPassDialog, AboutDialog,
+    LoopProgressDialog, UserPassDialog, AboutDialog, CollectURLDialog, 
    OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY,
    NotGoingToDownload )

@ -192,6 +193,11 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
                                                            shortcut_name='Get URLs from Selected Books',
                                                            triggered=self.get_list_urls)

+            self.get_list_action = self.create_menu_item_ex(self.menu, 'Get Story URLs from Web Page', image='view.png',
+                                                            unique_name='Get Story URLs from Web Page',
+                                                            shortcut_name='Get Story URLs from Web Page',
+                                                            triggered=self.get_urls_from_page)
+
            self.menu.addSeparator()
            self.config_action = create_menu_action_unique(self, self.menu, '&Configure Plugin', shortcut=False,
                                                           image= 'config.png',
@ -246,6 +252,26 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
                (prefs['addtolists'] or prefs['addtoreadlists']) :
            self._update_reading_lists(self.gui.library_view.get_selected_ids(),add)

+    def get_urls_from_page(self):
+        d = CollectURLDialog(self.gui,"Get Story URLs from Web Page")
+        d.exec_()
+        if not d.status:
+            return
+        print("URL:%s"%d.url.text())
+
+        url_list = get_urls_from_page("%s"%d.url.text())
+
+        if url_list:
+            d = ViewLog(_("List of URLs"),"\n".join(url_list),parent=self.gui)
+            d.setWindowIcon(get_icon('bookmarks.png'))
+            d.exec_()
+        else:
+            info_dialog(self.gui, _('List of URLs'),
+                        _('No Valid URLs found on given page.'),
+                        show=True,
+                        show_copy_button=False)
+        
+            
    def get_list_urls(self):
        if len(self.gui.library_view.get_selected_ids()) > 0:
            book_list = map( partial(self._convert_id_to_book, good=False), self.gui.library_view.get_selected_ids() )
@ -498,7 +524,7 @@ make_firstimage_cover:true

            # find dups
            mi = MetaInformation(story.getMetadata("title", removeallentities=True),
-                                 (story.getMetadata("author", removeallentities=True),)) # author is a list.
+                                 [story.getMetadata("author", removeallentities=True)]) # author is a list.
            identicalbooks = db.find_identical_books(mi)
            ## removed for being overkill.
            # for ib in identicalbooks:
@ -784,7 +810,7 @@ make_firstimage_cover:true
            if epubmi.cover_data[1] is not None:
                db.set_cover(book_id, epubmi.cover_data[1])

-        # set author link if found.  All current adapters have authorUrl.
+        # set author link if found.  All current adapters have authorUrl, except anonymous on AO3.
        if 'authorUrl' in book['all_metadata']:
            autid=db.get_author_id(book['author'])
            db.set_link_field_for_author(autid, unicode(book['all_metadata']['authorUrl']),
@ -926,7 +952,7 @@ make_firstimage_cover:true
                        confirm(message,'fanfictiondownloader_no_reading_list_%s'%l, self.gui)

    def _find_existing_book_id(self,db,book,matchurl=True):
-        mi = MetaInformation(book["title"],(book["author"],)) # author is a list.
+        mi = MetaInformation(book["title"],[book["author"]]) # author is a list.
        identicalbooks = db.find_identical_books(mi)
        if matchurl: # only *really* identical if URL matches, too.
            for ib in identicalbooks:
@ -937,7 +963,7 @@ make_firstimage_cover:true
        return None
    
    def _make_mi_from_book(self,book):
-        mi = MetaInformation(book['title'],(book['author'],)) # author is a list.
+        mi = MetaInformation(book['title'],[book['author']]) # author is a list.
        mi.set_identifiers({'url':book['url']})
        mi.publisher = book['publisher']
        mi.tags = book['tags']
--- a/downloader.py
+++ b/downloader.py
@ -17,7 +17,7 @@

 import logging
 ## XXX cli option for logging level.
-logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
+logging.basicConfig(level=logging.INFO,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")

 import sys, os
 from os.path import normpath, expanduser, isfile, join
@ -30,6 +30,7 @@ from subprocess import call

 from fanficdownloader import adapters,writers,exceptions
 from fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
+from fanficdownloader.geturls import get_urls_from_page

 if sys.version_info < (2, 5):
    print "This program requires Python 2.5 or newer."
@ -70,6 +71,9 @@ def main():
   parser.add_option("--force",
                     action="store_true", dest="force",
                     help="Force overwrite or update of an existing epub, download and overwrite all chapters.",)
+   parser.add_option("-l", "--list",
+                     action="store_true", dest="list",
+                     help="Get list of valid story URLs from page given.",)
   
   (options, args) = parser.parse_args()

@ -116,6 +120,12 @@ def main():
           (var,val) = opt.split('=')
           config.set("overrides",var,val)

+   if options.list:
+       retlist = get_urls_from_page(args[0])
+       print "\n".join(retlist)
+               
+       return
+
   try:
       ## Attempt to update an existing epub.
       if options.update:
@ -202,7 +212,7 @@ def main():
       print dne
   except exceptions.UnknownSite, us:
       print us
-   
+
 if __name__ == "__main__":
    #import time
    #start = time.time()
--- a/fanficdownloader/geturls.py
+++ b/fanficdownloader/geturls.py
@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2012 Fanficdownloader team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import urlparse
+import urllib2 as u2
+import ConfigParser
+
+from BeautifulSoup import BeautifulSoup 
+from gziphttp import GZipProcessor
+
+import adapters
+
+def get_urls_from_page(url):
+    
+    opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
+    soup = BeautifulSoup(opener.open(url).read())
+    
+    normalized = set() # normalized url
+    retlist = [] # orig urls.
+    config = ConfigParser.SafeConfigParser()
+    
+    for a in soup.findAll('a'):
+        if a.has_key('href'):
+            href = form_url(url,a['href'])
+            try:
+                adapter = adapters.getAdapter(config,href,"EPUB")
+                if adapter.story.getMetadata('storyUrl') not in normalized:
+                    normalized.add(adapter.story.getMetadata('storyUrl'))
+                    retlist.append(href)
+            except:
+                pass
+
+    return retlist
+
+def form_url(parenturl,url):
+     url = url.strip() # ran across an image with a space in the
+                       # src. Browser handled it, so we'd better, too.
+ 
+     if "//" in url or parenturl == None:
+         returl = url
+     else:
+         parsedUrl = urlparse.urlparse(parenturl)
+         if url.startswith("/") :
+             returl = urlparse.urlunparse(
+                 (parsedUrl.scheme,
+                  parsedUrl.netloc,
+                  url,
+                  '','',''))
+         else:
+             toppath=""
+             if parsedUrl.path.endswith("/"):
+                 toppath = parsedUrl.path
+             else:
+                 toppath = parsedUrl.path[:parsedUrl.path.rindex('/')]
+             returl = urlparse.urlunparse(
+                 (parsedUrl.scheme,
+                  parsedUrl.netloc,
+                  toppath + '/' + url,
+                  '','',''))
+     return returl
+