From 321daf7de9c4e6d0fe84ad229e6c46e1521bb2f9 Mon Sep 17 00:00:00 2001
From: Jim Miller <retiefjimm@gmail.com>
Date: Thu, 14 Jun 2012 18:41:26 -0500
Subject: [PATCH] Add 'Get Story URLs from Web Page' feature to plugin and
 CLI(-l).

---
 calibre-plugin/dialogs.py     | 37 +++++++++++++++++
 calibre-plugin/ffdl_plugin.py | 36 ++++++++++++++---
 downloader.py                 | 14 ++++++-
 fanficdownloader/geturls.py   | 76 +++++++++++++++++++++++++++++++++++
 4 files changed, 156 insertions(+), 7 deletions(-)
 create mode 100644 fanficdownloader/geturls.py

diff --git a/calibre-plugin/dialogs.py b/calibre-plugin/dialogs.py
index f2b1b9a9..977ba26c 100644
--- a/calibre-plugin/dialogs.py
+++ b/calibre-plugin/dialogs.py
@@ -177,6 +177,43 @@ class FakeLineEdit():
     def text(self):
         pass
     
+class CollectURLDialog(QDialog):
+    '''
+    Collect single url for get urls.
+    '''
+    def __init__(self, gui, title):
+        QDialog.__init__(self, gui)
+        self.gui = gui
+        self.status=False
+
+        self.l = QGridLayout()
+        self.setLayout(self.l)
+
+        self.setWindowTitle(title)
+        self.l.addWidget(QLabel(title),0,0,1,2)
+        
+        self.l.addWidget(QLabel("URL:"),1,0)
+        self.url = QLineEdit(self)
+        self.l.addWidget(self.url,1,1)
+   
+        self.ok_button = QPushButton('OK', self)
+        self.ok_button.clicked.connect(self.ok)
+        self.l.addWidget(self.ok_button,2,0)
+
+        self.cancel_button = QPushButton('Cancel', self)
+        self.cancel_button.clicked.connect(self.cancel)
+        self.l.addWidget(self.cancel_button,2,1)
+
+        self.resize(self.sizeHint())
+
+    def ok(self):
+        self.status=True
+        self.hide()
+
+    def cancel(self):
+        self.status=False
+        self.hide()
+
 class UserPassDialog(QDialog):
     '''
     Need to collect User/Pass for some sites.
diff --git a/calibre-plugin/ffdl_plugin.py b/calibre-plugin/ffdl_plugin.py
index e8e9b761..36c8e0b7 100644
--- a/calibre-plugin/ffdl_plugin.py
+++ b/calibre-plugin/ffdl_plugin.py
@@ -37,11 +37,12 @@ from calibre_plugins.fanfictiondownloader_plugin.common_utils import (set_plugin
 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML
 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource, get_dcsource_chaptercount, get_story_url_from_html
+from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.geturls import get_urls_from_page
 
 from calibre_plugins.fanfictiondownloader_plugin.config import (prefs, permitted_values)
 from calibre_plugins.fanfictiondownloader_plugin.dialogs import (
     AddNewDialog, UpdateExistingDialog, display_story_list, DisplayStoryListDialog,
-    LoopProgressDialog, UserPassDialog, AboutDialog,
+    LoopProgressDialog, UserPassDialog, AboutDialog, CollectURLDialog, 
     OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY,
     NotGoingToDownload )
 
@@ -192,6 +193,11 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
                                                             shortcut_name='Get URLs from Selected Books',
                                                             triggered=self.get_list_urls)
 
+            self.get_list_action = self.create_menu_item_ex(self.menu, 'Get Story URLs from Web Page', image='view.png',
+                                                            unique_name='Get Story URLs from Web Page',
+                                                            shortcut_name='Get Story URLs from Web Page',
+                                                            triggered=self.get_urls_from_page)
+
             self.menu.addSeparator()
             self.config_action = create_menu_action_unique(self, self.menu, '&Configure Plugin', shortcut=False,
                                                            image= 'config.png',
@@ -246,6 +252,26 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
                 (prefs['addtolists'] or prefs['addtoreadlists']) :
             self._update_reading_lists(self.gui.library_view.get_selected_ids(),add)
 
+    def get_urls_from_page(self):
+        d = CollectURLDialog(self.gui,"Get Story URLs from Web Page")
+        d.exec_()
+        if not d.status:
+            return
+        print("URL:%s"%d.url.text())
+
+        url_list = get_urls_from_page("%s"%d.url.text())
+
+        if url_list:
+            d = ViewLog(_("List of URLs"),"\n".join(url_list),parent=self.gui)
+            d.setWindowIcon(get_icon('bookmarks.png'))
+            d.exec_()
+        else:
+            info_dialog(self.gui, _('List of URLs'),
+                        _('No Valid URLs found on given page.'),
+                        show=True,
+                        show_copy_button=False)
+        
+            
     def get_list_urls(self):
         if len(self.gui.library_view.get_selected_ids()) > 0:
             book_list = map( partial(self._convert_id_to_book, good=False), self.gui.library_view.get_selected_ids() )
@@ -498,7 +524,7 @@ make_firstimage_cover:true
 
             # find dups
             mi = MetaInformation(story.getMetadata("title", removeallentities=True),
-                                 (story.getMetadata("author", removeallentities=True),)) # author is a list.
+                                 [story.getMetadata("author", removeallentities=True)]) # author is a list.
             identicalbooks = db.find_identical_books(mi)
             ## removed for being overkill.
             # for ib in identicalbooks:
@@ -784,7 +810,7 @@ make_firstimage_cover:true
             if epubmi.cover_data[1] is not None:
                 db.set_cover(book_id, epubmi.cover_data[1])
 
-        # set author link if found.  All current adapters have authorUrl.
+        # set author link if found.  All current adapters have authorUrl, except anonymous on AO3.
         if 'authorUrl' in book['all_metadata']:
             autid=db.get_author_id(book['author'])
             db.set_link_field_for_author(autid, unicode(book['all_metadata']['authorUrl']),
@@ -926,7 +952,7 @@ make_firstimage_cover:true
                         confirm(message,'fanfictiondownloader_no_reading_list_%s'%l, self.gui)
 
     def _find_existing_book_id(self,db,book,matchurl=True):
-        mi = MetaInformation(book["title"],(book["author"],)) # author is a list.
+        mi = MetaInformation(book["title"],[book["author"]]) # author is a list.
         identicalbooks = db.find_identical_books(mi)
         if matchurl: # only *really* identical if URL matches, too.
             for ib in identicalbooks:
@@ -937,7 +963,7 @@ make_firstimage_cover:true
         return None
     
     def _make_mi_from_book(self,book):
-        mi = MetaInformation(book['title'],(book['author'],)) # author is a list.
+        mi = MetaInformation(book['title'],[book['author']]) # author is a list.
         mi.set_identifiers({'url':book['url']})
         mi.publisher = book['publisher']
         mi.tags = book['tags']
diff --git a/downloader.py b/downloader.py
index 8f405e88..abc3781d 100644
--- a/downloader.py
+++ b/downloader.py
@@ -17,7 +17,7 @@
 
 import logging
 ## XXX cli option for logging level.
-logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
+logging.basicConfig(level=logging.INFO,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
 
 import sys, os
 from os.path import normpath, expanduser, isfile, join
@@ -30,6 +30,7 @@ from subprocess import call
 
 from fanficdownloader import adapters,writers,exceptions
 from fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
+from fanficdownloader.geturls import get_urls_from_page
 
 if sys.version_info < (2, 5):
     print "This program requires Python 2.5 or newer."
@@ -70,6 +71,9 @@ def main():
    parser.add_option("--force",
                      action="store_true", dest="force",
                      help="Force overwrite or update of an existing epub, download and overwrite all chapters.",)
+   parser.add_option("-l", "--list",
+                     action="store_true", dest="list",
+                     help="Get list of valid story URLs from page given.",)
    
    (options, args) = parser.parse_args()
 
@@ -116,6 +120,12 @@ def main():
            (var,val) = opt.split('=')
            config.set("overrides",var,val)
 
+   if options.list:
+       retlist = get_urls_from_page(args[0])
+       print "\n".join(retlist)
+               
+       return
+
    try:
        ## Attempt to update an existing epub.
        if options.update:
@@ -202,7 +212,7 @@ def main():
        print dne
    except exceptions.UnknownSite, us:
        print us
-   
+
 if __name__ == "__main__":
     #import time
     #start = time.time()
diff --git a/fanficdownloader/geturls.py b/fanficdownloader/geturls.py
new file mode 100644
index 00000000..65bf9398
--- /dev/null
+++ b/fanficdownloader/geturls.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2012 Fanficdownloader team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import urlparse
+import urllib2 as u2
+import ConfigParser
+
+from BeautifulSoup import BeautifulSoup 
+from gziphttp import GZipProcessor
+
+import adapters
+
+def get_urls_from_page(url):
+    
+    opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
+    soup = BeautifulSoup(opener.open(url).read())
+    
+    normalized = set() # normalized url
+    retlist = [] # orig urls.
+    config = ConfigParser.SafeConfigParser()
+    
+    for a in soup.findAll('a'):
+        if a.has_key('href'):
+            href = form_url(url,a['href'])
+            try:
+                adapter = adapters.getAdapter(config,href,"EPUB")
+                if adapter.story.getMetadata('storyUrl') not in normalized:
+                    normalized.add(adapter.story.getMetadata('storyUrl'))
+                    retlist.append(href)
+            except:
+                pass
+
+    return retlist
+
+def form_url(parenturl,url):
+     url = url.strip() # ran across an image with a space in the
+                       # src. Browser handled it, so we'd better, too.
+ 
+     if "//" in url or parenturl == None:
+         returl = url
+     else:
+         parsedUrl = urlparse.urlparse(parenturl)
+         if url.startswith("/") :
+             returl = urlparse.urlunparse(
+                 (parsedUrl.scheme,
+                  parsedUrl.netloc,
+                  url,
+                  '','',''))
+         else:
+             toppath=""
+             if parsedUrl.path.endswith("/"):
+                 toppath = parsedUrl.path
+             else:
+                 toppath = parsedUrl.path[:parsedUrl.path.rindex('/')]
+             returl = urlparse.urlunparse(
+                 (parsedUrl.scheme,
+                  parsedUrl.netloc,
+                  toppath + '/' + url,
+                  '','',''))
+     return returl
+