Merge from trunk

2026-05-08 12:53:37 +02:00 · 2010-06-24 19:10:24 +01:00 · 2010-06-24 19:10:24 +01:00 · 90be73fe5b
commit 90be73fe5b
parent 058ddc1274 e7eb5b6965
5 changed files with 117 additions and 46 deletions
--- a/resources/recipes/national_post.recipe
+++ b/resources/recipes/national_post.recipe
@ -7,18 +7,18 @@ class NYTimes(BasicNewsRecipe):
    __author__  = 'Krittika Goyal'
    description = 'Canadian national newspaper'
    timefmt = ' [%d %b, %Y]'
-    needs_subscription = False
    language = 'en_CA'
+    needs_subscription = False

    no_stylesheets = True
    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
-    #remove_tags_after  = dict(name='td', attrs={'class':'newptool1'})
+    remove_tags_after  = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'})
    remove_tags = [
       dict(name='iframe'),
-       dict(name='div', attrs={'class':'story-tools'}),
+       dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}),
       #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
       #dict(name='form', attrs={'onsubmit':''}),
-       #dict(name='table', attrs={'cellspacing':'0'}),
+       dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}),
    ]

   # def preprocess_html(self, soup):
@ -37,7 +37,7 @@ def nejm_get_index(self):
    def parse_index(self):
            soup = self.nejm_get_index()

-            div = soup.find(id='LegoText4')
+            div = soup.find(id='npContentMain')

            current_section = None
            current_articles = []
@ -50,7 +50,7 @@ def parse_index(self):
                    current_section = self.tag_to_string(x)
                    current_articles = []
                    self.log('\tFound section:', current_section)
-                if current_section is not None and x.name == 'h3':
+                if current_section is not None and x.name == 'h5':
                    # Article found
                    title = self.tag_to_string(x)
                    a = x.find('a', href=lambda x: x and 'story' in x)
@ -59,8 +59,8 @@ def parse_index(self):
                    url = a.get('href', False)
                    if not url or not title:
                        continue
-                    if url.startswith('story'):
-                         url = 'http://www.nationalpost.com/todays-paper/'+url
+                    #if url.startswith('story'):
+                    url = 'http://www.nationalpost.com/todays-paper/'+url
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url':url,
@ -70,28 +70,11 @@ def parse_index(self):
                feeds.append((current_section, current_articles))

            return feeds
-
    def preprocess_html(self, soup):
-        story = soup.find(name='div', attrs={'class':'triline'})
-        page2_link = soup.find('p','pagenav')
-        if page2_link:
-            atag = page2_link.find('a',href=True)
-            if atag:
-                page2_url = atag['href']
-                if page2_url.startswith('story'):
-                         page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
-                elif page2_url.startswith( '/todays-paper/story.html'):
-                    page2_url = 'http://www.nationalpost.com/'+page2_url
-                page2_soup = self.index_to_soup(page2_url)
-                if page2_soup:
-                    page2_content = page2_soup.find('div','story-content')
-                    if page2_content:
-                        full_story = BeautifulSoup('<div></div>')
-                        full_story.insert(0,story)
-                        full_story.insert(1,page2_content)
-                        story = full_story
+        story = soup.find(name='div', attrs={'id':'npContentMain'})
+        ##td = heading.findParent(name='td')
+        ##td.extract()
        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        body = soup.find(name='body')
        body.insert(0, story)
        return soup
-
--- a/resources/recipes/new_scientist.recipe
+++ b/resources/recipes/new_scientist.recipe
@ -32,15 +32,16 @@ class NewScientist(BasicNewsRecipe):
                        }
    preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]

-    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})]
+    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]

    remove_tags = [
                     dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
-                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools']})
+                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial']})
                    ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
                    ,dict(name='meta' , attrs={'name' :'description'                       })
+                    ,dict(name='a'    , attrs={'rel'  :'tag'                                })
                  ]
-    remove_tags_after = dict(attrs={'class':'nbpcopy'})
+    remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
    remove_attributes = ['height','width']

    feeds          = [
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@ -3,17 +3,18 @@
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import traceback, sys, textwrap, re
+import traceback, sys, textwrap, re, urllib2
 from threading import Thread

-from calibre import prints
+from calibre import prints, browser
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import default_log
 from calibre.customize import Plugin
+from calibre.ebooks.metadata.library_thing import OPENLIBRARY

 metadata_config = None

-class MetadataSource(Plugin):
+class MetadataSource(Plugin): # {{{

    author = 'Kovid Goyal'

@ -130,7 +131,9 @@ def save_settings(self, w):
    def customization_help(self):
        return 'This plugin can only be customized using the GUI'

-class GoogleBooks(MetadataSource):
+    # }}}
+
+class GoogleBooks(MetadataSource): # {{{

    name = 'Google Books'
    description = _('Downloads metadata from Google Books')
@ -145,8 +148,9 @@ def fetch(self):
            self.exception = e
            self.tb = traceback.format_exc()

+    # }}}

-class ISBNDB(MetadataSource):
+class ISBNDB(MetadataSource): # {{{

    name = 'IsbnDB'
    description = _('Downloads metadata from isbndb.com')
@ -181,7 +185,9 @@ def string_customization_help(self):
                'and enter your access key below.')
        return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>')

-class Amazon(MetadataSource):
+    # }}}
+
+class Amazon(MetadataSource): # {{{

    name = 'Amazon'
    metadata_type = 'social'
@ -198,7 +204,9 @@ def fetch(self):
            self.exception = e
            self.tb = traceback.format_exc()

-class LibraryThing(MetadataSource):
+    # }}}
+
+class LibraryThing(MetadataSource): # {{{

    name = 'LibraryThing'
    metadata_type = 'social'
@ -207,7 +215,6 @@ class LibraryThing(MetadataSource):
    def fetch(self):
        if not self.isbn:
            return
-        from calibre import browser
        from calibre.ebooks.metadata import MetaInformation
        import json
        br = browser()
@ -228,6 +235,7 @@ def fetch(self):
        except Exception, e:
            self.exception = e
            self.tb = traceback.format_exc()
+    # }}}


 def result_index(source, result):
@ -268,6 +276,31 @@ def join(self):
        for s in self.sources:
            s.join()

+def filter_metadata_results(item):
+    keywords = ["audio", "tape", "cassette", "abridged", "playaway"]
+    for keyword in keywords:
+        if item.publisher and keyword in item.publisher.lower():
+            return False
+    return True
+
+class HeadRequest(urllib2.Request):
+    def get_method(self):
+        return "HEAD"
+
+def do_cover_check(item):
+    opener = browser()
+    item.has_cover = False
+    try:
+        opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5)
+        item.has_cover = True
+    except:
+        pass # Cover not found
+
+def check_for_covers(items):
+    threads = [Thread(target=do_cover_check, args=(item,)) for item in items]
+    for t in threads: t.start()
+    for t in threads: t.join()
+
 def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
           verbose=0):
    assert not(title is None and author is None and publisher is None and \
@ -285,10 +318,60 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
    for fetcher in fetchers[1:]:
        merge_results(results, fetcher.results)

-    results = sorted(results, cmp=lambda x, y : cmp(
-            (x.comments.strip() if x.comments else ''),
-            (y.comments.strip() if y.comments else '')
-                                                  ), reverse=True)
+    results = list(filter(filter_metadata_results, results))
+
+    check_for_covers(results)
+
+    words = ("the", "a", "an", "of", "and")
+    prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
+    trailing_paren_pat = re.compile(r'\(.*\)$')
+    whitespace_pat = re.compile(r'\s+')
+
+    def sort_func(x, y):
+
+        def cleanup_title(s):
+            s = s.strip().lower()
+            s = prefix_pat.sub(' ', s)
+            s = trailing_paren_pat.sub('', s)
+            s = whitespace_pat.sub(' ', s)
+            return s.strip()
+
+        t = cleanup_title(title)
+        x_title = cleanup_title(x.title)
+        y_title = cleanup_title(y.title)
+
+        # prefer titles that start with the search title
+        tx = cmp(t, x_title)
+        ty = cmp(t, y_title)
+        result = 0 if abs(tx) == abs(ty) else abs(tx) - abs(ty)
+
+        # then prefer titles that have a cover image
+        if result == 0:
+            result = -cmp(x.has_cover, y.has_cover)
+
+        # then prefer titles with the longest comment, with in 10%
+        if result == 0:
+            cx = len(x.comments.strip() if x.comments else '')
+            cy = len(y.comments.strip() if y.comments else '')
+            t = (cx + cy) / 20
+            result = cy - cx
+            if abs(result) < t:
+                result = 0
+
+        return result
+
+    results = sorted(results, cmp=sort_func)
+
+    # if for some reason there is no comment in the top selection, go looking for one
+    if len(results) > 1:
+        if not results[0].comments or len(results[0].comments) == 0:
+            for r in results[1:]:
+                if title.lower() == r.title[:len(title)].lower() and r.comments and len(r.comments):
+                    results[0].comments = r.comments
+                    break
+
+ #   for r in results:
+ #       print "{0:14.14} {1:30.30} {2:20.20} {3:6} {4}".format(r.isbn, r.title, r.publisher, len(r.comments if r.comments else ''), r.has_cover)

    return results, [(x.name, x.exception, x.tb) for x in fetchers]

--- a/src/calibre/gui2/dialogs/config/add_save.ui
+++ b/src/calibre/gui2/dialogs/config/add_save.ui
@ -181,14 +181,14 @@ Title match ignores leading indefinite articles (&quot;the&quot;, &quot;a&quot;,
    <item>
     <widget class="QCheckBox" name="preserve_user_collections">
      <property name="text">
-       <string>Preserve user collections.</string>
+       <string>Preserve device collections.</string>
      </property>
     </widget>
    </item>
    <item>
     <widget class="QLabel" name="label_41">
      <property name="text">
-       <string>If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections on the device view will be enabled.</string>
+       <string>If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections in the device view will be enabled. If unchecked, collections will be always reflect only the metadata in the calibre library.</string>
      </property>
      <property name="wordWrap">
       <bool>true</bool>
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -325,6 +325,10 @@ Post any output you see in a help message on the `Forum <http://www.mobileread.c
 |app| is not starting on OS X?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+One common cause of failures on OS X is the use of accessibility technologies that are incompatible with the graphics toolkit |app| uses.
+Try turning off VoiceOver if you have it on. Also go to System Preferences->System->Universal Access and turn off the setting for enabling
+access for assistive devices in all the tabs.
+
 You can obtain debug output about why |app| is not starting by running `Console.app`. Debug output will
 be printed to it. If the debug output contains a line that looks like::

@ -334,9 +338,9 @@ then the problem is probably a corrupted font cache. You can clear the cache by
 `instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
 solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.

-
 My antivirus program claims |app| is a virus/trojan?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it.

 How do I use purchased EPUB books with |app|?