mirror of
git://github.com/kovidgoyal/calibre.git
synced 2026-05-08 22:53:38 +02:00
Amazon metadata download: Try to scrape series information from the amazon details page. Note that currently very few books have series info available. Often the page for hardcover will have series, but the Kindle edition will not. In such cases calibre may or may not find the series, depending on which page it uses.
This commit is contained in:
parent
60c97aec90
commit
45e7e3f507
1 changed files with 49 additions and 6 deletions
|
|
@ -156,6 +156,16 @@ def __init__(self, url, result_queue, browser, log, relevance, domain,
|
|||
for name in names:
|
||||
self.lang_map[name] = code
|
||||
|
||||
self.series_pat = re.compile(
|
||||
r'''
|
||||
\|\s* # Prefix
|
||||
(Series)\s*:\s* # Series declaration
|
||||
(?P<series>.+?)\s+ # The series name
|
||||
\((Book)\s* # Book declaration
|
||||
(?P<index>[0-9.]+) # Series index
|
||||
\s*\)
|
||||
''', re.X)
|
||||
|
||||
def delocalize_datestr(self, raw):
|
||||
if not self.months:
|
||||
return raw
|
||||
|
|
@ -265,6 +275,15 @@ def parse_details(self, raw, root):
|
|||
except:
|
||||
self.log.exception('Error parsing comments for url: %r'%self.url)
|
||||
|
||||
try:
|
||||
series, series_index = self.parse_series(root)
|
||||
if series:
|
||||
mi.series, mi.series_index = series, series_index
|
||||
elif self.testing:
|
||||
mi.series, mi.series_index = 'Dummy series for testing', 1
|
||||
except:
|
||||
self.log.exception('Error parsing series for url: %r'%self.url)
|
||||
|
||||
try:
|
||||
self.cover_url = self.parse_cover(root)
|
||||
except:
|
||||
|
|
@ -398,6 +417,20 @@ def parse_comments(self, root):
|
|||
ans += self._render_comments(desc[0])
|
||||
return ans
|
||||
|
||||
def parse_series(self, root):
|
||||
ans = (None, None)
|
||||
desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
|
||||
if desc:
|
||||
raw = self.tostring(desc[0], method='text', encoding=unicode)
|
||||
raw = re.sub(r'\s+', ' ', raw)
|
||||
match = self.series_pat.search(raw)
|
||||
if match is not None:
|
||||
s, i = match.group('series'), float(match.group('index'))
|
||||
if s:
|
||||
ans = (s, i)
|
||||
return ans
|
||||
|
||||
|
||||
def parse_cover(self, root):
|
||||
imgs = root.xpath('//img[@id="prodImage" and @src]')
|
||||
if imgs:
|
||||
|
|
@ -457,7 +490,7 @@ class Amazon(Source):
|
|||
capabilities = frozenset(['identify', 'cover'])
|
||||
touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
|
||||
'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate',
|
||||
'languages'])
|
||||
'languages', 'series'])
|
||||
has_html_comments = True
|
||||
supports_gzip_transfer_encoding = True
|
||||
|
||||
|
|
@ -685,13 +718,15 @@ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
|||
from lxml.html import tostring
|
||||
import html5lib
|
||||
|
||||
testing = getattr(self, 'running_a_test', False)
|
||||
|
||||
query, domain = self.create_query(log, title=title, authors=authors,
|
||||
identifiers=identifiers)
|
||||
if query is None:
|
||||
log.error('Insufficient metadata to construct query')
|
||||
return
|
||||
br = self.browser
|
||||
if getattr(self, 'running_a_test', False):
|
||||
if testing:
|
||||
print ('Using user agent for amazon: %s'%self.user_agent)
|
||||
try:
|
||||
raw = br.open_novisit(query, timeout=timeout).read().strip()
|
||||
|
|
@ -714,7 +749,7 @@ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
|||
raw = clean_ascii_chars(xml_to_unicode(raw,
|
||||
strip_encoding_pats=True, resolve_entities=True)[0])
|
||||
|
||||
if getattr(self, 'running_a_test', False):
|
||||
if testing:
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(prefix='amazon_results_',
|
||||
suffix='.html', delete=False) as f:
|
||||
|
|
@ -757,8 +792,7 @@ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
|||
return
|
||||
|
||||
workers = [Worker(url, result_queue, br, log, i, domain, self,
|
||||
testing=getattr(self, 'running_a_test', False)) for i, url in
|
||||
enumerate(matches)]
|
||||
testing=testing) for i, url in enumerate(matches)]
|
||||
|
||||
for w in workers:
|
||||
w.start()
|
||||
|
|
@ -820,9 +854,18 @@ def download_cover(self, log, result_queue, abort, # {{{
|
|||
# To run these test use: calibre-debug -e
|
||||
# src/calibre/ebooks/metadata/sources/amazon.py
|
||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||
isbn_test, title_test, authors_test, comments_test)
|
||||
isbn_test, title_test, authors_test, comments_test, series_test)
|
||||
com_tests = [ # {{{
|
||||
|
||||
( # Series
|
||||
{'identifiers':{'amazon':'0756407117'}},
|
||||
[title_test(
|
||||
"Throne of the Crescent Moon"
|
||||
, exact=True), series_test('Crescent Moon Kingdoms', 1),
|
||||
comments_test('Makhslood'),
|
||||
]
|
||||
),
|
||||
|
||||
( # Different comments markup, using Book Description section
|
||||
{'identifiers':{'amazon':'0982514506'}},
|
||||
[title_test(
|
||||
|
|
|
|||
Loading…
Reference in a new issue