From d1caf858838a9d6bb2f110c59050e94e458db361 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sat, 1 May 2021 16:35:49 -0500 Subject: [PATCH] Extract tags when present Supported currently on Xenforo and AO3 --- sites/__init__.py | 1 + sites/ao3.py | 3 ++- sites/xenforo.py | 4 +++- sites/xenforo2.py | 4 +++- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sites/__init__.py b/sites/__init__.py index ef468bc..67c577b 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -36,6 +36,7 @@ class Section: id = attr.ib(default=attr.Factory(_default_uuid_string, takes_self=True), converter=str) contents = attr.ib(default=attr.Factory(list)) footnotes = attr.ib(default=attr.Factory(list)) + tags = attr.ib(default=attr.Factory(list)) summary = attr.ib(default='') def __iter__(self): diff --git a/sites/ao3.py b/sites/ao3.py index 376b4d8..dd182c7 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -59,7 +59,8 @@ class ArchiveOfOurOwn(Site): title=soup.select('#workskin > .preface .title')[0].text.strip(), author=soup.select('#workskin .preface .byline a')[0].text.strip(), summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(), - url=f'http://archiveofourown.org/works/{workid}' + url=f'http://archiveofourown.org/works/{workid}', + tags=[tag.get_text().strip() for tag in soup.select('.work.meta .tags a.tag')] ) # Fetch the chapter list as well because it contains info that's not in the full work diff --git a/sites/xenforo.py b/sites/xenforo.py index 9facffc..4c8b50b 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -133,10 +133,12 @@ class XenForo(Site): # clean out informational bits from the title for tag in title.find_all(class_='prefix'): tag.decompose() + tags = [tag.get_text().strip() for tag in soup.select('div.tagBlock a.tag')] return Section( title=title.get_text().strip(), author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(), - url=url + url=url, + tags=tags ) def _posts_from_page(self, soup, postid=False): diff --git a/sites/xenforo2.py b/sites/xenforo2.py index ec08f12..bc9398c 100644 --- a/sites/xenforo2.py +++ b/sites/xenforo2.py @@ -16,10 +16,12 @@ class XenForo2(XenForo): # clean out informational bits from the title for tag in title.select('.labelLink,.label-append'): tag.decompose() + tags = [tag.get_text().strip() for tag in soup.select('.tagList a.tagItem')] return Section( title=title.get_text().strip(), author=soup.find('div', class_='p-description').find('a', class_='username').get_text(), - url=url + url=url, + tags=tags ) def _posts_from_page(self, soup, postid=False):