Extract tags when present

Supported currently on Xenforo and AO3
2025-12-13 11:53:43 +01:00 · 2021-05-01 16:35:49 -05:00 · 2021-05-01 16:35:49 -05:00 · d1caf85883
commit d1caf85883
parent 37cb0332b7
4 changed files with 9 additions and 3 deletions
--- a/sites/init.py
+++ b/sites/init.py
@ -36,6 +36,7 @@ class Section:
    id = attr.ib(default=attr.Factory(_default_uuid_string, takes_self=True), converter=str)
    contents = attr.ib(default=attr.Factory(list))
    footnotes = attr.ib(default=attr.Factory(list))
+    tags = attr.ib(default=attr.Factory(list))
    summary = attr.ib(default='')

    def __iter__(self):
--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -59,7 +59,8 @@ class ArchiveOfOurOwn(Site):
            title=soup.select('#workskin > .preface .title')[0].text.strip(),
            author=soup.select('#workskin .preface .byline a')[0].text.strip(),
            summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
-            url=f'http://archiveofourown.org/works/{workid}'
+            url=f'http://archiveofourown.org/works/{workid}',
+            tags=[tag.get_text().strip() for tag in soup.select('.work.meta .tags a.tag')]
        )

        # Fetch the chapter list as well because it contains info that's not in the full work
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -133,10 +133,12 @@ class XenForo(Site):
        # clean out informational bits from the title
        for tag in title.find_all(class_='prefix'):
            tag.decompose()
+        tags = [tag.get_text().strip() for tag in soup.select('div.tagBlock a.tag')]
        return Section(
            title=title.get_text().strip(),
            author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(),
-            url=url
+            url=url,
+            tags=tags
        )

    def _posts_from_page(self, soup, postid=False):
--- a/sites/xenforo2.py
+++ b/sites/xenforo2.py
@ -16,10 +16,12 @@ class XenForo2(XenForo):
        # clean out informational bits from the title
        for tag in title.select('.labelLink,.label-append'):
            tag.decompose()
+        tags = [tag.get_text().strip() for tag in soup.select('.tagList a.tagItem')]
        return Section(
            title=title.get_text().strip(),
            author=soup.find('div', class_='p-description').find('a', class_='username').get_text(),
-            url=url
+            url=url,
+            tags=tags
        )

    def _posts_from_page(self, soup, postid=False):