1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Arbitrary: strip all namespaced elements

This is `fb:like` and similar, which break some epub readers.

Refs: #41, #43
This commit is contained in:
David Lynch 2020-09-08 23:04:47 -05:00
parent 9c9877ed26
commit c208e33752

View file

@ -4,6 +4,7 @@ import logging
import attr
import datetime
import json
import re
import os.path
from . import register, Site, Section, Chapter
@ -103,6 +104,12 @@ class Arbitrary(Site):
if not soup.select(definition.content_selector):
return chapters
# clean up a few things which will definitely break epubs:
# TODO: expand this greatly, or make it configurable
for namespaced in soup.find_all(re.compile(r'[a-z]+:[a-z]+')):
# Namespaced elements are going to cause validation errors
namespaced.decompose()
for content in soup.select(definition.content_selector):
if definition.filter_selector:
for filtered in content.select(definition.filter_selector):