mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
Arbitrary: strip all namespaced elements
This is `fb:like` and similar, which break some epub readers. Refs: #41, #43
This commit is contained in:
parent
9c9877ed26
commit
c208e33752
1 changed files with 7 additions and 0 deletions
|
|
@ -4,6 +4,7 @@ import logging
|
|||
import attr
|
||||
import datetime
|
||||
import json
|
||||
import re
|
||||
import os.path
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
|
|
@ -103,6 +104,12 @@ class Arbitrary(Site):
|
|||
if not soup.select(definition.content_selector):
|
||||
return chapters
|
||||
|
||||
# clean up a few things which will definitely break epubs:
|
||||
# TODO: expand this greatly, or make it configurable
|
||||
for namespaced in soup.find_all(re.compile(r'[a-z]+:[a-z]+')):
|
||||
# Namespaced elements are going to cause validation errors
|
||||
namespaced.decompose()
|
||||
|
||||
for content in soup.select(definition.content_selector):
|
||||
if definition.filter_selector:
|
||||
for filtered in content.select(definition.filter_selector):
|
||||
|
|
|
|||
Loading…
Reference in a new issue