mirror of
https://github.com/kemayo/leech
synced 2025-12-09 09:53:30 +01:00
Arbitrary: strip all namespaced elements
This is `fb:like` and similar, which break some epub readers. Refs: #41, #43
This commit is contained in:
parent
9c9877ed26
commit
c208e33752
1 changed files with 7 additions and 0 deletions
|
|
@ -4,6 +4,7 @@ import logging
|
||||||
import attr
|
import attr
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
import os.path
|
import os.path
|
||||||
from . import register, Site, Section, Chapter
|
from . import register, Site, Section, Chapter
|
||||||
|
|
||||||
|
|
@ -103,6 +104,12 @@ class Arbitrary(Site):
|
||||||
if not soup.select(definition.content_selector):
|
if not soup.select(definition.content_selector):
|
||||||
return chapters
|
return chapters
|
||||||
|
|
||||||
|
# clean up a few things which will definitely break epubs:
|
||||||
|
# TODO: expand this greatly, or make it configurable
|
||||||
|
for namespaced in soup.find_all(re.compile(r'[a-z]+:[a-z]+')):
|
||||||
|
# Namespaced elements are going to cause validation errors
|
||||||
|
namespaced.decompose()
|
||||||
|
|
||||||
for content in soup.select(definition.content_selector):
|
for content in soup.select(definition.content_selector):
|
||||||
if definition.filter_selector:
|
if definition.filter_selector:
|
||||||
for filtered in content.select(definition.filter_selector):
|
for filtered in content.select(definition.filter_selector):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue