Reject HTML sites in no_convert_image

2026-04-28 09:54:14 +02:00 · 2025-04-03 09:46:27 +00:00 · 2025-04-03 09:46:27 +00:00 · 7123f7dd6f
commit 7123f7dd6f
parent 08a0f9b5fc
1 changed files with 8 additions and 0 deletions
--- a/fanficfare/story.py
+++ b/fanficfare/story.py
@ -31,6 +31,7 @@ from . import six
 from .six.moves.urllib.parse import (urlparse, urlunparse)
 from .six import text_type as unicode
 from .six import string_types as basestring
+from .six import ensure_binary

 import bs4

@ -189,6 +190,13 @@ def no_convert_image(url,data):

    ext=parsedUrl.path[parsedUrl.path.rfind('.')+1:].lower()

+    try:
+        sample_data = ensure_binary(data[:50])
+        if b'<!doctype html>' in sample_data or b'<!DOCTYPE html>' in sample_data:
+            raise exceptions.RejectImage("no_convert_image url:%s - html site"%url)
+    except (UnicodeEncodeError, TypeError) as e:
+        logger.debug("no_convert_image url:%s - Exception: %s"%(url,str(e)))
+
    if ext not in imagetypes:
        # not found at end of path, try end of whole URL in case of
        # parameter.