More direct way for /../ in Get Story URLs from web page, previous broke other sites. #1160

2025-12-06 08:52:55 +01:00 · 2025-02-23 20:37:16 -06:00 · 2025-02-23 20:37:16 -06:00 · 8e55d1e6f4
commit 8e55d1e6f4
parent 9b8eb547fc
1 changed files with 9 additions and 3 deletions
--- a/fanficfare/geturls.py
+++ b/fanficfare/geturls.py
@ -23,7 +23,7 @@ import re

 # unicode in py2, str in py3
 from .six.moves.urllib.request import urlopen
-from .six.moves.urllib.parse import (urlparse, urlunparse, urljoin)
+from .six.moves.urllib.parse import (urlparse, urlunparse)
 from .six import text_type as unicode
 from .six import ensure_str

@ -137,7 +137,7 @@ def form_url(parenturl,url):
             returl = urlunparse(
                 (parsedUrl.scheme,
                  parsedUrl.netloc,
-                  urljoin(url,'.'),
+                  url,
                  '','',''))
         else:
             toppath=""
@ -148,7 +148,7 @@ def form_url(parenturl,url):
             returl = urlunparse(
                 (parsedUrl.scheme,
                  parsedUrl.netloc,
-                  urljoin(toppath + '/' + url,'.'),
+                  toppath + '/' + url,
                  '','',''))
     return returl

@ -181,6 +181,12 @@ def cleanup_url(href,configuration,foremail=False):
            href = href.replace('&index=1','')
        except Exception as e:
            logger.warning("Skipping royalroad email URL %s, got HTTP error %s"%(href,e))
+    if '/../' in href:
+        ## For mcstories.com, see #1160 All my attempts to use
+        ## urljoin() got uncomfortably complex in the face of
+        ## javascript links and parameter URLs.  And normpath() will
+        ## give \ on windows.
+        href = re.sub(r'([^/]+/../)',r'',href)
    return href

 def get_urls_from_imap(srv,user,passwd,folder,markread=True,normalize_urls=False):