From b9224f17c4c2052d2fcab913afdd7bb02224f272 Mon Sep 17 00:00:00 2001
From: Eli Schwartz <eschwartz@archlinux.org>
Date: Tue, 23 Apr 2019 03:28:59 -0400
Subject: [PATCH] py3: basic news fetching works

Just some more juggling around of bytes types. Specifically note that
urls are encoded and quoted, then decoded before being given to
mechanize as mechanize expects to see unicode. Furthermore,
ascii_filename is already there to sanitize filenames.
---
 src/calibre/web/fetch/simple.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index 971f52a4e5..ca81b139e4 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -103,7 +103,7 @@ def save_soup(soup, target):
         f.write(html.encode('utf-8'))
 
 
-class response(str):
+class response(bytes):
 
     def __new__(cls, *args):
         obj = super(response, cls).__new__(cls, *args)
@@ -261,16 +261,13 @@ def fetch_url(self, url):
         delta = time.time() - self.last_fetch_at
         if delta < self.delay:
             time.sleep(self.delay - delta)
-        if isinstance(url, unicode_type):
-            url = url.encode('utf-8')
-        # Not sure is this is really needed as I think mechanize
-        # handles quoting automatically, but leaving it
-        # in case it breaks something
+        # mechanize does not handle quoting automatically
         if re.search(r'\s+', url) is not None:
+            url = url.encode('utf-8')
             purl = list(urlparse(url))
             for i in range(2, 6):
                 purl[i] = quote(purl[i])
-            url = urlunparse(purl)
+            url = urlunparse(purl).decode('utf-8')
         open_func = getattr(self.browser, 'open_novisit', self.browser.open)
         try:
             with closing(open_func(url, timeout=self.timeout)) as f:
@@ -414,8 +411,6 @@ def process_images(self, soup, baseurl):
                     continue
             c += 1
             fname = ascii_filename('img'+str(c))
-            if isinstance(fname, unicode_type):
-                fname = fname.encode('ascii', 'replace')
             data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
             if data is None:
                 continue
@@ -520,7 +515,7 @@ def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
                     dsrc = self.fetch_url(iurl)
                     newbaseurl = dsrc.newurl
                     if len(dsrc) == 0 or \
-                       len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
+                       len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
                         raise ValueError('No content at URL %r'%iurl)
                     if callable(self.encoding):
                         dsrc = self.encoding(dsrc)
@@ -544,7 +539,7 @@ def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
                     _fname = basename(iurl)
                     if not isinstance(_fname, unicode_type):
                         _fname.decode('latin1', 'replace')
-                    _fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
+                    _fname = _fname.replace('%', '').replace(os.sep, '').encode('ascii', 'replace')
                     _fname = ascii_filename(_fname)
                     _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
                     res = os.path.join(linkdiskpath, _fname)