mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-30 04:36:11 +01:00
Merge branch 'SV-xenforo2'
This commit is contained in:
commit
a8bdc69cea
5 changed files with 466 additions and 115 deletions
|
|
@ -110,6 +110,7 @@ from . import adapter_tgstorytimecom
|
|||
from . import adapter_itcouldhappennet
|
||||
from . import adapter_forumsspacebattlescom
|
||||
from . import adapter_forumssufficientvelocitycom
|
||||
from . import adapter_xf2testsufficientvelocitycom
|
||||
from . import adapter_forumquestionablequestingcom
|
||||
from . import adapter_ninelivesarchivecom
|
||||
from . import adapter_masseffect2in
|
||||
|
|
|
|||
49
fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py
Normal file
49
fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2019 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
import re
|
||||
|
||||
from .base_xenforo2forum_adapter import BaseXenForo2ForumAdapter
|
||||
|
||||
def getClass():
|
||||
return XF2TestSufficientVelocityComAdapter
|
||||
|
||||
class XF2TestSufficientVelocityComAdapter(BaseXenForo2ForumAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseXenForo2ForumAdapter.__init__(self, config, url)
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','fsv2')
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'xf2test.sufficientvelocity.com'
|
||||
|
||||
# @classmethod
|
||||
# def getAcceptDomains(cls):
|
||||
# return [cls.getSiteDomain(),
|
||||
# cls.getSiteDomain().replace('forums.','forum.'),
|
||||
# cls.getSiteDomain().replace('forums.','')]
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
## SV accepts forums.sufficientvelocity.com, forum.sufficientvelocity.com and sufficientvelocity.com
|
||||
## all of which redirect to forums.
|
||||
## We will use forums. as canonical for all
|
||||
return super(XF2TestSufficientVelocityComAdapter, self).getSiteURLPattern().replace(re.escape("forums."),r"(forums?\.)?")
|
||||
195
fanficfare/adapters/base_xenforo2forum_adapter.py
Normal file
195
fanficfare/adapters/base_xenforo2forum_adapter.py
Normal file
|
|
@ -0,0 +1,195 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2019 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
from datetime import datetime
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
from xml.dom.minidom import parseString
|
||||
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
# py2 vs py3 transition
|
||||
from ..six import text_type as unicode
|
||||
from ..six.moves.urllib.error import HTTPError
|
||||
|
||||
from .base_adapter import makeDate
|
||||
from .base_xenforoforum_adapter import BaseXenForoForumAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
logger.info("init url: "+url)
|
||||
BaseXenForoForumAdapter.__init__(self, config, url)
|
||||
|
||||
@classmethod
|
||||
def getConfigSections(cls):
|
||||
"Only needs to be overriden if has additional ini sections."
|
||||
return ['base_xenforo2forum'] + super(BaseXenForo2ForumAdapter, cls).getConfigSections()
|
||||
|
||||
def performLogin(self):
|
||||
params = {}
|
||||
|
||||
if self.password:
|
||||
params['login'] = self.username
|
||||
params['password'] = self.password
|
||||
else:
|
||||
params['login'] = self.getConfig("username")
|
||||
params['password'] = self.getConfig("password")
|
||||
|
||||
if not params['login']:
|
||||
raise exceptions.FailedToLogin(self.url,"No username given. Set in personal.ini or enter when prompted.")
|
||||
|
||||
## need a login token.
|
||||
data = self._fetchUrl(self.getURLPrefix() + '/login',usecache=False)
|
||||
# logger.debug(data)
|
||||
# <input type="hidden" name="_xfToken" value="1556822458,710e5bf6fc87c67ea04ab56a910ac3ff" />
|
||||
find_token='<input type="hidden" name="_xfToken" value="'
|
||||
xftoken = data[data.index(find_token)+len(find_token):]
|
||||
xftoken = xftoken[:xftoken.index('"')]
|
||||
params['remember'] = '1'
|
||||
params['_xfToken'] = xftoken
|
||||
params['_xfRedirect'] = self.getURLPrefix() + '/'
|
||||
|
||||
## https://forum.questionablequesting.com/login/login
|
||||
loginUrl = self.getURLPrefix() + '/login/login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['login']))
|
||||
|
||||
d = self._postUrl(loginUrl, params)# , headers={ 'referer':self.getURLPrefix() + '/login',
|
||||
# 'origin':self.getURLPrefix() })
|
||||
|
||||
if "Log In" in d:
|
||||
# logger.debug(d)
|
||||
logger.info("Failed to login to URL %s as %s" % (self.url,
|
||||
params['login']))
|
||||
raise exceptions.FailedToLogin(self.url,params['login'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def parse_title(self,souptag):
|
||||
h1 = souptag.find('h1',{'class':'p-title-value'})
|
||||
# logger.debug(h1)
|
||||
## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
|
||||
for tag in h1.find_all('span',{'class':'label'}):
|
||||
## stick them into genre.
|
||||
self.story.addToList('genre',stripHTML(tag))
|
||||
# logger.debug(stripHTML(tag))
|
||||
tag.extract()
|
||||
self.story.setMetadata('title',stripHTML(h1))
|
||||
# logger.debug(stripHTML(h1))
|
||||
|
||||
def get_forumtags(self,topsoup):
|
||||
return topsoup.find('div',{'class':'p-description'}).findAll('a',{'class':'tagItem'})
|
||||
|
||||
def parse_author(self,souptag):
|
||||
a = souptag.find('section',{'class':'message-user'}).find('a',{'class':'username'})
|
||||
# logger.debug(a)
|
||||
self.story.addToList('authorId',a['href'].split('/')[-2])
|
||||
authorUrl = a['href'] # self.getURLPrefix()+'/'+a['href']
|
||||
self.story.addToList('authorUrl',authorUrl)
|
||||
self.story.addToList('author',a.text)
|
||||
|
||||
def cache_posts(self,topsoup):
|
||||
for post in topsoup.find_all('article',{'class':'message--post'}):
|
||||
logger.debug("Caching %s"%post['data-content'])
|
||||
self.post_cache[post['data-content']] = post
|
||||
|
||||
def get_first_post(self,topsoup):
|
||||
return topsoup.find('article',{'class':'message--post'})
|
||||
|
||||
def get_first_post_body(self,topsoup):
|
||||
return self.get_post_body(self.get_first_post(topsoup))
|
||||
|
||||
def get_post_body(self,souptag):
|
||||
return souptag.find('article',{'class':'message-body'}).find('div',{'class':'bbWrapper'})
|
||||
|
||||
def get_post_created_date(self,souptag):
|
||||
return self.make_date(souptag.find('div', {'class':'message-date'}))
|
||||
|
||||
def get_post_updated_date(self,souptag):
|
||||
return self.make_date(souptag.find('div',{'class':'message-lastEdit'}))
|
||||
|
||||
def get_threadmarks_top(self,souptag):
|
||||
return souptag.find('div',{'class':'block-outer-main--threadmarks'})
|
||||
|
||||
def get_threadmarks(self,navdiv):
|
||||
return navdiv.find_all('a',{'class':'menuTrigger','href':re.compile('threadmarks.*(threadmark_category=)?')})
|
||||
|
||||
def get_threadmark_catnumname(self,threadmarksa):
|
||||
if 'threadmark_category=' in threadmarksa['href']:
|
||||
tmcat_num = threadmarksa['href'].split('threadmark_category=')[1]
|
||||
else:
|
||||
tmcat_num = '1'
|
||||
tmcat_name = stripHTML(threadmarksa)
|
||||
return (tmcat_num,tmcat_name)
|
||||
|
||||
def get_threadmarks_list(self,soupmarks):
|
||||
return soupmarks.find('div',{'class':'structItemContainer'})
|
||||
|
||||
def get_threadmarks_from_list(self,tm_list):
|
||||
return tm_list.find_all('div',{'class':'structItem--threadmark'})
|
||||
|
||||
def get_atag_from_threadmark(self,tm_item):
|
||||
return tm_item.find('a',{'data-tp-primary':'on'})
|
||||
|
||||
def get_threadmark_range_url(self,tm_item,tmcat_num):
|
||||
fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
|
||||
# logger.debug(fetcher)
|
||||
return fetcher['data-fetchurl']
|
||||
|
||||
def get_threadmark_date(self,tm_item):
|
||||
return self.make_date(tm_item)
|
||||
|
||||
## XF2 doesn't appear to have words, just kwords.
|
||||
def get_threadmark_words(self,tm_item):
|
||||
words = kwords = ""
|
||||
worddd = tm_item.find('dd')
|
||||
if worddd:
|
||||
kwords = "("+stripHTML(worddd)+")" # to match XF1
|
||||
return words,kwords
|
||||
|
||||
def make_date(self,parenttag):
|
||||
datestr=None
|
||||
try:
|
||||
datetag = parenttag.find('time')
|
||||
# not paying any attention to TZ issues.
|
||||
return datetime.fromtimestamp(float(datetag['data-time']))
|
||||
except:
|
||||
logger.warn('No date found in %s'%parenttag,exc_info=True)
|
||||
return None
|
||||
|
||||
def make_reader_url(self,tmcat_num,reader_page_num):
|
||||
# https://xf2test.sufficientvelocity.com/threads/mauling-snarks-worm.41471/reader/page-4?threadmark_category=4
|
||||
return self.story.getMetadata('storyUrl')+'reader/page-'+unicode(reader_page_num)+'?threadmark_category='+tmcat_num
|
||||
|
||||
def get_quote_expand_tag(self,soup):
|
||||
return soup.find_all('div',{'class':re.compile(r'bbCodeBlock-(expand|shrink)Link')})
|
||||
|
||||
def get_spoiler_tags(self,topsoup):
|
||||
return topsoup.find_all('div',class_='bbCodeSpoiler')
|
||||
|
||||
def convert_quotes(self,soup):
|
||||
## make XF2 quote divs blockquotes so the spacing is the same
|
||||
## as XF1.
|
||||
for tag in soup.find_all('div', class_="bbCodeBlock-expandContent"):
|
||||
tag.name='blockquote'
|
||||
|
|
@ -86,7 +86,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
def getSiteURLPattern(self):
|
||||
## need to accept http and https still.
|
||||
return re.escape(self.getURLPrefix()).replace("https","https?")+r"/(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#post-(?P<anchorpost>\d+))?$"
|
||||
return re.escape(self.getURLPrefix()).replace("https","https?")+r"/(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#?post-(?P<anchorpost>\d+))?$"
|
||||
|
||||
def _fetchUrlOpened(self, url,
|
||||
parameters=None,
|
||||
|
|
@ -206,7 +206,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
## https://forum.questionablequesting.com/login/login
|
||||
loginUrl = self.getURLPrefix() + '/login/login'
|
||||
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
params['login']))
|
||||
params['login']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
|
|
@ -220,6 +220,11 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
def make_soup(self,data):
|
||||
soup = super(BaseXenForoForumAdapter, self).make_soup(data)
|
||||
## img class="lazyload"
|
||||
## include lazy load images.
|
||||
for img in soup.find_all('img',{'class':'lazyload'}):
|
||||
img['src'] = img['data-src']
|
||||
|
||||
## after lazy load images, there are noscript blocks also
|
||||
## containing <img> tags. The problem comes in when they hit
|
||||
## book readers such as Kindle and Nook and then you see the
|
||||
|
|
@ -227,94 +232,142 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
for noscript in soup.find_all('noscript'):
|
||||
noscript.extract()
|
||||
|
||||
for qdiv in self.get_quote_expand_tag(soup):
|
||||
qdiv.extract() # Remove <div class="...">click to expand</div>
|
||||
|
||||
self.convert_quotes(soup)
|
||||
|
||||
self.handle_spoilers(soup)
|
||||
|
||||
## cache posts on page.
|
||||
self.cache_posts(soup)
|
||||
return soup
|
||||
|
||||
## Moved over from adapter_forumquestionablequestingcom when SB/SV
|
||||
## threadmark.rss became 'most recent 10 in reverse order'.
|
||||
def get_threadmarks_top(self,souptag):
|
||||
return souptag.find('div',{'class':'threadmarkMenus'})
|
||||
|
||||
def get_threadmarks(self,navdiv):
|
||||
return navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')})
|
||||
|
||||
def get_threadmark_catnumname(self,threadmarksa):
|
||||
return (threadmarksa['href'].split('category_id=')[1],
|
||||
stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'})))
|
||||
|
||||
def extract_threadmarks(self,souptag):
|
||||
threadmarks=[]
|
||||
# try threadmarks if no '#' in url
|
||||
navdiv = souptag.find('div',{'class':'threadmarkMenus'})
|
||||
navdiv = self.get_threadmarks_top(souptag)
|
||||
if not navdiv:
|
||||
return threadmarks
|
||||
# was class=threadmarksTrigger. thread cats are currently
|
||||
# only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't
|
||||
# be surprised if that changed. Don't want to do use just
|
||||
# href=re because there's more than one copy on the page; plus
|
||||
# could be included in a post. Would be easier if <noscript>s
|
||||
# weren't being stripped, but that's a different issue.
|
||||
threadmarksas = navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')})
|
||||
## Loop on threadmark categories.
|
||||
tmcat_num=None
|
||||
threadmarksas = self.get_threadmarks(navdiv)
|
||||
|
||||
threadmarkgroups = dict() # for ordering threadmarks
|
||||
## Loop on threadmark categories.
|
||||
for threadmarksa in threadmarksas:
|
||||
tmcat_num = threadmarksa['href'].split('category_id=')[1]
|
||||
# get from earlier <a> now.
|
||||
tmcat_name = stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'}))
|
||||
prepend = ""
|
||||
(tmcat_num,tmcat_name) = self.get_threadmark_catnumname(threadmarksa)
|
||||
if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
|
||||
continue
|
||||
|
||||
if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
|
||||
tmcat_name = 'Omake'
|
||||
|
||||
if tmcat_name != "Threadmarks":
|
||||
prepend = tmcat_name+" - "
|
||||
|
||||
threadmarks.extend(self.fetch_threadmarks(self.getURLPrefix()+'/'+threadmarksa['href'],
|
||||
tmcat_name,
|
||||
tmcat_num))
|
||||
if 'http' not in threadmarksa['href']:
|
||||
href = self.getURLPrefix()+'/'+threadmarksa['href']
|
||||
else:
|
||||
href = threadmarksa['href']
|
||||
threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
|
||||
tmcat_name,
|
||||
tmcat_num)
|
||||
## Order of threadmark groups in new SV is changed and
|
||||
## possibly unpredictable. Normalize. Keep as configurable?
|
||||
## What about categories not in the list?
|
||||
default_order = ['Threadmarks',
|
||||
'Sidestory',
|
||||
'Apocrypha',
|
||||
'Omake',
|
||||
'Media',
|
||||
'Informational',
|
||||
'Staff Post']
|
||||
# default order also *after* config'ed
|
||||
# threadmark_category_order so if they are not also in
|
||||
# skip_threadmarks_categories they appear in the expected
|
||||
# order.
|
||||
for cat_name in self.getConfigList('threadmark_category_order',default_order)+default_order:
|
||||
if cat_name in threadmarkgroups:
|
||||
threadmarks.extend(threadmarkgroups[cat_name])
|
||||
del threadmarkgroups[cat_name]
|
||||
# more categories left? new or at least unknown
|
||||
if threadmarkgroups:
|
||||
cats = threadmarkgroups.keys()
|
||||
# alphabetize for lack of a better idea to insure consist ordering
|
||||
cats.sort()
|
||||
for cat_name in cats:
|
||||
threadmarks.extend(threadmarkgroups[cat_name])
|
||||
return threadmarks
|
||||
|
||||
def get_threadmarks_list(self,soupmarks):
|
||||
return soupmarks.find('div',{'class':'threadmarkList'})
|
||||
|
||||
def get_threadmarks_from_list(self,tm_list):
|
||||
return tm_list.find_all('li',{'class':'threadmarkListItem'})
|
||||
|
||||
def get_atag_from_threadmark(self,tm_item):
|
||||
return tm_item.find('a',{'class':'PreviewTooltip'})
|
||||
|
||||
def get_threadmark_range_url(self,tm_item,tmcat_num):
|
||||
load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'],
|
||||
tm_item['data-range-max'],
|
||||
tmcat_num)
|
||||
return self.url+load_range
|
||||
|
||||
def get_threadmark_date(self,tm_item):
|
||||
atag = self.get_atag_from_threadmark(tm_item)
|
||||
return self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
|
||||
|
||||
def get_threadmark_words(self,tm_item):
|
||||
words = kwords = ""
|
||||
atag = self.get_atag_from_threadmark(tm_item)
|
||||
if atag.parent.has_attr('data-words'):
|
||||
words = int(atag.parent['data-words'])
|
||||
if "(" in atag.next_sibling:
|
||||
kwords = atag.next_sibling.strip()
|
||||
return words,kwords
|
||||
|
||||
def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0):
|
||||
logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url))
|
||||
threadmarks=[]
|
||||
soupmarks = self.make_soup(self._fetchUrl(url))
|
||||
tm_list = soupmarks.find('div',{'class':'threadmarkList'})
|
||||
if not tm_list: # load-range don't have threadmarkList.
|
||||
tm_list = self.get_threadmarks_list(soupmarks)
|
||||
if not tm_list: # load-range don't match
|
||||
tm_list = soupmarks
|
||||
# logger.debug(tm_list)
|
||||
markas = []
|
||||
tmcat_index=passed_tmcat_index
|
||||
after = False
|
||||
for tm_item in tm_list.find_all('li',{'class':'threadmarkListItem'}):
|
||||
atag = tm_item.find('a',{'class':'PreviewTooltip'})
|
||||
for tm_item in self.get_threadmarks_from_list(tm_list):
|
||||
atag = self.get_atag_from_threadmark(tm_item)
|
||||
if not atag:
|
||||
if tm_item['data-range-min'] and tm_item['data-range-max']:
|
||||
# logger.debug(tm_item)
|
||||
load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'],
|
||||
tm_item['data-range-max'],
|
||||
tmcat_num)
|
||||
threadmarks.extend(self.fetch_threadmarks(self.url+load_range,
|
||||
tmcat_name,
|
||||
tmcat_num,
|
||||
tmcat_index))
|
||||
tmcat_index = len(threadmarks)
|
||||
after=True
|
||||
threadmarks.extend(self.fetch_threadmarks(self.get_threadmark_range_url(tm_item,tmcat_num),
|
||||
tmcat_name,
|
||||
tmcat_num,
|
||||
tmcat_index))
|
||||
tmcat_index = len(threadmarks)
|
||||
after=True
|
||||
else:
|
||||
if after:
|
||||
# logger.debug("AFTER "*10)
|
||||
after=False
|
||||
url,name = atag['href'],stripHTML(atag)
|
||||
date = self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
|
||||
if atag.parent.has_attr('data-words'):
|
||||
words = int(atag.parent['data-words'])
|
||||
if "(" in atag.next_sibling:
|
||||
kwords = atag.next_sibling.strip()
|
||||
# logger.debug("%s"%kwords)
|
||||
else:
|
||||
words = ""
|
||||
kwords = ""
|
||||
date = self.get_threadmark_date(tm_item)
|
||||
words,kwords = self.get_threadmark_words(tm_item)
|
||||
if 'http' not in url:
|
||||
url = self.getURLPrefix()+"/"+url
|
||||
# logger.debug("%s. %s"%(tmcat_index,name))
|
||||
threadmarks.append({"tmcat_name":tmcat_name,
|
||||
"tmcat_num":tmcat_num,
|
||||
"tmcat_index":tmcat_index,
|
||||
"title":name,
|
||||
"url":self.getURLPrefix()+"/"+url,
|
||||
"url":url,
|
||||
"date":date,
|
||||
"words":words,
|
||||
"kwords":kwords})
|
||||
|
|
@ -348,27 +401,26 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
topsoup = souptag = self.make_soup(data)
|
||||
|
||||
h1 = souptag.find('div',{'class':'titleBar'}).h1
|
||||
## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
|
||||
for tag in h1.find_all('span',{'class':'prefix'}):
|
||||
## stick them into genre.
|
||||
self.story.addToList('genre',stripHTML(tag))
|
||||
tag.extract()
|
||||
self.story.setMetadata('title',stripHTML(h1))
|
||||
self.parse_title(topsoup)
|
||||
|
||||
first_post_title = self.getConfig('first_post_title','First Post')
|
||||
|
||||
use_threadmark_chaps = False
|
||||
if '#' in useurl:
|
||||
anchorid = useurl.split('#')[1]
|
||||
souptag = souptag.find('li',id=anchorid)
|
||||
# souptag = souptag.find('li',id=anchorid)
|
||||
# cache is now loaded with posts from that reader
|
||||
# page. looking for it in cache reuses code in
|
||||
# cache_posts that finds post tags.
|
||||
souptag = self.get_cache_post(anchorid)
|
||||
|
||||
else:
|
||||
## Also sets datePublished / dateUpdated to oldest / newest post datetimes.
|
||||
threadmarks = self.extract_threadmarks(souptag)
|
||||
|
||||
if len(threadmarks) >= int(self.getConfig('minimum_threadmarks',2)):
|
||||
# remember if reader link found--only applicable if using threadmarks.
|
||||
self.reader = topsoup.find('a',href=re.compile(r'\.'+self.story.getMetadata('storyId')+r"/reader$")) is not None
|
||||
self.reader = topsoup.find('a',href=re.compile(r'\.'+self.story.getMetadata('storyId')+r"/reader/?$")) is not None
|
||||
|
||||
if self.getConfig('always_include_first_post'):
|
||||
self.add_chapter(first_post_title,useurl)
|
||||
|
|
@ -408,22 +460,18 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
if words and self.getConfig('use_threadmark_wordcounts',True):
|
||||
self.story.setMetadata('numWords',words)
|
||||
souptag = souptag.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
|
||||
souptag = self.get_first_post(topsoup)
|
||||
|
||||
if use_threadmark_chaps or self.getConfig('always_use_forumtags'):
|
||||
## only use tags if threadmarks for chapters or always_use_forumtags is on.
|
||||
for tag in topsoup.findAll('a',{'class':'tag'}) + topsoup.findAll('span',{'class':'prefix'}):
|
||||
for tag in self.get_forumtags(topsoup):
|
||||
tstr = stripHTML(tag)
|
||||
if self.getConfig('capitalize_forumtags'):
|
||||
tstr = tstr.title()
|
||||
self.story.addToList('forumtags',tstr)
|
||||
|
||||
# author moved down here to take from post URLs.
|
||||
a = souptag.find('h3',{'class':'userText'}).find('a')
|
||||
self.story.addToList('authorId',a['href'].split('/')[1])
|
||||
authorUrl = self.getURLPrefix()+'/'+a['href']
|
||||
self.story.addToList('authorUrl',authorUrl)
|
||||
self.story.addToList('author',a.text)
|
||||
self.parse_author(souptag)
|
||||
|
||||
if self.getConfig('author_avatar_cover'):
|
||||
authorcard = self.make_soup(self._fetchUrl(authorUrl+"?card=1"))
|
||||
|
|
@ -437,27 +485,28 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
##
|
||||
## </div>
|
||||
|
||||
# Now go hunting for the 'chapter list'.
|
||||
bq = souptag.find('blockquote') # assume first posting contains TOC urls.
|
||||
# Now get first post for description and chapter list if not
|
||||
# using threadmarks.
|
||||
first_post = self.get_first_post_body(topsoup)
|
||||
|
||||
bq.name='div'
|
||||
|
||||
for iframe in bq.find_all('iframe'):
|
||||
for iframe in first_post.find_all('iframe'):
|
||||
iframe.extract() # calibre book reader & editor don't like iframes to youtube.
|
||||
|
||||
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
|
||||
for qdiv in first_post.find_all('div',{'class':'quoteExpand'}):
|
||||
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
|
||||
|
||||
self.setDescription(useurl,bq)
|
||||
self.setDescription(useurl,first_post)
|
||||
|
||||
# otherwise, use first post links--include first post since
|
||||
# that's often also the first chapter.
|
||||
|
||||
if self.num_chapters() < 1:
|
||||
self.add_chapter(first_post_title,useurl)
|
||||
for (url,name) in [ (x['href'],stripHTML(x)) for x in bq.find_all('a') ]:
|
||||
# logger.debug(first_post)
|
||||
for (url,name,tag) in [ (x['href'],stripHTML(x),x) for x in first_post.find_all('a') ]:
|
||||
(is_chapter_url,url) = self._is_normalize_chapterurl(url)
|
||||
if is_chapter_url and name != u"\u2191": # skip quote links as indicated by up arrow character.
|
||||
# skip quote links as indicated by up arrow character or data-xf-click=attribution
|
||||
if is_chapter_url and name != u"\u2191" and tag.get("data-xf-click",None)!="attribution":
|
||||
self.add_chapter(name,url)
|
||||
if url == useurl and first_post_title == self.get_chapter(0,'url') \
|
||||
and not self.getConfig('always_include_first_post',False):
|
||||
|
|
@ -466,14 +515,56 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
# Didn't use threadmarks, so take created/updated dates
|
||||
# from the 'first' posting created and updated.
|
||||
date = self.make_date(souptag.find('a',{'class':'datePermalink'}))
|
||||
date = self.get_post_created_date(souptag)
|
||||
if date:
|
||||
self.story.setMetadata('datePublished', date)
|
||||
self.story.setMetadata('dateUpdated', date) # updated overwritten below if found.
|
||||
|
||||
date = self.make_date(souptag.find('div',{'class':'editDate'}))
|
||||
date = self.get_post_updated_date(souptag)
|
||||
if date:
|
||||
self.story.setMetadata('dateUpdated', date)
|
||||
# logger.debug(self.story.getMetadata('datePublished'))
|
||||
# logger.debug(self.story.getMetadata('dateUpdated'))
|
||||
|
||||
def parse_title(self,souptag):
|
||||
h1 = souptag.find('div',{'class':'titleBar'}).h1
|
||||
## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
|
||||
for tag in h1.find_all('span',{'class':'prefix'}):
|
||||
## stick them into genre.
|
||||
self.story.addToList('genre',stripHTML(tag))
|
||||
tag.extract()
|
||||
self.story.setMetadata('title',stripHTML(h1))
|
||||
|
||||
def get_forumtags(self,topsoup):
|
||||
return topsoup.findAll('a',{'class':'tag'}) + topsoup.findAll('span',{'class':'prefix'})
|
||||
|
||||
def parse_author(self,souptag):
|
||||
a = souptag.find('h3',{'class':'userText'}).find('a')
|
||||
self.story.addToList('authorId',a['href'].split('/')[1])
|
||||
authorUrl = self.getURLPrefix()+'/'+a['href']
|
||||
self.story.addToList('authorUrl',authorUrl)
|
||||
self.story.addToList('author',a.text)
|
||||
|
||||
def get_first_post(self,topsoup):
|
||||
return topsoup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
|
||||
|
||||
def get_first_post_body(self,topsoup):
|
||||
bq = self.get_first_post(topsoup).find('blockquote')
|
||||
bq.name='div'
|
||||
return bq
|
||||
|
||||
def get_post_body(self,souptag):
|
||||
bq = souptag.find('blockquote')
|
||||
if not bq:
|
||||
bq = souptag.find('div',{'class':'messageText'}) # cached gets if it was already used before
|
||||
bq.name='div'
|
||||
return bq
|
||||
|
||||
def get_post_created_date(self,souptag):
|
||||
return self.make_date(souptag.find('a',{'class':'datePermalink'}))
|
||||
|
||||
def get_post_updated_date(self,souptag):
|
||||
return self.make_date(souptag.find('div',{'class':'editDate'}))
|
||||
|
||||
def make_date(self,parenttag): # forums use a BS thing where dates
|
||||
# can appear different if recent.
|
||||
|
|
@ -496,14 +587,19 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
def cache_posts(self,topsoup):
|
||||
for post in topsoup.find_all('li',id=re.compile('post-[0-9]+')):
|
||||
logger.debug("Caching %s"%post['id'])
|
||||
self.post_cache[post['id']] = post
|
||||
|
||||
def get_cache_post(self,postid):
|
||||
## saved using original 'post-99999' id for key.
|
||||
postid=unicode(postid) # thank you, Py3.
|
||||
if '/posts/' in postid:
|
||||
## allows chapter urls to be passed in directly.
|
||||
# assumed normalized to /posts/1234/
|
||||
postid = "post-"+postid.split('/')[-2]
|
||||
elif '#post-' in postid:
|
||||
postid = postid.split('#')[1]
|
||||
logger.debug("get cache %s %s"%(postid,postid in self.post_cache))
|
||||
return self.post_cache.get(postid,None)
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
|
|
@ -533,20 +629,13 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
(tmcat_num,tmcat_index)=self.threadmarks_for_reader[url]
|
||||
reader_page_num = int((tmcat_index+posts_per_page)/posts_per_page) + offset
|
||||
logger.debug('Reader page offset:%s tmcat_num:%s tmcat_index:%s'%(offset,tmcat_num,tmcat_index))
|
||||
reader_url=self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num)
|
||||
reader_url=self.make_reader_url(tmcat_num,reader_page_num)
|
||||
logger.debug("Fetch reader URL to: %s"%reader_url)
|
||||
data = self._fetchUrl(reader_url)
|
||||
topsoup = self.make_soup(data)
|
||||
|
||||
# if no posts at all, break out of loop, we're off the end.
|
||||
# don't need to remember this, the page is cached.
|
||||
if not topsoup.find_all('li',id=re.compile(r'post-[0-9]+')):
|
||||
break
|
||||
|
||||
# assumed normalized to /posts/1234/
|
||||
anchorid = "post-"+url.split('/')[-2]
|
||||
# logger.debug("anchorid: %s"%anchorid)
|
||||
souptag = topsoup.find('li',id=anchorid)
|
||||
topsoup = self.make_soup(self._fetchUrl(reader_url))
|
||||
# make_soup() loads cache with posts from that reader
|
||||
# page. looking for it in cache reuses code in
|
||||
# cache_posts that finds post tags.
|
||||
souptag = self.get_cache_post(url)
|
||||
else:
|
||||
logger.debug("post found in cache")
|
||||
if souptag:
|
||||
|
|
@ -558,52 +647,57 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
souptag = self.get_cache_post(url)
|
||||
if not souptag:
|
||||
(data,opened) = self._fetchUrlOpened(url)
|
||||
url = opened.geturl()
|
||||
url = unicode(opened.geturl())
|
||||
if '#' in origurl and '#' not in url:
|
||||
url = url + origurl[origurl.index('#'):]
|
||||
logger.debug("chapter URL redirected to: %s"%url)
|
||||
|
||||
topsoup = souptag = self.make_soup(data)
|
||||
|
||||
if '#' in unicode(url):
|
||||
anchorid = url.split('#')[1]
|
||||
souptag = topsoup.find('li',id=anchorid)
|
||||
topsoup = self.make_soup(data)
|
||||
# make_soup() loads cache with posts from that reader
|
||||
# page. looking for it in cache reuses code in
|
||||
# cache_posts that finds post tags.
|
||||
souptag = self.get_cache_post(url)
|
||||
if not souptag and '/threads/' in url: # first post uses /thread/ URL.
|
||||
souptag = self.get_first_post(topsoup)
|
||||
|
||||
# remove <div class="baseHtml noticeContent"> because it can
|
||||
# get confused for post content on first posts.
|
||||
for notice in souptag.find_all('div',{'class':'noticeContent'}):
|
||||
notice.extract()
|
||||
|
||||
bq = souptag.find('blockquote')
|
||||
if not bq:
|
||||
bq = souptag.find('div',{'class':'messageText'}) # cached gets if it was already used before
|
||||
postbody = self.get_post_body(souptag)
|
||||
|
||||
bq.name='div'
|
||||
|
||||
for iframe in bq.find_all('iframe'):
|
||||
for iframe in postbody.find_all('iframe'):
|
||||
iframe.extract() # calibre book reader & editor don't like iframes to youtube.
|
||||
|
||||
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
|
||||
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
|
||||
|
||||
## img alt="[IMG]" class="bbCodeImage LbImage lazyload
|
||||
## include lazy load images.
|
||||
for img in bq.find_all('img',{'class':'lazyload'}):
|
||||
img['src'] = img['data-src']
|
||||
|
||||
# XenForo uses <base href="https://forums.spacebattles.com/" />
|
||||
return self.utf8FromSoup(self.getURLPrefix()+'/',bq)
|
||||
return self.utf8FromSoup(self.getURLPrefix()+'/',postbody)
|
||||
|
||||
def make_reader_url(self,tmcat_num,reader_page_num):
|
||||
return self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num)
|
||||
|
||||
def get_quote_expand_tag(self,soup):
|
||||
return soup.find_all('div',{'class':'quoteExpand'})
|
||||
|
||||
def get_spoiler_tags(self,topsoup):
|
||||
return topsoup.find_all('div',class_='bbCodeSpoilerContainer')
|
||||
|
||||
def convert_quotes(self,soup):
|
||||
pass
|
||||
|
||||
def handle_spoilers(self,topsoup):
|
||||
'''
|
||||
Modifies tag given as required to do spoiler changes.
|
||||
'''
|
||||
if self.getConfig('remove_spoilers'):
|
||||
for div in topsoup.find_all('div',class_='bbCodeSpoilerContainer'):
|
||||
for div in self.get_spoiler_tags(topsoup):
|
||||
div.extract()
|
||||
elif self.getConfig('legend_spoilers'):
|
||||
for div in topsoup.find_all('div',class_='bbCodeSpoilerContainer'):
|
||||
for div in self.get_spoiler_tags(topsoup):
|
||||
div.name='fieldset'
|
||||
# add copy of XF1 class name for convenience of
|
||||
# existing output_css when XF2.
|
||||
div['class'].append('bbCodeSpoilerContainer')
|
||||
legend = topsoup.new_tag('legend')
|
||||
legend.string = stripHTML(div.button.span)
|
||||
div.insert(0,legend)
|
||||
|
|
@ -611,10 +705,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True):
|
||||
if self.getConfig('replace_failed_smilies_with_alt_text'):
|
||||
for img in soup.find_all('img',src=re.compile(r'(failedtoload|clear.png)$')):
|
||||
#logger.debug("replace_failed_smilies_with_alt_text img: %s"%img)
|
||||
for img in soup.find_all('img',src=re.compile(r'(^data:image|(failedtoload|clear.png)$)')):
|
||||
# logger.debug("replace_failed_smilies_with_alt_text img: %s"%img)
|
||||
clses = unicode(img['class']) # stringify list.
|
||||
if img.has_attr('alt') and 'mceSmilie' in clses :
|
||||
if img.has_attr('alt') and ('mceSmilie' in clses or 'smilie--sprite' in clses):
|
||||
## Change the img to a span containing the alt
|
||||
## text, remove attrs. This is a one-way change.
|
||||
img.name='span'
|
||||
|
|
|
|||
|
|
@ -1054,6 +1054,12 @@ class Configuration(configparser.SafeConfigParser):
|
|||
headers['Content-type']='application/x-www-form-urlencoded'
|
||||
if 'Accept' not in headers:
|
||||
headers['Accept']="text/html,*/*"
|
||||
|
||||
if "xf2test" in url:
|
||||
import base64
|
||||
base64string = base64.encodestring(b"xf2demo2019:dBfbyHVvRCsYtLg846r3").replace(b'\n', b'')
|
||||
headers['Authorization']=b"Basic %s" % base64string
|
||||
|
||||
req = Request(url,
|
||||
data=ensure_binary(urlencode(parameters)),
|
||||
headers=headers)
|
||||
|
|
@ -1122,6 +1128,11 @@ class Configuration(configparser.SafeConfigParser):
|
|||
## not present at all
|
||||
headers.append(('Referer',referer))
|
||||
|
||||
if "xf2test" in url:
|
||||
import base64
|
||||
base64string = base64.encodestring(b"xf2demo2019:dBfbyHVvRCsYtLg846r3").replace(b'\n', b'')
|
||||
headers.append(('Authorization', b"Basic %s" % base64string))
|
||||
|
||||
self.opener.addheaders = headers
|
||||
|
||||
if parameters != None:
|
||||
|
|
@ -1182,6 +1193,7 @@ class Configuration(configparser.SafeConfigParser):
|
|||
except Exception as e:
|
||||
excpt=e
|
||||
logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
|
||||
raise
|
||||
|
||||
logger.debug("Giving up on %s" %safe_url(url))
|
||||
logger.debug(excpt, exc_info=True)
|
||||
|
|
|
|||
Loading…
Reference in a new issue