Incomplete test version for xenforo2 on xf2test.sufficientvelocity.com.

This commit is contained in:
Jim Miller 2019-04-22 14:57:55 -05:00
parent ee48decec5
commit e53e2bfbe5
5 changed files with 289 additions and 25 deletions

View file

@ -110,6 +110,7 @@ from . import adapter_tgstorytimecom
from . import adapter_itcouldhappennet
from . import adapter_forumsspacebattlescom
from . import adapter_forumssufficientvelocitycom
from . import adapter_xf2testsufficientvelocitycom
from . import adapter_forumquestionablequestingcom
from . import adapter_ninelivesarchivecom
from . import adapter_masseffect2in

View file

@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
# Copyright 2019 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import re
from .base_xenforo2forum_adapter import BaseXenForo2ForumAdapter
def getClass():
return XF2TestSufficientVelocityComAdapter
class XF2TestSufficientVelocityComAdapter(BaseXenForo2ForumAdapter):
def __init__(self, config, url):
BaseXenForo2ForumAdapter.__init__(self, config, url)
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','fsv2')
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'xf2test.sufficientvelocity.com'
# @classmethod
# def getAcceptDomains(cls):
# return [cls.getSiteDomain(),
# cls.getSiteDomain().replace('forums.','forum.'),
# cls.getSiteDomain().replace('forums.','')]
def getSiteURLPattern(self):
## SV accepts forums.sufficientvelocity.com, forum.sufficientvelocity.com and sufficientvelocity.com
## all of which redirect to forums.
## We will use forums. as canonical for all
return super(XF2TestSufficientVelocityComAdapter, self).getSiteURLPattern().replace(re.escape("forums."),r"(forums?\.)?")

View file

@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
# Copyright 2019 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
from datetime import datetime
logger = logging.getLogger(__name__)
import re
from xml.dom.minidom import parseString
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
# py2 vs py3 transition
from ..six import text_type as unicode
from ..six.moves.urllib.error import HTTPError
from .base_adapter import makeDate
from .base_xenforoforum_adapter import BaseXenForoForumAdapter
logger = logging.getLogger(__name__)
class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
def __init__(self, config, url):
logger.info("init url: "+url)
BaseXenForoForumAdapter.__init__(self, config, url)
def parse_title(self,souptag):
h1 = souptag.find('h1',{'class':'p-title-value'})
logger.debug(h1)
## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
for tag in h1.find_all('span',{'class':'label'}):
## stick them into genre.
self.story.addToList('genre',stripHTML(tag))
logger.debug(stripHTML(tag))
tag.extract()
self.story.setMetadata('title',stripHTML(h1))
logger.debug(stripHTML(h1))
def parse_author(self,souptag):
a = souptag.find('section',{'class':'message-user'}).find('a')
logger.debug(a)
self.story.addToList('authorId',a['href'].split('/')[-2])
authorUrl = a['href'] # self.getURLPrefix()+'/'+a['href']
self.story.addToList('authorUrl',authorUrl)
self.story.addToList('author',a.text)
def cache_posts(self,topsoup):
for post in topsoup.find_all('article',{'class':'message--post'}):
logger.debug("Caching %s"%post['data-content'])
self.post_cache[post['data-content']] = post
def get_first_post(self,topsoup):
return topsoup.find('article',{'class':'message--post'})
def get_first_post_body(self,topsoup):
return self.get_first_post(topsoup).find('article',{'class':'message-body'}).find('div',{'class':'bbWrapper'})
def extract_threadmarks(self,souptag):
threadmarks=[]
# try threadmarks if no '#' in url
navdiv = souptag.find('div',{'class':'buttonGroup'})
if not navdiv:
return threadmarks
# was class=threadmarksTrigger. thread cats are currently
# only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't
# be surprised if that changed. Don't want to do use just
# href=re because there's more than one copy on the page; plus
# could be included in a post. Would be easier if <noscript>s
# weren't being stripped, but that's a different issue.
threadmarksas = navdiv.find_all('a',{'class':'menuTrigger','href':re.compile('threadmarks.*(threadmark_category=)?')})
## Loop on threadmark categories.
tmcat_num=None
threadmarkgroups = dict() # for ordering threadmarks
for threadmarksa in threadmarksas:
logger.debug("threadmarksa:%s"%threadmarksa)
if 'threadmark_category=' in threadmarksa['href']:
tmcat_num = threadmarksa['href'].split('threadmark_category=')[1]
else:
tmcat_num = '1'
# get from earlier <a> now.
tmcat_name = stripHTML(threadmarksa)
if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
continue
if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
tmcat_name = 'Omake'
if 'http' not in threadmarksa['href']:
href = self.getURLPrefix()+'/'+threadmarksa['href']
else:
href = threadmarksa['href']
threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
tmcat_name,
tmcat_num)
logger.debug(threadmarkgroups[tmcat_name])
for cat_name in self.getConfigList('threadmark_category_order',['Threadmarks',
'Sidestory',
'Apocrypha',
'Omake',
'Media',
'Informational',
'Staff Post']):
if cat_name in threadmarkgroups:
threadmarks.extend(threadmarkgroups[cat_name])
return threadmarks
def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0):
logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url))
threadmarks=[]
soupmarks = self.make_soup(self._fetchUrl(url))
tm_list = soupmarks.find('div',{'class':'structItemContainer'})
if not tm_list: # load-range don't have threadmarkList.
tm_list = soupmarks
# logger.debug(tm_list)
markas = []
tmcat_index=passed_tmcat_index
after = False
for tm_item in tm_list.find_all('div',{'class':'structItem--threadmark'}):
atag = tm_item.find('a',{'data-tp-primary':'on'})
if not atag:
fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
logger.debug(fetcher)
range_url = fetcher['data-fetchurl']
threadmarks.extend(self.fetch_threadmarks(range_url,
tmcat_name,
tmcat_num,
tmcat_index))
tmcat_index = len(threadmarks)
after=True
else:
if after:
logger.debug("AFTER "*10)
after=False
url,name = atag['href'],stripHTML(atag)
date = self.make_date(tm_item)
worddd = tm_item.find('dd')
if worddd:
kwords = stripHTML(worddd)
else:
kwords = ""
# if atag.parent.has_attr('data-words'):
# words = int(atag.parent['data-words'])
# if "(" in atag.next_sibling:
# kwords = atag.next_sibling.strip()
# logger.debug("%s"%kwords)
# else:
# words = ""
# kwords = ""
if 'http' not in url:
url = self.getURLPrefix()+"/"+url
logger.debug("%s. %s"%(tmcat_index,name))
threadmarks.append({"tmcat_name":tmcat_name,
"tmcat_num":tmcat_num,
"tmcat_index":tmcat_index,
"title":name,
"url":url,
"date":date,
"words":"",
"kwords":kwords})
tmcat_index += 1
return threadmarks
def make_date(self,parenttag): # forums use a BS thing where dates
# can appear different if recent.
datestr=None
try:
datetag = parenttag.find('time')
return datetime.fromtimestamp(float(datetag['data-time']))
# if datetag:
# datestr = datetag['title']
# else:
# datetag = parenttag.find('abbr',{'class':'DateTime'})
# if datetag:
# datestr="%s at %s"%(datetag['data-datestring'],datetag['data-timestring'])
# # Apr 24, 2015 at 4:39 AM
# # May 1, 2015 at 5:47 AM
# datestr = re.sub(r' (\d[^\d])',r' 0\1',datestr) # add leading 0 for single digit day & hours.
# return makeDate(datestr, self.dateformat)
except:
logger.debug('No date found in %s'%parenttag,exc_info=True)
return None

View file

@ -255,16 +255,12 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
tmcat_num = threadmarksa['href'].split('category_id=')[1]
# get from earlier <a> now.
tmcat_name = stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'}))
prepend = ""
if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
continue
if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
tmcat_name = 'Omake'
if tmcat_name != "Threadmarks":
prepend = tmcat_name+" - "
threadmarks.extend(self.fetch_threadmarks(self.getURLPrefix()+'/'+threadmarksa['href'],
tmcat_name,
tmcat_num))
@ -348,13 +344,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
topsoup = souptag = self.make_soup(data)
h1 = souptag.find('div',{'class':'titleBar'}).h1
## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
for tag in h1.find_all('span',{'class':'prefix'}):
## stick them into genre.
self.story.addToList('genre',stripHTML(tag))
tag.extract()
self.story.setMetadata('title',stripHTML(h1))
self.parse_title(topsoup)
first_post_title = self.getConfig('first_post_title','First Post')
@ -408,7 +398,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
if words and self.getConfig('use_threadmark_wordcounts',True):
self.story.setMetadata('numWords',words)
souptag = souptag.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
souptag = self.get_first_post(topsoup)
if use_threadmark_chaps or self.getConfig('always_use_forumtags'):
## only use tags if threadmarks for chapters or always_use_forumtags is on.
@ -419,11 +409,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
self.story.addToList('forumtags',tstr)
# author moved down here to take from post URLs.
a = souptag.find('h3',{'class':'userText'}).find('a')
self.story.addToList('authorId',a['href'].split('/')[1])
authorUrl = self.getURLPrefix()+'/'+a['href']
self.story.addToList('authorUrl',authorUrl)
self.story.addToList('author',a.text)
self.parse_author(souptag)
if self.getConfig('author_avatar_cover'):
authorcard = self.make_soup(self._fetchUrl(authorUrl+"?card=1"))
@ -437,25 +423,24 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
##
## </div>
# Now go hunting for the 'chapter list'.
bq = souptag.find('blockquote') # assume first posting contains TOC urls.
# Now get first post for description and chapter list if not
# using threadmarks.
first_post = self.get_first_post_body(topsoup)
bq.name='div'
for iframe in bq.find_all('iframe'):
for iframe in first_post.find_all('iframe'):
iframe.extract() # calibre book reader & editor don't like iframes to youtube.
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
for qdiv in first_post.find_all('div',{'class':'quoteExpand'}):
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
self.setDescription(useurl,bq)
self.setDescription(useurl,first_post)
# otherwise, use first post links--include first post since
# that's often also the first chapter.
if self.num_chapters() < 1:
self.add_chapter(first_post_title,useurl)
for (url,name) in [ (x['href'],stripHTML(x)) for x in bq.find_all('a') ]:
for (url,name) in [ (x['href'],stripHTML(x)) for x in first_post.find_all('a') ]:
(is_chapter_url,url) = self._is_normalize_chapterurl(url)
if is_chapter_url and name != u"\u2191": # skip quote links as indicated by up arrow character.
self.add_chapter(name,url)
@ -475,6 +460,30 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
if date:
self.story.setMetadata('dateUpdated', date)
def parse_title(self,souptag):
h1 = souptag.find('div',{'class':'titleBar'}).h1
## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
for tag in h1.find_all('span',{'class':'prefix'}):
## stick them into genre.
self.story.addToList('genre',stripHTML(tag))
tag.extract()
self.story.setMetadata('title',stripHTML(h1))
def parse_author(self,souptag):
a = souptag.find('h3',{'class':'userText'}).find('a')
self.story.addToList('authorId',a['href'].split('/')[1])
authorUrl = self.getURLPrefix()+'/'+a['href']
self.story.addToList('authorUrl',authorUrl)
self.story.addToList('author',a.text)
def get_first_post(self,topsoup):
return topsoup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
def get_first_post_body(self,topsoup):
bq = self.get_first_post(topsoup).find('blockquote')
bq.name='div'
return bq
def make_date(self,parenttag): # forums use a BS thing where dates
# can appear different if recent.
datestr=None

View file

@ -1122,6 +1122,11 @@ class Configuration(configparser.SafeConfigParser):
## not present at all
headers.append(('Referer',referer))
if "xf2test" in url:
import base64
base64string = base64.encodestring('%s:%s' % ("xf2demo2019", "dBfbyHVvRCsYtLg846r3")).replace('\n', '')
headers.append(('Authorization', "Basic %s" % base64string))
self.opener.addheaders = headers
if parameters != None: