mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-28 19:56:40 +01:00
Refactor XF1 XF2 to consolidate logic.
This commit is contained in:
parent
1c42040885
commit
cf99d82e30
2 changed files with 124 additions and 162 deletions
|
|
@ -80,128 +80,44 @@ class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
|
|||
def get_post_updated_date(self,souptag):
|
||||
return self.make_date(souptag.find('div',{'class':'message-lastEdit'}))
|
||||
|
||||
def extract_threadmarks(self,souptag):
|
||||
threadmarks=[]
|
||||
# try threadmarks if no '#' in url
|
||||
navdiv = souptag.find('div',{'class':'buttonGroup'})
|
||||
if not navdiv:
|
||||
return threadmarks
|
||||
# was class=threadmarksTrigger. thread cats are currently
|
||||
# only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't
|
||||
# be surprised if that changed. Don't want to do use just
|
||||
# href=re because there's more than one copy on the page; plus
|
||||
# could be included in a post. Would be easier if <noscript>s
|
||||
# weren't being stripped, but that's a different issue.
|
||||
threadmarksas = navdiv.find_all('a',{'class':'menuTrigger','href':re.compile('threadmarks.*(threadmark_category=)?')})
|
||||
## Loop on threadmark categories.
|
||||
tmcat_num=None
|
||||
def get_threadmarks_top(self,souptag):
|
||||
return souptag.find('div',{'class':'buttonGroup'})
|
||||
|
||||
threadmarkgroups = dict() # for ordering threadmarks
|
||||
for threadmarksa in threadmarksas:
|
||||
# logger.debug("threadmarksa:%s"%threadmarksa)
|
||||
if 'threadmark_category=' in threadmarksa['href']:
|
||||
tmcat_num = threadmarksa['href'].split('threadmark_category=')[1]
|
||||
else:
|
||||
tmcat_num = '1'
|
||||
# get from earlier <a> now.
|
||||
tmcat_name = stripHTML(threadmarksa)
|
||||
if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
|
||||
continue
|
||||
def get_threadmarks(self,navdiv):
|
||||
return navdiv.find_all('a',{'class':'menuTrigger','href':re.compile('threadmarks.*(threadmark_category=)?')})
|
||||
|
||||
if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
|
||||
tmcat_name = 'Omake'
|
||||
def get_threadmark_catnumname(self,threadmarksa):
|
||||
if 'threadmark_category=' in threadmarksa['href']:
|
||||
tmcat_num = threadmarksa['href'].split('threadmark_category=')[1]
|
||||
else:
|
||||
tmcat_num = '1'
|
||||
tmcat_name = stripHTML(threadmarksa)
|
||||
return (tmcat_num,tmcat_name)
|
||||
|
||||
if 'http' not in threadmarksa['href']:
|
||||
href = self.getURLPrefix()+'/'+threadmarksa['href']
|
||||
else:
|
||||
href = threadmarksa['href']
|
||||
threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
|
||||
tmcat_name,
|
||||
tmcat_num)
|
||||
# logger.debug(threadmarkgroups[tmcat_name])
|
||||
## Order of threadmark groups in new SV is changed and
|
||||
## possibly unpredictable. Normalize. Keep as configurable?
|
||||
## What about categories not in the list?
|
||||
default_order = ['Threadmarks',
|
||||
'Sidestory',
|
||||
'Apocrypha',
|
||||
'Omake',
|
||||
'Media',
|
||||
'Informational',
|
||||
'Staff Post']
|
||||
# default order also *after* config'ed
|
||||
# threadmark_category_order so if they are not also in
|
||||
# skip_threadmarks_categories they appear in the expected
|
||||
# order.
|
||||
for cat_name in self.getConfigList('threadmark_category_order',default_order)+default_order:
|
||||
if cat_name in threadmarkgroups:
|
||||
threadmarks.extend(threadmarkgroups[cat_name])
|
||||
del threadmarkgroups[cat_name]
|
||||
# more categories left? new or at least unknown
|
||||
if threadmarkgroups:
|
||||
cats = threadmarkgroups.keys()
|
||||
# alphabetize for lack of a better idea to insure consist ordering
|
||||
cats.sort()
|
||||
for cat_name in cats:
|
||||
threadmarks.extend(threadmarkgroups[cat_name])
|
||||
return threadmarks
|
||||
def get_threadmarks_list(self,soupmarks):
|
||||
return soupmarks.find('div',{'class':'structItemContainer'})
|
||||
|
||||
def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0):
|
||||
logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url))
|
||||
threadmarks=[]
|
||||
soupmarks = self.make_soup(self._fetchUrl(url))
|
||||
tm_list = soupmarks.find('div',{'class':'structItemContainer'})
|
||||
if not tm_list: # load-range don't have threadmarkList.
|
||||
tm_list = soupmarks
|
||||
# logger.debug(tm_list)
|
||||
markas = []
|
||||
tmcat_index=passed_tmcat_index
|
||||
after = False
|
||||
for tm_item in tm_list.find_all('div',{'class':'structItem--threadmark'}):
|
||||
atag = tm_item.find('a',{'data-tp-primary':'on'})
|
||||
if not atag:
|
||||
fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
|
||||
# logger.debug(fetcher)
|
||||
range_url = fetcher['data-fetchurl']
|
||||
threadmarks.extend(self.fetch_threadmarks(range_url,
|
||||
tmcat_name,
|
||||
tmcat_num,
|
||||
tmcat_index))
|
||||
tmcat_index = len(threadmarks)
|
||||
after=True
|
||||
else:
|
||||
if after:
|
||||
# logger.debug("AFTER "*10)
|
||||
after=False
|
||||
url,name = atag['href'],stripHTML(atag)
|
||||
date = self.make_date(tm_item)
|
||||
worddd = tm_item.find('dd')
|
||||
if worddd:
|
||||
kwords = stripHTML(worddd)
|
||||
else:
|
||||
kwords = ""
|
||||
def get_threadmarks_from_list(self,tm_list):
|
||||
return tm_list.find_all('div',{'class':'structItem--threadmark'})
|
||||
|
||||
# if atag.parent.has_attr('data-words'):
|
||||
# words = int(atag.parent['data-words'])
|
||||
# if "(" in atag.next_sibling:
|
||||
# kwords = atag.next_sibling.strip()
|
||||
# logger.debug("%s"%kwords)
|
||||
# else:
|
||||
# words = ""
|
||||
# kwords = ""
|
||||
if 'http' not in url:
|
||||
url = self.getURLPrefix()+"/"+url
|
||||
# logger.debug("%s. %s"%(tmcat_index,name))
|
||||
threadmarks.append({"tmcat_name":tmcat_name,
|
||||
"tmcat_num":tmcat_num,
|
||||
"tmcat_index":tmcat_index,
|
||||
"title":name,
|
||||
"url":url,
|
||||
"date":date,
|
||||
"words":"",
|
||||
"kwords":kwords})
|
||||
tmcat_index += 1
|
||||
return threadmarks
|
||||
def get_atag_from_threadmark(self,tm_item):
|
||||
return tm_item.find('a',{'data-tp-primary':'on'})
|
||||
|
||||
def get_threadmark_range_url(self,tm_item,tmcat_num):
|
||||
fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
|
||||
# logger.debug(fetcher)
|
||||
return fetcher['data-fetchurl']
|
||||
|
||||
def get_threadmark_date(self,tm_item):
|
||||
return self.make_date(tm_item)
|
||||
|
||||
## XF2 doesn't appear to have words, just kwords.
|
||||
def get_threadmark_words(self,tm_item):
|
||||
words = kwords = ""
|
||||
worddd = tm_item.find('dd')
|
||||
if worddd:
|
||||
kwords = "("+stripHTML(worddd)+")" # to match XF1
|
||||
return words,kwords
|
||||
|
||||
def make_date(self,parenttag):
|
||||
datestr=None
|
||||
|
|
|
|||
|
|
@ -233,84 +233,131 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
self.cache_posts(soup)
|
||||
return soup
|
||||
|
||||
## Moved over from adapter_forumquestionablequestingcom when SB/SV
|
||||
## threadmark.rss became 'most recent 10 in reverse order'.
|
||||
def get_threadmarks_top(self,souptag):
|
||||
return souptag.find('div',{'class':'threadmarkMenus'})
|
||||
|
||||
def get_threadmarks(self,navdiv):
|
||||
return navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')})
|
||||
|
||||
def get_threadmark_catnumname(self,threadmarksa):
|
||||
return (threadmarksa['href'].split('category_id=')[1],
|
||||
stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'})))
|
||||
|
||||
def extract_threadmarks(self,souptag):
|
||||
threadmarks=[]
|
||||
# try threadmarks if no '#' in url
|
||||
navdiv = souptag.find('div',{'class':'threadmarkMenus'})
|
||||
navdiv = self.get_threadmarks_top(souptag)
|
||||
if not navdiv:
|
||||
return threadmarks
|
||||
# was class=threadmarksTrigger. thread cats are currently
|
||||
# only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't
|
||||
# be surprised if that changed. Don't want to do use just
|
||||
# href=re because there's more than one copy on the page; plus
|
||||
# could be included in a post. Would be easier if <noscript>s
|
||||
# weren't being stripped, but that's a different issue.
|
||||
threadmarksas = navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')})
|
||||
## Loop on threadmark categories.
|
||||
tmcat_num=None
|
||||
threadmarksas = self.get_threadmarks(navdiv)
|
||||
|
||||
threadmarkgroups = dict() # for ordering threadmarks
|
||||
## Loop on threadmark categories.
|
||||
for threadmarksa in threadmarksas:
|
||||
tmcat_num = threadmarksa['href'].split('category_id=')[1]
|
||||
# get from earlier <a> now.
|
||||
tmcat_name = stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'}))
|
||||
(tmcat_num,tmcat_name) = self.get_threadmark_catnumname(threadmarksa)
|
||||
if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
|
||||
continue
|
||||
|
||||
if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
|
||||
tmcat_name = 'Omake'
|
||||
|
||||
threadmarks.extend(self.fetch_threadmarks(self.getURLPrefix()+'/'+threadmarksa['href'],
|
||||
tmcat_name,
|
||||
tmcat_num))
|
||||
if 'http' not in threadmarksa['href']:
|
||||
href = self.getURLPrefix()+'/'+threadmarksa['href']
|
||||
else:
|
||||
href = threadmarksa['href']
|
||||
threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
|
||||
tmcat_name,
|
||||
tmcat_num)
|
||||
## Order of threadmark groups in new SV is changed and
|
||||
## possibly unpredictable. Normalize. Keep as configurable?
|
||||
## What about categories not in the list?
|
||||
default_order = ['Threadmarks',
|
||||
'Sidestory',
|
||||
'Apocrypha',
|
||||
'Omake',
|
||||
'Media',
|
||||
'Informational',
|
||||
'Staff Post']
|
||||
# default order also *after* config'ed
|
||||
# threadmark_category_order so if they are not also in
|
||||
# skip_threadmarks_categories they appear in the expected
|
||||
# order.
|
||||
for cat_name in self.getConfigList('threadmark_category_order',default_order)+default_order:
|
||||
if cat_name in threadmarkgroups:
|
||||
threadmarks.extend(threadmarkgroups[cat_name])
|
||||
del threadmarkgroups[cat_name]
|
||||
# more categories left? new or at least unknown
|
||||
if threadmarkgroups:
|
||||
cats = threadmarkgroups.keys()
|
||||
# alphabetize for lack of a better idea to insure consist ordering
|
||||
cats.sort()
|
||||
for cat_name in cats:
|
||||
threadmarks.extend(threadmarkgroups[cat_name])
|
||||
return threadmarks
|
||||
|
||||
def get_threadmarks_list(self,soupmarks):
|
||||
return soupmarks.find('div',{'class':'threadmarkList'})
|
||||
|
||||
def get_threadmarks_from_list(self,tm_list):
|
||||
return tm_list.find_all('li',{'class':'threadmarkListItem'})
|
||||
|
||||
def get_atag_from_threadmark(self,tm_item):
|
||||
return tm_item.find('a',{'class':'PreviewTooltip'})
|
||||
|
||||
def get_threadmark_range_url(self,tm_item,tmcat_num):
|
||||
load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'],
|
||||
tm_item['data-range-max'],
|
||||
tmcat_num)
|
||||
return self.url+load_range
|
||||
|
||||
def get_threadmark_date(self,tm_item):
|
||||
atag = self.get_atag_from_threadmark(tm_item)
|
||||
return self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
|
||||
|
||||
def get_threadmark_words(self,tm_item):
|
||||
words = kwords = ""
|
||||
atag = self.get_atag_from_threadmark(tm_item)
|
||||
if atag.parent.has_attr('data-words'):
|
||||
words = int(atag.parent['data-words'])
|
||||
if "(" in atag.next_sibling:
|
||||
kwords = atag.next_sibling.strip()
|
||||
return words,kwords
|
||||
|
||||
def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0):
|
||||
logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url))
|
||||
threadmarks=[]
|
||||
soupmarks = self.make_soup(self._fetchUrl(url))
|
||||
tm_list = soupmarks.find('div',{'class':'threadmarkList'})
|
||||
if not tm_list: # load-range don't have threadmarkList.
|
||||
tm_list = self.get_threadmarks_list(soupmarks)
|
||||
if not tm_list: # load-range don't match
|
||||
tm_list = soupmarks
|
||||
# logger.debug(tm_list)
|
||||
markas = []
|
||||
tmcat_index=passed_tmcat_index
|
||||
after = False
|
||||
for tm_item in tm_list.find_all('li',{'class':'threadmarkListItem'}):
|
||||
atag = tm_item.find('a',{'class':'PreviewTooltip'})
|
||||
for tm_item in self.get_threadmarks_from_list(tm_list):
|
||||
atag = self.get_atag_from_threadmark(tm_item)
|
||||
if not atag:
|
||||
if tm_item['data-range-min'] and tm_item['data-range-max']:
|
||||
# logger.debug(tm_item)
|
||||
load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'],
|
||||
tm_item['data-range-max'],
|
||||
tmcat_num)
|
||||
threadmarks.extend(self.fetch_threadmarks(self.url+load_range,
|
||||
tmcat_name,
|
||||
tmcat_num,
|
||||
tmcat_index))
|
||||
tmcat_index = len(threadmarks)
|
||||
after=True
|
||||
threadmarks.extend(self.fetch_threadmarks(self.get_threadmark_range_url(tm_item,tmcat_num),
|
||||
tmcat_name,
|
||||
tmcat_num,
|
||||
tmcat_index))
|
||||
tmcat_index = len(threadmarks)
|
||||
after=True
|
||||
else:
|
||||
if after:
|
||||
# logger.debug("AFTER "*10)
|
||||
after=False
|
||||
url,name = atag['href'],stripHTML(atag)
|
||||
date = self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
|
||||
if atag.parent.has_attr('data-words'):
|
||||
words = int(atag.parent['data-words'])
|
||||
if "(" in atag.next_sibling:
|
||||
kwords = atag.next_sibling.strip()
|
||||
# logger.debug("%s"%kwords)
|
||||
else:
|
||||
words = ""
|
||||
kwords = ""
|
||||
date = self.get_threadmark_date(tm_item)
|
||||
words,kwords = self.get_threadmark_words(tm_item)
|
||||
if 'http' not in url:
|
||||
url = self.getURLPrefix()+"/"+url
|
||||
# logger.debug("%s. %s"%(tmcat_index,name))
|
||||
threadmarks.append({"tmcat_name":tmcat_name,
|
||||
"tmcat_num":tmcat_num,
|
||||
"tmcat_index":tmcat_index,
|
||||
"title":name,
|
||||
"url":self.getURLPrefix()+"/"+url,
|
||||
"url":url,
|
||||
"date":date,
|
||||
"words":words,
|
||||
"kwords":kwords})
|
||||
|
|
@ -495,7 +542,6 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
bq = souptag.find('blockquote')
|
||||
if not bq:
|
||||
bq = souptag.find('div',{'class':'messageText'}) # cached gets if it was already used before
|
||||
|
||||
bq.name='div'
|
||||
return bq
|
||||
|
||||
|
|
@ -623,7 +669,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
def make_reader_url(self,tmcat_num,reader_page_num):
|
||||
return self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num)
|
||||
|
||||
|
||||
def handle_spoilers(self,topsoup):
|
||||
'''
|
||||
Modifies tag given as required to do spoiler changes.
|
||||
|
|
|
|||
Loading…
Reference in a new issue