mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-01-02 06:04:56 +01:00
Fix Characters parsing in ffnet, add Language metadata (ffnet only right now).
This commit is contained in:
parent
b69ede76bb
commit
7cf76b7a47
9 changed files with 94 additions and 54 deletions
|
|
@ -445,6 +445,7 @@ permitted_values = {
|
|||
'series' : ['series'],
|
||||
'enumeration' : ['category',
|
||||
'genre',
|
||||
'language',
|
||||
'series',
|
||||
'characters',
|
||||
'status',
|
||||
|
|
@ -477,6 +478,7 @@ permitted_values['comments'] = permitted_values['enumeration']
|
|||
titleLabels = {
|
||||
'category':'Category',
|
||||
'genre':'Genre',
|
||||
'language':'Language',
|
||||
'status':'Status',
|
||||
'status-C':'Status:Completed',
|
||||
'status-I':'Status:In-Progress',
|
||||
|
|
|
|||
|
|
@ -52,9 +52,6 @@ formmapping = {
|
|||
|
||||
PLUGIN_ICONS = ['images/icon.png']
|
||||
|
||||
sendlists = ["Send to Nook", "Send to Kindle", "Send to Droid", "Add to Nook", "Add to Kindle", "Add to Droid"]
|
||||
readlists = ["000"]
|
||||
|
||||
class FanFictionDownLoaderPlugin(InterfaceAction):
|
||||
|
||||
name = 'FanFictionDownLoader'
|
||||
|
|
@ -742,11 +739,16 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
if len(filter( lambda x : not x.startswith("Last Update"), mi.tags)) > 0:
|
||||
old_tags = filter( lambda x : not x.startswith("Last Update"), old_tags)
|
||||
# mi.tags needs to be list, but set kills dups.
|
||||
mi.tags = list(set(list(old_tags)+mi.tags))
|
||||
# Set language english, but only if not already set.
|
||||
oldmi = db.get_metadata(book_id,index_is_id=True)
|
||||
if not oldmi.languages:
|
||||
mi.languages=['eng']
|
||||
mi.tags = list(set(list(old_tags)+mi.tags))
|
||||
|
||||
if 'langcode' in book['all_metadata']:
|
||||
mi.languages=[book['all_metadata']['langcode']]
|
||||
else:
|
||||
# Set language english, but only if not already set.
|
||||
oldmi = db.get_metadata(book_id,index_is_id=True)
|
||||
if not oldmi.languages:
|
||||
mi.languages=['eng']
|
||||
|
||||
db.set_metadata(book_id,mi)
|
||||
|
||||
# do configured column updates here.
|
||||
|
|
@ -971,40 +973,6 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
except:
|
||||
return None;
|
||||
|
||||
|
||||
|
||||
def get_job_details(job):
|
||||
'''
|
||||
Convert the job result into a set of parameters including a detail message
|
||||
summarising the success of the extraction operation.
|
||||
This is used by both the threaded and worker approaches to extraction
|
||||
'''
|
||||
extracted_ids, same_isbn_ids, failed_ids, no_format_ids = job.result
|
||||
if not hasattr(job, 'html_details'):
|
||||
job.html_details = job.details
|
||||
det_msg = []
|
||||
for i, title in failed_ids:
|
||||
if i in no_format_ids:
|
||||
msg = title + ' (No formats)'
|
||||
else:
|
||||
msg = title + ' (ISBN not found)'
|
||||
det_msg.append(msg)
|
||||
if same_isbn_ids:
|
||||
if det_msg:
|
||||
det_msg.append('----------------------------------')
|
||||
for i, title in same_isbn_ids:
|
||||
msg = title + ' (Same ISBN)'
|
||||
det_msg.append(msg)
|
||||
if len(extracted_ids) > 0:
|
||||
if det_msg:
|
||||
det_msg.append('----------------------------------')
|
||||
for i, title, last_modified, isbn in extracted_ids:
|
||||
msg = '%s (Extracted %s)'%(title, isbn)
|
||||
det_msg.append(msg)
|
||||
|
||||
det_msg = '\n'.join(det_msg)
|
||||
return extracted_ids, same_isbn_ids, failed_ids, det_msg
|
||||
|
||||
def get_url_list(urls):
|
||||
def f(x):
|
||||
if x.strip(): return True
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ formatext_label:File Extension
|
|||
## Sometimes there are multiple categories and/or genres.
|
||||
category_label:Category
|
||||
genre_label:Genre
|
||||
language_label:Language
|
||||
characters_label:Characters
|
||||
series_label:Series
|
||||
## Completed/In-Progress
|
||||
|
|
@ -67,7 +68,7 @@ version_label:FFDL Version
|
|||
## items to include in the title page
|
||||
## Empty entries will *not* appear, even if in the list.
|
||||
## All current formats already include title and author.
|
||||
titlepage_entries: series,category,genre,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
|
||||
titlepage_entries: series,category,genre,language,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
|
||||
|
||||
## Try to collect series name and number of this story in series.
|
||||
## Some sites (ab)use 'series' for reading lists and personal
|
||||
|
|
@ -176,7 +177,7 @@ output_css:
|
|||
|
||||
[txt]
|
||||
## Add URLs since there aren't links.
|
||||
titlepage_entries: series,category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
|
||||
titlepage_entries: series,category,genre,language,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
|
||||
|
||||
## use \r\n for line endings, the windows convention. text output only.
|
||||
windows_eol: true
|
||||
|
|
|
|||
|
|
@ -201,7 +201,6 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
metatext = a.findNext(text=re.compile(r' - Reviews:'))
|
||||
if metatext == None: # indicates there's no Reviews, look for id: instead.
|
||||
metatext = a.findNext(text=re.compile(r' - id:'))
|
||||
#print("========= metatext:\n%s"%metatext)
|
||||
|
||||
# after Rating, the same bit of text containing id:123456 contains
|
||||
# Complete--if completed.
|
||||
|
|
@ -215,7 +214,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
# <meta name="description" content="Chapter 1 of a Transformers/Beast Wars - Adventure/Friendship fanfiction with characters Bumblebee. TFA: What would you do if you was being abused all you life? Follow NightRunner as she goes through her spark breaking adventure of getting away from her father..">
|
||||
# (fp)<meta name="description" content="Chapter 1 of a Sci-Fi - Adventure/Humor fiction. Felix Max was just your regular hyperactive kid until he accidently caused his own fathers death. Now he has meta-humans trying to hunt him down with a corrupt goverment to back them up. Oh, and did I mention he has no Powers yet?.">
|
||||
# <meta name="description" content="Chapter 1 of a Bleach - Adventure/Angst fanfiction with characters Ichigo K. & Neliel T. O./Nel. Time travel with a twist. Time can be a real bi***. Ichigo finds that fact out when he accidentally goes back in time. Is this his second chance or is fate just screwing with him. Not a crack fic.IchixNelXHime.">
|
||||
m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?) (?:- (?P<genres>.*?) )?(?:crossover )?(?:fan)?fiction(?:[ ]+with characters (?P<char1>.*?\.?)(?: & (?P<char2>.*?\.?))?\. )?",
|
||||
# <meta name="description" content="Chapter 1 of a Harry Potter and Transformers - Humor/Adventure crossover fanfiction with characters: Harry P. & Ironhide. IT’s one thing to be tossed thru the Veil for something he didn’t do. It was quite another to wake in his animigus form in a world not his own. Harry just knew someone was laughing at him somewhere. Mech/Mech pairings inside..">
|
||||
m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?) (?:- (?P<genres>.*?) )?(?:crossover )?(?:fan)?fiction(?P<chars>[ ]+with characters)?",
|
||||
soup.find('meta',{'name':'description'})['content'])
|
||||
if m != None:
|
||||
genres=m.group('genres')
|
||||
|
|
@ -225,7 +225,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
for g in genres.split('/'):
|
||||
self.story.addToList('genre',g)
|
||||
|
||||
if m.group('char1') != None:
|
||||
if m.group('chars') != None:
|
||||
|
||||
# At this point we've proven that there's character(s)
|
||||
# We can't reliably parse characters out of meta name="description".
|
||||
# There's no way to tell that "with characters Ichigo K. & Neliel T. O./Nel. " ends at "Nel.", not "T."
|
||||
|
|
@ -233,12 +234,16 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
# reviewstext can take form of:
|
||||
# - English - Shinji H. - Updated: 01-13-12 - Published: 12-20-11 - id:7654123
|
||||
# - English - Adventure/Angst - Ichigo K. & Neliel T. O./Nel - Reviews:
|
||||
# - English - Humor/Adventure - Harry P. & Ironhide - Reviews:
|
||||
mc = re.match(r" - (?P<lang>[^ ]+ - )(?P<genres>[^ ]+ - )? (?P<chars>.+?) - (Reviews|Updated|Published)",
|
||||
metatext)
|
||||
chars = mc.group("chars")
|
||||
for c in chars.split(' & '):
|
||||
self.story.addToList('characters',c)
|
||||
|
||||
m = re.match(r" - (?P<lang>[^ ]+)",metatext)
|
||||
if m.group('lang') != None:
|
||||
self.story.setMetadata('language',m.group('lang'))
|
||||
|
||||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
|
|
|||
|
|
@ -90,6 +90,16 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
|
|||
self.story.setMetadata('status','In-Progress')
|
||||
else:
|
||||
self.story.setMetadata('status','Completed')
|
||||
|
||||
langs = {
|
||||
0:"English",
|
||||
1:"Russian",
|
||||
2:"French",
|
||||
3:"German",
|
||||
}
|
||||
if idnum < 10:
|
||||
self.story.setMetadata('language',langs[idnum%len(langs)])
|
||||
# greater than 10, no language.
|
||||
|
||||
self.setSeries('The Great Test',idnum)
|
||||
|
||||
|
|
@ -117,7 +127,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
|
|||
('Chapter 4',self.url+"&chapter=5"),
|
||||
('Chapter 5',self.url+"&chapter=6"),
|
||||
('Chapter 6',self.url+"&chapter=6"),
|
||||
# ('Chapter 7',self.url+"&chapter=6"),
|
||||
('Chapter 7',self.url+"&chapter=6"),
|
||||
# ('Chapter 8',self.url+"&chapter=6"),
|
||||
# ('Chapter 9',self.url+"&chapter=6"),
|
||||
# ('Chapter 0',self.url+"&chapter=6"),
|
||||
|
|
|
|||
|
|
@ -19,6 +19,50 @@ import os, re
|
|||
|
||||
from htmlcleanup import conditionalRemoveEntities, removeAllEntities
|
||||
|
||||
# The list comes from ffnet, the only multi-language site we support
|
||||
# at the time of writing. Values are taken largely from pycountry,
|
||||
# but with some corrections and guesses.
|
||||
langs = {
|
||||
"English":"en",
|
||||
"Spanish":"es",
|
||||
"French":"fr",
|
||||
"German":"de",
|
||||
"Chinese":"zh",
|
||||
"Japanese":"ja",
|
||||
"Dutch":"nl",
|
||||
"Portuguese":"pt",
|
||||
"Russian":"ru",
|
||||
"Italian":"it",
|
||||
"Bulgarian":"bg",
|
||||
"Polish":"pl",
|
||||
"Hungarian":"hu",
|
||||
"Hebrew":"he",
|
||||
"Arabic":"ar",
|
||||
"Swedish":"sv",
|
||||
"Norwegian":"no",
|
||||
"Danish":"da",
|
||||
"Finnish":"fi",
|
||||
"Filipino":"fil",
|
||||
"Esperanto":"eo",
|
||||
"Hindi":"hi",
|
||||
"Punjabi":"pa",
|
||||
"Farsi":"fa",
|
||||
"Greek":"el",
|
||||
"Romanian":"ro",
|
||||
"Albanian":"sq",
|
||||
"Serbian":"sr",
|
||||
"Turkish":"tr",
|
||||
"Czech":"cs",
|
||||
"Indonesian":"id",
|
||||
"Croatian":"hr",
|
||||
"Catalan":"ca",
|
||||
"Latin":"la",
|
||||
"Korean":"ko",
|
||||
"Vietnamese":"vi",
|
||||
"Thai":"th",
|
||||
"Devanagari":"hi",
|
||||
}
|
||||
|
||||
class Story:
|
||||
|
||||
def __init__(self):
|
||||
|
|
@ -33,6 +77,11 @@ class Story:
|
|||
def setMetadata(self, key, value):
|
||||
## still keeps < < and &
|
||||
self.metadata[key]=conditionalRemoveEntities(value)
|
||||
if key == "language":
|
||||
try:
|
||||
self.metadata['langcode'] = langs[self.metadata[key]]
|
||||
except:
|
||||
self.metadata['langcode'] = 'en'
|
||||
|
||||
def getMetadataRaw(self,key):
|
||||
if self.metadata.has_key(key):
|
||||
|
|
@ -111,7 +160,6 @@ class Story:
|
|||
def setReplace(self,replace):
|
||||
for line in replace.splitlines():
|
||||
if "=>" in line:
|
||||
print("line:%s"%line)
|
||||
self.replacements.append(map( lambda x: x.strip(), line.split("=>") ))
|
||||
|
||||
def commaGroups(s):
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ class BaseStoryWriter(Configurable):
|
|||
self.validEntries = [
|
||||
'category',
|
||||
'genre',
|
||||
'language',
|
||||
'characters',
|
||||
'series',
|
||||
'status',
|
||||
|
|
@ -80,6 +81,7 @@ class BaseStoryWriter(Configurable):
|
|||
self.titleLabels = {
|
||||
'category':'Category',
|
||||
'genre':'Genre',
|
||||
'language':'Language',
|
||||
'status':'Status',
|
||||
'series':'Series',
|
||||
'characters':'Characters',
|
||||
|
|
|
|||
|
|
@ -203,7 +203,10 @@ class EpubWriter(BaseStoryWriter):
|
|||
|
||||
metadata.appendChild(newTag(contentdom,"dc:contributor",text="fanficdownloader [http://fanficdownloader.googlecode.com]",attrs={"opf:role":"bkp"}))
|
||||
metadata.appendChild(newTag(contentdom,"dc:rights",text=""))
|
||||
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
|
||||
if self.story.getMetadata('langcode') != None:
|
||||
metadata.appendChild(newTag(contentdom,"dc:language",text=self.story.getMetadata('langcode')))
|
||||
else:
|
||||
metadata.appendChild(newTag(contentdom,"dc:language",text='en'))
|
||||
|
||||
# published, created, updated, calibre
|
||||
# Leave calling self.story.getMetadataRaw directly in case date format changes.
|
||||
|
|
@ -399,4 +402,4 @@ def newTag(dom,name,attrs=None,text=None):
|
|||
if( text is not None ):
|
||||
tag.appendChild(dom.createTextNode(text))
|
||||
return tag
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ formatext_label:File Extension
|
|||
## Sometimes there are multiple categories and/or genres.
|
||||
category_label:Category
|
||||
genre_label:Genre
|
||||
language_label:Language
|
||||
characters_label:Characters
|
||||
series_label:Series
|
||||
## Completed/In-Progress
|
||||
|
|
@ -72,7 +73,7 @@ version_label:FFDL Version
|
|||
## items to include in the title page
|
||||
## Empty entries will *not* appear, even if in the list.
|
||||
## All current formats already include title and author.
|
||||
titlepage_entries: series,category,genre,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
|
||||
titlepage_entries: series,category,genre,language,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
|
||||
|
||||
## Try to collect series name and number of this story in series.
|
||||
## Some sites (ab)use 'series' for reading lists and personal
|
||||
|
|
@ -154,7 +155,7 @@ output_css:
|
|||
|
||||
[txt]
|
||||
## Add URLs since there aren't links.
|
||||
titlepage_entries: series,category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
|
||||
titlepage_entries: series,category,genre,language,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
|
||||
|
||||
## use \r\n for line endings, the windows convention. text output only.
|
||||
windows_eol: true
|
||||
|
|
|
|||
Loading…
Reference in a new issue