Fix Characters parsing in ffnet, add Language metadata (ffnet only right now).

This commit is contained in:
Jim Miller 2012-02-14 22:31:13 -06:00
parent b69ede76bb
commit 7cf76b7a47
9 changed files with 94 additions and 54 deletions

View file

@ -445,6 +445,7 @@ permitted_values = {
'series' : ['series'],
'enumeration' : ['category',
'genre',
'language',
'series',
'characters',
'status',
@ -477,6 +478,7 @@ permitted_values['comments'] = permitted_values['enumeration']
titleLabels = {
'category':'Category',
'genre':'Genre',
'language':'Language',
'status':'Status',
'status-C':'Status:Completed',
'status-I':'Status:In-Progress',

View file

@ -52,9 +52,6 @@ formmapping = {
PLUGIN_ICONS = ['images/icon.png']
sendlists = ["Send to Nook", "Send to Kindle", "Send to Droid", "Add to Nook", "Add to Kindle", "Add to Droid"]
readlists = ["000"]
class FanFictionDownLoaderPlugin(InterfaceAction):
name = 'FanFictionDownLoader'
@ -742,11 +739,16 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
if len(filter( lambda x : not x.startswith("Last Update"), mi.tags)) > 0:
old_tags = filter( lambda x : not x.startswith("Last Update"), old_tags)
# mi.tags needs to be list, but set kills dups.
mi.tags = list(set(list(old_tags)+mi.tags))
# Set language english, but only if not already set.
oldmi = db.get_metadata(book_id,index_is_id=True)
if not oldmi.languages:
mi.languages=['eng']
mi.tags = list(set(list(old_tags)+mi.tags))
if 'langcode' in book['all_metadata']:
mi.languages=[book['all_metadata']['langcode']]
else:
# Set language english, but only if not already set.
oldmi = db.get_metadata(book_id,index_is_id=True)
if not oldmi.languages:
mi.languages=['eng']
db.set_metadata(book_id,mi)
# do configured column updates here.
@ -971,40 +973,6 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
except:
return None;
def get_job_details(job):
'''
Convert the job result into a set of parameters including a detail message
summarising the success of the extraction operation.
This is used by both the threaded and worker approaches to extraction
'''
extracted_ids, same_isbn_ids, failed_ids, no_format_ids = job.result
if not hasattr(job, 'html_details'):
job.html_details = job.details
det_msg = []
for i, title in failed_ids:
if i in no_format_ids:
msg = title + ' (No formats)'
else:
msg = title + ' (ISBN not found)'
det_msg.append(msg)
if same_isbn_ids:
if det_msg:
det_msg.append('----------------------------------')
for i, title in same_isbn_ids:
msg = title + ' (Same ISBN)'
det_msg.append(msg)
if len(extracted_ids) > 0:
if det_msg:
det_msg.append('----------------------------------')
for i, title, last_modified, isbn in extracted_ids:
msg = '%s (Extracted %s)'%(title, isbn)
det_msg.append(msg)
det_msg = '\n'.join(det_msg)
return extracted_ids, same_isbn_ids, failed_ids, det_msg
def get_url_list(urls):
def f(x):
if x.strip(): return True

View file

@ -36,6 +36,7 @@ formatext_label:File Extension
## Sometimes there are multiple categories and/or genres.
category_label:Category
genre_label:Genre
language_label:Language
characters_label:Characters
series_label:Series
## Completed/In-Progress
@ -67,7 +68,7 @@ version_label:FFDL Version
## items to include in the title page
## Empty entries will *not* appear, even if in the list.
## All current formats already include title and author.
titlepage_entries: series,category,genre,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
titlepage_entries: series,category,genre,language,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
## Try to collect series name and number of this story in series.
## Some sites (ab)use 'series' for reading lists and personal
@ -176,7 +177,7 @@ output_css:
[txt]
## Add URLs since there aren't links.
titlepage_entries: series,category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
titlepage_entries: series,category,genre,language,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
## use \r\n for line endings, the windows convention. text output only.
windows_eol: true

View file

@ -201,7 +201,6 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
metatext = a.findNext(text=re.compile(r' - Reviews:'))
if metatext == None: # indicates there's no Reviews, look for id: instead.
metatext = a.findNext(text=re.compile(r' - id:'))
#print("========= metatext:\n%s"%metatext)
# after Rating, the same bit of text containing id:123456 contains
# Complete--if completed.
@ -215,7 +214,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
# <meta name="description" content="Chapter 1 of a Transformers/Beast Wars - Adventure/Friendship fanfiction with characters Bumblebee. TFA: What would you do if you was being abused all you life? Follow NightRunner as she goes through her spark breaking adventure of getting away from her father..">
# (fp)<meta name="description" content="Chapter 1 of a Sci-Fi - Adventure/Humor fiction. Felix Max was just your regular hyperactive kid until he accidently caused his own fathers death. Now he has meta-humans trying to hunt him down with a corrupt goverment to back them up. Oh, and did I mention he has no Powers yet?.">
# <meta name="description" content="Chapter 1 of a Bleach - Adventure/Angst fanfiction with characters Ichigo K. & Neliel T. O./Nel. Time travel with a twist. Time can be a real bi***. Ichigo finds that fact out when he accidentally goes back in time. Is this his second chance or is fate just screwing with him. Not a crack fic.IchixNelXHime.">
m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?) (?:- (?P<genres>.*?) )?(?:crossover )?(?:fan)?fiction(?:[ ]+with characters (?P<char1>.*?\.?)(?: & (?P<char2>.*?\.?))?\. )?",
# <meta name="description" content="Chapter 1 of a Harry Potter and Transformers - Humor/Adventure crossover fanfiction with characters: Harry P. & Ironhide. ITs one thing to be tossed thru the Veil for something he didnt do. It was quite another to wake in his animigus form in a world not his own. Harry just knew someone was laughing at him somewhere. Mech/Mech pairings inside..">
m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?) (?:- (?P<genres>.*?) )?(?:crossover )?(?:fan)?fiction(?P<chars>[ ]+with characters)?",
soup.find('meta',{'name':'description'})['content'])
if m != None:
genres=m.group('genres')
@ -225,7 +225,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
for g in genres.split('/'):
self.story.addToList('genre',g)
if m.group('char1') != None:
if m.group('chars') != None:
# At this point we've proven that there's character(s)
# We can't reliably parse characters out of meta name="description".
# There's no way to tell that "with characters Ichigo K. & Neliel T. O./Nel. " ends at "Nel.", not "T."
@ -233,12 +234,16 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
# reviewstext can take form of:
# - English - Shinji H. - Updated: 01-13-12 - Published: 12-20-11 - id:7654123
# - English - Adventure/Angst - Ichigo K. & Neliel T. O./Nel - Reviews:
# - English - Humor/Adventure - Harry P. & Ironhide - Reviews:
mc = re.match(r" - (?P<lang>[^ ]+ - )(?P<genres>[^ ]+ - )? (?P<chars>.+?) - (Reviews|Updated|Published)",
metatext)
chars = mc.group("chars")
for c in chars.split(' & '):
self.story.addToList('characters',c)
m = re.match(r" - (?P<lang>[^ ]+)",metatext)
if m.group('lang') != None:
self.story.setMetadata('language',m.group('lang'))
return
def getChapterText(self, url):

View file

@ -90,6 +90,16 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
self.story.setMetadata('status','In-Progress')
else:
self.story.setMetadata('status','Completed')
langs = {
0:"English",
1:"Russian",
2:"French",
3:"German",
}
if idnum < 10:
self.story.setMetadata('language',langs[idnum%len(langs)])
# greater than 10, no language.
self.setSeries('The Great Test',idnum)
@ -117,7 +127,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
('Chapter 4',self.url+"&chapter=5"),
('Chapter 5',self.url+"&chapter=6"),
('Chapter 6',self.url+"&chapter=6"),
# ('Chapter 7',self.url+"&chapter=6"),
('Chapter 7',self.url+"&chapter=6"),
# ('Chapter 8',self.url+"&chapter=6"),
# ('Chapter 9',self.url+"&chapter=6"),
# ('Chapter 0',self.url+"&chapter=6"),

View file

@ -19,6 +19,50 @@ import os, re
from htmlcleanup import conditionalRemoveEntities, removeAllEntities
# The list comes from ffnet, the only multi-language site we support
# at the time of writing. Values are taken largely from pycountry,
# but with some corrections and guesses.
langs = {
"English":"en",
"Spanish":"es",
"French":"fr",
"German":"de",
"Chinese":"zh",
"Japanese":"ja",
"Dutch":"nl",
"Portuguese":"pt",
"Russian":"ru",
"Italian":"it",
"Bulgarian":"bg",
"Polish":"pl",
"Hungarian":"hu",
"Hebrew":"he",
"Arabic":"ar",
"Swedish":"sv",
"Norwegian":"no",
"Danish":"da",
"Finnish":"fi",
"Filipino":"fil",
"Esperanto":"eo",
"Hindi":"hi",
"Punjabi":"pa",
"Farsi":"fa",
"Greek":"el",
"Romanian":"ro",
"Albanian":"sq",
"Serbian":"sr",
"Turkish":"tr",
"Czech":"cs",
"Indonesian":"id",
"Croatian":"hr",
"Catalan":"ca",
"Latin":"la",
"Korean":"ko",
"Vietnamese":"vi",
"Thai":"th",
"Devanagari":"hi",
}
class Story:
def __init__(self):
@ -33,6 +77,11 @@ class Story:
def setMetadata(self, key, value):
## still keeps &lt; &lt; and &amp;
self.metadata[key]=conditionalRemoveEntities(value)
if key == "language":
try:
self.metadata['langcode'] = langs[self.metadata[key]]
except:
self.metadata['langcode'] = 'en'
def getMetadataRaw(self,key):
if self.metadata.has_key(key):
@ -111,7 +160,6 @@ class Story:
def setReplace(self,replace):
for line in replace.splitlines():
if "=>" in line:
print("line:%s"%line)
self.replacements.append(map( lambda x: x.strip(), line.split("=>") ))
def commaGroups(s):

View file

@ -52,6 +52,7 @@ class BaseStoryWriter(Configurable):
self.validEntries = [
'category',
'genre',
'language',
'characters',
'series',
'status',
@ -80,6 +81,7 @@ class BaseStoryWriter(Configurable):
self.titleLabels = {
'category':'Category',
'genre':'Genre',
'language':'Language',
'status':'Status',
'series':'Series',
'characters':'Characters',

View file

@ -203,7 +203,10 @@ class EpubWriter(BaseStoryWriter):
metadata.appendChild(newTag(contentdom,"dc:contributor",text="fanficdownloader [http://fanficdownloader.googlecode.com]",attrs={"opf:role":"bkp"}))
metadata.appendChild(newTag(contentdom,"dc:rights",text=""))
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
if self.story.getMetadata('langcode') != None:
metadata.appendChild(newTag(contentdom,"dc:language",text=self.story.getMetadata('langcode')))
else:
metadata.appendChild(newTag(contentdom,"dc:language",text='en'))
# published, created, updated, calibre
# Leave calling self.story.getMetadataRaw directly in case date format changes.
@ -399,4 +402,4 @@ def newTag(dom,name,attrs=None,text=None):
if( text is not None ):
tag.appendChild(dom.createTextNode(text))
return tag

View file

@ -41,6 +41,7 @@ formatext_label:File Extension
## Sometimes there are multiple categories and/or genres.
category_label:Category
genre_label:Genre
language_label:Language
characters_label:Characters
series_label:Series
## Completed/In-Progress
@ -72,7 +73,7 @@ version_label:FFDL Version
## items to include in the title page
## Empty entries will *not* appear, even if in the list.
## All current formats already include title and author.
titlepage_entries: series,category,genre,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
titlepage_entries: series,category,genre,language,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
## Try to collect series name and number of this story in series.
## Some sites (ab)use 'series' for reading lists and personal
@ -154,7 +155,7 @@ output_css:
[txt]
## Add URLs since there aren't links.
titlepage_entries: series,category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
titlepage_entries: series,category,genre,language,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
## use \r\n for line endings, the windows convention. text output only.
windows_eol: true