Fix Characters parsing in ffnet, add Language metadata (ffnet only right now).

2026-01-02 06:04:56 +01:00 · 2012-02-14 22:31:13 -06:00 · 2012-02-14 22:31:13 -06:00 · 7cf76b7a47
commit 7cf76b7a47
parent b69ede76bb
9 changed files with 94 additions and 54 deletions
--- a/calibre-plugin/config.py
+++ b/calibre-plugin/config.py
@ -445,6 +445,7 @@ permitted_values = {
    'series' : ['series'],
    'enumeration' : ['category',
                     'genre',
+                     'language',
                     'series',
                     'characters',
                     'status',
@ -477,6 +478,7 @@ permitted_values['comments'] = permitted_values['enumeration']
 titleLabels = {
    'category':'Category',
    'genre':'Genre',
+    'language':'Language',
    'status':'Status',
    'status-C':'Status:Completed',
    'status-I':'Status:In-Progress',
--- a/calibre-plugin/ffdl_plugin.py
+++ b/calibre-plugin/ffdl_plugin.py
@ -52,9 +52,6 @@ formmapping = {

 PLUGIN_ICONS = ['images/icon.png']

-sendlists = ["Send to Nook", "Send to Kindle", "Send to Droid", "Add to Nook", "Add to Kindle", "Add to Droid"]
-readlists = ["000"]
-
 class FanFictionDownLoaderPlugin(InterfaceAction):

    name = 'FanFictionDownLoader'
@ -742,11 +739,16 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
            if len(filter( lambda x : not x.startswith("Last Update"), mi.tags)) > 0:
                old_tags = filter( lambda x : not x.startswith("Last Update"), old_tags)
            # mi.tags needs to be list, but set kills dups.
-            mi.tags = list(set(list(old_tags)+mi.tags)) 
-        # Set language english, but only if not already set.
-        oldmi = db.get_metadata(book_id,index_is_id=True)
-        if not oldmi.languages:
-            mi.languages=['eng']
+            mi.tags = list(set(list(old_tags)+mi.tags))
+
+        if 'langcode' in book['all_metadata']:
+            mi.languages=[book['all_metadata']['langcode']]
+        else:
+            # Set language english, but only if not already set.
+            oldmi = db.get_metadata(book_id,index_is_id=True)
+            if not oldmi.languages:
+                mi.languages=['eng']
+                
        db.set_metadata(book_id,mi)

        # do configured column updates here.
@ -971,40 +973,6 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
        except:
            return None;

-
-
-def get_job_details(job):
-    '''
-    Convert the job result into a set of parameters including a detail message
-    summarising the success of the extraction operation.
-    This is used by both the threaded and worker approaches to extraction
-    '''
-    extracted_ids, same_isbn_ids, failed_ids, no_format_ids = job.result
-    if not hasattr(job, 'html_details'):
-        job.html_details = job.details
-    det_msg = []
-    for i, title in failed_ids:
-        if i in no_format_ids:
-            msg = title + ' (No formats)'
-        else:
-            msg = title + ' (ISBN not found)'
-        det_msg.append(msg)
-    if same_isbn_ids:
-        if det_msg:
-            det_msg.append('----------------------------------')
-        for i, title in same_isbn_ids:
-            msg = title + ' (Same ISBN)'
-            det_msg.append(msg)
-    if len(extracted_ids) > 0:
-        if det_msg:
-            det_msg.append('----------------------------------')
-        for i, title, last_modified, isbn in extracted_ids:
-            msg = '%s (Extracted %s)'%(title, isbn)
-            det_msg.append(msg)
-
-    det_msg = '\n'.join(det_msg)
-    return extracted_ids, same_isbn_ids, failed_ids, det_msg
-
 def get_url_list(urls):
    def f(x):
        if x.strip(): return True
--- a/defaults.ini
+++ b/defaults.ini
@ -36,6 +36,7 @@ formatext_label:File Extension
 ## Sometimes there are multiple categories and/or genres.
 category_label:Category
 genre_label:Genre
+language_label:Language
 characters_label:Characters
 series_label:Series
 ## Completed/In-Progress
@ -67,7 +68,7 @@ version_label:FFDL Version
 ## items to include in the title page
 ## Empty entries will *not* appear, even if in the list.
 ## All current formats already include title and author.
-titlepage_entries: series,category,genre,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
+titlepage_entries: series,category,genre,language,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description

 ## Try to collect series name and number of this story in series.
 ## Some sites (ab)use 'series' for reading lists and personal
@ -176,7 +177,7 @@ output_css:

 [txt]
 ## Add URLs since there aren't links.
-titlepage_entries: series,category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
+titlepage_entries: series,category,genre,language,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description

 ## use \r\n for line endings, the windows convention.  text output only.
 windows_eol: true
--- a/fanficdownloader/adapters/adapter_fanfictionnet.py
+++ b/fanficdownloader/adapters/adapter_fanfictionnet.py
@ -201,7 +201,6 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
        metatext = a.findNext(text=re.compile(r' - Reviews:'))
        if metatext == None: # indicates there's no Reviews, look for id: instead.
            metatext = a.findNext(text=re.compile(r' - id:'))
-        #print("========= metatext:\n%s"%metatext)

        # after Rating, the same bit of text containing id:123456 contains
        # Complete--if completed.
@ -215,7 +214,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
        # <meta name="description" content="Chapter 1 of a Transformers/Beast Wars  - Adventure/Friendship fanfiction with characters Bumblebee. TFA: What would you do if you was being abused all you life? Follow NightRunner as she goes through her spark breaking adventure of getting away from her father..">
        # (fp)<meta name="description" content="Chapter 1 of a Sci-Fi  - Adventure/Humor fiction. Felix Max was just your regular hyperactive kid until he accidently caused his own fathers death. Now he has meta-humans trying to hunt him down with a corrupt goverment to back them up. Oh, and did I mention he has no Powers yet?.">
        # <meta name="description" content="Chapter 1 of a Bleach  - Adventure/Angst fanfiction with characters Ichigo K. & Neliel T. O./Nel. Time travel with a twist. Time can be a real bi***. Ichigo finds that fact out when he accidentally goes back in time. Is this his second chance or is fate just screwing with him. Not a crack fic.IchixNelXHime.">
-        m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?)  (?:- (?P<genres>.*?) )?(?:crossover )?(?:fan)?fiction(?:[ ]+with characters (?P<char1>.*?\.?)(?: & (?P<char2>.*?\.?))?\. )?",
+        # <meta name="description" content="Chapter 1 of a Harry Potter and Transformers  - Humor/Adventure crossover fanfiction  with characters: Harry P. & Ironhide. IT’s one thing to be tossed thru the Veil for something he didn’t do. It was quite another to wake in his animigus form in a world not his own. Harry just knew someone was laughing at him somewhere. Mech/Mech pairings inside..">
+        m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?)  (?:- (?P<genres>.*?) )?(?:crossover )?(?:fan)?fiction(?P<chars>[ ]+with characters)?",
                     soup.find('meta',{'name':'description'})['content'])
        if m != None:
            genres=m.group('genres')
@ -225,7 +225,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
                for g in genres.split('/'):
                    self.story.addToList('genre',g)

-            if m.group('char1') != None:
+            if m.group('chars') != None:
+
                # At this point we've proven that there's character(s)
                # We can't reliably parse characters out of meta name="description".
                # There's no way to tell that "with characters Ichigo K. & Neliel T. O./Nel. " ends at "Nel.", not "T."
@ -233,12 +234,16 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
                # reviewstext can take form of:
                # - English -  Shinji H. - Updated: 01-13-12 - Published: 12-20-11 - id:7654123
                # - English - Adventure/Angst -  Ichigo K. & Neliel T. O./Nel - Reviews:
+                # - English - Humor/Adventure -  Harry P. & Ironhide - Reviews:
                mc = re.match(r" - (?P<lang>[^ ]+ - )(?P<genres>[^ ]+ - )? (?P<chars>.+?) - (Reviews|Updated|Published)",
                              metatext)
                chars = mc.group("chars")
                for c in chars.split(' & '):
                    self.story.addToList('characters',c)
-        
+        m = re.match(r" - (?P<lang>[^ ]+)",metatext)
+        if m.group('lang') != None:
+            self.story.setMetadata('language',m.group('lang'))
+                    
        return

    def getChapterText(self, url):
--- a/fanficdownloader/adapters/adapter_test1.py
+++ b/fanficdownloader/adapters/adapter_test1.py
@ -90,6 +90,16 @@ Some more longer description.  "I suck at summaries!"  "Better than it sounds!"
            self.story.setMetadata('status','In-Progress')
        else:
            self.story.setMetadata('status','Completed')
+
+        langs = {
+            0:"English",
+            1:"Russian",
+            2:"French",
+            3:"German",
+            }
+        if idnum < 10:
+            self.story.setMetadata('language',langs[idnum%len(langs)])
+        # greater than 10, no language.
            
        self.setSeries('The Great Test',idnum)
        
@ -117,7 +127,7 @@ Some more longer description.  "I suck at summaries!"  "Better than it sounds!"
                            ('Chapter 4',self.url+"&chapter=5"),
                            ('Chapter 5',self.url+"&chapter=6"),
                            ('Chapter 6',self.url+"&chapter=6"),
-                            # ('Chapter 7',self.url+"&chapter=6"),
+                            ('Chapter 7',self.url+"&chapter=6"),
                            # ('Chapter 8',self.url+"&chapter=6"),
                            # ('Chapter 9',self.url+"&chapter=6"),
                            # ('Chapter 0',self.url+"&chapter=6"),
--- a/fanficdownloader/story.py
+++ b/fanficdownloader/story.py
@ -19,6 +19,50 @@ import os, re

 from htmlcleanup import conditionalRemoveEntities, removeAllEntities

+# The list comes from ffnet, the only multi-language site we support
+# at the time of writing.  Values are taken largely from pycountry,
+# but with some corrections and guesses.
+langs = {
+    "English":"en",
+    "Spanish":"es",
+    "French":"fr",
+    "German":"de",
+    "Chinese":"zh",
+    "Japanese":"ja",
+    "Dutch":"nl",
+    "Portuguese":"pt",
+    "Russian":"ru",
+    "Italian":"it",
+    "Bulgarian":"bg",
+    "Polish":"pl",
+    "Hungarian":"hu",
+    "Hebrew":"he",
+    "Arabic":"ar",
+    "Swedish":"sv",
+    "Norwegian":"no",
+    "Danish":"da",
+    "Finnish":"fi",
+    "Filipino":"fil",
+    "Esperanto":"eo",
+    "Hindi":"hi",
+    "Punjabi":"pa",
+    "Farsi":"fa",
+    "Greek":"el",
+    "Romanian":"ro",
+    "Albanian":"sq",
+    "Serbian":"sr",
+    "Turkish":"tr",
+    "Czech":"cs",
+    "Indonesian":"id",
+    "Croatian":"hr",
+    "Catalan":"ca",
+    "Latin":"la",
+    "Korean":"ko",
+    "Vietnamese":"vi",
+    "Thai":"th",
+    "Devanagari":"hi",
+    }
+
 class Story:
    
    def __init__(self):
@ -33,6 +77,11 @@ class Story:
    def setMetadata(self, key, value):
        ## still keeps &lt; &lt; and &amp;
        self.metadata[key]=conditionalRemoveEntities(value)
+        if key == "language":
+            try:
+                self.metadata['langcode'] = langs[self.metadata[key]]
+            except:
+                self.metadata['langcode'] = 'en'

    def getMetadataRaw(self,key):
        if self.metadata.has_key(key):
@ -111,7 +160,6 @@ class Story:
    def setReplace(self,replace):
        for line in replace.splitlines():
            if "=>" in line:
-                print("line:%s"%line)
                self.replacements.append(map( lambda x: x.strip(), line.split("=>") ))
    
 def commaGroups(s):
--- a/fanficdownloader/writers/base_writer.py
+++ b/fanficdownloader/writers/base_writer.py
@ -52,6 +52,7 @@ class BaseStoryWriter(Configurable):
        self.validEntries = [
            'category',
            'genre',
+            'language',
            'characters',
            'series',
            'status',
@ -80,6 +81,7 @@ class BaseStoryWriter(Configurable):
        self.titleLabels = {
            'category':'Category',
            'genre':'Genre',
+            'language':'Language',
            'status':'Status',
            'series':'Series',
            'characters':'Characters',
--- a/fanficdownloader/writers/writer_epub.py
+++ b/fanficdownloader/writers/writer_epub.py
@ -203,7 +203,10 @@ class EpubWriter(BaseStoryWriter):

        metadata.appendChild(newTag(contentdom,"dc:contributor",text="fanficdownloader [http://fanficdownloader.googlecode.com]",attrs={"opf:role":"bkp"}))
        metadata.appendChild(newTag(contentdom,"dc:rights",text=""))
-        metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
+        if self.story.getMetadata('langcode') != None:
+            metadata.appendChild(newTag(contentdom,"dc:language",text=self.story.getMetadata('langcode')))
+        else:
+            metadata.appendChild(newTag(contentdom,"dc:language",text='en'))

        #  published, created, updated, calibre
        #  Leave calling self.story.getMetadataRaw directly in case date format changes.
@ -399,4 +402,4 @@ def newTag(dom,name,attrs=None,text=None):
    if( text is not None ):
        tag.appendChild(dom.createTextNode(text))
    return tag
-    
+
--- a/plugin-defaults.ini
+++ b/plugin-defaults.ini
@ -41,6 +41,7 @@ formatext_label:File Extension
 ## Sometimes there are multiple categories and/or genres.
 category_label:Category
 genre_label:Genre
+language_label:Language
 characters_label:Characters
 series_label:Series
 ## Completed/In-Progress
@ -72,7 +73,7 @@ version_label:FFDL Version
 ## items to include in the title page
 ## Empty entries will *not* appear, even if in the list.
 ## All current formats already include title and author.
-titlepage_entries: series,category,genre,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
+titlepage_entries: series,category,genre,language,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description

 ## Try to collect series name and number of this story in series.
 ## Some sites (ab)use 'series' for reading lists and personal
@ -154,7 +155,7 @@ output_css:

 [txt]
 ## Add URLs since there aren't links.
-titlepage_entries: series,category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
+titlepage_entries: series,category,genre,language,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description

 ## use \r\n for line endings, the windows convention.  text output only.
 windows_eol: true