Adding Word Count post-processing option, like Smarten Punct.

This commit is contained in:
Jim Miller 2016-01-29 22:34:07 -06:00
parent 18fd7d3653
commit 784375d15e
8 changed files with 677 additions and 528 deletions

View file

@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Jim Miller'
__copyright__ = '2016, Jim Miller'
__docformat__ = 'restructuredtext en'
import logging
@ -81,8 +81,8 @@ no_trans = { 'pini':'personal.ini',
STD_COLS_SKIP = ['size','cover','news','ondevice','path','series_sort','sort']
from calibre_plugins.fanficfare_plugin.prefs \
import (prefs, PREFS_NAMESPACE, updatecalcover_order, calcover_save_options,
gencalcover_order, SAVE_YES, SAVE_NO)
import (prefs, PREFS_NAMESPACE, prefs_save_options, updatecalcover_order,
gencalcover_order, do_wordcount_order, SAVE_YES, SAVE_NO)
from calibre_plugins.fanficfare_plugin.dialogs \
import (UPDATE, UPDATEALWAYS, collision_order, save_collisions, RejectListDialog,
@ -261,6 +261,7 @@ class ConfigWidget(QWidget):
prefs['checkforurlchange'] = self.basic_tab.checkforurlchange.isChecked()
prefs['injectseries'] = self.basic_tab.injectseries.isChecked()
prefs['matchtitleauth'] = self.basic_tab.matchtitleauth.isChecked()
prefs['do_wordcount'] = prefs_save_options[unicode(self.basic_tab.do_wordcount.currentText())]
prefs['smarten_punctuation'] = self.basic_tab.smarten_punctuation.isChecked()
prefs['reject_always'] = self.basic_tab.reject_always.isChecked()
@ -286,10 +287,10 @@ class ConfigWidget(QWidget):
prefs['cal_cols_pass_in'] = self.personalini_tab.cal_cols_pass_in.isChecked()
# Covers tab
prefs['updatecalcover'] = calcover_save_options[unicode(self.calibrecover_tab.updatecalcover.currentText())]
prefs['updatecalcover'] = prefs_save_options[unicode(self.calibrecover_tab.updatecalcover.currentText())]
# for backward compatibility:
prefs['updatecover'] = prefs['updatecalcover'] == SAVE_YES
prefs['gencalcover'] = calcover_save_options[unicode(self.calibrecover_tab.gencalcover.currentText())]
prefs['gencalcover'] = prefs_save_options[unicode(self.calibrecover_tab.gencalcover.currentText())]
prefs['calibre_gen_cover'] = self.calibrecover_tab.calibre_gen_cover.isChecked()
prefs['plugin_gen_cover'] = self.calibrecover_tab.plugin_gen_cover.isChecked()
prefs['gcnewonly'] = self.calibrecover_tab.gcnewonly.isChecked()
@ -478,6 +479,10 @@ class BasicTab(QWidget):
self.lookforurlinhtml.setChecked(prefs['lookforurlinhtml'])
self.l.addWidget(self.lookforurlinhtml)
proc_gb = groupbox = QGroupBox(_("Post Processing Options"))
self.l = QVBoxLayout()
groupbox.setLayout(self.l)
self.mark = QCheckBox(_("Mark added/updated books when finished?"),self)
self.mark.setToolTip(_("Mark added/updated books when finished. Use with option below.\nYou can also manually search for 'marked:fff_success'.\n'marked:fff_failed' is also available, or search 'marked:fff' for both."))
self.mark.setChecked(prefs['mark'])
@ -493,6 +498,24 @@ class BasicTab(QWidget):
self.smarten_punctuation.setChecked(prefs['smarten_punctuation'])
self.l.addWidget(self.smarten_punctuation)
tooltip = _("Calculate Word Counts using Calibre internal methods.\n"
"Many sites include Word Count, but many do not.\n"
"This will count the words in each book and include it as if it came from the site.")
horz = QHBoxLayout()
label = QLabel(_('Calculate Word Count:'))
label.setToolTip(tooltip)
horz.addWidget(label)
self.do_wordcount = QComboBox(self)
for i in do_wordcount_order:
self.do_wordcount.addItem(i)
self.do_wordcount.setCurrentIndex(self.do_wordcount.findText(prefs_save_options[prefs['do_wordcount']]))
self.do_wordcount.setToolTip(tooltip)
label.setBuddy(self.do_wordcount)
horz.addWidget(self.do_wordcount)
self.l.addLayout(horz)
self.autoconvert = QCheckBox(_("Automatically Convert new/update books?"),self)
self.autoconvert.setToolTip(_("Automatically call calibre's Convert for new/update books.\nConverts to the current output format as chosen in calibre's\nPreferences->Behavior settings."))
self.autoconvert.setChecked(prefs['autoconvert'])
@ -564,14 +587,17 @@ class BasicTab(QWidget):
horz = QHBoxLayout()
horz.addWidget(cali_gb)
vertleft = QVBoxLayout()
vertleft.addWidget(cali_gb)
vertleft.addWidget(proc_gb)
vert = QVBoxLayout()
vert.addWidget(gui_gb)
vert.addWidget(misc_gb)
vert.addWidget(rej_gb)
vertright = QVBoxLayout()
vertright.addWidget(gui_gb)
vertright.addWidget(misc_gb)
vertright.addWidget(rej_gb)
horz.addLayout(vert)
horz.addLayout(vertleft)
horz.addLayout(vertright)
topl.addLayout(horz)
topl.insertStretch(-1)
@ -840,11 +866,11 @@ class CalibreCoverTab(QWidget):
self.updatecalcover.addItem(i)
# back compat. If has own value, use.
if prefs['updatecalcover']:
self.updatecalcover.setCurrentIndex(self.updatecalcover.findText(calcover_save_options[prefs['updatecalcover']]))
self.updatecalcover.setCurrentIndex(self.updatecalcover.findText(prefs_save_options[prefs['updatecalcover']]))
elif prefs['updatecover']: # doesn't have own val, set YES if old value set.
self.updatecalcover.setCurrentIndex(self.updatecalcover.findText(calcover_save_options[SAVE_YES]))
self.updatecalcover.setCurrentIndex(self.updatecalcover.findText(prefs_save_options[SAVE_YES]))
else: # doesn't have own value, old value not set, NO.
self.updatecalcover.setCurrentIndex(self.updatecalcover.findText(calcover_save_options[SAVE_NO]))
self.updatecalcover.setCurrentIndex(self.updatecalcover.findText(prefs_save_options[SAVE_NO]))
self.updatecalcover.setToolTip(tooltip)
label.setBuddy(self.updatecalcover)
horz.addWidget(self.updatecalcover)
@ -862,11 +888,11 @@ class CalibreCoverTab(QWidget):
self.gencalcover.addItem(i)
# back compat. If has own value, use.
# if prefs['gencalcover']:
self.gencalcover.setCurrentIndex(self.gencalcover.findText(calcover_save_options[prefs['gencalcover']]))
self.gencalcover.setCurrentIndex(self.gencalcover.findText(prefs_save_options[prefs['gencalcover']]))
# elif prefs['gencover']: # doesn't have own val, set YES if old value set.
# self.gencalcover.setCurrentIndex(self.gencalcover.findText(calcover_save_options[SAVE_YES]))
# self.gencalcover.setCurrentIndex(self.gencalcover.findText(prefs_save_options[SAVE_YES]))
# else: # doesn't have own value, old value not set, NO.
# self.gencalcover.setCurrentIndex(self.gencalcover.findText(calcover_save_options[SAVE_NO]))
# self.gencalcover.setCurrentIndex(self.gencalcover.findText(prefs_save_options[SAVE_NO]))
self.gencalcover.setToolTip(tooltip)
label.setBuddy(self.gencalcover)
@ -990,7 +1016,7 @@ class CalibreCoverTab(QWidget):
## First, cover gen on/off
for e in self.gencov_elements:
e.setEnabled(calcover_save_options[unicode(self.gencalcover.currentText())] != SAVE_NO)
e.setEnabled(prefs_save_options[unicode(self.gencalcover.currentText())] != SAVE_NO)
# next, disable plugin settings when using calibre gen cov.
if not self.plugin_gen_cover.isChecked():

View file

@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Jim Miller'
__copyright__ = '2016, Jim Miller'
__docformat__ = 'restructuredtext en'
import traceback, re
@ -73,55 +73,30 @@ from calibre_plugins.fanficfare_plugin.fanficfare.configurable \
from inihighlighter import IniHighlighter
SKIP=_('Skip')
ADDNEW=_('Add New Book')
UPDATE=_('Update EPUB if New Chapters')
UPDATEALWAYS=_('Update EPUB Always')
OVERWRITE=_('Overwrite if Newer')
OVERWRITEALWAYS=_('Overwrite Always')
CALIBREONLY=_('Update Calibre Metadata from Web Site')
CALIBREONLYSAVECOL=_('Update Calibre Metadata from Saved Metadata Column')
collision_order=[SKIP,
ADDNEW,
UPDATE,
UPDATEALWAYS,
OVERWRITE,
OVERWRITEALWAYS,
CALIBREONLY,
CALIBREONLYSAVECOL,]
# best idea I've had for how to deal with config/pref saving the
# collision name in english.
SAVE_SKIP='Skip'
SAVE_ADDNEW='Add New Book'
SAVE_UPDATE='Update EPUB if New Chapters'
SAVE_UPDATEALWAYS='Update EPUB Always'
SAVE_OVERWRITE='Overwrite if Newer'
SAVE_OVERWRITEALWAYS='Overwrite Always'
SAVE_CALIBREONLY='Update Calibre Metadata Only'
SAVE_CALIBREONLYSAVECOL='Update Calibre Metadata Only(Saved Column)'
save_collisions={
SKIP:SAVE_SKIP,
ADDNEW:SAVE_ADDNEW,
UPDATE:SAVE_UPDATE,
UPDATEALWAYS:SAVE_UPDATEALWAYS,
OVERWRITE:SAVE_OVERWRITE,
OVERWRITEALWAYS:SAVE_OVERWRITEALWAYS,
CALIBREONLY:SAVE_CALIBREONLY,
CALIBREONLYSAVECOL:SAVE_CALIBREONLYSAVECOL,
SAVE_SKIP:SKIP,
SAVE_ADDNEW:ADDNEW,
SAVE_UPDATE:UPDATE,
SAVE_UPDATEALWAYS:UPDATEALWAYS,
SAVE_OVERWRITE:OVERWRITE,
SAVE_OVERWRITEALWAYS:OVERWRITEALWAYS,
SAVE_CALIBREONLY:CALIBREONLY,
SAVE_CALIBREONLYSAVECOL:CALIBREONLYSAVECOL,
}
anthology_collision_order=[UPDATE,
UPDATEALWAYS,
OVERWRITEALWAYS]
## moved to prefs.py so they can be included in jobs.py.
from calibre_plugins.fanficfare_plugin.prefs import \
( SAVE_YES,
SAVE_YES_UNLESS_SITE,
SKIP,
ADDNEW,
UPDATE,
UPDATEALWAYS,
OVERWRITE,
OVERWRITEALWAYS,
CALIBREONLY,
CALIBREONLYSAVECOL,
collision_order,
SAVE_SKIP,
SAVE_ADDNEW,
SAVE_UPDATE,
SAVE_UPDATEALWAYS,
SAVE_OVERWRITE,
SAVE_OVERWRITEALWAYS,
SAVE_CALIBREONLY,
SAVE_CALIBREONLYSAVECOL,
save_collisions,
anthology_collision_order,
)
gpstyle='QGroupBox {border:0; padding-top:10px; padding-bottom:0px; margin-bottom:0px;}' # background-color:red;
@ -473,8 +448,9 @@ class AddNewDialog(SizePersistedDialog):
'updatemeta': self.updatemeta.isChecked(),
'bgmeta': False, # self.bgmeta.isChecked(),
'updateepubcover': self.updateepubcover.isChecked(),
'smarten_punctuation':self.prefs['smarten_punctuation']
}
'smarten_punctuation':self.prefs['smarten_punctuation'],
'do_wordcount':self.prefs['do_wordcount'],
}
if self.merge:
retval['fileform']=='epub'
@ -898,7 +874,8 @@ class UpdateExistingDialog(SizePersistedDialog):
'updatemeta': self.updatemeta.isChecked(),
'bgmeta': self.bgmeta.isChecked(),
'updateepubcover': self.updateepubcover.isChecked(),
'smarten_punctuation':self.prefs['smarten_punctuation']
'smarten_punctuation':self.prefs['smarten_punctuation'],
'do_wordcount':self.prefs['do_wordcount'],
}
class StoryListTableWidget(QTableWidget):

View file

@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Jim Miller'
__copyright__ = '2016, Jim Miller'
__docformat__ = 'restructuredtext en'
import logging
@ -476,7 +476,8 @@ class FanFicFarePlugin(InterfaceAction):
'updatemeta': prefs['updatemeta'],
'bgmeta': False,
'updateepubcover': prefs['updateepubcover'],
'smarten_punctuation':prefs['smarten_punctuation']
'smarten_punctuation':prefs['smarten_punctuation'],
'do_wordcount':prefs['do_wordcount'],
},"\n".join(url_list))
else:
self.gui.status_bar.show_message(_('Finished Fetching Story URLs from Email.'),3000)

View file

@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
__copyright__ = '2016, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
__docformat__ = 'restructuredtext en'
import logging
@ -20,6 +20,9 @@ from calibre.constants import numeric_version as calibre_version
from calibre.utils.date import local_tz
from calibre.library.comments import sanitize_comments_html
from calibre_plugins.fanficfare_plugin.wordcount import get_word_count
from calibre_plugins.fanficfare_plugin.prefs import (SAVE_YES, SAVE_YES_UNLESS_SITE)
# ------------------------------------------------------------------------------
#
# Functions to perform downloads using worker jobs
@ -148,7 +151,7 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
adapter.setChaptersRange(book['begin'],book['end'])
adapter.load_cookiejar(options['cookiejarfile'])
logger.debug("cookiejar:%s"%adapter.cookiejar)
#logger.debug("cookiejar:%s"%adapter.cookiejar)
adapter.set_pagecache(options['pagecache'])
story = adapter.getStoryMetadataOnly()
@ -217,6 +220,7 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
logger.info("write to %s"%outfile)
inject_cal_cols(book,story,configuration)
writer.writeStory(outfilename=outfile, forceOverwrite=True)
book['comment'] = 'Download %s completed, %s chapters.'%(options['fileform'],story.getMetadata("numChapters"))
book['all_metadata'] = story.getAllMetadata(removeallentities=True)
if options['savemetacol'] != '':
@ -271,6 +275,16 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
if options['savemetacol'] != '':
book['savemetacol'] = story.dump_html_metadata()
if options['do_wordcount'] == SAVE_YES or (
options['do_wordcount'] == SAVE_YES_UNLESS_SITE and not story.getMetadataRaw('numWords') ):
wordcount = get_word_count(outfile)
logger.info("get_word_count:%s"%wordcount)
story.setMetadata('numWords',wordcount)
writer.writeStory(outfilename=outfile, forceOverwrite=True)
book['all_metadata'] = story.getAllMetadata(removeallentities=True)
if options['savemetacol'] != '':
book['savemetacol'] = story.dump_html_metadata()
if options['smarten_punctuation'] and options['fileform'] == "epub" \
and calibre_version >= (0, 9, 39):
# for smarten punc
@ -286,8 +300,7 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
opts = O(**opts)
log = Log(level=Log.DEBUG)
# report = []
polish({outfile:outfile}, opts, log, logger.info) # report.append
polish({outfile:outfile}, opts, log, logger.info)
except NotGoingToDownload as d:
book['good']=False

View file

@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Jim Miller'
__copyright__ = '2016, Jim Miller'
__docformat__ = 'restructuredtext en'
import logging
@ -15,9 +15,59 @@ import copy
from calibre.utils.config import JSONConfig
from calibre.gui2.ui import get_gui
from calibre_plugins.fanficfare_plugin.dialogs import SAVE_UPDATE
from calibre_plugins.fanficfare_plugin.common_utils import get_library_uuid
SKIP=_('Skip')
ADDNEW=_('Add New Book')
UPDATE=_('Update EPUB if New Chapters')
UPDATEALWAYS=_('Update EPUB Always')
OVERWRITE=_('Overwrite if Newer')
OVERWRITEALWAYS=_('Overwrite Always')
CALIBREONLY=_('Update Calibre Metadata from Web Site')
CALIBREONLYSAVECOL=_('Update Calibre Metadata from Saved Metadata Column')
collision_order=[SKIP,
ADDNEW,
UPDATE,
UPDATEALWAYS,
OVERWRITE,
OVERWRITEALWAYS,
CALIBREONLY,
CALIBREONLYSAVECOL,]
# best idea I've had for how to deal with config/pref saving the
# collision name in english.
SAVE_SKIP='Skip'
SAVE_ADDNEW='Add New Book'
SAVE_UPDATE='Update EPUB if New Chapters'
SAVE_UPDATEALWAYS='Update EPUB Always'
SAVE_OVERWRITE='Overwrite if Newer'
SAVE_OVERWRITEALWAYS='Overwrite Always'
SAVE_CALIBREONLY='Update Calibre Metadata Only'
SAVE_CALIBREONLYSAVECOL='Update Calibre Metadata Only(Saved Column)'
save_collisions={
SKIP:SAVE_SKIP,
ADDNEW:SAVE_ADDNEW,
UPDATE:SAVE_UPDATE,
UPDATEALWAYS:SAVE_UPDATEALWAYS,
OVERWRITE:SAVE_OVERWRITE,
OVERWRITEALWAYS:SAVE_OVERWRITEALWAYS,
CALIBREONLY:SAVE_CALIBREONLY,
CALIBREONLYSAVECOL:SAVE_CALIBREONLYSAVECOL,
SAVE_SKIP:SKIP,
SAVE_ADDNEW:ADDNEW,
SAVE_UPDATE:UPDATE,
SAVE_UPDATEALWAYS:UPDATEALWAYS,
SAVE_OVERWRITE:OVERWRITE,
SAVE_OVERWRITEALWAYS:OVERWRITEALWAYS,
SAVE_CALIBREONLY:CALIBREONLY,
SAVE_CALIBREONLYSAVECOL:CALIBREONLYSAVECOL,
}
anthology_collision_order=[UPDATE,
UPDATEALWAYS,
OVERWRITEALWAYS]
# Show translated strings, but save the same string in prefs so your
# prefs are the same in different languages.
YES=_('Yes, Always')
@ -26,9 +76,11 @@ YES_IF_IMG=_('Yes, if EPUB has a cover image')
SAVE_YES_IF_IMG='Yes, if img'
YES_UNLESS_IMG=_('Yes, unless FanFicFare found a cover image')
SAVE_YES_UNLESS_IMG='Yes, unless img'
YES_UNLESS_SITE=_('Yes, unless found on site')
SAVE_YES_UNLESS_SITE='Yes, unless site'
NO=_('No')
SAVE_NO='No'
calcover_save_options = {
prefs_save_options = {
YES:SAVE_YES,
SAVE_YES:YES,
YES_IF_IMG:SAVE_YES_IF_IMG,
@ -37,9 +89,12 @@ calcover_save_options = {
SAVE_YES_UNLESS_IMG:YES_UNLESS_IMG,
NO:SAVE_NO,
SAVE_NO:NO,
YES_UNLESS_SITE:SAVE_YES_UNLESS_SITE,
SAVE_YES_UNLESS_SITE:YES_UNLESS_SITE,
}
updatecalcover_order=[YES,YES_IF_IMG,NO]
gencalcover_order=[YES,YES_UNLESS_IMG,NO]
do_wordcount_order=[YES,YES_UNLESS_SITE,NO]
# if don't have any settings for FanFicFarePlugin, copy from
# predecessor FanFictionDownLoaderPlugin.
@ -78,6 +133,7 @@ default_prefs['checkforseriesurlid'] = True
default_prefs['checkforurlchange'] = True
default_prefs['injectseries'] = False
default_prefs['matchtitleauth'] = True
default_prefs['do_wordcount'] = SAVE_YES_UNLESS_SITE
default_prefs['smarten_punctuation'] = False
default_prefs['show_est_time'] = False

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,95 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2016, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
__docformat__ = 'restructuredtext en'
'''
A lot of this is lifted from Count Pages plugin by Grant Drake (with
some changes from davidfor.)
'''
import logging
logger = logging.getLogger(__name__)
import re
from calibre.ebooks.oeb.iterator import EbookIterator
RE_HTML_BODY = re.compile(u'<body[^>]*>(.*)</body>', re.UNICODE | re.DOTALL | re.IGNORECASE)
RE_STRIP_MARKUP = re.compile(u'<[^>]+>', re.UNICODE)
def get_word_count(book_path):
'''
Estimate a word count
'''
from calibre.utils.localization import get_lang
iterator = _open_epub_file(book_path)
lang = iterator.opf.language
lang = get_lang() if not lang else lang
count = _get_epub_standard_word_count(iterator, lang)
return count
def _open_epub_file(book_path, strip_html=False):
'''
Given a path to an EPUB file, read the contents into a giant block of text
'''
iterator = EbookIterator(book_path)
iterator.__enter__(only_input_plugin=True, run_char_count=True,
read_anchor_map=False)
return iterator
def _get_epub_standard_word_count(iterator, lang='en'):
'''
This algorithm counts individual words instead of pages
'''
book_text = _read_epub_contents(iterator, strip_html=True)
try:
from calibre.spell.break_iterator import count_words
wordcount = count_words(book_text, lang)
logger.debug('\tWord count - count_words method:%s'%wordcount)
except:
try: # The above method is new and no-one will have it as of 08/01/2016. Use an older method for a beta.
from calibre.spell.break_iterator import split_into_words_and_positions
wordcount = len(split_into_words_and_positions(book_text, lang))
logger.debug('\tWord count - split_into_words_and_positions method:%s'%wordcount)
except:
from calibre.utils.wordcount import get_wordcount_obj
wordcount = get_wordcount_obj(book_text)
wordcount = wordcount.words
logger.debug('\tWord count - old method:%s'%wordcount)
return wordcount
def _read_epub_contents(iterator, strip_html=False):
'''
Given an iterator for an ePub file, read the contents into a giant block of text
'''
book_files = []
for path in iterator.spine:
with open(path, 'rb') as f:
html = f.read().decode('utf-8', 'replace')
if strip_html:
html = unicode(_extract_body_text(html)).strip()
#print('FOUND HTML:', html)
book_files.append(html)
return ''.join(book_files)
def _extract_body_text(data):
'''
Get the body text of this html content wit any html tags stripped
'''
body = RE_HTML_BODY.findall(data)
if body:
return RE_STRIP_MARKUP.sub('', body[0]).replace('.','. ')
return ''

View file

@ -646,10 +646,8 @@ class Story(Configurable):
elif self.metadata.has_key(key):
value = self.metadata[key]
if value:
if key == "numWords":
value = commaGroups(value)
if key == "numChapters":
value = commaGroups("%d"%value)
if key in ("numWords","numChapters"):
value = commaGroups(unicode(value))
if key in ("dateCreated"):
value = value.strftime(self.getConfig(key+"_format","%Y-%m-%d %H:%M:%S"))
if key in ("datePublished","dateUpdated"):