This commit is contained in:
Kovid Goyal 2018-09-11 10:54:01 +05:30
parent 70a131f04c
commit f91b9c8e51
No known key found for this signature in database
GPG key ID: 06BC317B515ACE7C
79 changed files with 194 additions and 160 deletions

View file

@ -162,7 +162,7 @@ def should_skip_article(self, soup):
def scrape_article_date(self, soup): def scrape_article_date(self, soup):
for span in soup.findAll('span'): for span in soup.findAll('span'):
txt = self.text(span) txt = self.text(span)
rgx = re.compile('Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*') rgx = re.compile(unicode(r'Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*'))
hit = rgx.match(txt) hit = rgx.match(txt)
if hit: if hit:
return self.date_from_string(txt) return self.date_from_string(txt)

View file

@ -106,5 +106,5 @@ class AppledailyTW(BasicNewsRecipe):
def preprocess_raw_html(self, raw_html, url): def preprocess_raw_html(self, raw_html, url):
raw_html = re.sub(unicode(r'<a href=".*?<br><br>.*?<\/a>'), '', raw_html) raw_html = re.sub(unicode(r'<a href=".*?<br><br>.*?<\/a>'), '', raw_html)
raw_html = re.sub( raw_html = re.sub(
unicode(r'<title>(.*?)[\s]+\|.*<\/title>', '<title>\1<\/title>'), raw_html) unicode(r'<title>(.*?)[\\s]+\|.*<\/title>', r'<title>\1<\/title>'), raw_html)
return raw_html return raw_html

View file

@ -78,7 +78,7 @@ def get_cover_url(self):
'http://cdn.images.express.co.uk/img/covers/')}) 'http://cdn.images.express.co.uk/img/covers/')})
cov = str(cov) cov = str(cov)
cov2 = re.findall( cov2 = re.findall(
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov) 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov = str(cov2) cov = str(cov2)
cov = cov[2:len(cov) - 2] cov = cov[2:len(cov) - 2]

View file

@ -22,7 +22,7 @@ class FilmWebPl(BasicNewsRecipe):
'ul.sep-line > li + li::before {content: " | "} ' 'ul.sep-line > li + li::before {content: " | "} '
'ul.inline {padding:0px;} .vertical-align {display: inline-block;}') 'ul.inline {padding:0px;} .vertical-align {display: inline-block;}')
preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags... preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags...
(re.compile(u'(?:<sup>)?\(kliknij\,\ aby powiększyć\)(?:</sup>)?', re.IGNORECASE), lambda m: ''), (re.compile(u'(?:<sup>)?\\(kliknij\\,\\ aby powiększyć\\)(?:</sup>)?', re.IGNORECASE), lambda m: ''),
(re.compile(unicode(r'(<br ?/?>\s*?<br ?/?>\s*?)+'), re.IGNORECASE), lambda m: '<br />') (re.compile(unicode(r'(<br ?/?>\s*?<br ?/?>\s*?)+'), re.IGNORECASE), lambda m: '<br />')
] ]
remove_tags = [dict(attrs={'class':['infoParent', 'likeBar', remove_tags = [dict(attrs={'class':['infoParent', 'likeBar',

View file

@ -62,7 +62,7 @@ class HuffingtonPostRecipe(BasicNewsRecipe):
remove_tags = [] remove_tags = []
remove_tags.append(dict(name='a', attrs={'href': re.compile( remove_tags.append(dict(name='a', attrs={'href': re.compile(
'http://feedads\.g\.doubleclick.net.*')})) 'http://feedads\\.g\\.doubleclick.net.*')}))
remove_tags.append(dict(name='div', attrs={'class': 'feedflare'})) remove_tags.append(dict(name='div', attrs={'class': 'feedflare'}))
remove_tags.append(dict(name='a', attrs={'class': 'home_pixie'})) remove_tags.append(dict(name='a', attrs={'class': 'home_pixie'}))
remove_tags.append(dict(name='div', attrs={'id': [ remove_tags.append(dict(name='div', attrs={'id': [

View file

@ -144,7 +144,7 @@ def postprocess_html(self, soup, first_fetch):
# Place article date after header # Place article date after header
dates = soup.findAll(text=re.compile( dates = soup.findAll(text=re.compile(
'\d{2}\.\d{2}\.\d{4}, \d{2}:\d{2}:\d{2}')) r'\d{2}\.\d{2}\.\d{4}, \d{2}:\d{2}:\d{2}'))
if dates: if dates:
for date in dates: for date in dates:
for string in date: for string in date:

View file

@ -121,7 +121,7 @@ def parse_index(self):
if article_anchor: if article_anchor:
article_url = article_anchor.get('href') article_url = article_anchor.get('href')
if not article_url: if not article_url:
print('article_url is None for article_anchor "%s": "%s"' \ print('article_url is None for article_anchor "%s": "%s"'
% (str(article_anchor), article_title), file=sys.stderr) % (str(article_anchor), article_title), file=sys.stderr)
continue continue

View file

@ -84,11 +84,11 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name=['iframe', 'script', 'noscript', 'style']), dict(name=['iframe', 'script', 'noscript', 'style']),
dict(name='div', attrs={'class': ['fact-related-box', 'aside clearfix', 'aside clearfix middle-col-line', 'comments', 'share-tools', 'article-right-column', 'column-4-5', 'column-1-5', 'ad-msg', 'col-179 ', 'col-373 ', 'clear', 'ad', 'navigation', re.compile('share-tools(-top)?'), 'tools', 'metroCommentFormWrap', 'article-tools-below-title', 'related-links', 'padding-top-15', re.compile('^promo.*?$'), 'teaser-component', re.compile('fb(-comments|_iframe_widget)'), 'promos', 'header-links', 'promo-2']}), # noqa dict(name='div', attrs={'class': ['fact-related-box', 'aside clearfix', 'aside clearfix middle-col-line', 'comments', 'share-tools', 'article-right-column', 'column-4-5', 'column-1-5', 'ad-msg', 'col-179 ', 'col-373 ', 'clear', 'ad', 'navigation', re.compile('share-tools(-top)?'), 'tools', 'metroCommentFormWrap', 'article-tools-below-title', 'related-links', 'padding-top-15', re.compile('^promo.*?$'), 'teaser-component', re.compile('fb(-comments|_iframe_widget)'), 'promos', 'header-links', 'promo-2']}), # noqa
dict(id=['super-carousel', 'article-2', 'googleads', 'column-1-5-bottom', 'column-4-5', re.compile('^ad(\d+|adcomp.*?)?$'), 'adadcomp-4', 'margin-5', 'sidebar', re.compile('^article-\d'), 'comments', 'gallery-1', 'sharez_container', 'ts-container', 'topshares', 'ts-title']), # noqa dict(id=['super-carousel', 'article-2', 'googleads', 'column-1-5-bottom', 'column-4-5', re.compile('^ad(\\d+|adcomp.*?)?$'), 'adadcomp-4', 'margin-5', 'sidebar', re.compile('^article-\\d'), 'comments', 'gallery-1', 'sharez_container', 'ts-container', 'topshares', 'ts-title']), # noqa
dict(name='a', attrs={'name': 'comments'}), dict(name='a', attrs={'name': 'comments'}),
dict(name='img', attrs={'class': 'top-line', dict(name='img', attrs={'class': 'top-line',
'title': 'volledig scherm'}), 'title': 'volledig scherm'}),
dict(attrs={'style': re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'), 'title': 'volledig scherm'})] dict(attrs={'style': re.compile('^(.*(display\\s?:\\s?none|img-mask|white)\\s?;?.*)$'), 'title': 'volledig scherm'})]
'''removed by before/after: '''removed by before/after:
id: id:
@ -223,7 +223,7 @@ def removeArrayOfTags(self, souparray):
return self.myKiller.safeRemovePart(souparray, True) return self.myKiller.safeRemovePart(souparray, True)
def removeEmptyTags(self, soup, run=0): def removeEmptyTags(self, soup, run=0):
emptymatches = re.compile('^[&nbsp;\s\n\r\t ]*$') emptymatches = re.compile('^[&nbsp;\\s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and ( emptytags = soup.findAll(lambda tag: tag.find(True) is None and (
tag.string is None or tag.string.strip() == "" or tag.string.strip() == emptymatches) and not tag.isSelfClosing) tag.string is None or tag.string.strip() == "" or tag.string.strip() == emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags is None or emptytags == []): if emptytags and not (emptytags is None or emptytags == []):

View file

@ -29,7 +29,7 @@ def parse_index(self):
soup = self.index_to_soup( soup = self.index_to_soup(
'http://www.observatorcultural.ro/Arhiva*-archive.html') 'http://www.observatorcultural.ro/Arhiva*-archive.html')
issueTag = soup.find('a', href=re.compile( issueTag = soup.find('a', href=re.compile(
"observatorcultural.ro\/Numarul")) "observatorcultural.ro\\/Numarul"))
issueURL = issueTag['href'] issueURL = issueTag['href']
print(issueURL) print(issueURL)
issueSoup = self.index_to_soup(issueURL) issueSoup = self.index_to_soup(issueURL)

View file

@ -83,6 +83,7 @@ def get_icons(zfp, name_or_list_of_names):
ians = ians.pop(names[0]) ians = ians.pop(names[0])
return ians return ians
_translations_cache = {} _translations_cache = {}
@ -316,4 +317,3 @@ def _locate_code(self, zf, path_to_zip_file):
zf.write(os.path.join(x, y)) zf.write(os.path.join(x, y))
add_plugin(f.name) add_plugin(f.name)
print('Added plugin from', sys.argv[-1]) print('Added plugin from', sys.argv[-1])

View file

@ -40,5 +40,6 @@ def main():
show_stats(stats) show_stats(stats)
print('Stats saved to', stats) print('Stats saved to', stats)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -221,7 +221,7 @@ def main():
try: try:
d.startup() d.startup()
except: except:
print ('Startup failed for device plugin: %s'%d) print('Startup failed for device plugin: %s'%d)
if d.MANAGES_DEVICE_PRESENCE: if d.MANAGES_DEVICE_PRESENCE:
cd = d.detect_managed_devices(scanner.devices) cd = d.detect_managed_devices(scanner.devices)
if cd is not None: if cd is not None:
@ -395,5 +395,6 @@ def main():
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -93,8 +93,8 @@ def main():
finally: finally:
dev.shutdown() dev.shutdown()
print ('Device connection shutdown') print('Device connection shutdown')
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -211,7 +211,6 @@ def test_udisks(ver=None):
print('Ejecting:') print('Ejecting:')
u.eject(dev) u.eject(dev)
if __name__ == '__main__': if __name__ == '__main__':
test_udisks() test_udisks()

View file

@ -53,6 +53,7 @@ def __str__(self):
''.join(["%02x" % d for d in self.data4[2:]]), ''.join(["%02x" % d for d in self.data4[2:]]),
) )
CONFIGRET = DWORD CONFIGRET = DWORD
DEVINST = DWORD DEVINST = DWORD
LPDWORD = POINTER(DWORD) LPDWORD = POINTER(DWORD)
@ -70,6 +71,8 @@ def CTL_CODE(DeviceType, Function, Method, Access):
def USB_CTL(id): def USB_CTL(id):
# CTL_CODE(FILE_DEVICE_USB, (id), METHOD_BUFFERED, FILE_ANY_ACCESS) # CTL_CODE(FILE_DEVICE_USB, (id), METHOD_BUFFERED, FILE_ANY_ACCESS)
return CTL_CODE(0x22, id, 0, 0) return CTL_CODE(0x22, id, 0, 0)
IOCTL_USB_GET_ROOT_HUB_NAME = USB_CTL(258) IOCTL_USB_GET_ROOT_HUB_NAME = USB_CTL(258)
IOCTL_USB_GET_NODE_INFORMATION = USB_CTL(258) IOCTL_USB_GET_NODE_INFORMATION = USB_CTL(258)
IOCTL_USB_GET_NODE_CONNECTION_INFORMATION = USB_CTL(259) IOCTL_USB_GET_NODE_CONNECTION_INFORMATION = USB_CTL(259)
@ -108,6 +111,7 @@ class SP_DEVINFO_DATA(Structure):
def __str__(self): def __str__(self):
return "ClassGuid:%s DevInst:%s" % (self.ClassGuid, self.DevInst) return "ClassGuid:%s DevInst:%s" % (self.ClassGuid, self.DevInst)
PSP_DEVINFO_DATA = POINTER(SP_DEVINFO_DATA) PSP_DEVINFO_DATA = POINTER(SP_DEVINFO_DATA)
@ -122,6 +126,7 @@ class SP_DEVICE_INTERFACE_DATA(Structure):
def __str__(self): def __str__(self):
return "InterfaceClassGuid:%s Flags:%s" % (self.InterfaceClassGuid, self.Flags) return "InterfaceClassGuid:%s Flags:%s" % (self.InterfaceClassGuid, self.Flags)
ANYSIZE_ARRAY = 1 ANYSIZE_ARRAY = 1
@ -131,6 +136,7 @@ class SP_DEVICE_INTERFACE_DETAIL_DATA(Structure):
("DevicePath", c_wchar*ANYSIZE_ARRAY) ("DevicePath", c_wchar*ANYSIZE_ARRAY)
] ]
UCHAR = c_ubyte UCHAR = c_ubyte
@ -216,6 +222,7 @@ class SetupPacket(Structure):
('Data', USB_STRING_DESCRIPTOR), ('Data', USB_STRING_DESCRIPTOR),
) )
PUSB_DESCRIPTOR_REQUEST = POINTER(USB_DESCRIPTOR_REQUEST) PUSB_DESCRIPTOR_REQUEST = POINTER(USB_DESCRIPTOR_REQUEST)
PSP_DEVICE_INTERFACE_DETAIL_DATA = POINTER(SP_DEVICE_INTERFACE_DETAIL_DATA) PSP_DEVICE_INTERFACE_DETAIL_DATA = POINTER(SP_DEVICE_INTERFACE_DETAIL_DATA)
PSP_DEVICE_INTERFACE_DATA = POINTER(SP_DEVICE_INTERFACE_DATA) PSP_DEVICE_INTERFACE_DATA = POINTER(SP_DEVICE_INTERFACE_DATA)
@ -390,6 +397,7 @@ def config_err_check(result, func, args):
raise WindowsError(result, 'The cfgmgr32 function failed with err: %s' % CR_CODE_NAMES.get(result, result)) raise WindowsError(result, 'The cfgmgr32 function failed with err: %s' % CR_CODE_NAMES.get(result, result))
return args return args
GetLogicalDrives = cwrap('GetLogicalDrives', DWORD, errcheck=bool_err_check, lib=kernel32) GetLogicalDrives = cwrap('GetLogicalDrives', DWORD, errcheck=bool_err_check, lib=kernel32)
GetDriveType = cwrap('GetDriveTypeW', UINT, LPCWSTR, lib=kernel32) GetDriveType = cwrap('GetDriveTypeW', UINT, LPCWSTR, lib=kernel32)
GetVolumeNameForVolumeMountPoint = cwrap('GetVolumeNameForVolumeMountPointW', BOOL, LPCWSTR, LPWSTR, DWORD, errcheck=bool_err_check, lib=kernel32) GetVolumeNameForVolumeMountPoint = cwrap('GetVolumeNameForVolumeMountPointW', BOOL, LPCWSTR, LPWSTR, DWORD, errcheck=bool_err_check, lib=kernel32)
@ -675,6 +683,7 @@ def get_volume_pathnames(volume_id, buf=None):
# def scan_usb_devices(): {{{ # def scan_usb_devices(): {{{
_USBDevice = namedtuple('USBDevice', 'vendor_id product_id bcd devid devinst') _USBDevice = namedtuple('USBDevice', 'vendor_id product_id bcd devid devinst')
@ -1017,6 +1026,7 @@ def drives_for(vendor_id, product_id=None):
pprint(get_drive_letters_for_device(usbdev, debug=True)) pprint(get_drive_letters_for_device(usbdev, debug=True))
print('USB info:', get_usb_info(usbdev, debug=True)) print('USB info:', get_usb_info(usbdev, debug=True))
if __name__ == '__main__': if __name__ == '__main__':
develop() develop()
# }}} # }}}

View file

@ -32,11 +32,11 @@ def __init__(self, extra_opts=None, log=None):
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE) self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" # noqa self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" # noqa
self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE) self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">' self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
self.common_in_text_endings = u'[\"\'—’”,\.!\?\\)„\w]' self.common_in_text_endings = u'[\"\'—’”,\\.!\\?\\\\)„\\w]'
self.common_in_text_beginnings = u'[\w\'\"“‘‛]' self.common_in_text_beginnings = u'[\\w\'\"“‘‛]'
def is_pdftohtml(self, src): def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@ -54,10 +54,10 @@ def chapter_head(self, match):
" chapters. - " + unicode(chap)) " chapters. - " + unicode(chap))
return '<h2>'+chap+'</h2>\n' return '<h2>'+chap+'</h2>\n'
else: else:
delete_whitespace = re.compile('^\s*(?P<c>.*?)\s*$') delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
delete_quotes = re.compile('\'\"') delete_quotes = re.compile('\'\"')
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(chap))) txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
txt_title = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(title))) txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
self.html_preprocess_sections = self.html_preprocess_sections + 1 self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode(self.html_preprocess_sections) + self.log.debug("marked " + unicode(self.html_preprocess_sections) +
" chapters & titles. - " + unicode(chap) + ", " + unicode(title)) " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
@ -216,24 +216,24 @@ def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" # noqa title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" # noqa
chapter_header_open = r"(?P<chap>" chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>" title_header_open = r"(?P<title>"
chapter_header_close = ")\s*" chapter_header_close = ")\\s*"
title_header_close = ")" title_header_close = ")"
chapter_line_close = self.line_close chapter_line_close = self.line_close
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" title_line_close = "(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>"
is_pdftohtml = self.is_pdftohtml(html) is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml: if is_pdftohtml:
title_line_open = "<(?P<outer2>p)[^>]*>\s*" title_line_open = "<(?P<outer2>p)[^>]*>\\s*"
title_line_close = "\s*</(?P=outer2)>" title_line_close = "\\s*</(?P=outer2)>"
if blanks_between_paragraphs: if blanks_between_paragraphs:
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" blank_lines = "(\\s*<p[^>]*>\\s*</p>){0,2}\\s*"
else: else:
blank_lines = "" blank_lines = ""
opt_title_open = "(" opt_title_open = "("
opt_title_close = ")?" opt_title_close = ")?"
n_lookahead_open = "(?!\s*" n_lookahead_open = "(?!\\s*"
n_lookahead_close = ")\s*" n_lookahead_close = ")\\s*"
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)" simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
@ -358,12 +358,12 @@ def style_unwrap(match):
# define the pieces of the regex # define the pieces of the regex
# (?<!\&\w{4});) is a semicolon not part of an entity # (?<!\&\w{4});) is a semicolon not part of an entity
lookahead = "(?<=.{"+unicode(length)+u"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" lookahead = "(?<=.{"+unicode(length)+u"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IA\u00DF]|(?<!\\&\\w{4});))"
em_en_lookahead = "(?<=.{"+unicode(length)+u"}[\u2013\u2014])" em_en_lookahead = "(?<=.{"+unicode(length)+u"}[\u2013\u2014])"
soft_hyphen = u"\xad" soft_hyphen = u"\xad"
line_ending = "\s*(?P<style_close></(span|[iub])>)?\s*(</(p|div)>)?" line_ending = "\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?"
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*" blanklines = "\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*"
line_opening = "<(p|div)[^>]*>\s*(?P<style_open><(span|[iub])[^>]*>)?\s*" line_opening = "<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*"
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
if format == 'txt': if format == 'txt':
@ -414,8 +414,8 @@ def markup_pre(self, html):
return html return html
def arrange_htm_line_endings(self, html): def arrange_htm_line_endings(self, html):
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html) html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\\g<tag>"+">\n", html)
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html) html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\\g<tag>"+"\\g<style>"+">", html)
return html return html
def fix_nbsp_indents(self, html): def fix_nbsp_indents(self, html):
@ -432,7 +432,7 @@ def cleanup_markup(self, html):
# Get rid of empty <o:p> tags to simplify other processing # Get rid of empty <o:p> tags to simplify other processing
html = re.sub(unicode(r'\s*<o:p>\s*</o:p>'), ' ', html) html = re.sub(unicode(r'\s*<o:p>\s*</o:p>'), ' ', html)
# Delete microsoft 'smart' tags # Delete microsoft 'smart' tags
html = re.sub('(?i)</?st1:\w+>', '', html) html = re.sub('(?i)</?st1:\\w+>', '', html)
# Re-open self closing paragraph tags # Re-open self closing paragraph tags
html = re.sub('<p[^>/]*/>', '<p> </p>', html) html = re.sub('<p[^>/]*/>', '<p> </p>', html)
# Get rid of empty span, bold, font, em, & italics tags # Get rid of empty span, bold, font, em, & italics tags
@ -443,7 +443,7 @@ def cleanup_markup(self, html):
html = re.sub( html = re.sub(
r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html) r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html)
# delete surrounding divs from empty paragraphs # delete surrounding divs from empty paragraphs
html = re.sub('<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html) html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
# Empty heading tags # Empty heading tags
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html) html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
self.deleted_nbsps = True self.deleted_nbsps = True
@ -527,7 +527,7 @@ def merge_header_whitespace(match):
elif content.find('scenebreak') != -1: elif content.find('scenebreak') != -1:
return content return content
else: else:
content = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content) content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
return content return content
html = blanks_around_headings.sub(merge_header_whitespace, html) html = blanks_around_headings.sub(merge_header_whitespace, html)
@ -540,15 +540,15 @@ def markup_whitespaces(match):
html = blanks_n_nopunct.sub(markup_whitespaces, html) html = blanks_n_nopunct.sub(markup_whitespaces, html)
if self.html_preprocess_sections > self.min_chapters: if self.html_preprocess_sections > self.min_chapters:
html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html) html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
return html return html
def detect_soft_breaks(self, html): def detect_soft_breaks(self, html):
line = '(?P<initline>'+self.line_open+'\s*(?P<init_content>.*?)'+self.line_close+')' line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \ line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
'\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')' '\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
div_break_candidate_pattern = line+'\s*<div[^>]*>\s*</div>\s*'+line_two div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE) div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
def convert_div_softbreaks(match): def convert_div_softbreaks(match):
@ -571,9 +571,9 @@ def convert_div_softbreaks(match):
def detect_scene_breaks(self, html): def detect_scene_breaks(self, html):
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \ scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close '<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?)+)\\s*'+self.line_close
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
html = scene_breaks.sub(self.scene_break_open+'\g<break>'+'</p>', html) html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html)
return html return html
def markup_user_break(self, replacement_break): def markup_user_break(self, replacement_break):
@ -589,13 +589,13 @@ def markup_user_break(self, replacement_break):
if re.match('^<hr', replacement_break): if re.match('^<hr', replacement_break):
if replacement_break.find('width') != -1: if replacement_break.find('width') != -1:
try: try:
width = int(re.sub('.*?width(:|=)(?P<wnum>\d+).*', '\g<wnum>', replacement_break)) width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
except: except:
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>' scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
self.log.warn('Invalid replacement scene break' self.log.warn('Invalid replacement scene break'
' expression, using default') ' expression, using default')
else: else:
replacement_break = re.sub('(?i)(width=\d+\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break) replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
divpercent = (100 - width) / 2 divpercent = (100 - width) / 2
hr_open = re.sub('45', unicode(divpercent), hr_open) hr_open = re.sub('45', unicode(divpercent), hr_open)
scene_break = hr_open+replacement_break+'</div>' scene_break = hr_open+replacement_break+'</div>'
@ -606,16 +606,16 @@ def markup_user_break(self, replacement_break):
else: else:
from calibre.utils.html2text import html2text from calibre.utils.html2text import html2text
replacement_break = html2text(replacement_break) replacement_break = html2text(replacement_break)
replacement_break = re.sub('\s', '&nbsp;', replacement_break) replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>' scene_break = self.scene_break_open+replacement_break+'</p>'
else: else:
replacement_break = re.sub('\s', '&nbsp;', replacement_break) replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>' scene_break = self.scene_break_open+replacement_break+'</p>'
return scene_break return scene_break
def check_paragraph(self, content): def check_paragraph(self, content):
content = re.sub('\s*</?span[^>]*>\s*', '', content) content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
if re.match('.*[\"\'.!?:]$', content): if re.match('.*[\"\'.!?:]$', content):
# print "detected this as a paragraph" # print "detected this as a paragraph"
return True return True
@ -623,7 +623,7 @@ def check_paragraph(self, content):
return False return False
def abbyy_processor(self, html): def abbyy_processor(self, html):
abbyy_line = re.compile('((?P<linestart><p\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE) abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
empty_paragraph = '\n<p> </p>\n' empty_paragraph = '\n<p> </p>\n'
self.in_blockquote = False self.in_blockquote = False
self.previous_was_paragraph = False self.previous_was_paragraph = False
@ -669,7 +669,7 @@ def convert_styles(match):
if style == 'text-align' and setting != 'left': if style == 'text-align' and setting != 'left':
text_align = style+':'+setting+';' text_align = style+':'+setting+';'
if style == 'text-indent': if style == 'text-indent':
setting = int(re.sub('\s*pt\s*', '', setting)) setting = int(re.sub('\\s*pt\\s*', '', setting))
if 9 < setting < 14: if 9 < setting < 14:
text_indent = indented_text text_indent = indented_text
else: else:
@ -757,8 +757,8 @@ def __call__(self, html):
is_pdftohtml = self.is_pdftohtml(html) is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml: if is_pdftohtml:
self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*" self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>" self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"
# ADE doesn't render <br />, change to empty paragraphs # ADE doesn't render <br />, change to empty paragraphs
# html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html) # html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
@ -831,7 +831,7 @@ def __call__(self, html):
# headings and titles, images, etc # headings and titles, images, etc
doubleheading = re.compile( doubleheading = re.compile(
r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html) html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks, # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks. # style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
@ -839,7 +839,7 @@ def __call__(self, html):
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins. # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
if getattr(self.extra_opts, 'format_scene_breaks', False): if getattr(self.extra_opts, 'format_scene_breaks', False):
self.log.debug('Formatting scene breaks') self.log.debug('Formatting scene breaks')
html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html) html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
html = self.detect_scene_breaks(html) html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html) html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html) html = self.detect_soft_breaks(html)
@ -856,9 +856,9 @@ def __call__(self, html):
replacement_break = self.markup_user_break(replacement_break) replacement_break = self.markup_user_break(replacement_break)
if scene_break_count >= 1: if scene_break_count >= 1:
html = detected_scene_break.sub(replacement_break, html) html = detected_scene_break.sub(replacement_break, html)
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html) html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
else: else:
html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html) html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
if self.deleted_nbsps: if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly # put back non-breaking spaces in empty paragraphs so they render correctly

View file

@ -41,6 +41,6 @@ def dump(path):
print(path, 'dumped to', dest) print(path, 'dumped to', dest)
if __name__ == '__main__': if __name__ == '__main__':
dump(sys.argv[-1]) dump(sys.argv[-1])

View file

@ -165,6 +165,7 @@ def parse_text_assertion(self, raw, ans):
ans['text_assertion'] = ta ans['text_assertion'] = ta
return raw[1:] return raw[1:]
_parser = None _parser = None
@ -203,5 +204,3 @@ def cfi_sort_key(cfi, only_path=True):
step = steps[-1] if steps else {} step = steps[-1] if steps else {}
offsets = (step.get('temporal_offset', 0), tuple(reversed(step.get('spatial_offset', (0, 0)))), step.get('text_offset', 0), ) offsets = (step.get('temporal_offset', 0), tuple(reversed(step.get('spatial_offset', (0, 0)))), step.get('text_offset', 0), )
return (step_nums, offsets) return (step_nums, offsets)

View file

@ -100,5 +100,6 @@ def a(before=None, after=None, **params):
def find_tests(): def find_tests():
return unittest.TestLoader().loadTestsFromTestCase(Tests) return unittest.TestLoader().loadTestsFromTestCase(Tests)
if __name__ == '__main__': if __name__ == '__main__':
unittest.TextTestRunner(verbosity=2).run(find_tests()) unittest.TextTestRunner(verbosity=2).run(find_tests())

View file

@ -62,5 +62,6 @@ def main(args=sys.argv):
any2lit(opts, args[1]) any2lit(opts, args[1])
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View file

@ -104,6 +104,7 @@ def f60_79(B, C, D):
def f6_42(B, C, D): def f6_42(B, C, D):
return (B + C) ^ C return (B + C) ^ C
f = [f0_19]*20 + [f20_39]*20 + [f40_59]*20 + [f60_79]*20 f = [f0_19]*20 + [f20_39]*20 + [f40_59]*20 + [f60_79]*20
# ...and delightful changes # ...and delightful changes
@ -321,6 +322,7 @@ def new(arg=None):
return crypto return crypto
if __name__ == '__main__': if __name__ == '__main__':
def main(): def main():
import sys import sys

View file

@ -57,6 +57,7 @@ def invert_tag_map(tag_map):
tattrs[0] = dattrs tattrs[0] = dattrs
return tags, tattrs return tags, tattrs
OPF_MAP = invert_tag_map(maps.OPF_MAP) OPF_MAP = invert_tag_map(maps.OPF_MAP)
HTML_MAP = invert_tag_map(maps.HTML_MAP) HTML_MAP = invert_tag_map(maps.HTML_MAP)
@ -76,6 +77,7 @@ def packguid(guid):
values = [int(value, 16) for value in values] values = [int(value, 16) for value in values]
return pack("<LHHBBBBBBBB", *values) return pack("<LHHBBBBBBBB", *values)
FLAG_OPENING = (1 << 0) FLAG_OPENING = (1 << 0)
FLAG_CLOSING = (1 << 1) FLAG_CLOSING = (1 << 1)
FLAG_BLOCK = (1 << 2) FLAG_BLOCK = (1 << 2)

View file

@ -153,9 +153,9 @@ class HTMLConverter(object):
(re.compile('<hr>', re.IGNORECASE), (re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'), lambda match : '<span style="page-break-after:always"> </span>'),
# Create header tags # Create header tags
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))), lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))), lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)), lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
@ -409,7 +409,7 @@ def parse_css(self, style):
selector name and the value is a dictionary of properties selector name and the value is a dictionary of properties
""" """
sdict, pdict = {}, {} sdict, pdict = {}, {}
style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments style = re.sub('/\\*.*?\\*/', '', style) # Remove /*...*/ comments
for sel in re.findall(HTMLConverter.SELECTOR_PAT, style): for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
for key in sel[0].split(','): for key in sel[0].split(','):
val = self.parse_style_properties(sel[1]) val = self.parse_style_properties(sel[1])

View file

@ -148,7 +148,7 @@ def format_results(self, reserveid, od_title, subtitle, series, publisher, creat
fix_slashes = re.compile(r'\\/') fix_slashes = re.compile(r'\\/')
thumbimage = fix_slashes.sub('/', thumbimage) thumbimage = fix_slashes.sub('/', thumbimage)
worldcatlink = fix_slashes.sub('/', worldcatlink) worldcatlink = fix_slashes.sub('/', worldcatlink)
cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage) cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\\g<img>100', thumbimage)
social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
series_num = '' series_num = ''
if not series: if not series:
@ -254,7 +254,7 @@ def overdrive_search(self, br, log, q, title, author):
def sort_ovrdrv_results(self, raw, log, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): def sort_ovrdrv_results(self, raw, log, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
close_matches = [] close_matches = []
raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw) raw = re.sub('.*?\\[\\[(?P<content>.*?)\\]\\].*', '[[\\g<content>]]', raw)
results = json.loads(raw) results = json.loads(raw)
# log.error('raw results are:'+str(results)) # log.error('raw results are:'+str(results))
# The search results are either from a keyword search or a multi-format list from a single ID, # The search results are either from a keyword search or a multi-format list from a single ID,

View file

@ -149,7 +149,7 @@ def identify(self, log, result_queue, abort, title=None, authors=None,
# Redirect page: trying to extract ozon_id from javascript data # Redirect page: trying to extract ozon_id from javascript data
h = HTMLParser() h = HTMLParser()
entry_string = (h.unescape(etree.tostring(doc, pretty_print=True, encoding=unicode))) entry_string = (h.unescape(etree.tostring(doc, pretty_print=True, encoding=unicode)))
json_pat = re.compile(u'dataLayer\s*=\s*(.+)?;') json_pat = re.compile(r'dataLayer\s*=\s*(.+)?;')
json_info = re.search(json_pat, entry_string) json_info = re.search(json_pat, entry_string)
jsondata = json_info.group(1) if json_info else None jsondata = json_info.group(1) if json_info else None
if jsondata: if jsondata:
@ -344,7 +344,7 @@ def to_metadata(self, log, entry): # {{{
pub_year = None pub_year = None
pub_year_block = entry.xpath(u'.//div[@class="bOneTileProperty"]/text()') pub_year_block = entry.xpath(u'.//div[@class="bOneTileProperty"]/text()')
year_pattern = re.compile('\d{4}') year_pattern = re.compile(r'\d{4}')
if pub_year_block: if pub_year_block:
pub_year = re.search(year_pattern, pub_year_block[0]) pub_year = re.search(year_pattern, pub_year_block[0])
if pub_year: if pub_year:
@ -625,8 +625,8 @@ def _translageLanguageToCode(displayLang): # {{{
def _normalizeAuthorNameWithInitials(name): # {{{ def _normalizeAuthorNameWithInitials(name): # {{{
res = name res = name
if name: if name:
re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$' re1 = r'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$'
re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$' re2 = r'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$'
matcher = re.match(re1, unicode(name), re.UNICODE) matcher = re.match(re1, unicode(name), re.UNICODE)
if not matcher: if not matcher:
matcher = re.match(re2, unicode(name), re.UNICODE) matcher = re.match(re2, unicode(name), re.UNICODE)

View file

@ -370,6 +370,7 @@ def set_metadata(stream, mi):
mu.update(mi) mu.update(mi)
return return
if __name__ == '__main__': if __name__ == '__main__':
if False: if False:
# Test get_metadata() # Test get_metadata()
@ -388,4 +389,3 @@ def set_metadata(stream, mi):
updated_data = open(tokens[0]+'-updated' + '.' + tokens[2],'wb') updated_data = open(tokens[0]+'-updated' + '.' + tokens[2],'wb')
updated_data.write(stream.getvalue()) updated_data.write(stream.getvalue())
updated_data.close() updated_data.close()

View file

@ -45,6 +45,6 @@ def inspect_mobi(path_or_stream, ddir=None): # {{{
def main(): def main():
inspect_mobi(sys.argv[1]) inspect_mobi(sys.argv[1])
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -350,13 +350,13 @@ def cleanup_html(self):
# Swap inline and block level elements, and order block level elements according to priority # Swap inline and block level elements, and order block level elements according to priority
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
self.processed_html = re.sub( self.processed_html = re.sub(
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html) r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\\g<para>'+'\\g<styletags>', self.processed_html)
self.processed_html = re.sub( self.processed_html = re.sub(
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html) r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\\g<styletags>'+'\\g<para>', self.processed_html)
self.processed_html = re.sub( self.processed_html = re.sub(
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', '\g<para>'+'\g<blockquote>', self.processed_html) r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', '\\g<para>'+'\\g<blockquote>', self.processed_html)
self.processed_html = re.sub( self.processed_html = re.sub(
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', '\g<blockquote>'+'\g<para>', self.processed_html) r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', '\\g<blockquote>'+'\\g<para>', self.processed_html)
bods = htmls = 0 bods = htmls = 0
for x in re.finditer(u'</body>|</html>', self.processed_html): for x in re.finditer(u'</body>|</html>', self.processed_html):
if x == '</body>': if x == '</body>':
@ -692,7 +692,7 @@ def create_opf(self, htmlfile, guide=None, root=None):
continue continue
if reached and x.tag == 'a': if reached and x.tag == 'a':
href = x.get('href', '') href = x.get('href', '')
if href and re.match('\w+://', href) is None: if href and re.match('\\w+://', href) is None:
try: try:
text = u' '.join([t.strip() for t in text = u' '.join([t.strip() for t in
x.xpath('descendant::text()')]) x.xpath('descendant::text()')])

View file

@ -374,6 +374,7 @@ class NonLinearNCXIndex(NCXIndex):
EndTagTable EndTagTable
))) )))
if __name__ == '__main__': if __name__ == '__main__':
# Generate a document with a large number of index entries using both # Generate a document with a large number of index entries using both
# calibre and kindlegen and compare the output # calibre and kindlegen and compare the output
@ -393,4 +394,3 @@ class NonLinearNCXIndex(NCXIndex):
from calibre.gui2.tweak_book.diff.main import main from calibre.gui2.tweak_book.diff.main import main
main(['cdiff', 'decompiled_index/mobi8/ncx.record', 'x/ncx.record']) main(['cdiff', 'decompiled_index/mobi8/ncx.record', 'x/ncx.record'])

View file

@ -114,6 +114,7 @@ def normalize_simple_composition(name, cssvalue, composition, check_inherit=True
break break
return style return style
font_composition = ('font-style', 'font-variant', 'font-weight', 'font-size', 'line-height', 'font-family') font_composition = ('font-style', 'font-variant', 'font-weight', 'font-size', 'line-height', 'font-family')
@ -144,6 +145,7 @@ def normalize_border(name, cssvalue):
style.update({k.replace(EDGES[0], edge):v for k, v in vals.iteritems()}) style.update({k.replace(EDGES[0], edge):v for k, v in vals.iteritems()})
return style return style
normalizers = { normalizers = {
'list-style': simple_normalizer('list-style', ('type', 'position', 'image')), 'list-style': simple_normalizer('list-style', ('type', 'position', 'image')),
'font': lambda prop, v: normalize_font(v), 'font': lambda prop, v: normalize_font(v),
@ -243,6 +245,7 @@ def condense_border(style, props):
style.removeProperty(prop.name) style.removeProperty(prop.name)
style.setProperty('border', edge_vals[0].value) style.setProperty('border', edge_vals[0].value)
condensers = {'margin': simple_condenser('margin', condense_edge), 'padding': simple_condenser('padding', condense_edge), 'border': condense_border} condensers = {'margin': simple_condenser('margin', condense_edge), 'padding': simple_condenser('padding', condense_edge), 'border': condense_border}
@ -430,5 +433,6 @@ def test_border_condensation(self):
unittest.TextTestRunner(verbosity=4).run(tests) unittest.TextTestRunner(verbosity=4).run(tests)
# }}} # }}}
if __name__ == '__main__': if __name__ == '__main__':
test_normalization() test_normalization()

View file

@ -82,6 +82,7 @@ def iterrules(container, sheet_name, rules=None, media_rule_ok=media_allowed, ru
importing.discard(sheet_name) importing.discard(sheet_name)
StyleDeclaration = namedtuple('StyleDeclaration', 'index declaration pseudo_element') StyleDeclaration = namedtuple('StyleDeclaration', 'index declaration pseudo_element')
Specificity = namedtuple('Specificity', 'is_style num_id num_class num_elem rule_index') Specificity = namedtuple('Specificity', 'is_style num_id num_class num_elem rule_index')
@ -224,6 +225,7 @@ def process_sheet(sheet, sheet_name):
return partial(resolve_property, style_map), partial(resolve_pseudo_property, style_map, pseudo_style_map), select return partial(resolve_property, style_map), partial(resolve_pseudo_property, style_map, pseudo_style_map), select
_defvals = None _defvals = None

View file

@ -115,6 +115,7 @@ def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.
zf.writestr(html_name, HTML) zf.writestr(html_name, HTML)
zf.writestr(toc_name, ncx) zf.writestr(toc_name, ncx)
if __name__ == '__main__': if __name__ == '__main__':
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
mi = Metadata('Test book', authors=('Kovid Goyal',)) mi = Metadata('Test book', authors=('Kovid Goyal',))

View file

@ -140,5 +140,6 @@ def main(args=sys.argv):
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View file

@ -134,7 +134,6 @@ def main():
print('PDF written to:', pdf) print('PDF written to:', pdf)
print('Image written to:', path) print('Image written to:', path)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -188,7 +188,7 @@ def clean_text(self, text):
text = text.replace('\\Q="%s"' % unused, '') text = text.replace('\\Q="%s"' % unused, '')
# Remove \Cn tags that are within \x and \Xn tags # Remove \Cn tags that are within \x and \Xn tags
text = re.sub(unicode(r'(?msu)(?P<t>\\(x|X[0-4]))(?P<a>.*?)(?P<c>\\C[0-4]\s*=\s*"[^"]*")(?P<b>.*?)(?P=t)'), '\g<t>\g<a>\g<b>\g<t>', text) text = re.sub(unicode(r'(?msu)(?P<t>\\(x|X[0-4]))(?P<a>.*?)(?P<c>\\C[0-4]\s*=\s*"[^"]*")(?P<b>.*?)(?P=t)'), '\\g<t>\\g<a>\\g<b>\\g<t>', text)
# Replace bad characters. # Replace bad characters.
text = text.replace(u'\xc2', '') text = text.replace(u'\xc2', '')
@ -206,7 +206,7 @@ def clean_text(self, text):
text = re.sub('[ ]{2,}', ' ', text) text = re.sub('[ ]{2,}', ' ', text)
# Condense excessive \c empty line sequences. # Condense excessive \c empty line sequences.
text = re.sub('(\\c\s*\\c\s*){2,}', '\\c \n\\c\n', text) text = re.sub('(\\c\\s*\\c\\s*){2,}', '\\c \n\\c\n', text)
# Remove excessive newlines. # Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub('\n[ ]+\n', '\n\n', text)

View file

@ -142,7 +142,7 @@ def remove_newlines(self, text):
return text return text
def remove_tabs(self, text): def remove_tabs(self, text):
self.log.debug('\Replace tabs with space for processing...') self.log.debug('Replace tabs with space for processing...')
text = text.replace('\t', ' ') text = text.replace('\t', ' ')
return text return text

View file

@ -32,6 +32,7 @@ def check_encoding(self, path, encoding='us-ascii', verbose=True):
return True return True
return False return False
if __name__ == '__main__': if __name__ == '__main__':
check_encoding_obj = CheckEncoding() check_encoding_obj = CheckEncoding()
check_encoding_obj.check_encoding(sys.argv[1]) check_encoding_obj.check_encoding(sys.argv[1])

View file

@ -175,6 +175,7 @@ def _encoding(self):
elif enc == 'pca': elif enc == 'pca':
self.__code_page = '850' self.__code_page = '850'
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
encode_obj = DefaultEncoding( encode_obj = DefaultEncoding(

View file

@ -411,8 +411,6 @@ def __hyperlink_func(self, field_name, name, line):
line -- the string to be parse line -- the string to be parse
Retuns: Retuns:
The name of the field The name of the field
Logic:
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
""" """
self.__link_switch = re.compile(r'\\l\s{1,}"{0,1}(.*?)"{0,1}\s') self.__link_switch = re.compile(r'\\l\s{1,}"{0,1}(.*?)"{0,1}\s')
the_string = name the_string = name

View file

@ -562,6 +562,8 @@ def convert_hex_2_utf8(self):
self.__convert_preamble() self.__convert_preamble()
else: else:
self.__convert_body() self.__convert_body()
""" """
how to swap case for non-capitals how to swap case for non-capitals
my_string.swapcase() my_string.swapcase()

View file

@ -120,7 +120,7 @@ def __found_list_func(self, line):
Requires: line -- line to process Requires: line -- line to process
Returns: nothing Returns: nothing
Logic: Logic:
I have found \list. I have found \\list.
Change the state to list Change the state to list
Get the open bracket count so you know when this state ends. Get the open bracket count so you know when this state ends.
Append an empty list to all lists. Append an empty list to all lists.
@ -162,7 +162,7 @@ def __found_level_func(self, line):
Requires: line -- line to process Requires: line -- line to process
Returns: nothing Returns: nothing
Logic: Logic:
I have found \listlevel. I have found \\listlevel.
Change the state to level Change the state to level
Get the open bracket count so you know when this state ends. Get the open bracket count so you know when this state ends.
Append an empty list to the last list inside all lists. Append an empty list to the last list inside all lists.
@ -285,7 +285,7 @@ def __parse_level_text_length(self, line):
Returns: Returns:
nothing nothing
Logic: Logic:
Method is used for to parse text in the \leveltext group. Method is used for to parse text in the \\leveltext group.
""" """
num = line[18:] num = line[18:]
the_num = int(num, 16) the_num = int(num, 16)

View file

@ -270,6 +270,8 @@ def parse_options(self):
return options_dict, arguments return options_dict, arguments
else: else:
return 0,0 return 0,0
if __name__ == '__main__': if __name__ == '__main__':
this_dict = { this_dict = {
'indents': [0, 'i'], 'indents': [0, 'i'],

View file

@ -52,7 +52,7 @@ def __override_func(self, line):
Returns: Returns:
nothing nothing
Logic: Logic:
The group {\override has been found. The group {\\override has been found.
Check for the end of the group. Check for the end of the group.
Otherwise, add appropriate tokens to the override dictionary. Otherwise, add appropriate tokens to the override dictionary.
""" """

View file

@ -128,7 +128,7 @@ def __initiate_values(self):
'list-conti' : 'list-continue', 'list-conti' : 'list-continue',
'list-hang_' : 'list-hang', 'list-hang_' : 'list-hang',
# 'list-tebef' : 'list-text-before', # 'list-tebef' : 'list-text-before',
'list-level' : 'level', # 'list-level' : 'level',
'list-id___' : 'list-id', 'list-id___' : 'list-id',
'list-start' : 'list-start', 'list-start' : 'list-start',
'nest-level' : 'nest-level', 'nest-level' : 'nest-level',
@ -198,7 +198,7 @@ def __initiate_values(self):
'bor-cel-to' : 'border-cell-top', 'bor-cel-to' : 'border-cell-top',
'bor-cel-le' : 'border-cell-left', 'bor-cel-le' : 'border-cell-left',
'bor-cel-ri' : 'border-cell-right', 'bor-cel-ri' : 'border-cell-right',
'bor-par-bo' : 'border-paragraph-bottom', # 'bor-par-bo' : 'border-paragraph-bottom',
'bor-par-to' : 'border-paragraph-top', 'bor-par-to' : 'border-paragraph-top',
'bor-par-le' : 'border-paragraph-left', 'bor-par-le' : 'border-paragraph-left',
'bor-par-ri' : 'border-paragraph-right', 'bor-par-ri' : 'border-paragraph-right',
@ -413,7 +413,7 @@ def __para_def_in_para_def_func(self, line):
Returns: Returns:
nothing nothing
Logic: Logic:
I have found a \pard while I am collecting tokens. I want to reset I have found a \\pard while I am collecting tokens. I want to reset
the dectionary and do nothing else. the dectionary and do nothing else.
""" """
# Change this # Change this

View file

@ -584,7 +584,7 @@ def initiate_token_dict(self):
} }
""" """
# unknown # unknown
# These must get passed on because they occure after \* # These must get passed on because they occure after \\*
'do' : ('un', 'unknown___', self.default_func), 'do' : ('un', 'unknown___', self.default_func),
'company' : ('un', 'company___', self.default_func), 'company' : ('un', 'company___', self.default_func),
'shpinst' : ('un', 'unknown___', self.default_func), 'shpinst' : ('un', 'unknown___', self.default_func),
@ -716,10 +716,10 @@ def __no_sup_sub_func(self, pre, token, num):
def divide_num(self, numerator, denominator): def divide_num(self, numerator, denominator):
try: try:
# calibre why ignore negative number? Wrong in case of \fi # calibre why ignore negative number? Wrong in case of \fi
numerator = float(re.search('[0-9.\-]+', numerator).group()) numerator = float(re.search('[0-9.\\-]+', numerator).group())
except TypeError as msg: except TypeError as msg:
if self.__run_level > 3: if self.__run_level > 3:
msg = ('No number to process?\nthis indicates that the token \(\\li\) \ msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \
should have a number and does not\nnumerator is \ should have a number and does not\nnumerator is \
"%s"\ndenominator is "%s"\n') % (numerator, denominator) "%s"\ndenominator is "%s"\n') % (numerator, denominator)
raise self.__bug_handler(msg) raise self.__bug_handler(msg)

View file

@ -27,19 +27,19 @@ class Sections:
logic logic
--------------- ---------------
The tags for the first section breaks have already been written. The tags for the first section breaks have already been written.
RTF stores section breaks with the \sect tag. Each time this tag is RTF stores section breaks with the \\sect tag. Each time this tag is
encountered, add one to the counter. encountered, add one to the counter.
When I encounter the \sectd tag, I want to collect all the appropriate tokens When I encounter the \\sectd tag, I want to collect all the appropriate tokens
that describe the section. When I reach a \pard, I know I an stop collecting that describe the section. When I reach a \\pard, I know I an stop collecting
tokens and write the section tags. tokens and write the section tags.
The exception to this method occurs when sections occur in field blocks, such The exception to this method occurs when sections occur in field blocks, such
as the index. Normally, two section break occur within the index and other as the index. Normally, two section break occur within the index and other
field-blocks. (If less or more section breaks occurr, this code may not work.) field-blocks. (If less or more section breaks occur, this code may not work.)
I want the sections to occur outside of the index. That is, the index I want the sections to occur outside of the index. That is, the index
should be nested inside one section tag. After the index is complete, a new should be nested inside one section tag. After the index is complete, a new
section should begin. section should begin.
In order to write the sections outside of the field blocks, I have to store In order to write the sections outside of the field blocks, I have to store
all of the field block as a string. When I ecounter the \sect tag, add one to all of the field block as a string. When I ecounter the \\sect tag, add one to
the section counter, but store this number in a list. Likewise, store the the section counter, but store this number in a list. Likewise, store the
information describing the section in another list. information describing the section in another list.
When I reach the end of the field block, choose the first item from the When I reach the end of the field block, choose the first item from the
@ -243,7 +243,7 @@ def __end_sec_premature_func(self, line, name):
nothing nothing
Logic: Logic:
Text or control words indicating text have been found Text or control words indicating text have been found
before \pard. This shoud indicate older RTF. Reset the state before \\pard. This shoud indicate older RTF. Reset the state
Write the section defintion. Insert a paragraph definition. Write the section defintion. Insert a paragraph definition.
Insert {} to mark the end of a paragraph defintion Insert {} to mark the end of a paragraph defintion
""" """

View file

@ -121,7 +121,7 @@ def __initiate_values(self):
'list-conti' : 'list-continue', 'list-conti' : 'list-continue',
'list-hang_' : 'list-hang', 'list-hang_' : 'list-hang',
# 'list-tebef' : 'list-text-before', # 'list-tebef' : 'list-text-before',
'list-level' : 'level', # 'list-level' : 'level',
'list-id___' : 'list-id', 'list-id___' : 'list-id',
'list-start' : 'list-start', 'list-start' : 'list-start',
'nest-level' : 'nest-level', 'nest-level' : 'nest-level',
@ -192,7 +192,7 @@ def __initiate_values(self):
'bor-cel-to' : 'border-cell-top', 'bor-cel-to' : 'border-cell-top',
'bor-cel-le' : 'border-cell-left', 'bor-cel-le' : 'border-cell-left',
'bor-cel-ri' : 'border-cell-right', 'bor-cel-ri' : 'border-cell-right',
'bor-par-bo' : 'border-paragraph-bottom', # 'bor-par-bo' : 'border-paragraph-bottom',
'bor-par-to' : 'border-paragraph-top', 'bor-par-to' : 'border-paragraph-top',
'bor-par-le' : 'border-paragraph-left', 'bor-par-le' : 'border-paragraph-left',
'bor-par-ri' : 'border-paragraph-right', 'bor-par-ri' : 'border-paragraph-right',

View file

@ -333,6 +333,7 @@ def main():
return 1 return 1
return 0 return 0
if __name__ == "__main__": if __name__ == "__main__":
"""SNB file unit test""" """SNB file unit test"""
sys.exit(main()) sys.exit(main())

View file

@ -160,6 +160,7 @@ def event(self, ev):
self.window_unblocked.emit() self.window_unblocked.emit()
return QMainWindow.event(self, ev) return QMainWindow.event(self, ev)
app=QApplication([]) app=QApplication([])
app.setAttribute(Qt.AA_DontUseNativeMenuBar, False) app.setAttribute(Qt.AA_DontUseNativeMenuBar, False)
app.setApplicationName('com.calibre-ebook.DBusExportDemo') app.setApplicationName('com.calibre-ebook.DBusExportDemo')

View file

@ -456,6 +456,7 @@ def accept(self):
self.rule = ('', txt) self.rule = ('', txt)
QDialog.accept(self) QDialog.accept(self)
if __name__ == '__main__': if __name__ == '__main__':
app = QApplication([]) app = QApplication([])
from calibre.ebooks.metadata.book.base import field_metadata from calibre.ebooks.metadata.book.base import field_metadata

View file

@ -581,5 +581,6 @@ def __init__(self, string, width, height, ts, font, valign):
def main(args=sys.argv): def main(args=sys.argv):
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View file

@ -171,8 +171,8 @@ def reset_confirmation_dialogs(self, *args):
info_dialog(self, _('Done'), info_dialog(self, _('Done'),
_('Confirmation dialogs have all been reset'), show=True) _('Confirmation dialogs have all been reset'), show=True)
if __name__ == '__main__': if __name__ == '__main__':
from PyQt5.Qt import QApplication from PyQt5.Qt import QApplication
app = QApplication([]) app = QApplication([])
test_widget('Interface', 'Behavior') test_widget('Interface', 'Behavior')

View file

@ -190,6 +190,7 @@ def load_object(self, src, key):
return cls(builtin.gui, builtin.name, config=builtin.config, return cls(builtin.gui, builtin.name, config=builtin.config,
base_plugin=builtin.base_plugin), ver base_plugin=builtin.base_plugin), ver
if __name__ == '__main__': if __name__ == '__main__':
st = time.time() st = time.time()
count = 0 count = 0
@ -199,5 +200,3 @@ def load_object(self, src, key):
print(code.encode('utf-8')) print(code.encode('utf-8'))
print('\n', '_'*80, '\n', sep='') print('\n', '_'*80, '\n', sep='')
print ('Time to download all %d plugins: %.2f seconds'%(count, time.time() - st)) print ('Time to download all %d plugins: %.2f seconds'%(count, time.time() - st))

View file

@ -153,15 +153,15 @@ def is_filterable_query(self, query):
# Remove filter identifiers # Remove filter identifiers
# Remove the prefix. # Remove the prefix.
for loc in ('all', 'author', 'author2', 'authors', 'title', 'title2'): for loc in ('all', 'author', 'author2', 'authors', 'title', 'title2'):
query = re.sub(r'%s:"(?P<a>[^\s"]+)"' % loc, '\g<a>', query) query = re.sub(r'%s:"(?P<a>[^\s"]+)"' % loc, r'\g<a>', query)
query = query.replace('%s:' % loc, '') query = query.replace('%s:' % loc, '')
# Remove the prefix and search text. # Remove the prefix and search text.
for loc in ('cover', 'download', 'downloads', 'drm', 'format', 'formats', 'price', 'store'): for loc in ('cover', 'download', 'downloads', 'drm', 'format', 'formats', 'price', 'store'):
query = re.sub(r'%s:"[^"]"' % loc, '', query) query = re.sub(r'%s:"[^"]"' % loc, '', query)
query = re.sub(r'%s:[^\s]*' % loc, '', query) query = re.sub(r'%s:[^\s]*' % loc, '', query)
# Remove whitespace # Remove whitespace
query = re.sub('\s', '', query) query = re.sub(r'\s', '', query)
mod_query = re.sub('\s', '', mod_query) mod_query = re.sub(r'\s', '', mod_query)
# If mod_query and query are the same then there were no filter modifiers # If mod_query and query are the same then there were no filter modifiers
# so this isn't a filterable query. # so this isn't a filterable query.
if mod_query == query: if mod_query == query:

View file

@ -128,6 +128,7 @@ def set_use_primary_find_in_search(toWhat):
global pref_use_primary_find_in_search global pref_use_primary_find_in_search
pref_use_primary_find_in_search = toWhat pref_use_primary_find_in_search = toWhat
y, c, n, u = map(icu_lower, (_('yes'), _('checked'), _('no'), _('unchecked'))) y, c, n, u = map(icu_lower, (_('yes'), _('checked'), _('no'), _('unchecked')))
yes_vals = {y, c, 'true'} yes_vals = {y, c, 'true'}
no_vals = {n, u, 'false'} no_vals = {n, u, 'false'}
@ -1215,5 +1216,3 @@ def itervals(self, record):
# }}} # }}}
# }}} # }}}

View file

@ -241,7 +241,7 @@ def make_bibtex_citation(entry, template_citation, bibtexclass):
# define a function to replace the template entry by its value # define a function to replace the template entry by its value
def tpl_replace(objtplname) : def tpl_replace(objtplname) :
tpl_field = re.sub(u'[\{\}]', u'', objtplname.group()) tpl_field = re.sub(u'[\\{\\}]', u'', objtplname.group())
if tpl_field in TEMPLATE_ALLOWED_FIELDS : if tpl_field in TEMPLATE_ALLOWED_FIELDS :
if tpl_field in ['pubdate', 'timestamp'] : if tpl_field in ['pubdate', 'timestamp'] :
@ -258,14 +258,14 @@ def tpl_replace(objtplname) :
if len(template_citation) >0 : if len(template_citation) >0 :
tpl_citation = bibtexclass.utf8ToBibtex( tpl_citation = bibtexclass.utf8ToBibtex(
bibtexclass.ValidateCitationKey(re.sub(u'\{[^{}]*\}', bibtexclass.ValidateCitationKey(re.sub(u'\\{[^{}]*\\}',
tpl_replace, template_citation))) tpl_replace, template_citation)))
if len(tpl_citation) >0 : if len(tpl_citation) >0 :
return tpl_citation return tpl_citation
if len(entry["isbn"]) > 0 : if len(entry["isbn"]) > 0 :
template_citation = u'%s' % re.sub(u'[\D]',u'', entry["isbn"]) template_citation = u'%s' % re.sub(u'[\\D]',u'', entry["isbn"])
else : else :
template_citation = u'%s' % str(entry["id"]) template_citation = u'%s' % str(entry["id"])

View file

@ -154,9 +154,9 @@ def run(self, path_to_output, opts, db, notification=DummyReporter()):
# Convert HTML to markdown text # Convert HTML to markdown text
if type(item) is unicode: if type(item) is unicode:
opening_tag = re.search('<(\w+)(\x20|>)', item) opening_tag = re.search('<(\\w+)(\x20|>)', item)
if opening_tag: if opening_tag:
closing_tag = re.search('<\/%s>$' % opening_tag.group(1), item) closing_tag = re.search('<\\/%s>$' % opening_tag.group(1), item)
if closing_tag: if closing_tag:
item = html2text(item) item = html2text(item)

View file

@ -61,7 +61,7 @@ class EPUB_MOBI(CatalogPlugin):
"Default: '%default'\n" "Default: '%default'\n"
"Applies to: AZW3, EPUB, MOBI output formats")), "Applies to: AZW3, EPUB, MOBI output formats")),
Option('--exclude-genre', Option('--exclude-genre',
default='\[.+\]|^\+$', default='\\[.+\\]|^\\+$',
dest='exclude_genre', dest='exclude_genre',
action=None, action=None,
help=_("Regex describing tags to exclude as genres.\n" help=_("Regex describing tags to exclude as genres.\n"

View file

@ -1209,11 +1209,11 @@ def _normalize_tag(tag, max_len):
clipped to max_len clipped to max_len
""" """
normalized = massaged = re.sub('\s', '', ascii_text(tag).lower()) normalized = massaged = re.sub('\\s', '', ascii_text(tag).lower())
if re.search('\W', normalized): if re.search('\\W', normalized):
normalized = '' normalized = ''
for c in massaged: for c in massaged:
if re.search('\W', c): if re.search('\\W', c):
normalized += self.generate_unicode_name(c) normalized += self.generate_unicode_name(c)
else: else:
normalized += c normalized += c
@ -1376,7 +1376,7 @@ def generate_author_anchor(self, author):
Return: Return:
(str): asciized version of author (str): asciized version of author
""" """
return re.sub("\W", "", ascii_text(author)) return re.sub("\\W", "", ascii_text(author))
def generate_format_args(self, book): def generate_format_args(self, book):
""" Generate the format args for template substitution. """ Generate the format args for template substitution.
@ -4209,9 +4209,9 @@ def generate_series_anchor(self, series):
# Generate a legal XHTML id/href string # Generate a legal XHTML id/href string
if self.letter_or_symbol(series) == self.SYMBOLS: if self.letter_or_symbol(series) == self.SYMBOLS:
return "symbol_%s_series" % re.sub('\W', '', series).lower() return "symbol_%s_series" % re.sub('\\W', '', series).lower()
else: else:
return "%s_series" % re.sub('\W', '', ascii_text(series)).lower() return "%s_series" % re.sub('\\W', '', ascii_text(series)).lower()
def generate_short_description(self, description, dest=None): def generate_short_description(self, description, dest=None):
""" Generate a truncated version of the supplied string. """ Generate a truncated version of the supplied string.
@ -4292,7 +4292,7 @@ def generate_sort_title(self, title):
else: else:
if re.match('[0-9]+', word[0]): if re.match('[0-9]+', word[0]):
word = word.replace(',', '') word = word.replace(',', '')
suffix = re.search('[\D]', word) suffix = re.search('[\\D]', word)
if suffix: if suffix:
word = '%10.0f%s' % (float(word[:suffix.start()]), word[suffix.start():]) word = '%10.0f%s' % (float(word[:suffix.start()]), word[suffix.start():])
else: else:
@ -4308,7 +4308,7 @@ def generate_sort_title(self, title):
else: else:
if re.search('[0-9]+', word[0]): if re.search('[0-9]+', word[0]):
word = word.replace(',', '') word = word.replace(',', '')
suffix = re.search('[\D]', word) suffix = re.search('[\\D]', word)
if suffix: if suffix:
word = '%10.0f%s' % (float(word[:suffix.start()]), word[suffix.start():]) word = '%10.0f%s' % (float(word[:suffix.start()]), word[suffix.start():])
else: else:
@ -4638,7 +4638,7 @@ def massage_comments(self, comments):
# confusion with decimal points. # confusion with decimal points.
# Explode lost CRs to \n\n # Explode lost CRs to \n\n
for lost_cr in re.finditer('([a-z])([\.\?!])([A-Z])', comments): for lost_cr in re.finditer('([a-z])([\\.\\?!])([A-Z])', comments):
comments = comments.replace(lost_cr.group(), comments = comments.replace(lost_cr.group(),
'%s%s\n\n%s' % (lost_cr.group(1), '%s%s\n\n%s' % (lost_cr.group(1),
lost_cr.group(2), lost_cr.group(2),

View file

@ -90,8 +90,8 @@ def numberTranslate(self):
# Special case ordinals # Special case ordinals
if re.search('[st|nd|rd|th]',self.number): if re.search('[st|nd|rd|th]',self.number):
self.number = re.sub(',','',self.number) self.number = re.sub(',','',self.number)
ordinal_suffix = re.search('[\D]', self.number) ordinal_suffix = re.search('[\\D]', self.number)
ordinal_number = re.sub('\D','',re.sub(',','',self.number)) ordinal_number = re.sub('\\D','',re.sub(',','',self.number))
if self.verbose: if self.verbose:
self.log("Ordinal: %s" % ordinal_number) self.log("Ordinal: %s" % ordinal_number)
self.number_as_float = ordinal_number self.number_as_float = ordinal_number
@ -120,7 +120,7 @@ def numberTranslate(self):
self.text = NumberToText(self.number.replace('%',' percent')).text self.text = NumberToText(self.number.replace('%',' percent')).text
# Test for decimal # Test for decimal
elif re.search('\.',self.number): elif re.search('\\.',self.number):
if self.verbose: if self.verbose:
self.log("Decimal: %s" % self.number) self.log("Decimal: %s" % self.number)
self.number_as_float = self.number self.number_as_float = self.number
@ -151,12 +151,12 @@ def numberTranslate(self):
self.text = NumberToText(self.number_as_float).text self.text = NumberToText(self.number_as_float).text
# Test for hybrid e.g., 'K2, 2nd, 10@10' # Test for hybrid e.g., 'K2, 2nd, 10@10'
elif re.search('[\D]+', self.number): elif re.search('[\\D]+', self.number):
if self.verbose: if self.verbose:
self.log("Hybrid: %s" % self.number) self.log("Hybrid: %s" % self.number)
# Split the token into number/text # Split the token into number/text
number_position = re.search('\d',self.number).start() number_position = re.search('\\d',self.number).start()
text_position = re.search('\D',self.number).start() text_position = re.search('\\D',self.number).start()
if number_position < text_position: if number_position < text_position:
number = self.number[:text_position] number = self.number[:text_position]
text = self.number[text_position:] text = self.number[text_position:]
@ -225,4 +225,3 @@ def numberTranslate(self):
self.log(u'resultString: %s' % resultString) self.log(u'resultString: %s' % resultString)
self.text = resultString.strip().capitalize() self.text = resultString.strip().capitalize()
# }}} # }}}

View file

@ -16,7 +16,7 @@
# Hackish - ignoring sentences ending or beginning in numbers to avoid # Hackish - ignoring sentences ending or beginning in numbers to avoid
# confusion with decimal points. # confusion with decimal points.
lost_cr_pat = re.compile('([a-z])([\.\?!])([A-Z])') lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])')
lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])')
sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe', sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe',
re.IGNORECASE) re.IGNORECASE)

View file

@ -657,7 +657,7 @@ def create_custom_column(self, label, name, datatype, is_multiple,
editable=True, display={}): editable=True, display={}):
if not label: if not label:
raise ValueError(_('No label was provided')) raise ValueError(_('No label was provided'))
if re.match('^\w*$', label) is None or not label[0].isalpha() or label.lower() != label: if re.match('^\\w*$', label) is None or not label[0].isalpha() or label.lower() != label:
raise ValueError(_('The label must contain only lower case letters, digits and underscores, and start with a letter')) raise ValueError(_('The label must contain only lower case letters, digits and underscores, and start with a letter'))
if datatype not in self.CUSTOM_DATA_TYPES: if datatype not in self.CUSTOM_DATA_TYPES:
raise ValueError('%r is not a supported data type'%datatype) raise ValueError('%r is not a supported data type'%datatype)
@ -809,5 +809,3 @@ def create_custom_column(self, label, name, datatype, is_multiple,
self.conn.executescript(script) self.conn.executescript(script)
self.conn.commit() self.conn.commit()
return num return num

View file

@ -50,7 +50,7 @@ def _connect(path):
conn = sqlite.connect(path, factory=Connection, detect_types=sqlite.PARSE_DECLTYPES|sqlite.PARSE_COLNAMES) conn = sqlite.connect(path, factory=Connection, detect_types=sqlite.PARSE_DECLTYPES|sqlite.PARSE_COLNAMES)
conn.row_factory = lambda cursor, row : list(row) conn.row_factory = lambda cursor, row : list(row)
conn.create_aggregate('concat', 1, Concatenate) conn.create_aggregate('concat', 1, Concatenate)
title_pat = re.compile('^(A|The|An)\s+', re.IGNORECASE) title_pat = re.compile('^(A|The|An)\\s+', re.IGNORECASE)
def title_sort(title): def title_sort(title):
match = title_pat.search(title) match = title_pat.search(title)
@ -1514,6 +1514,7 @@ def text_to_tokens(text):
continue continue
return ans, OR return ans, OR
if __name__ == '__main__': if __name__ == '__main__':
sqlite.enable_callback_tracebacks(True) sqlite.enable_callback_tracebacks(True)
db = LibraryDatabase('/home/kovid/temp/library1.db.orig') db = LibraryDatabase('/home/kovid/temp/library1.db.orig')

View file

@ -68,6 +68,7 @@ def _py_convert_timestamp(val):
return parse_date(val, as_utc=False) return parse_date(val, as_utc=False)
return None return None
convert_timestamp = _py_convert_timestamp if _c_speedup is None else \ convert_timestamp = _py_convert_timestamp if _c_speedup is None else \
_c_convert_timestamp _c_convert_timestamp
@ -75,6 +76,7 @@ def _py_convert_timestamp(val):
def adapt_datetime(dt): def adapt_datetime(dt):
return isoformat(dt, sep=' ') return isoformat(dt, sep=' ')
sqlite.register_adapter(datetime, adapt_datetime) sqlite.register_adapter(datetime, adapt_datetime)
sqlite.register_converter('timestamp', convert_timestamp) sqlite.register_converter('timestamp', convert_timestamp)
@ -82,6 +84,7 @@ def adapt_datetime(dt):
def convert_bool(val): def convert_bool(val):
return val != '0' return val != '0'
sqlite.register_adapter(bool, lambda x : 1 if x else 0) sqlite.register_adapter(bool, lambda x : 1 if x else 0)
sqlite.register_converter('bool', convert_bool) sqlite.register_converter('bool', convert_bool)
sqlite.register_converter('BOOL', convert_bool) sqlite.register_converter('BOOL', convert_bool)
@ -411,4 +414,3 @@ def test():
c = sqlite.connect(':memory:') c = sqlite.connect(':memory:')
if load_c_extensions(c, True): if load_c_extensions(c, True):
print('Loaded C extension successfully') print('Loaded C extension successfully')

View file

@ -64,6 +64,7 @@ def extract_member(filename, match=re.compile(r'\.(jpg|jpeg|gif|png)\s*$', re.I)
if match.search(name): if match.search(name):
return name, zf.read(name) return name, zf.read(name)
comic_exts = {'png', 'jpg', 'jpeg', 'gif', 'webp'} comic_exts = {'png', 'jpg', 'jpeg', 'gif', 'webp'}

View file

@ -14,6 +14,8 @@
def pre_activated_socket(): def pre_activated_socket():
return None return None
has_preactivated_support = False has_preactivated_support = False
if islinux: if islinux:

View file

@ -73,7 +73,7 @@ def check_for_critical_bugs():
print('WARNING: Translation errors detected') print('WARNING: Translation errors detected')
print('See the .errors directory and http://translate.sourceforge.net/wiki/toolkit/using_pofilter') print('See the .errors directory and http://translate.sourceforge.net/wiki/toolkit/using_pofilter')
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
import_from_launchpad(sys.argv[1]) import_from_launchpad(sys.argv[1])

View file

@ -146,7 +146,7 @@ def make(filename, outfile):
# This is a message with plural forms # This is a message with plural forms
elif l.startswith('msgid_plural'): elif l.startswith('msgid_plural'):
if section != ID: if section != ID:
print('msgid_plural not preceeded by msgid on %s:%d' %\ print('msgid_plural not preceeded by msgid on %s:%d' %
(infile, lno), file=sys.stderr) (infile, lno), file=sys.stderr)
sys.exit(1) sys.exit(1)
l = l[12:] l = l[12:]
@ -157,7 +157,7 @@ def make(filename, outfile):
section = STR section = STR
if l.startswith('msgstr['): if l.startswith('msgstr['):
if not is_plural: if not is_plural:
print('plural without msgid_plural on %s:%d' %\ print('plural without msgid_plural on %s:%d' %
(infile, lno), file=sys.stderr) (infile, lno), file=sys.stderr)
sys.exit(1) sys.exit(1)
l = l.split(']', 1)[1] l = l.split(']', 1)[1]
@ -165,7 +165,7 @@ def make(filename, outfile):
msgstr += '\0' # Separator of the various plural forms msgstr += '\0' # Separator of the various plural forms
else: else:
if is_plural: if is_plural:
print('indexed msgstr required for plural on %s:%d' %\ print('indexed msgstr required for plural on %s:%d' %
(infile, lno), file=sys.stderr) (infile, lno), file=sys.stderr)
sys.exit(1) sys.exit(1)
l = l[6:] l = l[6:]
@ -180,7 +180,7 @@ def make(filename, outfile):
elif section == STR: elif section == STR:
msgstr += l msgstr += l
else: else:
print('Syntax error on %s:%d' % (infile, lno), \ print('Syntax error on %s:%d' % (infile, lno),
'before:', file=sys.stderr) 'before:', file=sys.stderr)
print(l, file=sys.stderr) print(l, file=sys.stderr)
sys.exit(1) sys.exit(1)

View file

@ -1627,6 +1627,7 @@ def close(self):
# Test a few module features, including service registration, service # Test a few module features, including service registration, service
# query (for Zoe), and service unregistration. # query (for Zoe), and service unregistration.
if __name__ == '__main__': if __name__ == '__main__':
print("Multicast DNS Service Discovery for Python, version", __version__) print("Multicast DNS Service Discovery for Python, version", __version__)
r = Zeroconf() r = Zeroconf()

View file

@ -49,6 +49,7 @@ def allowed(x):
def py_clean_xml_chars(unicode_string): def py_clean_xml_chars(unicode_string):
return u''.join(filter(allowed, unicode_string)) return u''.join(filter(allowed, unicode_string))
clean_xml_chars = native_clean_xml_chars or py_clean_xml_chars clean_xml_chars = native_clean_xml_chars or py_clean_xml_chars
@ -85,5 +86,4 @@ def fixup(m, rm=rm, rchar=rchar):
if rm: if rm:
return rchar # replace by char return rchar # replace by char
return text # leave as is return text # leave as is
return re.sub("&#?\w+;", fixup, text) return re.sub("&#?\\w+;", fixup, text)

View file

@ -30,6 +30,7 @@ def check_thread(self, *args, **kwargs):
return func(self, *args, **kwargs) return func(self, *args, **kwargs)
return check_thread return check_thread
FreeTypeError = getattr(plugins['freetype'][0], 'FreeTypeError', Exception) FreeTypeError = getattr(plugins['freetype'][0], 'FreeTypeError', Exception)
@ -80,5 +81,3 @@ def __init__(self):
@same_thread @same_thread
def load_font(self, data): def load_font(self, data):
return Face(self.ft.load_font(data)) return Face(self.ft.load_font(data))

View file

@ -113,6 +113,7 @@ def width(self, string, pixel_size=12.0, stretch=1.0):
'The width of the string at the specified pixel size and stretch, in pixels' 'The width of the string at the specified pixel size and stretch, in pixels'
return sum(self.advance_widths(string, pixel_size, stretch)) return sum(self.advance_widths(string, pixel_size, stretch))
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
from calibre.utils.fonts.sfnt.container import Sfnt from calibre.utils.fonts.sfnt.container import Sfnt
@ -127,4 +128,3 @@ def width(self, string, pixel_size=12.0, stretch=1.0):
print('AvgWidth:', m.pdf_avg_width) print('AvgWidth:', m.pdf_avg_width)
print('ItalicAngle', m.post.italic_angle) print('ItalicAngle', m.post.italic_angle)
print('StemV', m.pdf_stemv) print('StemV', m.pdf_stemv)

View file

@ -640,7 +640,7 @@ class BuiltinReGroup(BuiltinFormatterFunction):
'the template and the eval functions, you use [[ for { and ]] for }.' 'the template and the eval functions, you use [[ for { and ]] for }.'
' The following example in template program mode looks for series ' ' The following example in template program mode looks for series '
'with more than one word and uppercases the first word: ' 'with more than one word and uppercases the first word: '
"{series:'re_group($, \"(\S* )(.*)\", \"[[$:uppercase()]]\", \"[[$]]\")'}") "{series:'re_group($, \"(\\S* )(.*)\", \"[[$:uppercase()]]\", \"[[$]]\")'}")
def evaluate(self, formatter, kwargs, mi, locals, val, pattern, *args): def evaluate(self, formatter, kwargs, mi, locals, val, pattern, *args):
from formatter import EvalFormatter from formatter import EvalFormatter
@ -924,9 +924,9 @@ class BuiltinSublist(BuiltinFormatterFunction):
'of zero is assumed to be the length of the list. Examples using ' 'of zero is assumed to be the length of the list. Examples using '
'basic template mode and assuming that the tags column (which is ' 'basic template mode and assuming that the tags column (which is '
'comma-separated) contains "A, B, C": ' 'comma-separated) contains "A, B, C": '
'{tags:sublist(0,1,\,)} returns "A". ' '{tags:sublist(0,1,\\,)} returns "A". '
'{tags:sublist(-1,0,\,)} returns "C". ' '{tags:sublist(-1,0,\\,)} returns "C". '
'{tags:sublist(0,-1,\,)} returns "A, B".' '{tags:sublist(0,-1,\\,)} returns "A, B".'
) )
def evaluate(self, formatter, kwargs, mi, locals, val, start_index, end_index, sep): def evaluate(self, formatter, kwargs, mi, locals, val, start_index, end_index, sep):

View file

@ -26,6 +26,7 @@ class DirTooLarge(ValueError):
def __init__(self, bdir): def __init__(self, bdir):
ValueError.__init__(self, 'The directory {0} is too large to monitor. Try increasing the value in /proc/sys/fs/inotify/max_user_watches'.format(bdir)) ValueError.__init__(self, 'The directory {0} is too large to monitor. Try increasing the value in /proc/sys/fs/inotify/max_user_watches'.format(bdir))
_inotify = None _inotify = None
@ -320,6 +321,7 @@ def __call__(self):
self.modified = set() self.modified = set()
return ret return ret
if __name__ == '__main__': if __name__ == '__main__':
w = INotifyTreeWatcher(sys.argv[-1]) w = INotifyTreeWatcher(sys.argv[-1])
w() w()

View file

@ -87,6 +87,7 @@ def returncode(self):
class CriticalError(Exception): class CriticalError(Exception):
pass pass
_name_counter = itertools.count() _name_counter = itertools.count()
if islinux: if islinux:
@ -384,4 +385,3 @@ def __enter__(self):
def __exit__(self, *args): def __exit__(self, *args):
self.close() self.close()

View file

@ -85,5 +85,6 @@ def main(args=sys.argv):
f.close() f.close()
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View file

@ -292,7 +292,7 @@ def print_result(request, result):
# this will be called when an exception occurs within a thread # this will be called when an exception occurs within a thread
def handle_exception(request, exc_info): def handle_exception(request, exc_info):
print("Exception occured in request #%s: %s" % \ print("Exception occured in request #%s: %s" %
(request.requestID, exc_info[1])) (request.requestID, exc_info[1]))
# assemble the arguments for each job to a list... # assemble the arguments for each job to a list...

View file

@ -15,7 +15,7 @@
__all__ = ['titlecase'] __all__ = ['titlecase']
__version__ = '0.5' __version__ = '0.5'
SMALL = 'a|an|and|as|at|but|by|en|for|if|in|of|on|or|the|to|v\.?|via|vs\.?' SMALL = 'a|an|and|as|at|but|by|en|for|if|in|of|on|or|the|to|v\\.?|via|vs\\.?'
PUNCT = r"""!"#$%&'()*+,\-‒–—―./:;?@[\\\]_`{|}~""" PUNCT = r"""!"#$%&'()*+,\-‒–—―./:;?@[\\\]_`{|}~"""
SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I) SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I)
@ -54,7 +54,7 @@ def titlecase(text):
all_caps = upper(text) == text all_caps = upper(text) == text
words = re.split('\s+', text) words = re.split('\\s+', text)
line = [] line = []
for word in words: for word in words:
if all_caps: if all_caps:

View file

@ -1,6 +1,5 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
from __future__ import with_statement from __future__ import with_statement, print_function
from __future__ import print_function
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
@ -110,7 +109,7 @@ def default_is_link_wanted(url, tag):
class RecursiveFetcher(object): class RecursiveFetcher(object):
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) ('.exe\\s*$', '.mp3\\s*$', '.ogg\\s*$', '^\\s*mailto:', '^\\s*$'))
# ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in # ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
# ( # (
# #