diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index 4dd6e7c7ae..6f8e94f180 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -16,7 +16,6 @@
from lxml import etree
-from calibre import guess_type
from calibre import prepare_string_for_xml
from calibre.constants import __appname__, __version__
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
@@ -41,7 +40,7 @@ def reset_state(self):
# in different directories. FB2 images are all in a flat layout so we rename all images
# into a sequential numbering system to ensure there are no collisions between image names.
self.image_hrefs = {}
- # Mapping of toc items and their
+ # Mapping of toc items and their
self.toc = {}
# Used to see whether a new \s*
', '
\n\n', text) - + text = re.sub(r'(?miu)
some textile
' + """ + self.html_type = html_type + + # text = unicode(text) + text = _normalize_newlines(text) + + if self.restricted: + text = self.encode_html(text, quotes=False) + + if rel: + self.rel = ' rel="%s"' % rel + + text = self.getRefs(text) + + text = self.block(text, int(head_offset)) + + text = self.retrieve(text) + + return text + + def pba(self, input, element=None): + """ + Parse block attributes. + + >>> t = Textile() + >>> t.pba(r'\3') + '' + >>> t.pba(r'\\3', element='td') + ' colspan="3"' + >>> t.pba(r'/4', element='td') + ' rowspan="4"' + >>> t.pba(r'\\3/4', element='td') + ' colspan="3" rowspan="4"' + + >>> t.vAlign('^') + 'top' + + >>> t.pba('^', element='td') + ' style="vertical-align:top;"' + + >>> t.pba('{line-height:18px}') + ' style="line-height:18px;"' + + >>> t.pba('(foo-bar)') + ' class="foo-bar"' + + >>> t.pba('(#myid)') + ' id="myid"' + + >>> t.pba('(foo-bar#myid)') + ' class="foo-bar" id="myid"' + + >>> t.pba('((((') + ' style="padding-left:4em;"' + + >>> t.pba(')))') + ' style="padding-right:3em;"' + + >>> t.pba('[fr]') + ' lang="fr"' + + """ + style = [] + aclass = '' + lang = '' + colspan = '' + rowspan = '' + id = '' + + if not input: + return '' + + matched = input + if element == 'td': + m = re.search(r'\\(\d+)', matched) + if m: + colspan = m.group(1) + + m = re.search(r'/(\d+)', matched) + if m: + rowspan = m.group(1) + + if element == 'td' or element == 'tr': + m = re.search(r'(%s)' % self.vlgn, matched) + if m: + style.append("vertical-align:%s;" % self.vAlign(m.group(1))) + + m = re.search(r'\{([^}]*)\}', matched) + if m: + style.append(m.group(1).rstrip(';') + ';') + matched = matched.replace(m.group(0), '') + + m = re.search(r'\[([^\]]+)\]', matched, re.U) + if m: + lang = m.group(1) + matched = matched.replace(m.group(0), '') + + m = re.search(r'\(([^()]+)\)', matched, re.U) + if m: + aclass = m.group(1) + matched = matched.replace(m.group(0), '') + + m = re.search(r'([(]+)', matched) + if m: + style.append("padding-left:%sem;" % len(m.group(1))) + matched = matched.replace(m.group(0), '') + + m = re.search(r'([)]+)', matched) + if m: + style.append("padding-right:%sem;" % len(m.group(1))) + matched = matched.replace(m.group(0), '') + + m = re.search(r'(%s)' % self.hlgn, matched) + if m: + style.append("text-align:%s;" % self.hAlign(m.group(1))) + + m = re.search(r'^(.*)#(.*)$', aclass) + if m: + id = m.group(2) + aclass = m.group(1) + + if self.restricted: + if lang: + return ' lang="%s"' + else: + return '' + + result = [] + if style: + result.append(' style="%s"' % "".join(style)) + if aclass: + result.append(' class="%s"' % aclass) + if lang: + result.append(' lang="%s"' % lang) + if id: + result.append(' id="%s"' % id) + if colspan: + result.append(' colspan="%s"' % colspan) + if rowspan: + result.append(' rowspan="%s"' % rowspan) + return ''.join(result) + + def hasRawText(self, text): + """ + checks whether the text has text not already enclosed by a block tag + + >>> t = Textile() + >>> t.hasRawText('foo bar biz baz
') + False + + >>> t.hasRawText(' why yes, yes it does') + True + + """ + r = re.compile(r'<(p|blockquote|div|form|table|ul|ol|pre|h\d)[^>]*?>.*\1>', re.S).sub('', text.strip()).strip() + r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r) + return '' != r + + def table(self, text): + r""" + >>> t = Textile() + >>> t.table('|one|two|three|\n|a|b|c|') + '\t| one | \n\t\t\ttwo | \n\t\t\tthree | \n\t\t
| a | \n\t\t\tb | \n\t\t\tc | \n\t\t
\\n', '\\t\\t') + + >>> t.fBlock("bq", "", None, "http://google.com", "Hello BlockQuote") + ('\\t', 'Hello BlockQuote', '
', '\\n\\t
\\n', '\\t\\t') + + >>> t.fBlock("bc", "", None, "", 'printf "Hello, World";') # doctest: +ELLIPSIS + ('', 'Hello BlockQuote', '
', '\\n\\t
', '', ..., '', '')
+
+ >>> t.fBlock("h1", "", None, "", "foobar")
+ ('', '\\t\n" % (cite, atts) + o2 = "\t\t" + + elif tag == 'bc': + o1 = "" % atts + c2 = "
" + c1 = "\n\t
" % atts
+ o2 = "" % atts
+ c2 = ""
+ c1 = ""
+ content = self.shelve(self.encode_html(content.rstrip("\n") + "\n"))
+
+ elif tag == 'notextile':
+ content = self.shelve(content)
+ o1 = o2 = ''
+ c1 = c2 = ''
+
+ elif tag == 'pre':
+ content = self.shelve(self.encode_html(content.rstrip("\n") + "\n"))
+ o1 = "" % atts + o2 = c2 = '' + c1 = '' + + else: + o2 = "\t<%s%s>" % (tag, atts) + c2 = "%s>" % tag + + content = self.graf(content) + return o1, o2, content, c2, c1 + + def footnoteRef(self, text): + """ + >>> t = Textile() + >>> t.footnoteRef('foo[1] ') # doctest: +ELLIPSIS + 'foo1 ' + """ + return re.sub(r'\b\[([0-9]+)\](\s)?', self.footnoteID, text) + + def footnoteID(self, match): + id, t = match.groups() + if id not in self.fn: + self.fn[id] = str(uuid.uuid4()) + fnid = self.fn[id] + if not t: + t = '' + return '%s%s' % (fnid, id, t) + + def glyphs(self, text): + """ + >>> t = Textile() + + >>> t.glyphs("apostrophe's") + 'apostrophe’s' + + >>> t.glyphs("back in '88") + 'back in ’88' + + >>> t.glyphs('foo ...') + 'foo …' + + >>> t.glyphs('--') + '—' + + >>> t.glyphs('FooBar[tm]') + 'FooBar™' + + >>> t.glyphs("
Cat's Cradle by Vonnegut
") + 'Cat’s Cradle by Vonnegut
' + + """ + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + glyph_search = ( + re.compile(r"(\w)\'(\w)"), # apostrophe's + re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), # back in '88 + re.compile(r'(\S)\'(?=\s|'+self.pnct+'|<|$)'), # single closing + re.compile(r'\'/'), # single opening + re.compile(r'(\S)\"(?=\s|'+self.pnct+'|<|$)'), # double closing + re.compile(r'"'), # double opening + re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), # 3+ uppercase acronym + re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), # 3+ uppercase + re.compile(r'\b(\s{0,1})?\.{3}'), # ellipsis + re.compile(r'(\s?)--(\s?)'), # em dash + re.compile(r'\s-(?:\s|$)'), # en dash + re.compile(r'(\d+)( ?)x( ?)(?=\d+)'), # dimension sign + re.compile(r'\b ?[([]TM[])]', re.I), # trademark + re.compile(r'\b ?[([]R[])]', re.I), # registered + re.compile(r'\b ?[([]C[])]', re.I), # copyright + ) + + glyph_replace = [x % dict(self.glyph_defaults) for x in ( + r'\1%(txt_apostrophe)s\2', # apostrophe's + r'\1%(txt_apostrophe)s\2', # back in '88 + r'\1%(txt_quote_single_close)s', # single closing + r'%(txt_quote_single_open)s', # single opening + r'\1%(txt_quote_double_close)s', # double closing + r'%(txt_quote_double_open)s', # double opening + r'\1', # 3+ uppercase acronym + r'\1', # 3+ uppercase + r'\1%(txt_ellipsis)s', # ellipsis + r'\1%(txt_emdash)s\2', # em dash + r' %(txt_endash)s ', # en dash + r'\1\2%(txt_dimension)s\3', # dimension sign + r'%(txt_trademark)s', # trademark + r'%(txt_registered)s', # registered + r'%(txt_copyright)s', # copyright + )] + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + for s, r in zip(glyph_search, glyph_replace): + line = s.sub(r, line) + result.append(line) + return ''.join(result) + + def vAlign(self, input): + d = {'^':'top', '-':'middle', '~':'bottom'} + return d.get(input, '') + + def hAlign(self, input): + d = {'<':'left', '=':'center', '>':'right', '<>': 'justify'} + return d.get(input, '') + + def getRefs(self, text): + """ + what is this for? + """ + pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?):\/\/|\/)\S+)(?=\s|$)', re.U) + text = pattern.sub(self.refs, text) + return text + + def refs(self, match): + flag, url = match.groups() + self.urlrefs[flag] = url + return '' + + def checkRefs(self, url): + return self.urlrefs.get(url, url) + + def isRelURL(self, url): + """ + Identify relative urls. + + >>> t = Textile() + >>> t.isRelURL("http://www.google.com/") + False + >>> t.isRelURL("/foo") + True + + """ + (scheme, netloc) = urlparse(url)[0:2] + return not scheme and not netloc + + def relURL(self, url): + scheme = urlparse(url)[0] + if self.restricted and scheme and scheme not in self.url_schemes: + return '#' + return url + + def shelve(self, text): + id = str(uuid.uuid4()) + self.shelf[id] = text + return id + + def retrieve(self, text): + """ + >>> t = Textile() + >>> id = t.shelve("foobar") + >>> t.retrieve(id) + 'foobar' + """ + while True: + old = text + for k, v in self.shelf.items(): + text = text.replace(k, v) + if text == old: + break + return text + + def encode_html(self, text, quotes=True): + a = ( + ('&', '&'), + ('<', '<'), + ('>', '>') + ) + + if quotes: + a = a + ( + ("'", '''), + ('"', '"') + ) + + for k, v in a: + text = text.replace(k, v) + return text + + def graf(self, text): + if not self.lite: + text = self.noTextile(text) + text = self.code(text) + + text = self.links(text) + + if not self.noimage: + text = self.image(text) + + if not self.lite: + text = self.lists(text) + text = self.table(text) + + text = self.span(text) + text = self.footnoteRef(text) + text = self.glyphs(text) + + return text.rstrip('\n') + + def links(self, text): + """ + >>> t = Textile() + >>> t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') # doctest: +ELLIPSIS + 'fooobar ... and hello world ...' + """ + + punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' + + pattern = r''' + (?P [\s\[{(]|[%s] )?
+ " # start
+ (?P %s )
+ (?P [^"]+? )
+ \s?
+ (?: \(([^)]+?)\)(?=") )? # $title
+ ":
+ (?P (?:ftp|https?)? (?: :// )? [-A-Za-z0-9+&@#/?=~_()|!:,.;]*[-A-Za-z0-9+&@#/=~_()|] )
+ (?P [^\w\/;]*? )
+ (?=<|\s|$)
+ ''' % (re.escape(punct), self.c)
+
+ text = re.compile(pattern, re.X).sub(self.fLink, text)
+
+ return text
+
+ def fLink(self, match):
+ pre, atts, text, title, url, post = match.groups()
+
+ if pre == None:
+ pre = ''
+
+ # assume ) at the end of the url is not actually part of the url
+ # unless the url also contains a (
+ if url.endswith(')') and not url.find('(') > -1:
+ post = url[-1] + post
+ url = url[:-1]
+
+ url = self.checkRefs(url)
+
+ atts = self.pba(atts)
+ if title:
+ atts = atts + ' title="%s"' % self.encode_html(title)
+
+ if not self.noimage:
+ text = self.image(text)
+
+ text = self.span(text)
+ text = self.glyphs(text)
+
+ url = self.relURL(url)
+ out = '%s' % (self.encode_html(url), atts, self.rel, text)
+ out = self.shelve(out)
+ return ''.join([pre, out, post])
+
+ def span(self, text):
+ """
+ >>> t = Textile()
+ >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
+ 'hello span strong and bold goodbye'
+ """
+ qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
+ pnct = ".,\"'?!;:"
+
+ for qtag in qtags:
+ pattern = re.compile(r"""
+ (?:^|(?<=[\s>%(pnct)s])|([\]}]))
+ (%(qtag)s)(?!%(qtag)s)
+ (%(c)s)
+ (?::(\S+))?
+ ([^\s%(qtag)s]+|\S[^%(qtag)s\n]*[^\s%(qtag)s\n])
+ ([%(pnct)s]*)
+ %(qtag)s
+ (?:$|([\]}])|(?=%(selfpnct)s{1,2}|\s))
+ """ % {'qtag':qtag, 'c':self.c, 'pnct':pnct,
+ 'selfpnct':self.pnct}, re.X)
+ text = pattern.sub(self.fSpan, text)
+ return text
+
+
+ def fSpan(self, match):
+ _, tag, atts, cite, content, end, _ = match.groups()
+
+ qtags = {
+ '*': 'strong',
+ '**': 'b',
+ '??': 'cite',
+ '_' : 'em',
+ '__': 'i',
+ '-' : 'del',
+ '%' : 'span',
+ '+' : 'ins',
+ '~' : 'sub',
+ '^' : 'sup'
+ }
+ tag = qtags[tag]
+ atts = self.pba(atts)
+ if cite:
+ atts = atts + 'cite="%s"' % cite
+
+ content = self.span(content)
+
+ out = "<%s%s>%s%s%s>" % (tag, atts, content, end, tag)
+ return out
+
+ def image(self, text):
+ """
+ >>> t = Textile()
+ >>> t.image('!/imgs/myphoto.jpg!:http://jsamsa.com')
+ '
'
+ """
+ pattern = re.compile(r"""
+ (?:[\[{])? # pre
+ \! # opening !
+ (%s) # optional style,class atts
+ (?:\. )? # optional dot-space
+ ([^\s(!]+) # presume this is the src
+ \s? # optional space
+ (?:\(([^\)]+)\))? # optional title
+ \! # closing
+ (?::(\S+))? # optional href
+ (?:[\]}]|(?=\s|$)) # lookahead: space or end of string
+ """ % self.c, re.U|re.X)
+ return pattern.sub(self.fImage, text)
+
+ def fImage(self, match):
+ # (None, '', '/imgs/myphoto.jpg', None, None)
+ atts, url, title, href = match.groups()
+ atts = self.pba(atts)
+
+ if title:
+ atts = atts + ' title="%s" alt="%s"' % (title, title)
+ else:
+ atts = atts + ' alt=""'
+
+ if not self.isRelURL(url) and self.get_sizes:
+ size = getimagesize(url)
+ if (size):
+ atts += " %s" % size
+
+ if href:
+ href = self.checkRefs(href)
+
+ url = self.checkRefs(url)
+ url = self.relURL(url)
+
+ out = []
+ if href:
+ out.append('' % href)
+ if self.html_type == 'html':
+ out.append('
' % (url, atts))
+ else:
+ out.append('
' % (url, atts))
+ if href:
+ out.append('')
+
+ return ''.join(out)
+
+ def code(self, text):
+ text = self.doSpecial(text, '', '', self.fCode)
+ text = self.doSpecial(text, '@', '@', self.fCode)
+ text = self.doSpecial(text, '', '
', self.fPre)
+ return text
+
+ def fCode(self, match):
+ before, text, after = match.groups()
+ if after == None:
+ after = ''
+ # text needs to be escaped
+ if not self.restricted:
+ text = self.encode_html(text)
+ return ''.join([before, self.shelve('%s' % text), after])
+
+ def fPre(self, match):
+ before, text, after = match.groups()
+ if after == None:
+ after = ''
+ # text needs to be escapedd
+ if not self.restricted:
+ text = self.encode_html(text)
+ return ''.join([before, '', self.shelve(text), '
', after])
+
+ def doSpecial(self, text, start, end, method=None):
+ if method == None:
+ method = self.fSpecial
+ pattern = re.compile(r'(^|\s|[\[({>])%s(.*?)%s(\s|$|[\])}])?' % (re.escape(start), re.escape(end)), re.M|re.S)
+ return pattern.sub(method, text)
+
+ def fSpecial(self, match):
+ """
+ special blocks like notextile or code
+ """
+ before, text, after = match.groups()
+ if after == None:
+ after = ''
+ return ''.join([before, self.shelve(self.encode_html(text)), after])
+
+ def noTextile(self, text):
+ text = self.doSpecial(text, '', ' ', self.fTextile)
+ return self.doSpecial(text, '==', '==', self.fTextile)
+
+ def fTextile(self, match):
+ before, notextile, after = match.groups()
+ if after == None:
+ after = ''
+ return ''.join([before, self.shelve(notextile), after])
+
+
+def textile(text, head_offset=0, html_type='xhtml', encoding=None, output=None):
+ """
+ this function takes additional parameters:
+ head_offset - offset to apply to heading levels (default: 0)
+ html_type - 'xhtml' or 'html' style tags (default: 'xhtml')
+ """
+ return Textile().textile(text, head_offset=head_offset,
+ html_type=html_type)
+
+def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
+ """
+ Restricted version of Textile designed for weblog comments and other
+ untrusted input.
+
+ Raw HTML is escaped.
+ Style attributes are disabled.
+ rel='nofollow' is added to external links.
+
+ When lite=True is set (the default):
+ Block tags are restricted to p, bq, and bc.
+ Lists and tables are disabled.
+
+ When noimage=True is set (the default):
+ Image tags are disabled.
+
+ """
+ return Textile(restricted=True, lite=lite,
+ noimage=noimage).textile(text, rel='nofollow',
+ html_type=html_type)
+
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index aaff8b55c0..0b0bd6d570 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
- convert_heuristic, normalize_line_endings
+ convert_heuristic, normalize_line_endings, convert_textile
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@@ -41,6 +41,7 @@ class TXTInput(InputFormatPlugin):
'paragraph and no styling is applied.\n'
'* heuristic: Process using heuristics to determine formatting such '
'as chapter headings and italic text.\n'
+ '* textile: Processing using textile formatting.\n'
'* markdown: Processing using markdown formatting. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
@@ -91,6 +92,9 @@ def convert(self, stream, options, file_ext, log,
except RuntimeError:
raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
+ elif options.formatting_type == 'textile':
+ log.debug('Running text though textile conversion...')
+ html = convert_textile(txt)
else:
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 6a1a106681..3702bbfabe 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -7,7 +7,6 @@
import os, re
from calibre import prepare_string_for_xml, isbytestring
-from calibre.ebooks.markdown import markdown
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
from calibre.ebooks.conversion.preprocess import DocAnalysis
@@ -37,7 +36,7 @@ def clean_txt(txt):
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
txt = illegal_chars.sub('', txt)
-
+
return txt
def split_txt(txt, epub_split_size_kb=0):
@@ -74,12 +73,18 @@ def convert_heuristic(txt, title='', epub_split_size_kb=0):
return tp.convert(txt, title, epub_split_size_kb)
def convert_markdown(txt, title='', disable_toc=False):
+ from calibre.ebooks.markdown import markdown
md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'],
extension_configs={"toc": {"disable_toc": disable_toc}},
safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt))
+def convert_textile(txt, title=''):
+ from calibre.ebooks.textile import textile
+ html = textile(txt, encoding='utf-8')
+ return HTML_TEMPLATE % (title, html)
+
def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
@@ -115,66 +120,75 @@ def split_string_separator(txt, size) :
def detect_paragraph_type(txt):
'''
Tries to determine the formatting of the document.
-
+
block: Paragraphs are separated by a blank line.
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
unformatted: most lines have hard line breaks, few/no blank lines or indents
-
+
returns block, single, print, unformatted
'''
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
-
+
# Check for hard line breaks - true if 55% of the doc breaks in the same region
docanalysis = DocAnalysis('txt', txt)
hardbreaks = docanalysis.line_histogram(.55)
-
+
if hardbreaks:
# Determine print percentage
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
print_percent = tab_line_count / float(txt_line_count)
-
+
# Determine block percentage
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
block_percent = empty_line_count / float(txt_line_count)
-
+
# Compare the two types - the type with the larger number of instances wins
# in cases where only one or the other represents the vast majority of the document neither wins
if print_percent >= block_percent:
if .15 <= print_percent <= .75:
return 'print'
elif .15 <= block_percent <= .75:
- return 'block'
+ return 'block'
- # Assume unformatted text with hardbreaks if nothing else matches
+ # Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted'
-
+
# return single if hardbreaks is false
return 'single'
def detect_formatting_type(txt):
+ markdown_count = 0
+ textile_count = 0
+
# Check for markdown
# Headings
- if len(re.findall('(?mu)^#+', txt)) >= 5:
- return 'markdown'
- if len(re.findall('(?mu)^=+$', txt)) >= 5:
- return 'markdown'
- if len(re.findall('(?mu)^-+$', txt)) >= 5:
- return 'markdown'
+ markdown_count += len(re.findall('(?mu)^#+', txt))
+ markdown_count += len(re.findall('(?mu)^=+$', txt))
+ markdown_count += len(re.findall('(?mu)^-+$', txt))
# Images
- if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
- return 'markdown'
+ markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
# Links
- if len(re.findall('(?u)(^|(?P[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
- return 'markdown'
- # Escaped characters
- md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
- for c in md_escapted_characters:
- if txt.count('\\'+c) > 10:
+ markdown_count += len(re.findall('(?u)(^|(?P[^!]))\[.*?\]\([^)]+\)', txt))
+
+ # Check for textile
+ # Headings
+ textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
+ # Block quote.
+ textile_count += len(re.findall(r'(?mu)^bq\.', txt))
+ # Images
+ textile_count += len(re.findall(r'\![^\s]+(:[^\s]+)*', txt))
+ # Links
+ textile_count += len(re.findall(r'"(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
+
+ if markdown_count > 5 or textile_count > 5:
+ if markdown_count > textile_count:
return 'markdown'
-
+ else:
+ return 'textile'
+
return 'heuristic'