mirror of
git://github.com/kovidgoyal/calibre.git
synced 2026-05-02 10:44:08 +02:00
Txt output: remove more tags, ensure no spaces at beginning and end of lines
This commit is contained in:
parent
11013c2665
commit
94c5e717a1
1 changed files with 7 additions and 3 deletions
|
|
@ -68,6 +68,9 @@ def strip_html(self, html):
|
|||
for tag in ['script', 'style']:
|
||||
text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
|
||||
text = re.sub('<!--.*-->', '', text)
|
||||
text = re.sub('<\?.*?\?>', '', text)
|
||||
text = re.sub('<@.*?@>', '', text)
|
||||
text = re.sub('<%.*?%>', '', text)
|
||||
|
||||
# Headings usually indicate Chapters.
|
||||
# We are going to use a marker to insert the proper number of
|
||||
|
|
@ -107,7 +110,6 @@ def cleanup_text(self, text):
|
|||
text = text.replace(u'\xa0', ' ')
|
||||
|
||||
# Replace tabs, vertical tags and form feeds with single space.
|
||||
#text = re.sub('\xc2\xa0', '', text)
|
||||
text = text.replace('\t+', ' ')
|
||||
text = text.replace('\v+', ' ')
|
||||
text = text.replace('\f+', ' ')
|
||||
|
|
@ -122,8 +124,6 @@ def cleanup_text(self, text):
|
|||
|
||||
# Remove multiple spaces.
|
||||
text = re.sub('[ ]+', ' ', text)
|
||||
text = re.sub('(?imu)^[ ]+', '', text)
|
||||
text = re.sub('(?imu)[ ]+$', '', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
|
|
@ -133,6 +133,10 @@ def cleanup_text(self, text):
|
|||
text = text.replace('-vzxedxy-', '\n\n\n\n\n')
|
||||
text = text.replace('-vlgzxey-', '\n\n\n')
|
||||
|
||||
# Replace spaces at the beginning and end of lines
|
||||
text = re.sub('(?imu)^[ ]+', '', text)
|
||||
text = re.sub('(?imu)[ ]+$', '', text)
|
||||
|
||||
return text
|
||||
|
||||
def unix_newlines(self, text):
|
||||
|
|
|
|||
Loading…
Reference in a new issue