1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 """Some text manipulation utility functions.
19
20
21 :group text formatting: normalize_text, normalize_paragraph, pretty_match,\
22 unquote, colorize_ansi
23 :group text manipulation: searchall, splitstrip
24 :sort: text formatting, text manipulation
25
26 :type ANSI_STYLES: dict(str)
27 :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code
28
29 :type ANSI_COLORS: dict(str)
30 :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code
31
32 :type ANSI_PREFIX: str
33 :var ANSI_PREFIX:
34 ANSI terminal code notifying the start of an ANSI escape sequence
35
36 :type ANSI_END: str
37 :var ANSI_END:
38 ANSI terminal code notifying the end of an ANSI escape sequence
39
40 :type ANSI_RESET: str
41 :var ANSI_RESET:
42 ANSI terminal code resetting format defined by a previous ANSI escape sequence
43 """
44 __docformat__ = "restructuredtext en"
45
46 import sys
47 import re
48 import os.path as osp
49 from warnings import warn
50 from unicodedata import normalize as _uninormalize
51 try:
52 from os import linesep
53 except ImportError:
54 linesep = '\n'
55
56 from logilab.common.deprecation import deprecated
57
58 MANUAL_UNICODE_MAP = {
59 u'\xa1': u'!',
60 u'\u0142': u'l',
61 u'\u2044': u'/',
62 u'\xc6': u'AE',
63 u'\xa9': u'(c)',
64 u'\xab': u'"',
65 u'\xe6': u'ae',
66 u'\xae': u'(r)',
67 u'\u0153': u'oe',
68 u'\u0152': u'OE',
69 u'\xd8': u'O',
70 u'\xf8': u'o',
71 u'\xbb': u'"',
72 u'\xdf': u'ss',
73 }
74
75 -def unormalize(ustring, ignorenonascii=None, substitute=None):
76 """replace diacritical characters with their corresponding ascii characters
77
78 Convert the unicode string to its long normalized form (unicode character
79 will be transform into several characters) and keep the first one only.
80 The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
81 replace all compatibility characters with their equivalents.
82
83 :type substitute: str
84 :param substitute: replacement character to use if decomposition fails
85
86 :see: Another project about ASCII transliterations of Unicode text
87 http://pypi.python.org/pypi/Unidecode
88 """
89
90 if ignorenonascii is not None:
91 warn("ignorenonascii is deprecated, use substitute named parameter instead",
92 DeprecationWarning, stacklevel=2)
93 if ignorenonascii:
94 substitute = ''
95 res = []
96 for letter in ustring[:]:
97 try:
98 replacement = MANUAL_UNICODE_MAP[letter]
99 except KeyError:
100 replacement = _uninormalize('NFKD', letter)[0]
101 if ord(replacement) >= 2 ** 7:
102 if substitute is None:
103 raise ValueError("can't deal with non-ascii based characters")
104 replacement = substitute
105 res.append(replacement)
106 return u''.join(res)
107
109 """remove optional quotes (simple or double) from the string
110
111 :type string: str or unicode
112 :param string: an optionally quoted string
113
114 :rtype: str or unicode
115 :return: the unquoted string (or the input string if it wasn't quoted)
116 """
117 if not string:
118 return string
119 if string[0] in '"\'':
120 string = string[1:]
121 if string[-1] in '"\'':
122 string = string[:-1]
123 return string
124
125
126 _BLANKLINES_RGX = re.compile('\r?\n\r?\n')
127 _NORM_SPACES_RGX = re.compile('\s+')
128
129 -def normalize_text(text, line_len=80, indent='', rest=False):
130 """normalize a text to display it with a maximum line size and
131 optionally arbitrary indentation. Line jumps are normalized but blank
132 lines are kept. The indentation string may be used to insert a
133 comment (#) or a quoting (>) mark for instance.
134
135 :type text: str or unicode
136 :param text: the input text to normalize
137
138 :type line_len: int
139 :param line_len: expected maximum line's length, default to 80
140
141 :type indent: str or unicode
142 :param indent: optional string to use as indentation
143
144 :rtype: str or unicode
145 :return:
146 the input text normalized to fit on lines with a maximized size
147 inferior to `line_len`, and optionally prefixed by an
148 indentation string
149 """
150 if rest:
151 normp = normalize_rest_paragraph
152 else:
153 normp = normalize_paragraph
154 result = []
155 for text in _BLANKLINES_RGX.split(text):
156 result.append(normp(text, line_len, indent))
157 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
158
159
161 """normalize a text to display it with a maximum line size and
162 optionally arbitrary indentation. Line jumps are normalized. The
163 indentation string may be used top insert a comment mark for
164 instance.
165
166 :type text: str or unicode
167 :param text: the input text to normalize
168
169 :type line_len: int
170 :param line_len: expected maximum line's length, default to 80
171
172 :type indent: str or unicode
173 :param indent: optional string to use as indentation
174
175 :rtype: str or unicode
176 :return:
177 the input text normalized to fit on lines with a maximized size
178 inferior to `line_len`, and optionally prefixed by an
179 indentation string
180 """
181 text = _NORM_SPACES_RGX.sub(' ', text)
182 line_len = line_len - len(indent)
183 lines = []
184 while text:
185 aline, text = splittext(text.strip(), line_len)
186 lines.append(indent + aline)
187 return linesep.join(lines)
188
190 """normalize a ReST text to display it with a maximum line size and
191 optionally arbitrary indentation. Line jumps are normalized. The
192 indentation string may be used top insert a comment mark for
193 instance.
194
195 :type text: str or unicode
196 :param text: the input text to normalize
197
198 :type line_len: int
199 :param line_len: expected maximum line's length, default to 80
200
201 :type indent: str or unicode
202 :param indent: optional string to use as indentation
203
204 :rtype: str or unicode
205 :return:
206 the input text normalized to fit on lines with a maximized size
207 inferior to `line_len`, and optionally prefixed by an
208 indentation string
209 """
210 toreport = ''
211 lines = []
212 line_len = line_len - len(indent)
213 for line in text.splitlines():
214 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
215 toreport = ''
216 while len(line) > line_len:
217
218 line, toreport = splittext(line, line_len)
219 lines.append(indent + line)
220 if toreport:
221 line = toreport + ' '
222 toreport = ''
223 else:
224 line = ''
225 if line:
226 lines.append(indent + line.strip())
227 return linesep.join(lines)
228
229
230 -def splittext(text, line_len):
231 """split the given text on space according to the given max line size
232
233 return a 2-uple:
234 * a line <= line_len if possible
235 * the rest of the text which has to be reported on another line
236 """
237 if len(text) <= line_len:
238 return text, ''
239 pos = min(len(text)-1, line_len)
240 while pos > 0 and text[pos] != ' ':
241 pos -= 1
242 if pos == 0:
243 pos = min(len(text), line_len)
244 while len(text) > pos and text[pos] != ' ':
245 pos += 1
246 return text[:pos], text[pos+1:].strip()
247
248
250 """return a list of stripped string by splitting the string given as
251 argument on `sep` (',' by default). Empty string are discarded.
252
253 >>> splitstrip('a, b, c , 4,,')
254 ['a', 'b', 'c', '4']
255 >>> splitstrip('a')
256 ['a']
257 >>>
258
259 :type string: str or unicode
260 :param string: a csv line
261
262 :type sep: str or unicode
263 :param sep: field separator, default to the comma (',')
264
265 :rtype: str or unicode
266 :return: the unquoted string (or the input string if it wasn't quoted)
267 """
268 return [word.strip() for word in string.split(sep) if word.strip()]
269
270 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip)
271
272
274 """return the latest component of a string containing either an url of the
275 form <scheme>://<path> or a local file system path
276 """
277 if '://' in url_or_path:
278 return url_or_path.rstrip('/').rsplit('/', 1)
279 return osp.split(url_or_path.rstrip(osp.sep))
280
281
282 -def text_to_dict(text):
283 """parse multilines text containing simple 'key=value' lines and return a
284 dict of {'key': 'value'}. When the same key is encountered multiple time,
285 value is turned into a list containing all values.
286
287 >>> text_to_dict('''multiple=1
288 ... multiple= 2
289 ... single =3
290 ... ''')
291 {'single': '3', 'multiple': ['1', '2']}
292
293 """
294 res = {}
295 if not text:
296 return res
297 for line in text.splitlines():
298 line = line.strip()
299 if line and not line.startswith('#'):
300 key, value = [w.strip() for w in line.split('=', 1)]
301 if key in res:
302 try:
303 res[key].append(value)
304 except AttributeError:
305 res[key] = [res[key], value]
306 else:
307 res[key] = value
308 return res
309
310
311 _BLANK_URE = r'(\s|,)+'
312 _BLANK_RE = re.compile(_BLANK_URE)
313 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
314 __UNITS_URE = r'[a-zA-Z]+'
315 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE))
316 _VALIDATION_RE = re.compile(r'^((%s)(%s))*(%s)?$' % (__VALUE_URE, __UNITS_URE,
317 __VALUE_URE))
318
319 BYTE_UNITS = {
320 "b": 1,
321 "kb": 1024,
322 "mb": 1024 ** 2,
323 "gb": 1024 ** 3,
324 "tb": 1024 ** 4,
325 }
326
327 TIME_UNITS = {
328 "ms": 0.0001,
329 "s": 1,
330 "min": 60,
331 "h": 60 * 60,
332 "d": 60 * 60 *24,
333 }
334
337 """Parse the string applying the units defined in units
338 (e.g.: "1.5m",{'m',60} -> 80).
339
340 :type string: str or unicode
341 :param string: the string to parse
342
343 :type units: dict (or any object with __getitem__ using basestring key)
344 :param units: a dict mapping a unit string repr to its value
345
346 :type inter: type
347 :param inter: used to parse every intermediate value (need __sum__)
348
349 :type blank_reg: regexp
350 :param blank_reg: should match every blank char to ignore.
351
352 :type value_reg: regexp with "value" and optional "unit" group
353 :param value_reg: match a value and it's unit into the
354 """
355 if inter is None:
356 inter = final
357 fstring = _BLANK_RE.sub('', string)
358 if not (fstring and _VALIDATION_RE.match(fstring)):
359 raise ValueError("Invalid unit string: %r." % string)
360 values = []
361 for match in value_reg.finditer(fstring):
362 dic = match.groupdict()
363 lit, unit = dic["value"], dic.get("unit")
364 value = inter(lit)
365 if unit is not None:
366 try:
367 value *= units[unit.lower()]
368 except KeyError:
369 raise KeyError('invalid unit %s. valid units are %s' %
370 (unit, units.keys()))
371 values.append(value)
372 return final(sum(values))
373
374
375 _LINE_RGX = re.compile('\r\n|\r+|\n')
376
378 """return a string with the match location underlined:
379
380 >>> import re
381 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon'))
382 il mange du bacon
383 ^^^^^
384 >>>
385
386 :type match: _sre.SRE_match
387 :param match: object returned by re.match, re.search or re.finditer
388
389 :type string: str or unicode
390 :param string:
391 the string on which the regular expression has been applied to
392 obtain the `match` object
393
394 :type underline_char: str or unicode
395 :param underline_char:
396 character to use to underline the matched section, default to the
397 carret '^'
398
399 :rtype: str or unicode
400 :return:
401 the original string with an inserted line to underline the match
402 location
403 """
404 start = match.start()
405 end = match.end()
406 string = _LINE_RGX.sub(linesep, string)
407 start_line_pos = string.rfind(linesep, 0, start)
408 if start_line_pos == -1:
409 start_line_pos = 0
410 result = []
411 else:
412 result = [string[:start_line_pos]]
413 start_line_pos += len(linesep)
414 offset = start - start_line_pos
415 underline = ' ' * offset + underline_char * (end - start)
416 end_line_pos = string.find(linesep, end)
417 if end_line_pos == -1:
418 string = string[start_line_pos:]
419 result.append(string)
420 result.append(underline)
421 else:
422 end = string[end_line_pos + len(linesep):]
423 string = string[start_line_pos:end_line_pos]
424 result.append(string)
425 result.append(underline)
426 result.append(end)
427 return linesep.join(result).rstrip()
428
429
430
431
432 ANSI_PREFIX = '\033['
433 ANSI_END = 'm'
434 ANSI_RESET = '\033[0m'
435 ANSI_STYLES = {
436 'reset': "0",
437 'bold': "1",
438 'italic': "3",
439 'underline': "4",
440 'blink': "5",
441 'inverse': "7",
442 'strike': "9",
443 }
444 ANSI_COLORS = {
445 'reset': "0",
446 'black': "30",
447 'red': "31",
448 'green': "32",
449 'yellow': "33",
450 'blue': "34",
451 'magenta': "35",
452 'cyan': "36",
453 'white': "37",
454 }
455
457 """return ansi escape code corresponding to color and style
458
459 :type color: str or None
460 :param color:
461 the color name (see `ANSI_COLORS` for available values)
462 or the color number when 256 colors are available
463
464 :type style: str or None
465 :param style:
466 style string (see `ANSI_COLORS` for available values). To get
467 several style effects at the same time, use a coma as separator.
468
469 :raise KeyError: if an unexistent color or style identifier is given
470
471 :rtype: str
472 :return: the built escape code
473 """
474 ansi_code = []
475 if style:
476 style_attrs = splitstrip(style)
477 for effect in style_attrs:
478 ansi_code.append(ANSI_STYLES[effect])
479 if color:
480 if color.isdigit():
481 ansi_code.extend(['38', '5'])
482 ansi_code.append(color)
483 else:
484 ansi_code.append(ANSI_COLORS[color])
485 if ansi_code:
486 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
487 return ''
488
490 """colorize message by wrapping it with ansi escape codes
491
492 :type msg: str or unicode
493 :param msg: the message string to colorize
494
495 :type color: str or None
496 :param color:
497 the color identifier (see `ANSI_COLORS` for available values)
498
499 :type style: str or None
500 :param style:
501 style string (see `ANSI_COLORS` for available values). To get
502 several style effects at the same time, use a coma as separator.
503
504 :raise KeyError: if an unexistent color or style identifier is given
505
506 :rtype: str or unicode
507 :return: the ansi escaped string
508 """
509
510 if color is None and style is None:
511 return msg
512 escape_code = _get_ansi_code(color, style)
513
514 if escape_code:
515 return '%s%s%s' % (escape_code, msg, ANSI_RESET)
516 return msg
517
518 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}
519
534