def re_parts regex_list text An iterator that returns the entire text

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def re_parts(regex_list, text):
"""
An iterator that returns the entire text, but split by which regex it
matched, or none at all. If it did, the first value of the returned tuple
is the index into the regex list, otherwise -1.
>>> first_re = re.compile('asdf')
>>> second_re = re.compile('an')
>>> list(re_parts([first_re, second_re], 'This is an asdf test.'))
[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]
>>> list(re_parts([first_re, second_re], 'asdfasdfasdf'))
[(0, 'asdf'), (0, 'asdf'), (0, 'asdf')]
>>> list(re_parts([], 'This is an asdf test.'))
[(-1, 'This is an asdf test.')]
>>> third_re = re.compile('sdf')
>>> list(re_parts([first_re, second_re, third_re], 'This is an asdf test.'))
[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]
"""
def match_compare(x, y):
return x.start() - y.start()
prev_end = 0
iters = [r.finditer(text) for r in regex_list]
matches = []
while iters:
if matches:
match = matches.pop(0)
(start, end) = match.span()
if start > prev_end:
yield (-1, text[prev_end:start])
yield (regex_list.index(match.re), text[start:end])
elif start == prev_end:
yield (regex_list.index(match.re), text[start:end])
prev_end = end
else:
matches = []
for iterator in iters:
try:
matches.append(iterator.next())
except StopIteration:
iters.remove(iterator)
matches = sorted(matches, match_compare)
last_bit = text[prev_end:]
if len(last_bit) > 0:
yield (-1, last_bit)