Skip to content

Instantly share code, notes, and snippets.

@Makman2
Last active August 29, 2015 14:16
Show Gist options
  • Save Makman2/28a71dd80b8ad29d282d to your computer and use it in GitHub Desktop.
Save Makman2/28a71dd80b8ad29d282d to your computer and use it in GitHub Desktop.
regex_proto.py
import re
# Just a forwarding method for re.split(), that splits unescaped.
# NOTE: If max_split it a number above 0, unescaped_split() appends the
# remaining unprocessed string to the result list!
def unescaped_split(pattern,
string,
max_split = 0,
remove_empty_matches = False):
match = re.split(pattern, string, max_split)
if (remove_empty_matches):
filtered_match = filter(bool, match)
match = []
for item in filtered_match:
match.append(item)
return match
# Splits the given <string> by <pattern> <maxsplit> times. Setting <maxsplit>
# to 0 tries to find all splits. Properly escapes escaped <pattern>s.
# Matches also empty strings.
# NOTE: If max_split it a number above 0, escaped_split() appends the remaining
# unprocessed string to the result list!
def escaped_split(pattern, string, max_split = 0, remove_empty_matches = False):
# Need to use re.search() since using splitting directly is not possible.
# We need to match the separator only if the number of escapes is even.
# The solution is to use lookbehind-assertions, but these doesn't support
# a variable number of letters (means quantifiers are not usable there). So
# if we try to match the escape sequences too, they would be replaced,
# because they are consumed then by the regex. That's not wanted.
match_strings = []
matches = search_for(r"(.*?)(?<!\\)((?:\\\\)*)" + pattern,
string,
max_split,
re.DOTALL) # Match also newlines.
last_pos = 0
for item in matches:
if (not remove_empty_matches or len(item.group(1)) != 0):
# Return the first matching group. The pattern from parameter can't
# change the group order. Concat it with the second group, that
# are all escapes that are escaped and would get consumed.
match_strings.append(item.group(1))
if (item.group(2) is not None):
match_strings[-1] += item.group(2)
last_pos = item.end()
# Append the rest of the string, since it's not in the result list (only
# matches are captured that have a leading separator).
if (not remove_empty_matches or len(string[last_pos : ]) != 0):
match_strings.append(string[last_pos : ])
return match_strings
# Searches for the expression between <begin> and <end> in <string>.
# Matches <max_matches> times. If 0 is specified, matches all that can be found
# in <string>.
# Matches also empty strings.
# If <max_matches> is less than the maximum possible number of matches, the
# remaining string is not appended to the result.
def search_in_between(begin, end, string, max_matches = 0):
# Compilation of the begin sequence is needed to get the number of capturing
# groups in it.
rxc_begin = re.compile(begin)
match_strings = []
for item in search_for(begin + r"(.*?)" + end,
string,
max_matches,
re.DOTALL):
# If a user provides a pattern with a matching group (concrete a pattern
# with a capturing group in parantheses "()"), we need to return the
# right one.
match_strings.append(item.group(rxc_begin.groups + 1))
return match_strings
# Searches for the expression between <begin> and <end> in <string>.
# Matches <max_matches> times. If 0 is specified, matches all that can be found
# in <string>.
# Matches also empty strings.
# If <max_matches> is less than the maximum possible number of matches, the
# remaining string is not appended to the result.
# Escapes escaped <begin> or <end> sequences.
def escaped_search_in_between(begin, end, string, max_matches = 0):
# Compilation of the begin sequence is needed to get the number of capturing
# groups in it.
rxc_begin = re.compile(begin)
match_strings = []
for item in search_for(r"(?<!\\)(?:\\\\)*" + begin +
r"(.*?)(?<!\\)((?:\\\\)*)" + end,
string,
max_matches,
re.DOTALL):
# If a user provides a pattern with a matching group (concrete a pattern
# with a capturing group in parantheses "()"), we need to return the
# right one. Also concat the consumed escaped escapes together to the
# real string.
match_strings.append(item.group(rxc_begin.groups + 1))
if (item.group(rxc_begin.groups + 2) is not None):
match_strings[-1] += item.group(rxc_begin.groups + 2)
return match_strings
# Searches for a specified <pattern> in <string> and matches <max_matches>
# times. Optional provide <flags> passed to the python3 regex functions.
# Returns MatchObject's to provide full access to found matches.
def search_for(pattern, string, max_matches = 0, flags = 0):
if (max_matches == 0):
# Use plain re.finditer() to find all matches.
return re.finditer(pattern, string, flags)
elif (max_matches > 0):
# Use a for loop to control the number of matches.
pos = 0
matches = []
rxc = re.compile(pattern, flags)
for x in range(0, max_matches):
current_match = rxc.search(string, pos)
if (current_match is None):
# Break out, no more matches found.
break
else:
matches.append(current_match)
pos = current_match.end()
return matches
else:
# Invalid.
return None
# Tests the split() functions with special strings to test every eventuality.
def test_splits():
separator = "'"
test_strings = [
r"out1 'escaped-escape: \\ ' out2",
r"out1 'escaped-quote: \' ' out2",
r"out1 'escaped-anything: \X ' out2",
r"out1 'two escaped escapes: \\\\ ' out2",
r"out1 'escaped-quote at end: \'' out2",
r"out1 'escaped-escape at end: \\' out2",
r"out1 'str1' out2 'str2' out2",
r"out1 \' 'str1' out2 'str2' out2",
r"out1 \\\' 'str1' out2 'str2' out2",
r"out1 \\ 'str1' out2 'str2' out2",
r"out1 \\\\ 'str1' out2 'str2' out2",
r"out1 \\'str1' out2 'str2' out2",
r"out1 \\\\'str1' out2 'str2' out2",
r"out1 'str1''str2''str3' out2"
]
# Since raw strings go on my nerves when backslahes appear before the
# closing ", I'll make a variable out of it.
bs = "\\"
# Expected results for unescaped_split().
unescaped_split_results = [
[r"out1 ", r"escaped-escape: \\ ", r" out2"],
[r"out1 ", r"escaped-quote: " + bs, r" ", r" out2"],
[r"out1 ", r"escaped-anything: \X ", r" out2"],
[r"out1 ", r"two escaped escapes: \\\\ ", r" out2"],
[r"out1 ", r"escaped-quote at end: " + bs, r"", r" out2"],
[r"out1 ", r"escaped-escape at end: " + 2 * bs, r" out2"],
[r"out1 ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + bs, r" ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 3 * bs, r" ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\ ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\\\ ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 2 * bs, r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 4 * bs, r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 ", r"str1", r"", r"str2", r"", r"str3", r" out2"]
]
# Expected results for escaped_split().
escaped_split_results = [
[r"out1 ", r"escaped-escape: \\ ", r" out2"],
[r"out1 ", r"escaped-quote: \' ", r" out2"],
[r"out1 ", r"escaped-anything: \X ", r" out2"],
[r"out1 ", r"two escaped escapes: \\\\ ", r" out2"],
[r"out1 ", r"escaped-quote at end: \'", r" out2"],
[r"out1 ", r"escaped-escape at end: " + 2 * bs, r" out2"],
[r"out1 ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \' ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\\' ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\ ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\\\ ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 2 * bs, r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 4 * bs, r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 ", r"str1", r"", r"str2", r"", r"str3", r" out2"]
]
successful = True
for i in range(0, len(test_strings)):
# Test unescaped function.
unescaped_real_result = unescaped_split(separator, test_strings[i])
for x in range(0, len(unescaped_split_results[i])):
if (unescaped_split_results[i][x] != unescaped_real_result[x]):
print_error_msg("unescaped_split()", i + 1,
unescaped_real_result,
unescaped_split_results[i])
successful = False
break;
# Test escaped function.
escaped_real_result = escaped_split(separator, test_strings[i])
for x in range(0, len(escaped_split_results[i])):
if (escaped_split_results[i][x] != escaped_real_result[x]):
print_error_msg("escaped_split()", i + 1,
escaped_real_result, escaped_split_results[i])
successful = False
break;
# Print summary
if (successful):
print("Test successful.")
else:
print("Test exited unsuccessful.")
return successful
# Tests the split functions while modifying the max_split parameter.
def test_splits2():
separator = "'"
test_strings = [
r"out1 'escaped-escape: \\ ' out2",
r"out1 'escaped-quote: \' ' out2",
r"out1 'escaped-anything: \X ' out2",
r"out1 'two escaped escapes: \\\\ ' out2",
r"out1 'escaped-quote at end: \'' out2",
r"out1 'escaped-escape at end: \\' out2",
r"out1 'str1' out2 'str2' out2",
r"out1 \' 'str1' out2 'str2' out2",
r"out1 \\\' 'str1' out2 'str2' out2",
r"out1 \\ 'str1' out2 'str2' out2",
r"out1 \\\\ 'str1' out2 'str2' out2",
r"out1 \\'str1' out2 'str2' out2",
r"out1 \\\\'str1' out2 'str2' out2",
r"out1 'str1''str2''str3' out2"
]
# Since raw strings go on my nerves when backslahes appear before the
# closing ", I'll make a variable out of it.
bs = "\\"
# Expected results for unescaped_split() when setting max_split to
# 1, 2, and 10.
unescaped_split_results = [
[
[r"out1 ", r"escaped-escape: \\ ' out2"],
[r"out1 ", r"escaped-quote: \' ' out2"],
[r"out1 ", r"escaped-anything: \X ' out2"],
[r"out1 ", r"two escaped escapes: \\\\ ' out2"],
[r"out1 ", r"escaped-quote at end: \'' out2"],
[r"out1 ", r"escaped-escape at end: \\' out2"],
[r"out1 ", r"str1' out2 'str2' out2"],
[r"out1 " + bs, r" 'str1' out2 'str2' out2"],
[r"out1 " + 3 * bs, r" 'str1' out2 'str2' out2"],
[r"out1 \\ ", r"str1' out2 'str2' out2"],
[r"out1 \\\\ ", r"str1' out2 'str2' out2"],
[r"out1 " + 2 * bs, r"str1' out2 'str2' out2"],
[r"out1 " + 4 * bs, r"str1' out2 'str2' out2"],
[r"out1 ", r"str1''str2''str3' out2"]
],[
[r"out1 ", r"escaped-escape: \\ ", r" out2"],
[r"out1 ", r"escaped-quote: " + bs, r" ' out2"],
[r"out1 ", r"escaped-anything: \X ", r" out2"],
[r"out1 ", r"two escaped escapes: \\\\ ", r" out2"],
[r"out1 ", r"escaped-quote at end: " + bs, r"' out2"],
[r"out1 ", r"escaped-escape at end: " + 2 * bs, r" out2"],
[r"out1 ", r"str1", r" out2 'str2' out2"],
[r"out1 " + bs, r" ", r"str1' out2 'str2' out2"],
[r"out1 " + 3 * bs, r" ", r"str1' out2 'str2' out2"],
[r"out1 \\ ", r"str1", r" out2 'str2' out2"],
[r"out1 \\\\ ", r"str1", r" out2 'str2' out2"],
[r"out1 " + 2 * bs, r"str1", r" out2 'str2' out2"],
[r"out1 " + 4 * bs, r"str1", r" out2 'str2' out2"],
[r"out1 ", r"str1", r"'str2''str3' out2"]
],[
[r"out1 ", r"escaped-escape: \\ ", r" out2"],
[r"out1 ", r"escaped-quote: " + bs, r" ", r" out2"],
[r"out1 ", r"escaped-anything: \X ", r" out2"],
[r"out1 ", r"two escaped escapes: \\\\ ", r" out2"],
[r"out1 ", r"escaped-quote at end: " + bs, r"", r" out2"],
[r"out1 ", r"escaped-escape at end: " + 2 * bs, r" out2"],
[r"out1 ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + bs, r" ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 3 * bs, r" ", r"str1", r" out2 ", r"str2",
r" out2"],
[r"out1 \\ ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\\\ ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 2 * bs, r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 4 * bs, r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 ", r"str1", r"", r"str2", r"", r"str3", r" out2"]
]
]
# Expected results for escaped_split() when setting max_split to
# 1, 2, and 10.
escaped_split_results = [
[
[r"out1 ", r"escaped-escape: \\ ' out2"],
[r"out1 ", r"escaped-quote: \' ' out2"],
[r"out1 ", r"escaped-anything: \X ' out2"],
[r"out1 ", r"two escaped escapes: \\\\ ' out2"],
[r"out1 ", r"escaped-quote at end: \'' out2"],
[r"out1 ", r"escaped-escape at end: \\' out2"],
[r"out1 ", r"str1' out2 'str2' out2"],
[r"out1 \' ", r"str1' out2 'str2' out2"],
[r"out1 \\\' ", r"str1' out2 'str2' out2"],
[r"out1 \\ ", r"str1' out2 'str2' out2"],
[r"out1 \\\\ ", r"str1' out2 'str2' out2"],
[r"out1 " + 2 * bs, r"str1' out2 'str2' out2"],
[r"out1 " + 4 * bs, r"str1' out2 'str2' out2"],
[r"out1 ", r"str1''str2''str3' out2"]
],[
[r"out1 ", r"escaped-escape: \\ ", r" out2"],
[r"out1 ", r"escaped-quote: \' ", r" out2"],
[r"out1 ", r"escaped-anything: \X ", r" out2"],
[r"out1 ", r"two escaped escapes: \\\\ ", r" out2"],
[r"out1 ", r"escaped-quote at end: \'", r" out2"],
[r"out1 ", r"escaped-escape at end: " + 2 * bs, r" out2"],
[r"out1 ", r"str1", r" out2 'str2' out2"],
[r"out1 \' ", r"str1", r" out2 'str2' out2"],
[r"out1 \\\' ", r"str1", r" out2 'str2' out2"],
[r"out1 \\ ", r"str1", r" out2 'str2' out2"],
[r"out1 \\\\ ", r"str1", r" out2 'str2' out2"],
[r"out1 " + 2 * bs, r"str1", r" out2 'str2' out2"],
[r"out1 " + 4 * bs, r"str1", r" out2 'str2' out2"],
[r"out1 ", r"str1", r"'str2''str3' out2"]
],[
[r"out1 ", r"escaped-escape: \\ ", r" out2"],
[r"out1 ", r"escaped-quote: \' ", r" out2"],
[r"out1 ", r"escaped-anything: \X ", r" out2"],
[r"out1 ", r"two escaped escapes: \\\\ ", r" out2"],
[r"out1 ", r"escaped-quote at end: \'", r" out2"],
[r"out1 ", r"escaped-escape at end: " + 2 * bs, r" out2"],
[r"out1 ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \' ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\\' ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\ ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 \\\\ ", r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 2 * bs, r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 " + 4 * bs, r"str1", r" out2 ", r"str2", r" out2"],
[r"out1 ", r"str1", r"", r"str2", r"", r"str3", r" out2"]
]
]
successful = True
for i in range(0, len(test_strings)):
# Test unescaped function.
mxs = [1, 2, 10]
for mx in range(0, 3):
unescaped_real_result = unescaped_split(separator,
test_strings[i],
mxs[mx])
for x in range(0, len(unescaped_split_results[mx][i])):
if (unescaped_split_results[mx][i][x] !=
unescaped_real_result[x]):
print_error_msg("unescaped_split(max_split=" +
str(mxs[mx]) + ")",
i + 1,
unescaped_real_result,
unescaped_split_results[mx][i])
successful = False
break;
# Test escaped function.
for mx in range(0, 3):
escaped_real_result = escaped_split(separator,
test_strings[i],
mxs[mx])
for x in range(0, len(escaped_split_results[mx][i])):
if (escaped_split_results[mx][i][x] != escaped_real_result[x]):
print_error_msg("escaped_split(max_split=" + str(mxs[mx]) +
")",
i + 1,
escaped_real_result,
escaped_split_results[mx][i])
successful = False
break;
# Print summary
if (successful):
print("Test successful.")
else:
print("Test exited unsuccessful.")
return successful
# - Use multi-char-patterns
def test_splits3():
pass
# - Use regex patterns for split
def test_splits4():
pass
# - remove_empty_matches
def test_splits5():
pass
# Tests search_in_between() functions with special strings to test every
# eventuality.
def test_search_in_between():
# Use ' as beginning and also the end sequence.
separator = "'"
test_strings = [
r"out1 'escaped-escape: \\ ' out2",
r"out1 'escaped-quote: \' ' out2",
r"out1 'escaped-anything: \X ' out2",
r"out1 'two escaped escapes: \\\\ ' out2",
r"out1 'escaped-quote at end: \'' out2",
r"out1 'escaped-escape at end: \\' out2",
r"out1 'str1' out2 'str2' out2",
r"out1 \' 'str1' out2 'str2' out2",
r"out1 \\\' 'str1' out2 'str2' out2",
r"out1 \\ 'str1' out2 'str2' out2",
r"out1 \\\\ 'str1' out2 'str2' out2",
r"out1 \\'str1' out2 'str2' out2",
r"out1 \\\\'str1' out2 'str2' out2",
r"out1 'str1''str2''str3' out2"
]
# Since raw strings go on my nerves when backslahes appear before the
# closing ", I'll make a variable out of it.
bs = "\\"
# Expected results for search_in_between().
unescaped_search_results = [
[r"escaped-escape: \\ "],
[r"escaped-quote: " + bs],
[r"escaped-anything: \X "],
[r"two escaped escapes: \\\\ "],
[r"escaped-quote at end: " + bs],
[r"escaped-escape at end: " + 2 * bs],
[r"str1", r"str2"],
[r" ", r" out2 "],
[r" ", r" out2 "],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2", r"str3"]
]
# Expected results for escaped_search_in_between().
escaped_search_results = [
[r"escaped-escape: \\ "],
[r"escaped-quote: \' "],
[r"escaped-anything: \X "],
[r"two escaped escapes: \\\\ "],
[r"escaped-quote at end: \'"],
[r"escaped-escape at end: " + 2 * bs],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2"],
[r"str1", r"str2", r"str3"]
]
successful = True
for i in range(0, len(test_strings)):
# Test unescaped function.
unescaped_real_result = search_in_between(separator,
separator,
test_strings[i])
for x in range(0, len(unescaped_search_results[i])):
if (unescaped_search_results[i][x] != unescaped_real_result[x]):
print_error_msg("search_in_between()", i + 1,
unescaped_real_result,
unescaped_search_results[i])
successful = False
break;
# Test escaped function.
escaped_real_result = escaped_search_in_between(separator,
separator,
test_strings[i])
for x in range(0, len(escaped_search_results[i])):
if (escaped_search_results[i][x] != escaped_real_result[x]):
print_error_msg("escaped_search_in_between()", i + 1,
escaped_real_result, escaped_search_results[i])
successful = False
break;
# Print summary
if (successful):
print("Test successful.")
else:
print("Test exited unsuccessful.")
return successful
def run_all_tests():
print("Run test for split() functions...")
test_splits()
print("Run test for split() functions using max_split...")
test_splits2()
print("Run test for split() using multi-char matching pattern...")
test_splits3()
print("Run test for split() functions using a regex as pattern...")
test_splits4()
print("Run test for split() functions using remove_empty_matches...")
test_splits5()
print("Run test for search_in_between() functions...")
test_search_in_between()
def print_error_msg(func_name, string_nr, result, expected):
print("ERROR: " + func_name + " failed for ", end="")
print(string_nr, end="")
print(". string.")
print("Result was:")
print(result)
print("Expected:")
print(expected)
print("----------------------------------")
# Unescaping methods would use regexes from escaped_split.
# There's an interesting function in the 're' module in python3: re.escape().
# It doesn't unescape but does the opposite, maybe this is helpful too.
# Question:
# - Rename search_in_between into unescaped_search_in_between?
# What's coming next:
# - Fill the empty test functions for split() (test_splits3() until
# test_splits4()).
# - Also add such tests for search_in_between().
# regex_proto v0.2.6 "Even more tests..."
# Add test for split() functions while modifying the 'max_split' parameter
# (with 1, 2 and 10).
# Also move error printing to a separate function.
# (inofficial) Release history :P
# regex_proto v0.1 "broken"
# regex_proto v0.2 "Hope"
# regex_proto v0.2.1 "More than Hope"
# regex_proto v0.2.2 "7-10 Split"
# regex_proto v0.2.3 "What lies beneath..."
# regex_proto v0.2.4 "The first testament"
# regex_proto v0.2.5 "The second testament"
# regex_proto v0.2.6 "Even more tests..."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment