-
-
Save bgusach/a967e0587d6e01e889fd1d776c5f3729 to your computer and use it in GitHub Desktop.
def multireplace(string, replacements, ignore_case=False): | |
""" | |
Given a string and a replacement map, it returns the replaced string. | |
:param str string: string to execute replacements on | |
:param dict replacements: replacement dictionary {value to find: value to replace} | |
:param bool ignore_case: whether the match should be case insensitive | |
:rtype: str | |
""" | |
if not replacements: | |
# Edge case that'd produce a funny regex and cause a KeyError | |
return string | |
# If case insensitive, we need to normalize the old string so that later a replacement | |
# can be found. For instance with {"HEY": "lol"} we should match and find a replacement for "hey", | |
# "HEY", "hEy", etc. | |
if ignore_case: | |
def normalize_old(s): | |
return s.lower() | |
re_mode = re.IGNORECASE | |
else: | |
def normalize_old(s): | |
return s | |
re_mode = 0 | |
replacements = {normalize_old(key): val for key, val in replacements.items()} | |
# Place longer ones first to keep shorter substrings from matching where the longer ones should take place | |
# For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce | |
# 'hey ABC' and not 'hey ABc' | |
rep_sorted = sorted(replacements, key=len, reverse=True) | |
rep_escaped = map(re.escape, rep_sorted) | |
# Create a big OR regex that matches any of the substrings to replace | |
pattern = re.compile("|".join(rep_escaped), re_mode) | |
# For each match, look up the new string in the replacements, being the key the normalized old string | |
return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string) | |
Based on @bgusach and @elidchan proposals, I have created a version with support for basic regex replacement. The main restriction is that expressions must not contain subgroups, and there may be some edge cases:
import re
class StringReplacer:
def __init__(self, replacements, ignore_case=False):
patterns = sorted(replacements, key=len, reverse=True)
self.replacements = [replacements[k] for k in patterns]
re_mode = re.IGNORECASE if ignore_case else 0
self.pattern = re.compile('|'.join(("({})".format(p) for p in patterns)), re_mode)
def tr(matcher):
index = next((index for index,value in enumerate(matcher.groups()) if value), None)
return self.replacements[index]
self.tr = tr
def __call__(self, string):
return self.pattern.sub(self.tr, string)
Tests
table = {
"aaa" : "[This is three a]",
"b+" : "[This is one or more b]",
r"<\w+>" : "[This is a tag]"
}
replacer = StringReplacer(table, True)
sample1 = "whatever bb, aaa, <star> BBB <end>"
print(replacer(sample1))
# output: whatever [This is one or more b], [This is three a], [This is a tag] [This is one or more b] [This is a tag]
The trick is to identify the matched group by its position. It is not super efficient (O(n)), but it works.
index = next((index for index,value in enumerate(matcher.groups()) if value), None)
Replacement is done in one pass.
How would one apply multireplace to strings in pandas dataframe?
Based on @mnesarco approach, I tried a functional one with support for one subgroup per expression:
import re
from typing import Dict, Union
def multireplace(table: Dict[str, str], string: str, flags: Union[int, re.RegexFlag] = 0):
patterns = {
f"_g{n}": pattern for n, pattern in enumerate(table)
}
def repl(match: re.Match):
repkey = None
groupkey = None
for key, value in match.groupdict().items():
if value is None:
continue
if key.startswith("_g"):
repkey = key
else:
groupkey, groupval = key, value
repval = table[patterns[repkey]]
return repval if groupkey is None else repval.replace(fr"\g<{groupkey}>", groupval)
regex = "|".join(fr"(?P<{group}>{rep})" for group, rep in patterns.items())
return re.sub(regex, repl, string, flags=flags)
Test
table = {
"aaa": "[This is three a]",
"b+": "[This is one or more b]",
r"(?<=<spam>).+(?=</spam>)": "[REDACTED]",
r"</?\w+>": "[This is a tag]",
}
txt = multireplace(table, "whatever bb, aaa, <star> BBB <end> <tag>keep me</tag> and <spam>delete me</spam>", re.IGNORECASE)
print(txt)
# output: whatever [This is one or more b], [This is three a], [This is a tag] [This is one or more b] [This is a tag] [This is a tag]keep me[This is a tag] and [This is a tag][REDACTED][This is a tag]
table = {
"aaa": "[This is three a]",
"b+": "[This is one or more b]",
r"<(?P<name>\w+)>(?P<value>.+)</(?P=name)>": r"[This is an HTML tag with text (\g<value>)]",
r"</?\w+>": "[This is a tag]",
}
txt = multireplace(table, "whatever bb, aaa, <star> BBB <end> <tag>keep me</tag> and <spam>delete me</spam>", re.IGNORECASE)
print(txt)
# output: whatever [This is one or more b], [This is three a], [This is a tag] [This is one or more b] [This is a tag] [This is an HTML tag with text (keep me)] and [This is an HTML tag with text (delete me)]
It's still O(n), I don't know how priorities are being set inside the main regex, they should be based on the dictionary order, but when there is competition (eg r"<(?P<name>\w+)>(?P<value>.+)</(?P=name)>"
versus r"(?<=<spam>).+(?=</spam>)"
) the first has precedence. Also, one cannot reference a group by its order, only by name.
@elidchan
re.compile
caches to a certain extent:https://docs.python.org/3/library/re.html#re.compile
So your approach may or may not make sense depending on the scenario (and it could be that you'd have to cache your StringReplacer instances)