Created
November 21, 2019 04:14
-
-
Save huynhducduy/696a090cac38ddac3c1d223f9b1d6eeb to your computer and use it in GitHub Desktop.
REGEX: Get start, stop of capture groups
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def addGroupToRegexString(str, start, end, groupsAdded): | |
start += groupsAdded * 2 | |
end += groupsAdded * 2 | |
return str[0:start] + '(' + str[start:end+1] + ')' + str[end+1:] | |
def fillGroups(regex): | |
import re | |
if str(type(regex)) == "<class 're.Pattern'>": | |
regexString = regex.pattern | |
modifier = regex.flags | |
else: | |
regexString = str(regex) | |
modifier = 32 | |
tester = r"(\\\()|(\\\))|(\(\?)|(\()|(\)(?:\{\d+,?\d*}|[*+?])?\??)" | |
modifiedRegex = regexString | |
lastGroupStartPosition = -1 | |
lastGroupEndPosition = -1 | |
lastNonGroupStartPosition = -1 | |
lastNonGroupEndPosition = -1 | |
groupsAdded = 0 | |
groupCount = 0 | |
nonGroupPositions = [] | |
groupPositions = [] | |
groupNumber = [] | |
currentLengthIndexes = [] | |
groupIndexMapper = {} | |
previousGroupsForGroup = {} | |
go = re.finditer(tester, regexString) | |
try: | |
while True: | |
matchArr = next(go) | |
if matchArr.group(1) or matchArr.group(2): | |
pass | |
if matchArr.group(3): | |
index = matchArr.start() + len(matchArr.group(0)) - 1 | |
lastNonGroupStartPosition = index | |
nonGroupPositions.append(index) | |
elif matchArr.group(4): | |
index = matchArr.start() + len(matchArr.group(0)) - 1 | |
lastGroupPosition = max( | |
lastGroupStartPosition, lastGroupEndPosition) | |
if lastNonGroupStartPosition > lastGroupPosition: | |
if lastGroupPosition < lastNonGroupEndPosition: | |
if (lastNonGroupEndPosition - 1) - (lastGroupPosition + 1) > 0: | |
modifiedRegex = addGroupToRegexString( | |
modifiedRegex, lastGroupPosition + 1, lastNonGroupEndPosition - 1, groupsAdded) | |
groupsAdded += 1 | |
lastGroupEndPosition = lastNonGroupEndPosition - 1 | |
currentLengthIndexes.append( | |
groupCount + groupsAdded) | |
if (lastNonGroupStartPosition - 1) - (lastNonGroupEndPosition + 1) > 0: | |
modifiedRegex = addGroupToRegexString( | |
modifiedRegex, lastNonGroupEndPosition + 1, lastNonGroupStartPosition - 2, groupsAdded) | |
groupsAdded += 1 | |
lastGroupEndPosition = lastNonGroupStartPosition - 1 | |
currentLengthIndexes.append( | |
groupCount + groupsAdded) | |
else: | |
modifiedRegex = addGroupToRegexString( | |
modifiedRegex, lastGroupPosition + 1, lastNonGroupStartPosition - 2, groupsAdded) | |
groupsAdded += 1 | |
lastGroupEndPosition = lastNonGroupStartPosition - 1 | |
currentLengthIndexes.append(groupCount + groupsAdded) | |
if index > lastNonGroupStartPosition + 2: | |
modifiedRegex = addGroupToRegexString( | |
modifiedRegex, lastNonGroupStartPosition + 2, index - 1, groupsAdded) | |
groupsAdded += 1 | |
lastGroupEndPosition = index - 1 | |
currentLengthIndexes.append(groupCount + groupsAdded) | |
elif lastGroupPosition < index - 1: | |
modifiedRegex = addGroupToRegexString( | |
modifiedRegex, lastGroupPosition + 1, index - 1, groupsAdded) | |
groupsAdded += 1 | |
lastGroupEndPosition = index - 1 | |
currentLengthIndexes.append(groupCount + groupsAdded) | |
groupCount += 1 | |
lastGroupStartPosition = index | |
groupPositions.append(index) | |
groupNumber.append(groupCount + groupsAdded) | |
groupIndexMapper[groupCount] = groupCount + groupsAdded | |
previousGroupsForGroup[groupCount] = currentLengthIndexes[:] | |
elif matchArr.group(5): | |
index = matchArr.start() + len(matchArr.group(0)) - 1 | |
if (len(groupPositions) and not len(nonGroupPositions)) or groupPositions[len(groupPositions) - 1] > nonGroupPositions[len(nonGroupPositions) - 1]: | |
if lastGroupStartPosition < lastGroupEndPosition and lastGroupEndPosition < index - 1: | |
modifiedRegex = addGroupToRegexString( | |
modifiedRegex, lastGroupEndPosition + 1, index - 1, groupsAdded) | |
groupsAdded += 1 | |
currentLengthIndexes.append(groupCount + groupsAdded) | |
groupPositions.pop() | |
lastGroupEndPosition = index | |
toPush = groupNumber.pop() | |
currentLengthIndexes.append(toPush) | |
currentLengthIndexes = list( | |
filter(lambda index: index <= toPush, currentLengthIndexes)) | |
elif len(nonGroupPositions): | |
nonGroupPositions.pop() | |
lastNonGroupEndPosition = index | |
except StopIteration: | |
return [re.compile(modifiedRegex, flags=modifier), groupIndexMapper, previousGroupsForGroup] | |
class betterRegex: | |
def __init__(self, baseRegExp): | |
import re | |
self.regexp, self.groupIndexMapper, self.previousGroupsForGroup = fillGroups( | |
baseRegExp) | |
def execForAllGroups(self, string, includeFullMatch=False): | |
from functools import reduce | |
print(self.regexp.pattern) | |
matches = None | |
try: | |
matches = next(self.regexp.finditer(string)) | |
if not matches: | |
return matches | |
except StopIteration: | |
return None | |
firstIndex = matches.start() | |
indexMapper = { | |
0: 0, **self.groupIndexMapper} if includeFullMatch else self.groupIndexMapper | |
previousGroups = { | |
0: [], **self.previousGroupsForGroup} if includeFullMatch else self.previousGroupsForGroup | |
def somefn(group): | |
mapped = indexMapper[group] | |
r = { | |
'match': matches.group(mapped), | |
'start': firstIndex + reduce( | |
lambda sum, i: sum + | |
(len(matches.group(i)) if matches.group(i) | |
else 0), previousGroups[group], 0 | |
) | |
} | |
r['end'] = r['start'] + \ | |
(len(matches.group(mapped)) if matches.group(mapped) else 0) | |
return r | |
return list(map(somefn, list(indexMapper))) | |
def execForGroup(self, string, group): | |
from functools import reduce | |
matches = None | |
try: | |
matches = next(self.regexp.finditer(string)) | |
if not matches: | |
return matches | |
except StopIteration: | |
return None | |
firstIndex = matches.start() | |
mapped = 0 if group == 0 else self.groupIndexMapper[group] | |
previousGroups = [] if group == 0 else self.previousGroupsForGroup[group] | |
r = { | |
'match': matches.group(mapped), | |
'start': firstIndex + reduce( | |
lambda sum, i: sum + | |
(len(matches.group(i)) if matches.group(i) | |
else 0), previousGroups, 0 | |
) | |
} | |
r['end'] = r['start'] + (len(matches.group(mapped)) | |
if matches.group(mapped) else 0) | |
return r |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test_patt = '^([^\s]*)\-at job ([^\s]*) ([^\s]*)$' | |
test_str = 'failure-at job qa-k8s-build_designer-be 8' | |
test = betterRegex(test_patt) | |
print(test.execForAllGroups(test_str)) | |
print(test.execForAllGroups(test_str, includeFullMatch=True)) | |
print(test.execForGroup(test_str, group=2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment