Skip to content

Instantly share code, notes, and snippets.

@huynhducduy
Created November 21, 2019 04:14
Show Gist options
  • Save huynhducduy/696a090cac38ddac3c1d223f9b1d6eeb to your computer and use it in GitHub Desktop.
Save huynhducduy/696a090cac38ddac3c1d223f9b1d6eeb to your computer and use it in GitHub Desktop.
REGEX: Get start, stop of capture groups
def addGroupToRegexString(str, start, end, groupsAdded):
start += groupsAdded * 2
end += groupsAdded * 2
return str[0:start] + '(' + str[start:end+1] + ')' + str[end+1:]
def fillGroups(regex):
import re
if str(type(regex)) == "<class 're.Pattern'>":
regexString = regex.pattern
modifier = regex.flags
else:
regexString = str(regex)
modifier = 32
tester = r"(\\\()|(\\\))|(\(\?)|(\()|(\)(?:\{\d+,?\d*}|[*+?])?\??)"
modifiedRegex = regexString
lastGroupStartPosition = -1
lastGroupEndPosition = -1
lastNonGroupStartPosition = -1
lastNonGroupEndPosition = -1
groupsAdded = 0
groupCount = 0
nonGroupPositions = []
groupPositions = []
groupNumber = []
currentLengthIndexes = []
groupIndexMapper = {}
previousGroupsForGroup = {}
go = re.finditer(tester, regexString)
try:
while True:
matchArr = next(go)
if matchArr.group(1) or matchArr.group(2):
pass
if matchArr.group(3):
index = matchArr.start() + len(matchArr.group(0)) - 1
lastNonGroupStartPosition = index
nonGroupPositions.append(index)
elif matchArr.group(4):
index = matchArr.start() + len(matchArr.group(0)) - 1
lastGroupPosition = max(
lastGroupStartPosition, lastGroupEndPosition)
if lastNonGroupStartPosition > lastGroupPosition:
if lastGroupPosition < lastNonGroupEndPosition:
if (lastNonGroupEndPosition - 1) - (lastGroupPosition + 1) > 0:
modifiedRegex = addGroupToRegexString(
modifiedRegex, lastGroupPosition + 1, lastNonGroupEndPosition - 1, groupsAdded)
groupsAdded += 1
lastGroupEndPosition = lastNonGroupEndPosition - 1
currentLengthIndexes.append(
groupCount + groupsAdded)
if (lastNonGroupStartPosition - 1) - (lastNonGroupEndPosition + 1) > 0:
modifiedRegex = addGroupToRegexString(
modifiedRegex, lastNonGroupEndPosition + 1, lastNonGroupStartPosition - 2, groupsAdded)
groupsAdded += 1
lastGroupEndPosition = lastNonGroupStartPosition - 1
currentLengthIndexes.append(
groupCount + groupsAdded)
else:
modifiedRegex = addGroupToRegexString(
modifiedRegex, lastGroupPosition + 1, lastNonGroupStartPosition - 2, groupsAdded)
groupsAdded += 1
lastGroupEndPosition = lastNonGroupStartPosition - 1
currentLengthIndexes.append(groupCount + groupsAdded)
if index > lastNonGroupStartPosition + 2:
modifiedRegex = addGroupToRegexString(
modifiedRegex, lastNonGroupStartPosition + 2, index - 1, groupsAdded)
groupsAdded += 1
lastGroupEndPosition = index - 1
currentLengthIndexes.append(groupCount + groupsAdded)
elif lastGroupPosition < index - 1:
modifiedRegex = addGroupToRegexString(
modifiedRegex, lastGroupPosition + 1, index - 1, groupsAdded)
groupsAdded += 1
lastGroupEndPosition = index - 1
currentLengthIndexes.append(groupCount + groupsAdded)
groupCount += 1
lastGroupStartPosition = index
groupPositions.append(index)
groupNumber.append(groupCount + groupsAdded)
groupIndexMapper[groupCount] = groupCount + groupsAdded
previousGroupsForGroup[groupCount] = currentLengthIndexes[:]
elif matchArr.group(5):
index = matchArr.start() + len(matchArr.group(0)) - 1
if (len(groupPositions) and not len(nonGroupPositions)) or groupPositions[len(groupPositions) - 1] > nonGroupPositions[len(nonGroupPositions) - 1]:
if lastGroupStartPosition < lastGroupEndPosition and lastGroupEndPosition < index - 1:
modifiedRegex = addGroupToRegexString(
modifiedRegex, lastGroupEndPosition + 1, index - 1, groupsAdded)
groupsAdded += 1
currentLengthIndexes.append(groupCount + groupsAdded)
groupPositions.pop()
lastGroupEndPosition = index
toPush = groupNumber.pop()
currentLengthIndexes.append(toPush)
currentLengthIndexes = list(
filter(lambda index: index <= toPush, currentLengthIndexes))
elif len(nonGroupPositions):
nonGroupPositions.pop()
lastNonGroupEndPosition = index
except StopIteration:
return [re.compile(modifiedRegex, flags=modifier), groupIndexMapper, previousGroupsForGroup]
class betterRegex:
def __init__(self, baseRegExp):
import re
self.regexp, self.groupIndexMapper, self.previousGroupsForGroup = fillGroups(
baseRegExp)
def execForAllGroups(self, string, includeFullMatch=False):
from functools import reduce
print(self.regexp.pattern)
matches = None
try:
matches = next(self.regexp.finditer(string))
if not matches:
return matches
except StopIteration:
return None
firstIndex = matches.start()
indexMapper = {
0: 0, **self.groupIndexMapper} if includeFullMatch else self.groupIndexMapper
previousGroups = {
0: [], **self.previousGroupsForGroup} if includeFullMatch else self.previousGroupsForGroup
def somefn(group):
mapped = indexMapper[group]
r = {
'match': matches.group(mapped),
'start': firstIndex + reduce(
lambda sum, i: sum +
(len(matches.group(i)) if matches.group(i)
else 0), previousGroups[group], 0
)
}
r['end'] = r['start'] + \
(len(matches.group(mapped)) if matches.group(mapped) else 0)
return r
return list(map(somefn, list(indexMapper)))
def execForGroup(self, string, group):
from functools import reduce
matches = None
try:
matches = next(self.regexp.finditer(string))
if not matches:
return matches
except StopIteration:
return None
firstIndex = matches.start()
mapped = 0 if group == 0 else self.groupIndexMapper[group]
previousGroups = [] if group == 0 else self.previousGroupsForGroup[group]
r = {
'match': matches.group(mapped),
'start': firstIndex + reduce(
lambda sum, i: sum +
(len(matches.group(i)) if matches.group(i)
else 0), previousGroups, 0
)
}
r['end'] = r['start'] + (len(matches.group(mapped))
if matches.group(mapped) else 0)
return r
test_patt = '^([^\s]*)\-at job ([^\s]*) ([^\s]*)$'
test_str = 'failure-at job qa-k8s-build_designer-be 8'
test = betterRegex(test_patt)
print(test.execForAllGroups(test_str))
print(test.execForAllGroups(test_str, includeFullMatch=True))
print(test.execForGroup(test_str, group=2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment