huynhducduy/better_regex.py

## better_regex.py
def addGroupToRegexString(str, start, end, groupsAdded):
    start += groupsAdded * 2
    end += groupsAdded * 2
    return str[0:start] + '(' + str[start:end+1] + ')' + str[end+1:]


def fillGroups(regex):
    import re

    if str(type(regex)) == "<class 're.Pattern'>":
        regexString = regex.pattern
        modifier = regex.flags
    else:
        regexString = str(regex)
        modifier = 32

    tester = r"(\\\()|(\\\))|(\(\?)|(\()|(\)(?:\{\d+,?\d*}|[*+?])?\??)"

    modifiedRegex = regexString

    lastGroupStartPosition = -1
    lastGroupEndPosition = -1
    lastNonGroupStartPosition = -1
    lastNonGroupEndPosition = -1
    groupsAdded = 0
    groupCount = 0
    nonGroupPositions = []
    groupPositions = []
    groupNumber = []
    currentLengthIndexes = []
    groupIndexMapper = {}
    previousGroupsForGroup = {}

    go = re.finditer(tester, regexString)

    try:
        while True:
            matchArr = next(go)
            if matchArr.group(1) or matchArr.group(2):
                pass
            if matchArr.group(3):
                index = matchArr.start() + len(matchArr.group(0)) - 1
                lastNonGroupStartPosition = index
                nonGroupPositions.append(index)
            elif matchArr.group(4):
                index = matchArr.start() + len(matchArr.group(0)) - 1
                lastGroupPosition = max(
                    lastGroupStartPosition, lastGroupEndPosition)
                if lastNonGroupStartPosition > lastGroupPosition:
                    if lastGroupPosition < lastNonGroupEndPosition:
                        if (lastNonGroupEndPosition - 1) - (lastGroupPosition + 1) > 0:
                            modifiedRegex = addGroupToRegexString(
                                modifiedRegex, lastGroupPosition + 1, lastNonGroupEndPosition - 1, groupsAdded)
                            groupsAdded += 1
                            lastGroupEndPosition = lastNonGroupEndPosition - 1
                            currentLengthIndexes.append(
                                groupCount + groupsAdded)
                        if (lastNonGroupStartPosition - 1) - (lastNonGroupEndPosition + 1) > 0:
                            modifiedRegex = addGroupToRegexString(
                                modifiedRegex, lastNonGroupEndPosition + 1, lastNonGroupStartPosition - 2, groupsAdded)
                            groupsAdded += 1
                            lastGroupEndPosition = lastNonGroupStartPosition - 1
                            currentLengthIndexes.append(
                                groupCount + groupsAdded)
                    else:
                        modifiedRegex = addGroupToRegexString(
                            modifiedRegex, lastGroupPosition + 1, lastNonGroupStartPosition - 2, groupsAdded)
                        groupsAdded += 1
                        lastGroupEndPosition = lastNonGroupStartPosition - 1
                        currentLengthIndexes.append(groupCount + groupsAdded)
                    if index > lastNonGroupStartPosition + 2:
                        modifiedRegex = addGroupToRegexString(
                            modifiedRegex, lastNonGroupStartPosition + 2, index - 1, groupsAdded)
                        groupsAdded += 1
                        lastGroupEndPosition = index - 1
                        currentLengthIndexes.append(groupCount + groupsAdded)
                elif lastGroupPosition < index - 1:
                    modifiedRegex = addGroupToRegexString(
                        modifiedRegex, lastGroupPosition + 1, index - 1, groupsAdded)
                    groupsAdded += 1
                    lastGroupEndPosition = index - 1
                    currentLengthIndexes.append(groupCount + groupsAdded)

                groupCount += 1
                lastGroupStartPosition = index
                groupPositions.append(index)
                groupNumber.append(groupCount + groupsAdded)
                groupIndexMapper[groupCount] = groupCount + groupsAdded
                previousGroupsForGroup[groupCount] = currentLengthIndexes[:]
            elif matchArr.group(5):
                index = matchArr.start() + len(matchArr.group(0)) - 1
                if (len(groupPositions) and not len(nonGroupPositions)) or groupPositions[len(groupPositions) - 1] > nonGroupPositions[len(nonGroupPositions) - 1]:
                    if lastGroupStartPosition < lastGroupEndPosition and lastGroupEndPosition < index - 1:
                        modifiedRegex = addGroupToRegexString(
                            modifiedRegex, lastGroupEndPosition + 1, index - 1, groupsAdded)
                        groupsAdded += 1
                        currentLengthIndexes.append(groupCount + groupsAdded)

                    groupPositions.pop()
                    lastGroupEndPosition = index

                    toPush = groupNumber.pop()
                    currentLengthIndexes.append(toPush)
                    currentLengthIndexes = list(
                        filter(lambda index: index <= toPush, currentLengthIndexes))
                elif len(nonGroupPositions):
                    nonGroupPositions.pop()
                    lastNonGroupEndPosition = index
    except StopIteration:
        return [re.compile(modifiedRegex, flags=modifier), groupIndexMapper, previousGroupsForGroup]


class betterRegex:
    def __init__(self, baseRegExp):
        import re

        self.regexp, self.groupIndexMapper, self.previousGroupsForGroup = fillGroups(
            baseRegExp)

    def execForAllGroups(self, string, includeFullMatch=False):
        from functools import reduce

        print(self.regexp.pattern)

        matches = None
        try:
            matches = next(self.regexp.finditer(string))
            if not matches:
                return matches
        except StopIteration:
            return None

        firstIndex = matches.start()
        indexMapper = {
            0: 0, **self.groupIndexMapper} if includeFullMatch else self.groupIndexMapper
        previousGroups = {
            0: [], **self.previousGroupsForGroup} if includeFullMatch else self.previousGroupsForGroup

        def somefn(group):
            mapped = indexMapper[group]
            r = {
                'match': matches.group(mapped),
                'start': firstIndex + reduce(
                    lambda sum, i: sum +
                    (len(matches.group(i)) if matches.group(i)
                     else 0), previousGroups[group], 0
                )
            }
            r['end'] = r['start'] + \
                (len(matches.group(mapped)) if matches.group(mapped) else 0)
            return r

        return list(map(somefn, list(indexMapper)))

    def execForGroup(self, string, group):
        from functools import reduce

        matches = None
        try:
            matches = next(self.regexp.finditer(string))
            if not matches:
                return matches
        except StopIteration:
            return None

        firstIndex = matches.start()
        mapped = 0 if group == 0 else self.groupIndexMapper[group]
        previousGroups = [] if group == 0 else self.previousGroupsForGroup[group]

        r = {
            'match': matches.group(mapped),
            'start': firstIndex + reduce(
                lambda sum, i: sum +
                (len(matches.group(i)) if matches.group(i)
                 else 0), previousGroups, 0
            )
        }
        r['end'] = r['start'] + (len(matches.group(mapped))
                                 if matches.group(mapped) else 0)
        return r

## how_to_use.py
test_patt = '^([^\s]*)\-at job ([^\s]*) ([^\s]*)$'
test_str = 'failure-at job qa-k8s-build_designer-be 8'

test = betterRegex(test_patt)

print(test.execForAllGroups(test_str))
print(test.execForAllGroups(test_str, includeFullMatch=True))
print(test.execForGroup(test_str, group=2))
	def addGroupToRegexString(str, start, end, groupsAdded):
	start += groupsAdded * 2
	end += groupsAdded * 2
	return str[0:start] + '(' + str[start:end+1] + ')' + str[end+1:]


	def fillGroups(regex):
	import re

	if str(type(regex)) == "<class 're.Pattern'>":
	regexString = regex.pattern
	modifier = regex.flags
	else:
	regexString = str(regex)
	modifier = 32

	tester = r"(\\\()\|(\\\))\|(\(\?)\|(\()\|(\)(?:\{\d+,?\d}\|[+?])?\??)"

	modifiedRegex = regexString

	lastGroupStartPosition = -1
	lastGroupEndPosition = -1
	lastNonGroupStartPosition = -1
	lastNonGroupEndPosition = -1
	groupsAdded = 0
	groupCount = 0
	nonGroupPositions = []
	groupPositions = []
	groupNumber = []
	currentLengthIndexes = []
	groupIndexMapper = {}
	previousGroupsForGroup = {}

	go = re.finditer(tester, regexString)

	try:
	while True:
	matchArr = next(go)
	if matchArr.group(1) or matchArr.group(2):
	pass
	if matchArr.group(3):
	index = matchArr.start() + len(matchArr.group(0)) - 1
	lastNonGroupStartPosition = index
	nonGroupPositions.append(index)
	elif matchArr.group(4):
	index = matchArr.start() + len(matchArr.group(0)) - 1
	lastGroupPosition = max(
	lastGroupStartPosition, lastGroupEndPosition)
	if lastNonGroupStartPosition > lastGroupPosition:
	if lastGroupPosition < lastNonGroupEndPosition:
	if (lastNonGroupEndPosition - 1) - (lastGroupPosition + 1) > 0:
	modifiedRegex = addGroupToRegexString(
	modifiedRegex, lastGroupPosition + 1, lastNonGroupEndPosition - 1, groupsAdded)
	groupsAdded += 1
	lastGroupEndPosition = lastNonGroupEndPosition - 1
	currentLengthIndexes.append(
	groupCount + groupsAdded)
	if (lastNonGroupStartPosition - 1) - (lastNonGroupEndPosition + 1) > 0:
	modifiedRegex = addGroupToRegexString(
	modifiedRegex, lastNonGroupEndPosition + 1, lastNonGroupStartPosition - 2, groupsAdded)
	groupsAdded += 1
	lastGroupEndPosition = lastNonGroupStartPosition - 1
	currentLengthIndexes.append(
	groupCount + groupsAdded)
	else:
	modifiedRegex = addGroupToRegexString(
	modifiedRegex, lastGroupPosition + 1, lastNonGroupStartPosition - 2, groupsAdded)
	groupsAdded += 1
	lastGroupEndPosition = lastNonGroupStartPosition - 1
	currentLengthIndexes.append(groupCount + groupsAdded)
	if index > lastNonGroupStartPosition + 2:
	modifiedRegex = addGroupToRegexString(
	modifiedRegex, lastNonGroupStartPosition + 2, index - 1, groupsAdded)
	groupsAdded += 1
	lastGroupEndPosition = index - 1
	currentLengthIndexes.append(groupCount + groupsAdded)
	elif lastGroupPosition < index - 1:
	modifiedRegex = addGroupToRegexString(
	modifiedRegex, lastGroupPosition + 1, index - 1, groupsAdded)
	groupsAdded += 1
	lastGroupEndPosition = index - 1
	currentLengthIndexes.append(groupCount + groupsAdded)

	groupCount += 1
	lastGroupStartPosition = index
	groupPositions.append(index)
	groupNumber.append(groupCount + groupsAdded)
	groupIndexMapper[groupCount] = groupCount + groupsAdded
	previousGroupsForGroup[groupCount] = currentLengthIndexes[:]
	elif matchArr.group(5):
	index = matchArr.start() + len(matchArr.group(0)) - 1
	if (len(groupPositions) and not len(nonGroupPositions)) or groupPositions[len(groupPositions) - 1] > nonGroupPositions[len(nonGroupPositions) - 1]:
	if lastGroupStartPosition < lastGroupEndPosition and lastGroupEndPosition < index - 1:
	modifiedRegex = addGroupToRegexString(
	modifiedRegex, lastGroupEndPosition + 1, index - 1, groupsAdded)
	groupsAdded += 1
	currentLengthIndexes.append(groupCount + groupsAdded)

	groupPositions.pop()
	lastGroupEndPosition = index

	toPush = groupNumber.pop()
	currentLengthIndexes.append(toPush)
	currentLengthIndexes = list(
	filter(lambda index: index <= toPush, currentLengthIndexes))
	elif len(nonGroupPositions):
	nonGroupPositions.pop()
	lastNonGroupEndPosition = index
	except StopIteration:
	return [re.compile(modifiedRegex, flags=modifier), groupIndexMapper, previousGroupsForGroup]


	class betterRegex:
	def __init__(self, baseRegExp):
	import re

	self.regexp, self.groupIndexMapper, self.previousGroupsForGroup = fillGroups(
	baseRegExp)

	def execForAllGroups(self, string, includeFullMatch=False):
	from functools import reduce

	print(self.regexp.pattern)

	matches = None
	try:
	matches = next(self.regexp.finditer(string))
	if not matches:
	return matches
	except StopIteration:
	return None

	firstIndex = matches.start()
	indexMapper = {
	0: 0, **self.groupIndexMapper} if includeFullMatch else self.groupIndexMapper
	previousGroups = {
	0: [], **self.previousGroupsForGroup} if includeFullMatch else self.previousGroupsForGroup

	def somefn(group):
	mapped = indexMapper[group]
	r = {
	'match': matches.group(mapped),
	'start': firstIndex + reduce(
	lambda sum, i: sum +
	(len(matches.group(i)) if matches.group(i)
	else 0), previousGroups[group], 0
	)
	}
	r['end'] = r['start'] + \
	(len(matches.group(mapped)) if matches.group(mapped) else 0)
	return r

	return list(map(somefn, list(indexMapper)))

	def execForGroup(self, string, group):
	from functools import reduce

	matches = None
	try:
	matches = next(self.regexp.finditer(string))
	if not matches:
	return matches
	except StopIteration:
	return None

	firstIndex = matches.start()
	mapped = 0 if group == 0 else self.groupIndexMapper[group]
	previousGroups = [] if group == 0 else self.previousGroupsForGroup[group]

	r = {
	'match': matches.group(mapped),
	'start': firstIndex + reduce(
	lambda sum, i: sum +
	(len(matches.group(i)) if matches.group(i)
	else 0), previousGroups, 0
	)
	}
	r['end'] = r['start'] + (len(matches.group(mapped))
	if matches.group(mapped) else 0)
	return r
	test_patt = '^([^\s])\-at job ([^\s]) ([^\s]*)$'
	test_str = 'failure-at job qa-k8s-build_designer-be 8'

	test = betterRegex(test_patt)

	print(test.execForAllGroups(test_str))
	print(test.execForAllGroups(test_str, includeFullMatch=True))
	print(test.execForGroup(test_str, group=2))