Skip to content

Instantly share code, notes, and snippets.

@pschwede
Last active September 24, 2015 22:58
Show Gist options
  • Save pschwede/823057 to your computer and use it in GitHub Desktop.
Save pschwede/823057 to your computer and use it in GitHub Desktop.
Reduces a set of strings to one regular expression string. Respects a black list.
'''
===RegexReduce===
Goal is to generate regular expressions out of lists of strings to match
all strings in that list.
This happens by reducing strings to their unique substrings..
@author: spazzpp2
'''
import re
class RegexReduce:
def __init__(self):
self.regex = ""
self.allowed = []
self.forbidden = []
pass
def whitestr(self, str):
whitelist([str])
def whitelist(self, listOfStrings):
self.allowed += listOfStrings
def blackstr(self, str):
self.blacklist([str])
def blacklist(self, listOfStrings):
self.forbidden += listOfStrings
def __same(self, a, b):
if len(a) < len(b):
a, b = b, a
res = ""
i = 0
while i < len(a)-len(res):
j = b.find(a[i])
if j > -1:
c = 0
while i+c < len(a) and j+c < len(b) and a[i+c] == b[j+c]:
c += 1
if len(res) < c:
res = b[j:j+c]
i += 1
return res
def __do(self, l, nolist):
# remove double strings
l = list(set(l))
nolist = list(set(nolist))
# gather similarites between the strings in l
eq = set([self.__same(a,b) for a in l for b in l if a!=b]) - set([""])
# gather similarities between the strings in l and the strings in nolist
neq = set([self.__same(a,b) for a in l for b in nolist if a!=b]) - set([""])
# remove similarities from eq which may lead to allow forbidden strings
eq -= neq
# make sure all allowed strings are still recognized
# for that we need to gather all strings which arent covered by eq
#eqall = [self.__same(a,b) for a in l+nolist for b in l+nolist]
res = set([])
for k in l:
for e in eq:
if e in k:
break
else:
t = str(k)
allsame = list(set([self.__same(a,b) for a in neq|set(l) for b in neq|set(l) if a!=b])-set(['']))
allsame.sort(lambda a,b: cmp(len(a), len(b)))
while len(allsame) and len(allsame[0]) > 2:
t = t.replace(self.__escape(allsame[0]),".+",1)
allsame.remove(allsame[0])
res |= set([t])
s = "("
if len(res):
s += "|".join(map(lambda a: self.__escape(a), res))
if len(eq):
return s + "|" + ".*" + ".*|.*".join(map(lambda a: self.__escape(a), eq)) + ".*)"
else:
return s + ")"
def __escape(self, word):
return word.replace(".","\.").replace("^","\^")
def __str__(self):
self.regex = self.__do(self.allowed, self.forbidden)
return self.regex
if __name__ == "__main__":
rg = RegexGen()
#rg.addlist(["xx","abcabc", "efabc","ref","abef"])
#rg.forbidlist(["xxx","cabc"])
rg.addlist(["Python - Mozilla Firefox", "Python.org - Mozilla Firefox", "Pythonizer - Mozilla Firefox"])
rg.forbidlist(["Ministry of Silly Walking (Monty) - Mozilla Firefox"])
print rg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment