Skip to content

Instantly share code, notes, and snippets.

@fcicq
Last active July 14, 2021 23:37
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save fcicq/3394647 to your computer and use it in GitHub Desktop.
Save fcicq/3394647 to your computer and use it in GitHub Desktop.
Regexp::Trie in Python, by rex @ iregex.org. similar with Aho-Corasick
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
#author: rex
#blog: http://iregex.org
#filename tr.py
#created: 2010-08-01 20:24
#source uri: http://iregex.org/blog/trie-in-python.html
# escape bug fix by fcicq @ 2012.8.19
import re
class Trie():
"""Regexp::Trie in python"""
def __init__(self):
self.data={}
def add(self, word):
ref=self.data
for char in word:
ref[char]=ref.has_key(char) and ref[char] or {}
ref=ref[char]
ref['']=1
def dump(self):
return self.data
def quote(self, char):
return re.escape(char)
def _regexp(self, pData):
data=pData
if data.has_key("") and len(data.keys())==1:
return None
alt=[]
cc=[]
q=0
for char in sorted(data.keys()):
if isinstance(data[char],dict):
try:
recurse=self._regexp(data[char])
alt.append(self.quote(char)+recurse)
except:
cc.append(self.quote(char))
else:
q=1
cconly=not len(alt)>0 # backport from atiking's gist
if len(cc)>0:
if len(cc)==1:
alt.append(cc[0])
else:
alt.append('['+''.join(cc)+']')
if len(alt)==1:
result=alt[0]
else:
result="(?:"+"|".join(alt)+")"
if q:
if cconly:
result+="?"
else:
result="(?:%s)?" % result
return result
def regexp(self):
# return "(?-xism:%s)" % self._regexp(self.dump()) # fcicq: not sure what is it...
return self._regexp(self.dump())
if __name__ == '__main__':
a=Trie()
for w in ['foobar', 'foobah', 'fooxar', 'foozap', 'fooza']:
a.add(w)
print a.regexp()
@henix
Copy link

henix commented Feb 21, 2014

发现了一个 bug :'a' 和 'abc' 这组数据会给出 'abc?'

@henix
Copy link

henix commented Feb 21, 2014

好吧,已经有人修了:https://gist.github.com/atiking/7525326

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment