Created
April 24, 2013 17:57
-
-
Save apiguy/5454125 to your computer and use it in GitHub Desktop.
Grouping strings by prefixes in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Gribouillis at www.daniweb.com | |
""" | |
The class Grouper implements grouping strings by prefixes. | |
A Grouper object is created with a sequence of prefixes | |
G = Grouper(["pref1", "pref2", ...]) | |
(an empty prefix is automatically added). The Grouper object | |
is a dictionary "prefix --> set of strings". Initially, it looks | |
like | |
{ | |
"pref1" : set([]), | |
"pref2" : set([]), | |
} | |
The method 'add_strings' can be used to add a sequence of strings | |
to the grouper. Each string goes in one of the sets according to | |
the longest prefix it contains. | |
After | |
G.add_strings(["pref1aaaaa", "pref2jjj", "pref1s"]) | |
it's content will be | |
{ | |
"pref1" : set(["pref1aaaaa", "pref1s"]), | |
"pref2" : set(["pref2jjj"]), | |
} | |
More strings can be added. | |
""" | |
from collections import deque | |
from bisect import bisect_left ,bisect_right | |
class Grouper (dict ): | |
def __init__ (self ,iterable ): | |
prefixes =set (iterable ) | |
prefixes .add ("") | |
dict .__init__ (self ,[(p ,set ())for p in prefixes ]) | |
self .skeys =[] | |
def add_strings (self ,iterable ): | |
strings =sorted (set (iterable )) | |
if len (self .skeys )!=len (self ): | |
self .skeys =sorted (self )[1 :] | |
stack =deque ([("",chr (255 ))]) | |
k =0 | |
for p in self .skeys : | |
while stack [-1 ][1 ]<=p : | |
q ,qs =stack .pop () | |
k =self ._forward (q ,k ,qs ,strings ) | |
k =self ._forward (stack [-1 ][0 ],k ,p ,strings ) | |
stack .append ((p ,p [:-1 ]+chr (ord (p [-1 ])+1 ))) | |
while stack : | |
q ,qs =stack .pop () | |
k =self ._forward (q ,k ,qs ,strings ) | |
def _forward (self ,q ,k ,qs ,strings ): | |
ks =bisect_left (strings ,qs ,k ) | |
self [q ].update (strings [i ]for i in xrange (k ,ks )) | |
return ks | |
if __name__ =="__main__": | |
grouper =Grouper (["/usr","/usr/lib","hello","heat"]) | |
grouper .add_strings ([ | |
"/usr/bin/python","/usr/lib/python2.5","/dev", | |
"hello world","hello","heat foo","heat","her", | |
"/usr/liberation", | |
]) | |
for k ,s in grouper .items (): | |
print ("%s: %s"%(repr (k ),str (s ))) | |
print "Adding more strings" | |
grouper .add_strings ([ | |
"hello fred","/usrssss", | |
]) | |
for k ,s in grouper .items (): | |
print ("%s: %s"%(repr (k ),str (s ))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment