Skip to content

Instantly share code, notes, and snippets.

@pythononwheels
Created April 16, 2016 20:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pythononwheels/37f5570affe643b626358ba45799b764 to your computer and use it in GitHub Desktop.
Save pythononwheels/37f5570affe643b626358ba45799b764 to your computer and use it in GitHub Desktop.
compare tuples of strings with a given pattern in python using difflib.
#
# Test string comparison of string tuples
# python 3.4
# using difflib.
#
# see also: http://stackoverflow.com/questions/36643618/scoring-consistency-within-dataset
# khz 04/2016
#
import difflib
import random
import string
import time
import tracemalloc
import collections
import json
NUM_ROWS = 200000
PRINT_PERCENT = 10
def timing(f):
"""
measure the time a function took to be excuted
"""
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print("[ TIMIMG ]")
print ("{0:20s} function took {1:5f} ms".format(f.__name__, (time2-time1)*1000.0))
return ret
return wrap
def shuffle(inlist):
""" shuffle the stings in the given inlist
randomly to slighlty differ from the given default
"""
olist = []
for elem in inlist:
olist.append(shuffle_string(elem))
return tuple(olist)
def shuffle_string(instr):
"""
shuffle a single string slightly / randomly
"""
# randomly choose how many chars to change in max (upper bound)
randlimit = random.randrange(len(instr))
# choose how many exactly this time....
#print("randlimit: " + str(randlimit) + " string lenght: " + str(len(instr)), end="")
if randlimit:
change_chars = random.randrange(int(randlimit))
#print("changing: " + str(change_chars) + " chars")
s_list = list(instr)
for c in range(0, change_chars):
cchar = random.randrange(26)
s_list[c] = string.ascii_lowercase[cchar]
return "".join(s_list)
else:
return instr
@timing
def build(inlist):
"""
build the whole dataset (list of list of randoly (slightly modified) strings)
[
["aaa","bbb", "ccc"],
["aaa","byb", "c1c"],
...
]
returns a list of tuples.
"""
olist = []
for i in range(0,NUM_ROWS):
olist.append(shuffle(inlist))
return olist
def compare(intuple, pattern_list):
"""
compare two strings with difflib
(in this case a n-tuple of strings with a given pattern list.)
n-tuple and list must be of the same lenght.
return a dict (Ordered) with the tuple and the score
"""
d = collections.OrderedDict()
d["tuple"] = intuple
#d["pattern"] = pattern_list
scorelist = []
for counter in range(0,len(pattern_list)):
score = difflib.SequenceMatcher(None,intuple[counter].lower(),pattern_list[counter].lower()).ratio()
scorelist.append(score)
d["score"] = scorelist
return d
@timing
def compare_all(list, pattern_list):
olist = []
for tup in list:
olist.append(compare(tup,pattern_list))
return olist
if __name__ == "__main__":
tracemalloc.start()
COMPARE_LIST = ["foofoo", "bar", "lorem"]
olist = build(COMPARE_LIST)
ret = compare_all(olist, COMPARE_LIST)
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
print("[ INFO ]")
print("num rows: " + str(NUM_ROWS))
print("pattern: " + str(COMPARE_LIST))
print("[ SHOWING 10 random results ]")
for counter in range(0,10):
chosen = random.randrange(len(olist))
print(str(counter) +": chosen: : " + json.dumps(ret[chosen]))
print("[ Memory usage ]")
for stat in top_stats[:1]:
print(stat)
#for s in range(0,100):
# print(str(olist[s]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment