Created
April 16, 2016 20:32
-
-
Save pythononwheels/37f5570affe643b626358ba45799b764 to your computer and use it in GitHub Desktop.
compare tuples of strings with a given pattern in python using difflib.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Test string comparison of string tuples | |
# python 3.4 | |
# using difflib. | |
# | |
# see also: http://stackoverflow.com/questions/36643618/scoring-consistency-within-dataset | |
# khz 04/2016 | |
# | |
import difflib | |
import random | |
import string | |
import time | |
import tracemalloc | |
import collections | |
import json | |
NUM_ROWS = 200000 | |
PRINT_PERCENT = 10 | |
def timing(f): | |
""" | |
measure the time a function took to be excuted | |
""" | |
def wrap(*args): | |
time1 = time.time() | |
ret = f(*args) | |
time2 = time.time() | |
print("[ TIMIMG ]") | |
print ("{0:20s} function took {1:5f} ms".format(f.__name__, (time2-time1)*1000.0)) | |
return ret | |
return wrap | |
def shuffle(inlist): | |
""" shuffle the stings in the given inlist | |
randomly to slighlty differ from the given default | |
""" | |
olist = [] | |
for elem in inlist: | |
olist.append(shuffle_string(elem)) | |
return tuple(olist) | |
def shuffle_string(instr): | |
""" | |
shuffle a single string slightly / randomly | |
""" | |
# randomly choose how many chars to change in max (upper bound) | |
randlimit = random.randrange(len(instr)) | |
# choose how many exactly this time.... | |
#print("randlimit: " + str(randlimit) + " string lenght: " + str(len(instr)), end="") | |
if randlimit: | |
change_chars = random.randrange(int(randlimit)) | |
#print("changing: " + str(change_chars) + " chars") | |
s_list = list(instr) | |
for c in range(0, change_chars): | |
cchar = random.randrange(26) | |
s_list[c] = string.ascii_lowercase[cchar] | |
return "".join(s_list) | |
else: | |
return instr | |
@timing | |
def build(inlist): | |
""" | |
build the whole dataset (list of list of randoly (slightly modified) strings) | |
[ | |
["aaa","bbb", "ccc"], | |
["aaa","byb", "c1c"], | |
... | |
] | |
returns a list of tuples. | |
""" | |
olist = [] | |
for i in range(0,NUM_ROWS): | |
olist.append(shuffle(inlist)) | |
return olist | |
def compare(intuple, pattern_list): | |
""" | |
compare two strings with difflib | |
(in this case a n-tuple of strings with a given pattern list.) | |
n-tuple and list must be of the same lenght. | |
return a dict (Ordered) with the tuple and the score | |
""" | |
d = collections.OrderedDict() | |
d["tuple"] = intuple | |
#d["pattern"] = pattern_list | |
scorelist = [] | |
for counter in range(0,len(pattern_list)): | |
score = difflib.SequenceMatcher(None,intuple[counter].lower(),pattern_list[counter].lower()).ratio() | |
scorelist.append(score) | |
d["score"] = scorelist | |
return d | |
@timing | |
def compare_all(list, pattern_list): | |
olist = [] | |
for tup in list: | |
olist.append(compare(tup,pattern_list)) | |
return olist | |
if __name__ == "__main__": | |
tracemalloc.start() | |
COMPARE_LIST = ["foofoo", "bar", "lorem"] | |
olist = build(COMPARE_LIST) | |
ret = compare_all(olist, COMPARE_LIST) | |
snapshot = tracemalloc.take_snapshot() | |
top_stats = snapshot.statistics('lineno') | |
print("[ INFO ]") | |
print("num rows: " + str(NUM_ROWS)) | |
print("pattern: " + str(COMPARE_LIST)) | |
print("[ SHOWING 10 random results ]") | |
for counter in range(0,10): | |
chosen = random.randrange(len(olist)) | |
print(str(counter) +": chosen: : " + json.dumps(ret[chosen])) | |
print("[ Memory usage ]") | |
for stat in top_stats[:1]: | |
print(stat) | |
#for s in range(0,100): | |
# print(str(olist[s])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment