Skip to content

Instantly share code, notes, and snippets.

@thecapacity
Last active August 29, 2015 14:00
Show Gist options
  • Save thecapacity/80cf9b4a3809a05ce248 to your computer and use it in GitHub Desktop.
Save thecapacity/80cf9b4a3809a05ce248 to your computer and use it in GitHub Desktop.
Testing parms for efficiently searching a given population for duplicates with a given sample size and number of iterations
#!/usr/bin/env python
from optparse import OptionParser
import collections
import itertools
import random
import math
import profile
import sys
def has_dups(sample):
if len(sample) != len(set(sample)): return True
else: return False
def run_test(population, overlap, sample_size, iterations):
results = { 'items_seen': set(), 'dupes_found': set(),
'pop_size': len(population) }
add = math.floor(len(pop) * overlap)
results['dupes_added'] = int(add)
overlap = (random.choice(population) for i in xrange(int(add)))
population.extend(overlap)
random.shuffle(population)
samples = (itertools.combinations(population, sample_size))
s = None
for i in xrange(iterations):
try:
s = next(samples)
except Exception as e:
print "Sample:", s
print "Cobinations Exhausted @Itr:", i
print e
break
results['items_seen'] = results['items_seen'].union(s)
if has_dups(s):
dupes = [x for x, y in collections.Counter(s).items() if y > 1]
results['dupes_found'] = results['dupes_found'].union(dupes)
results['dupes_found'] = len(results['dupes_found'] )
results['items_seen'] = len(results['items_seen'] )
results['work'] = sample_size * iterations
results['success_rate'] = results['dupes_found'] / add
results['efficiency'] = results['success_rate'] / results['work']
return results
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-n", "--num_sample", dest="n", type="int",
help="How many items are sampled in each iteration",
default = 10)
parser.add_option("-p", "--pop_size", dest="pop_size", type="int",
help="How big is the overall population",
default = 100)
parser.add_option("-i", "--iterations", dest="i", type="int",
help="How many times to sample (i.e. total iterations)",
default = 1000)
parser.add_option("-o", "--overlaop", dest="overlap", type="float",
help="Decimal amount of overlap to create (i.e. dupes)",
default = .10)
(options, args) = parser.parse_args()
pop = range(options.pop_size)
results = run_test(pop, options.overlap, options.n, options.i)
print results
#profile.run("results = run_test(pop, overlap, n, i)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment