Skip to content

Instantly share code, notes, and snippets.

@ebrard
Created February 2, 2018 19:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ebrard/611eb0519c52f8fdbb194cc5ba5f6138 to your computer and use it in GitHub Desktop.
Save ebrard/611eb0519c52f8fdbb194cc5ba5f6138 to your computer and use it in GitHub Desktop.
import pprint
# ["id", "col1", "col2"]
dataset = [ [1,2,3], [1,6,8], [2,37,0], [1,2,3], [1,6,8], [2,60,0] ]
def duplicate_finder(subdataset):
ids = {}
for record in subdataset:
if record[0] in ids:
ids[record[0]] += 1
else:
ids[record[0]] = 1
return ids
def merge(d1, d2):
all_keys = set(list(d1.keys()) + list(d2.keys()))
d_merge = {}
for a_key in all_keys:
if a_key not in d_merge:
d_merge[a_key] = 0
if a_key in d1:
d_merge[a_key] = d_merge[a_key] + d1[a_key]
if a_key in d2:
d_merge[a_key] = d_merge[a_key] + d2[a_key]
return d_merge
global_ids = duplicate_finder(dataset)
assert( global_ids[1] == 4 )
assert( global_ids[2] == 2 )
print "Not split method"
pprint.pprint(global_ids)
global1_ids = duplicate_finder(dataset[0:2])
global2_ids = duplicate_finder(dataset[2:6])
print "Intermediate"
pprint.pprint(global1_ids)
pprint.pprint(global2_ids)
final_ids = merge(global1_ids, global2_ids)
print "Split method"
print(final_ids)
assert( final_ids[1] == 4 )
assert( final_ids[2] == 2 )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment