Created
February 2, 2018 19:13
-
-
Save ebrard/611eb0519c52f8fdbb194cc5ba5f6138 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
# ["id", "col1", "col2"] | |
dataset = [ [1,2,3], [1,6,8], [2,37,0], [1,2,3], [1,6,8], [2,60,0] ] | |
def duplicate_finder(subdataset): | |
ids = {} | |
for record in subdataset: | |
if record[0] in ids: | |
ids[record[0]] += 1 | |
else: | |
ids[record[0]] = 1 | |
return ids | |
def merge(d1, d2): | |
all_keys = set(list(d1.keys()) + list(d2.keys())) | |
d_merge = {} | |
for a_key in all_keys: | |
if a_key not in d_merge: | |
d_merge[a_key] = 0 | |
if a_key in d1: | |
d_merge[a_key] = d_merge[a_key] + d1[a_key] | |
if a_key in d2: | |
d_merge[a_key] = d_merge[a_key] + d2[a_key] | |
return d_merge | |
global_ids = duplicate_finder(dataset) | |
assert( global_ids[1] == 4 ) | |
assert( global_ids[2] == 2 ) | |
print "Not split method" | |
pprint.pprint(global_ids) | |
global1_ids = duplicate_finder(dataset[0:2]) | |
global2_ids = duplicate_finder(dataset[2:6]) | |
print "Intermediate" | |
pprint.pprint(global1_ids) | |
pprint.pprint(global2_ids) | |
final_ids = merge(global1_ids, global2_ids) | |
print "Split method" | |
print(final_ids) | |
assert( final_ids[1] == 4 ) | |
assert( final_ids[2] == 2 ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment