Created
June 2, 2018 18:56
-
-
Save duhaime/30251966f56e243a4f04740840ed2f3d to your computer and use it in GitHub Desktop.
k-merge : given n files with sorted keys and their values, identify all instances of each key quickly
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run this script first to generate dummy data | |
from random import random, randint | |
import json | |
# image we need to k-merge 10 files | |
for file_id in range(10): | |
file_keys = [] | |
with open('data/' + str(file_id) + '.txt', 'w') as out: | |
# each of the 10 files will have 0:100 keys | |
for j in range(100): | |
# add some randomness so all files don't have all keys | |
if random() < 0.3: | |
file_keys.append(j) | |
# store the key | the file id | |
for k in file_keys: | |
out.write(str(k) + '|' + str(file_id) + '\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob, sys | |
# map from each filename to the line number for that file | |
def get_min_key_values(positions, lines): | |
# map each intput file to its smallest key | |
id_to_key = {} | |
for i in positions.keys(): | |
# try catch because fully consumed lists have positions > their | |
# number of lines in the file | |
try: | |
id_to_key[i] = lines[i][positions[i]].split('|')[0] | |
# any ids that hit the except block have been consumed | |
except: | |
pass | |
# find the smallest among all available keys | |
keys = list(id_to_key.values()) | |
# if there are no keys left to process, we're done! | |
if not keys: | |
sys.exit() | |
min_key = min(keys) | |
# pull out all values for the current smallest key | |
vals = [] | |
for i in id_to_key.keys(): | |
if id_to_key[i] == min_key: | |
vals.append(lines[i][positions[i]].split('|')[1].rstrip()) | |
# increment all positions for the consumed keys | |
positions[i] += 1 | |
# pass back the key and all accumulated values for the key | |
return min_key, vals | |
# main | |
files = glob.glob('data/*') | |
positions = {} | |
lines = {} | |
# store a generator with the file lines in lines[file_id] | |
for i in files: | |
positions[i] = 0 | |
with open(i) as f: | |
lines[i] = f.readlines() | |
# process all keys | |
while True: | |
# `values` contains all values for `key` | |
key, values = get_min_key_values(positions, lines) | |
print(key, values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment