Skip to content

Instantly share code, notes, and snippets.

@duhaime
Created June 2, 2018 18:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save duhaime/30251966f56e243a4f04740840ed2f3d to your computer and use it in GitHub Desktop.
Save duhaime/30251966f56e243a4f04740840ed2f3d to your computer and use it in GitHub Desktop.
k-merge : given n files with sorted keys and their values, identify all instances of each key quickly
# run this script first to generate dummy data
from random import random, randint
import json
# image we need to k-merge 10 files
for file_id in range(10):
file_keys = []
with open('data/' + str(file_id) + '.txt', 'w') as out:
# each of the 10 files will have 0:100 keys
for j in range(100):
# add some randomness so all files don't have all keys
if random() < 0.3:
file_keys.append(j)
# store the key | the file id
for k in file_keys:
out.write(str(k) + '|' + str(file_id) + '\n')
import glob, sys
# map from each filename to the line number for that file
def get_min_key_values(positions, lines):
# map each intput file to its smallest key
id_to_key = {}
for i in positions.keys():
# try catch because fully consumed lists have positions > their
# number of lines in the file
try:
id_to_key[i] = lines[i][positions[i]].split('|')[0]
# any ids that hit the except block have been consumed
except:
pass
# find the smallest among all available keys
keys = list(id_to_key.values())
# if there are no keys left to process, we're done!
if not keys:
sys.exit()
min_key = min(keys)
# pull out all values for the current smallest key
vals = []
for i in id_to_key.keys():
if id_to_key[i] == min_key:
vals.append(lines[i][positions[i]].split('|')[1].rstrip())
# increment all positions for the consumed keys
positions[i] += 1
# pass back the key and all accumulated values for the key
return min_key, vals
# main
files = glob.glob('data/*')
positions = {}
lines = {}
# store a generator with the file lines in lines[file_id]
for i in files:
positions[i] = 0
with open(i) as f:
lines[i] = f.readlines()
# process all keys
while True:
# `values` contains all values for `key`
key, values = get_min_key_values(positions, lines)
print(key, values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment