duhaime/generate_data.py

## generate_data.py
# run this script first to generate dummy data
from random import random, randint
import json

# image we need to k-merge 10 files
for file_id in range(10):
  file_keys = []
  with open('data/' + str(file_id) + '.txt', 'w') as out:
    # each of the 10 files will have 0:100 keys
    for j in range(100):
      # add some randomness so all files don't have all keys
      if random() < 0.3:
        file_keys.append(j)
    # store the key | the file id
    for k in file_keys:
      out.write(str(k) + '|' + str(file_id) + '\n')

## k_merge.py
import glob, sys

# map from each filename to the line number for that file
def get_min_key_values(positions, lines):

  # map each intput file to its smallest key
  id_to_key = {}
  for i in positions.keys():
    # try catch because fully consumed lists have positions > their
    # number of lines in the file
    try:
      id_to_key[i] =  lines[i][positions[i]].split('|')[0]
    # any ids that hit the except block have been consumed
    except:
      pass

  # find the smallest among all available keys
  keys = list(id_to_key.values())

  # if there are no keys left to process, we're done!
  if not keys:
    sys.exit()
  min_key = min(keys)

  # pull out all values for the current smallest key
  vals = []
  for i in id_to_key.keys():
    if id_to_key[i] == min_key:
      vals.append(lines[i][positions[i]].split('|')[1].rstrip())

      # increment all positions for the consumed keys
      positions[i] += 1

  # pass back the key and all accumulated values for the key
  return min_key, vals

# main
files = glob.glob('data/*')
positions = {}
lines = {}

# store a generator with the file lines in lines[file_id]
for i in files:
  positions[i] = 0
  with open(i) as f:
    lines[i] = f.readlines()

# process all keys
while True:
  # `values` contains all values for `key`
  key, values = get_min_key_values(positions, lines)
  print(key, values)
	# run this script first to generate dummy data
	from random import random, randint
	import json

	# image we need to k-merge 10 files
	for file_id in range(10):
	file_keys = []
	with open('data/' + str(file_id) + '.txt', 'w') as out:
	# each of the 10 files will have 0:100 keys
	for j in range(100):
	# add some randomness so all files don't have all keys
	if random() < 0.3:
	file_keys.append(j)
	# store the key \| the file id
	for k in file_keys:
	out.write(str(k) + '\|' + str(file_id) + '\n')
	import glob, sys

	# map from each filename to the line number for that file
	def get_min_key_values(positions, lines):

	# map each intput file to its smallest key
	id_to_key = {}
	for i in positions.keys():
	# try catch because fully consumed lists have positions > their
	# number of lines in the file
	try:
	id_to_key[i] = lines[i][positions[i]].split('\|')[0]
	# any ids that hit the except block have been consumed
	except:
	pass

	# find the smallest among all available keys
	keys = list(id_to_key.values())

	# if there are no keys left to process, we're done!
	if not keys:
	sys.exit()
	min_key = min(keys)

	# pull out all values for the current smallest key
	vals = []
	for i in id_to_key.keys():
	if id_to_key[i] == min_key:
	vals.append(lines[i][positions[i]].split('\|')[1].rstrip())

	# increment all positions for the consumed keys
	positions[i] += 1

	# pass back the key and all accumulated values for the key
	return min_key, vals

	# main
	files = glob.glob('data/*')
	positions = {}
	lines = {}

	# store a generator with the file lines in lines[file_id]
	for i in files:
	positions[i] = 0
	with open(i) as f:
	lines[i] = f.readlines()

	# process all keys
	while True:
	# `values` contains all values for `key`
	key, values = get_min_key_values(positions, lines)
	print(key, values)