Skip to content

Instantly share code, notes, and snippets.

@Cryolite
Created January 31, 2022 10:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Cryolite/d3814188079c975f5d6d6c840a5abc74 to your computer and use it in GitHub Desktop.
Save Cryolite/d3814188079c975f5d6d6c840a5abc74 to your computer and use it in GitHub Desktop.
Merges two or more shuffled files into one shuffled stream as a whole
#!/usr/bin/env python3
import random
from pathlib import Path
import gzip
import sys
def main() -> None:
if len(sys.argv) <= 2:
raise RuntimeError('Too few arguments.')
paths = [Path(p) for p in sys.argv[1:]]
for path in paths:
if not path.exists():
raise RuntimeError(f'{path}: Does not exist.')
if not path.is_file():
raise RuntimeError(f'{path}: Not a file.')
n = len(paths)
files = [gzip.open(p, 'rt', encoding='UTF-8') for p in paths]
num_lines = [0] * n
for i in range(n):
path = paths[i]
f = files[i]
for line in f:
num_lines[i] += 1
print(f'{path}: {num_lines[i]}', file=sys.stderr)
f.close()
files = [gzip.open(p, 'rt', encoding='UTF-8') for p in paths]
while num_lines != [0] * n:
i = random.choices(range(n), weights=num_lines)
assert(len(i) == 1)
i = i[0]
f = files[i]
line = next(f)
print(line, end='')
num_lines[i] -= 1
for path, f in zip(paths, files):
try:
next(f)
raise AssertionError(f'{path}: Has not been fully consumed.')
except StopIteration:
pass
f.close()
if __name__ == '__main__':
main()
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment