Skip to content

Instantly share code, notes, and snippets.

@jsmolina
Created November 14, 2018 15:10
Show Gist options
  • Save jsmolina/38c191c5aca70f186554d59e743c13be to your computer and use it in GitHub Desktop.
Save jsmolina/38c191c5aca70f186554d59e743c13be to your computer and use it in GitHub Desktop.
Allows splitting merged parquet files with an accidental hadoop fs -getmerge
"""
Obtained from
https://stackoverflow.com/questions/41564291/parquet-build-with-hdfs-getmerge-recovery
"""
import sys
import locale
import os
import re
import io
filename = sys.argv[1]
currencode=locale.getpreferredencoding()
print ("=====================================================================")
print ("Create parquet from: ", filename)
print ("defautl buffer size: ", io.DEFAULT_BUFFER_SIZE)
print ("default encoding of the system: ", currencode)
print ("=====================================================================")
def write_to_binfile(parquetfile, contents):
with io.open(parquetfile, mode='wb') as f:
f.write(contents)
magicnum = "PAR1"
with io.open(filename, mode='rb') as f:
content = f.read()
res = [ magicnum + chunk + magicnum for chunk in filter(lambda s: s!="", re.split(magicnum, content)) ]
szcontent = len(res[2:])
for i in range(0,szcontent) :
si = str(i)
write_to_binfile("part-{}".format(si.zfill(5)), res[i+2])
write_to_binfile("_common_metadata", res[0])
write_to_binfile("_metadata", res[1])
os.system("mv {} {}.backup".format(filename, filename))
os.system("mkdir {}.parquet".format(filename))
os.system("mv _* {}.parquet".format(filename))
os.system("mv part* {}.parquet".format(filename))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment