Skip to content

Instantly share code, notes, and snippets.

@cicorias
Created July 22, 2022 20:47
Show Gist options
  • Save cicorias/106d59b9eb133fe6399f322a316a7135 to your computer and use it in GitHub Desktop.
Save cicorias/106d59b9eb133fe6399f322a316a7135 to your computer and use it in GitHub Desktop.
Parses output from Hadoop DFSIO utility
#!/usr/bin/env python3
def parse_file(file_name):
keys = list()
rows = list()
row = dict()
with open(file_name, 'r') as log:
items = [line.split(':', 1) for line in log]
for item in items:
if len(item) < 2:
rows.append(row)
row = dict()
continue
key = item[0].strip()
value = item[1].strip()
if key not in keys:
keys.append(key)
row[key] = value
return keys, rows
def write_csv(file_name, keys, rows):
import csv
with open(file_name, 'w') as outfile:
w = csv.DictWriter(outfile, keys)
w.writeheader()
w.writerows(rows)
def parse_arguments():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', help='File to parse')
parser.add_argument('-o', '--output', help='Output file')
return parser.parse_args()
def main():
args = parse_arguments()
keys, rows = parse_file(args.file)
write_csv(args.output, keys, rows)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment