Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Splits a giant jsonL file into multipble jsonL files.
# Gabriel Fair
# Please suggest any improvements: http://keybase.io/gabefair
import mmap
import re
import argparse
import sys
import progressbar
from time import sleep
import os
from time import clock
import atexit
from time import time
from datetime import timedelta
from time import perf_counter
#import pdb
current_comment_count = 0
global_comment_count = 0
file_count = 0
comments_per_file = 500000
output_file_contents = '' # The new file will build in RAM before writing to disk. Limiting the number of disk bottlenecks
file_size = 0
bar = progressbar.ProgressBar(maxval=100,widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
def split_json(file_argument):
global output_file_contents
global current_comment_count
last_chunk = False
global json_pattern
global bar
global global_comment_count
global file_size
file_size = os.path.getsize(sys.argv[1])
print("Reading file: " + file_argument + " Splitting every: " + str(comments_per_file) + " comments")
bar = progressbar.ProgressBar(redirect_stdout=True)
bar.start()
lap_time = perf_counter()
with open(file_argument,'r',encoding="utf-8") as file:
for line in file:
#pdb.set_trace()
#if (line == '\x00\n'):
# continue
current_comment_count = current_comment_count + 1
global_comment_count = global_comment_count + 1
output_file_contents = output_file_contents + line
#if (current_comment_count < comments_per_file):
# output_file_contents = output_file_contents + '\n'
if ( current_comment_count % 400 == 0):
write_file(file_argument, 1)
if ( current_comment_count % comments_per_file == 0):
print("Total comments proccessed: "+ str(global_comment_count) + ' and the time since last update: ' + str(timedelta(seconds=perf_counter() - lap_time)))
lap_time = perf_counter()
bar.update(int((global_comment_count/file_size)*100))
if (current_comment_count >= comments_per_file):
write_file(file_argument, 0)
write_file(file_argument, 0)
bar.finish()
print("Bytes successfully read: "+ str(int(file.tell())) + '/' + str(os.path.getsize(file_argument)) + ' ('+ str((file.tell()//os.path.getsize(file_argument))*100) + '%)')
print("Total files: ", file_count)
print("Total comments: ", global_comment_count)
return
def write_file(file_name, leave_open_flag):
global current_comment_count
global file_count
global global_comment_count
global output_file_contents
f = open(file_name + '_%04d' % file_count, 'a')
f.write(output_file_contents)
f.close()
output_file_contents = ''
if(leave_open_flag == 0):
file_count += 1
current_comment_count = 0
def secondsToStr(t):
return str(timedelta(seconds=t))
progress_bar_line = "="*40
def log(s, elapsed=None):
print(progress_bar_line)
print(secondsToStr(time()), '-', s)
if elapsed:
print("Elapsed time:", elapsed)
print(progress_bar_line)
print()
def endlog(start):
end = time()
elapsed = end-start
log("End Program", secondsToStr(elapsed))
def now():
return secondsToStr(time())
def main():
# parser = argparse.ArgumentParser(description='Splits giant file with many JSON objects.')
# parser.add_argument('json file', metavar='F', type=open, help='a file containing valid json' required=True)
# args = parser.parse_args()
start_time = time()
atexit.register(endlog)
log("Start Program")
split_json(sys.argv[1])
endlog(start_time)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.