Skip to content

Instantly share code, notes, and snippets.

@seanbenhur
Created December 6, 2021 12:48
Show Gist options
  • Save seanbenhur/6f19d8e16834b1da130ad8415d8ab5d5 to your computer and use it in GitHub Desktop.
Save seanbenhur/6f19d8e16834b1da130ad8415d8ab5d5 to your computer and use it in GitHub Desktop.
import time
import json
import multiprocessing
from multiprocessing import Pool
txt_path = "tamil_dataset.txt"
json_path = "tamil_final_dataset.json"
def write_single_json_record(idx, record, save_path):
with open(save_path, 'a') as obj:
obj.write(record+'\n')
print(f"Processed: {idx}", end='\r')
def convert_txt_to_json(text_path,json_path):
with open(text_path,'r') as fp:
data = fp.read().splitlines()
records = [[idx, json.dumps({'text':text}),json_path] for idx, text in enumerate(data)]
num_cores = multiprocessing.cpu_count()
with Pool(num_cores) as pool:
L = pool.starmap(write_single_json_record, records)
print("-"*30)
print("Json file written to disk")
print("-"*30)
return json_path
if __name__ == "__main__":
print("-"*30)
start = time.time()
print("Writing data to json")
json_file_path = convert_txt_to_json(txt_path, json_path)
end = time.time()
print(f"Completed in: {(end-start):.4}s")
print("-"*30)
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment