Skip to content

Instantly share code, notes, and snippets.

@packmad
Created November 26, 2020 09:40
Show Gist options
  • Save packmad/c7c79d60d36c43798344dd7a31b5bca1 to your computer and use it in GitHub Desktop.
Save packmad/c7c79d60d36c43798344dd7a31b5bca1 to your computer and use it in GitHub Desktop.
Load all json files from an input folder using multiprocess
import os
import json
import sys
import time
from multiprocessing import Pool
from os.path import isdir, abspath, join
from typing import List, Dict
def os_listdir_json(folder: str) -> List[str]:
return [abspath(join(folder, f)) for f in os.listdir(folder) if f.endswith('.json')]
def read_json(file_path: str) -> Dict:
with open(file_path, 'r', encoding='utf-8') as in_file:
return json.load(in_file)
def load_dataset(src_folder: str, processes: int = None) -> List[Dict]:
if processes is None:
processes = os.cpu_count()
with Pool(processes) as pool:
start_time = time.time()
res = pool.map(read_json, os_listdir_json(src_folder))
print(f'> Loaded {len(res)} files in {round(time.time() - start_time, 1)} sec using {processes} processes')
return res
if __name__ == "__main__":
if len(sys.argv) != 2:
sys.exit('Wrong number of args')
assert isdir(sys.argv[1])
src_folder = sys.argv[1]
for cpu_i in range(2, os.cpu_count()+1):
load_dataset(src_folder, cpu_i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment