Last active
April 9, 2023 10:42
-
-
Save Nakroma/d40c798137f7d99c53abee17da0d5c7b to your computer and use it in GitHub Desktop.
Script for multiprocessing huge CSV/TSV files, originally made to process the MS-Celeb-1M dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import multiprocessing as mp | |
counter = None | |
def init(c): | |
global counter | |
counter = c | |
def gen_chunks(reader, chunksize=100): | |
""" | |
Chunk generator. Take a CSV `reader` and yield | |
`chunksize` sized slices. | |
Source: https://gist.github.com/miku/820490 | |
""" | |
chunk = [] | |
for index, line in enumerate(reader): | |
if index % chunksize == 0 and index > 0: | |
yield chunk | |
del chunk[:] | |
chunk.append(line) | |
yield chunk | |
def process(rows): | |
global counter | |
for row in rows: | |
with counter.get_lock(): | |
counter.value += 1 | |
# processing | |
return | |
if __name__ == '__main__': | |
file = "FaceImageCroppedWithOutAlignment.tsv" | |
with open(file, encoding='utf-8') as tsv_file: # change encoding if necessary | |
tsv_reader = csv.reader(tsv_file, delimiter='\t') # change delimiter for normal csv files | |
counter = mp.Value('i', 0) # number of rows processed | |
pool = mp.Pool(processes=4, initializer=init, initargs=(counter,)) | |
for tsv_chunk in gen_chunks(tsv_reader, chunksize=10000): | |
pool.apply_async(process, args=(tsv_chunk,)) | |
pool.close() | |
pool.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment