Skip to content

Instantly share code, notes, and snippets.

@Nakroma
Last active April 9, 2023 10:42
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nakroma/d40c798137f7d99c53abee17da0d5c7b to your computer and use it in GitHub Desktop.
Save Nakroma/d40c798137f7d99c53abee17da0d5c7b to your computer and use it in GitHub Desktop.
Script for multiprocessing huge CSV/TSV files, originally made to process the MS-Celeb-1M dataset
#!/usr/bin/env python
import csv
import multiprocessing as mp
counter = None
def init(c):
global counter
counter = c
def gen_chunks(reader, chunksize=100):
"""
Chunk generator. Take a CSV `reader` and yield
`chunksize` sized slices.
Source: https://gist.github.com/miku/820490
"""
chunk = []
for index, line in enumerate(reader):
if index % chunksize == 0 and index > 0:
yield chunk
del chunk[:]
chunk.append(line)
yield chunk
def process(rows):
global counter
for row in rows:
with counter.get_lock():
counter.value += 1
# processing
return
if __name__ == '__main__':
file = "FaceImageCroppedWithOutAlignment.tsv"
with open(file, encoding='utf-8') as tsv_file: # change encoding if necessary
tsv_reader = csv.reader(tsv_file, delimiter='\t') # change delimiter for normal csv files
counter = mp.Value('i', 0) # number of rows processed
pool = mp.Pool(processes=4, initializer=init, initargs=(counter,))
for tsv_chunk in gen_chunks(tsv_reader, chunksize=10000):
pool.apply_async(process, args=(tsv_chunk,))
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment