Skip to content

Instantly share code, notes, and snippets.

@mpangrazzi
Created January 22, 2023 13:09
Show Gist options
  • Save mpangrazzi/bddc3addf00e4fb72c2547168b6e9015 to your computer and use it in GitHub Desktop.
Save mpangrazzi/bddc3addf00e4fb72c2547168b6e9015 to your computer and use it in GitHub Desktop.
Process-based parallelism with Python when processing a 500k+ rows dataset
from multiprocessing import cpu_count, get_context
from tqdm.auto import tqdm
from time import sleep
# A fake function to simulate some work
def process_data(x: str) -> str:
sleep(0.001)
return x + " processed"
def main():
# The 500k+ rows dataset we want to process
sample_dataset = [f"Sentence {i}" for i in range(550_000)]
# The output data
output_data = []
# Get the multiprocessing context
ctx = get_context("spawn")
# Process-based parallelism
with ctx.Pool(cpu_count()) as pool:
for data in tqdm(
pool.imap_unordered(process_data, sample_dataset),
total=len(sample_dataset),
desc="Processing data"
):
output_data.append(data)
# Done!
print(output_data[0]) # Sentence 0 processed
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment