mpangrazzi/multiprocessing-parallelism.py

## multiprocessing-parallelism.py
from multiprocessing import cpu_count, get_context
from tqdm.auto import tqdm
from time import sleep

# A fake function to simulate some work
def process_data(x: str) -> str:
    sleep(0.001)
    return x + " processed"

def main():
    # The 500k+ rows dataset we want to process
    sample_dataset = [f"Sentence {i}" for i in range(550_000)]

    # The output data
    output_data = []

    # Get the multiprocessing context
    ctx = get_context("spawn")

    # Process-based parallelism
    with ctx.Pool(cpu_count()) as pool:
        for data in tqdm(
            pool.imap_unordered(process_data, sample_dataset),
            total=len(sample_dataset),
            desc="Processing data"
        ):
            output_data.append(data)

    # Done!
    print(output_data[0]) # Sentence 0 processed

if __name__ == "__main__":
    main()
	from multiprocessing import cpu_count, get_context
	from tqdm.auto import tqdm
	from time import sleep

	# A fake function to simulate some work
	def process_data(x: str) -> str:
	sleep(0.001)
	return x + " processed"

	def main():
	# The 500k+ rows dataset we want to process
	sample_dataset = [f"Sentence {i}" for i in range(550_000)]

	# The output data
	output_data = []

	# Get the multiprocessing context
	ctx = get_context("spawn")

	# Process-based parallelism
	with ctx.Pool(cpu_count()) as pool:
	for data in tqdm(
	pool.imap_unordered(process_data, sample_dataset),
	total=len(sample_dataset),
	desc="Processing data"
	):
	output_data.append(data)

	# Done!
	print(output_data[0]) # Sentence 0 processed

	if __name__ == "__main__":
	main()