Created
May 10, 2024 08:24
-
-
Save do-me/a51d4c3c04aef7a0c7adebd238bb3ce7 to your computer and use it in GitHub Desktop.
Pandas multiprocessing with pandarallel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from pandarallel import pandarallel | |
pandarallel.initialize(progress_bar=True) | |
# Create a sample dataframe with 10,000 rows and 2 columns | |
np.random.seed(0) # for reproducibility | |
df = pd.DataFrame({'numbers': np.random.randint(1, 100, size=10000000)}) | |
# you might need to wrap your functions if you reference from external sources | |
def wrap_func(n): | |
return n**2 | |
df["square_numbers"] = df["numbers"].parallel_apply(wrap_func) | |
df | |
#INFO: Pandarallel will run on 16 workers. | |
#INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers. | |
#numbers square_numbers | |
#0 45 2025 | |
#1 48 2304 | |
#2 65 4225 | |
#3 68 4624 | |
#4 68 4624 | |
#... ... ... | |
#9999995 19 361 | |
#9999996 42 1764 | |
#9999997 24 576 | |
#9999998 35 1225 | |
#9999999 48 2304 | |
#10000000 rows × 2 columns |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment