Skip to content

Instantly share code, notes, and snippets.

@fclesio
Created April 1, 2020 20:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save fclesio/3410bdb2da864c493e51afbc93f1d505 to your computer and use it in GitHub Desktop.
Save fclesio/3410bdb2da864c493e51afbc93f1d505 to your computer and use it in GitHub Desktop.
import time
import numpy as np
import pandas as pd
import nlp_pre_processing
# An internal NLP lib to process text
nlp = nlp_pre_processing_library.NLPPreprocessor()
# Multiprocessing library that uses pool
# to distribute the task for all processors
from multiprocessing import Pool
print(f'Start processing...: {(time.time() - start_time)}')
# Tracking the time
start_time = time.time()
# Number of partitions that
# the Pandas Dataframe will be
# splited to parallel processing
num_partitions = 20
# Number of cores that will be used
# more it's better
num_cores = 16
print(f'Partition Number: {num_partitions} - Number of Cores: {num_cores}...')
def main_process_pipeline(df, func):
"""
Function that will split the dataframe
and process all those parts in a n number
of processors
Args:
df (Pandas dataframe): Dataframe that will be splited
func (function): Python function that will be executed in parallel
Returns:
df: Dataframe with all parts concatenated after the function be applied
"""
df_split = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
def pre_process_wrapper(df):
""" Will take the Dataframe and apply a function using lambda"""
df['text'] = df['text'].apply(lambda text: nlp.pre_processing_pipeline(text))
return df
# Unite the Dataframe and the Wrapper
processed_df = main_process_pipeline(df, pre_process_wrapper)
print(f'Processing finished in seconds: {(time.time() - start_time)}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment