Skip to content

Instantly share code, notes, and snippets.

@satkr7
Created July 4, 2021 18:10
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save satkr7/1087afdd4291638122186f5741564dd9 to your computer and use it in GitHub Desktop.
Save satkr7/1087afdd4291638122186f5741564dd9 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
@author: satyam.kumar
"""
'''
Import necessary packages
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from datetime import datetime
import multiprocessing
from functools import partial
SAFE_DIV=0.0001
BUCKET_SIZE = 5000
STOP_WORDS=stopwords.words('english')
def preprocess(x):
x=str(x).lower()
x=x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
.replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
.replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
.replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
.replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
.replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
.replace("€", " euro ").replace("'ll", " will")
x = re.sub(r"([0-9]+)000000", r"lm", x)
x = re.sub(r"([0-9]+)000", r"lk", x)
porter=PorterStemmer()
pattern=re.compile('\W')
if type(x) == type(''):
x = re.sub(pattern, ' ', x)
if type(x) == type(''):
x = porter.stem(x)
example1 = BeautifulSoup(x, 'lxml')
x = example1.get_text()
return x
def run_process(df, start):
df = df[start:start+BUCKET_SIZE]
print(start, "to ",start+BUCKET_SIZE)
temp = df["question"].apply(preprocess)
if __name__=="__main__":
df = pd.read_csv("train/train.csv")
list_of_questions = set(list(df['question1'])+list(df['question2']))
print(len(list_of_questions))
df_temp = pd.DataFrame()
df_temp['question'] = list(list_of_questions)
df_temp = df_temp.fillna("")
sample_size = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
bucket = [1000, 2500, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]
single_process_time = []
multi_process_time = []
for x,b in zip(sample_size, bucket):
df = df_temp.sample(frac=x)
BUCKET_SIZE = b
print("Data Shape: ", df.shape)
print("BUCKET SIZE: ", BUCKET_SIZE)
print("Single Processing Starting")
st = datetime.now()
temp = df["question"].apply(preprocess)
end = datetime.now()
print("Single processing Time: ",end-st)
single_process_time.append(end-st)
print("Multiprocessing Starting")
st = datetime.now()
chunks = [x for x in range(0,df.shape[0],BUCKET_SIZE)]
pool = multiprocessing.Pool()
func = partial(run_process, df)
temp = pool.map(func,chunks)
pool.close()
pool.join()
end = datetime.now()
print("Multiprocessing Time: ",end-st)
multi_process_time.append(end-st)
print()
print()
out_time = pd.DataFrame()
out_time['sample size'] = sample_size
out_time['bucket size'] = bucket
out_time['single_process_time'] = single_process_time
out_time['multi_process_time'] = multi_process_time
out_time.to_csv("Benchmark.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment