satkr7/multiprocessing_benchmark.py

## multiprocessing_benchmark.py
# -*- coding: utf-8 -*-
"""

@author: satyam.kumar
"""

'''
Import necessary packages
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from datetime import datetime
import multiprocessing
from functools import partial


SAFE_DIV=0.0001
BUCKET_SIZE = 5000
STOP_WORDS=stopwords.words('english')

def preprocess(x):
    x=str(x).lower()
    x=x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")

    x = re.sub(r"([0-9]+)000000", r"lm", x)
    x = re.sub(r"([0-9]+)000", r"lk", x)


    porter=PorterStemmer()
    pattern=re.compile('\W')

    if type(x) == type(''):
        x = re.sub(pattern, ' ', x)


    if type(x) == type(''):
        x = porter.stem(x)
        example1 = BeautifulSoup(x, 'lxml')
        x = example1.get_text()

    return x

def run_process(df, start):
    df = df[start:start+BUCKET_SIZE]
    print(start, "to ",start+BUCKET_SIZE)
    temp = df["question"].apply(preprocess)


if __name__=="__main__":

    df = pd.read_csv("train/train.csv")
    list_of_questions = set(list(df['question1'])+list(df['question2']))
    print(len(list_of_questions))

    df_temp = pd.DataFrame()
    df_temp['question'] = list(list_of_questions)
    df_temp = df_temp.fillna("")


    sample_size = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    bucket = [1000, 2500, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]
    single_process_time = []
    multi_process_time = []

    for x,b in zip(sample_size, bucket):
        df = df_temp.sample(frac=x)
        BUCKET_SIZE = b
        print("Data Shape: ", df.shape)
        print("BUCKET SIZE: ", BUCKET_SIZE)

        print("Single Processing Starting")
        st = datetime.now()
        temp = df["question"].apply(preprocess)
        end = datetime.now()
        print("Single processing Time: ",end-st)
        single_process_time.append(end-st)


        print("Multiprocessing Starting")

        st = datetime.now()
        chunks  = [x for x in range(0,df.shape[0],BUCKET_SIZE)]
        pool = multiprocessing.Pool()
        func = partial(run_process, df)
        temp = pool.map(func,chunks)
        pool.close()
        pool.join()
        end = datetime.now()
        print("Multiprocessing Time: ",end-st)
        multi_process_time.append(end-st)
        print()
        print()

    out_time = pd.DataFrame()
    out_time['sample size'] = sample_size
    out_time['bucket size'] = bucket
    out_time['single_process_time'] = single_process_time
    out_time['multi_process_time'] = multi_process_time
    out_time.to_csv("Benchmark.csv", index=False)
	# -- coding: utf-8 --
	"""

	@author: satyam.kumar
	"""

	'''
	Import necessary packages
	'''

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import os
	import gc
	import re
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from bs4 import BeautifulSoup
	from datetime import datetime
	import multiprocessing
	from functools import partial




	SAFE_DIV=0.0001
	BUCKET_SIZE = 5000
	STOP_WORDS=stopwords.words('english')

	def preprocess(x):
	x=str(x).lower()
	x=x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
	.replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
	.replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
	.replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
	.replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
	.replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
	.replace("€", " euro ").replace("'ll", " will")

	x = re.sub(r"([0-9]+)000000", r"lm", x)
	x = re.sub(r"([0-9]+)000", r"lk", x)


	porter=PorterStemmer()
	pattern=re.compile('\W')

	if type(x) == type(''):
	x = re.sub(pattern, ' ', x)


	if type(x) == type(''):
	x = porter.stem(x)
	example1 = BeautifulSoup(x, 'lxml')
	x = example1.get_text()

	return x

	def run_process(df, start):
	df = df[start:start+BUCKET_SIZE]
	print(start, "to ",start+BUCKET_SIZE)
	temp = df["question"].apply(preprocess)


	if __name__=="__main__":

	df = pd.read_csv("train/train.csv")
	list_of_questions = set(list(df['question1'])+list(df['question2']))
	print(len(list_of_questions))

	df_temp = pd.DataFrame()
	df_temp['question'] = list(list_of_questions)
	df_temp = df_temp.fillna("")


	sample_size = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
	bucket = [1000, 2500, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000]
	single_process_time = []
	multi_process_time = []

	for x,b in zip(sample_size, bucket):
	df = df_temp.sample(frac=x)
	BUCKET_SIZE = b
	print("Data Shape: ", df.shape)
	print("BUCKET SIZE: ", BUCKET_SIZE)

	print("Single Processing Starting")
	st = datetime.now()
	temp = df["question"].apply(preprocess)
	end = datetime.now()
	print("Single processing Time: ",end-st)
	single_process_time.append(end-st)


	print("Multiprocessing Starting")

	st = datetime.now()
	chunks = [x for x in range(0,df.shape[0],BUCKET_SIZE)]
	pool = multiprocessing.Pool()
	func = partial(run_process, df)
	temp = pool.map(func,chunks)
	pool.close()
	pool.join()
	end = datetime.now()
	print("Multiprocessing Time: ",end-st)
	multi_process_time.append(end-st)
	print()
	print()

	out_time = pd.DataFrame()
	out_time['sample size'] = sample_size
	out_time['bucket size'] = bucket
	out_time['single_process_time'] = single_process_time
	out_time['multi_process_time'] = multi_process_time
	out_time.to_csv("Benchmark.csv", index=False)