ashutoshsingh25/Preprocessing Script

## Preprocessing Script
import re
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#Downloading the common english language stopwords from the nltk module in Python
#nltk.download('stopwords')
stop_words = stopwords.words('english')


#Adding the extra stopwords identified as per business use-case into the above set
stop_words.extend(("for medicine","medicine","for medicines","medicines","mg","mgs","ml","mls","kg","kgs","degree","degrees","g","gms","gm","mm","gram","grams","ft","cm","cms","m","cu","we","are","dealing","quality","manufacturers","manufacturer","exporters","supplier","dealer","good","topmost","business","trusted","finest","offer","offering","involved","provide","reputed","company","organization","trader","trading","li","pvt.","ltd","pvt","ltd."))

#Buylead specific stopwords
stop_words.extend(["i","want","to","buy","setup","am","looking","service provider","need","will","samples","before","purchasing","see","product","starting","business venture","kindly","share","details","via","whatsapp","whatsaap","sms","email","know","price","requirement","send","interested","my","good quality","would","would like","feet","bore","size","mr","rs","per","inch","indian rupee","total order value","rupee","l","xl","xxl","also","usage","basis","use","reselling","buying","nos","discussed","personal","personally","installation","required","quote","asap","indiamart","what","where","why","how","then","decide","kindly","piece","stock","length","usd","diameter","lot","kilogram","upto","km","peices","indian","purchase","meter","medium","pair","like","full","lakh","rupees","delivery","rpm","litre","ton","easy","yes","thickness","composite","liter","including","pack","venture","marketing","features","condition","hour","high","age group","packaging","dimension","star rating","making","searching","products","services","things","resell","suppliers"])

stop_words1 = set(stopwords.words('english'))

token1 = WordPunctTokenizer()

#Defining regular expression for special characters, numerics and hyperlinks
ditits_and_num1 = r'@[A-Za-z0-9_]+'
hyperlink_2 = r'https?://[^ ]+'
numeric_3 = r'[0-9]+'
combined_pattern1 = r'|'.join((ditits_and_num1, hyperlink_2,numeric_3))
www_pattern = r'www.[^ ]+'
pat_3 = r'[^A-Za-z0-9]+'

#Transforming negative words into simple words
negative_word_list = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}


neg_pattern = re.compile(r'\b(' + '|'.join(negative_word_list.keys()) + r')\b')


def cleaned_and_processed(text):
    b_soup = BeautifulSoup(text, 'lxml')
    b_souped = b_soup.get_text()
    try:
        b_removed = b_souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        b_removed = b_souped

    stripped_w = re.sub(combined_pattern1, '', b_removed)
    stripped_w = re.sub(www_pattern, '', stripped_w)
    stripped_w = re.sub(pat_3, '#', stripped_w)
    stripped_w = re.sub(':',' ',stripped_w)
    stripped_w = re.sub('::',' ',stripped_w)
    lower_case = stripped_w.lower()
    negative_handling = neg_pattern.sub(lambda x: negative_word_list[x.group()], lower_case)

    # To handle letters_only cases process two lines above, it has created unnecessay white spaces,
    # tokenizing and joining together to remove unneccessary white spaces
    words = [x for x  in token1.tokenize(negative_handling) if len(x) > 1]
    return (" ".join(words)).strip()


def clean(text):
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, '', text)
    text = re.sub(r'[^\w\s]'," ",text)
    text = re.sub("@"," ",text)
    text = re.sub("[0-9] \\w+ *"," ",text)
    text = re.sub("[0-9] \\w+ *"," ",text)
    text = re.sub(" +"," ",text)
    text = text.lower()
    return text

def stop_words(text):
    text = ''.join([i for i in text if not i.isdigit()])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return(text)

def rem_dup(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist
	import re
	import pandas as pd
	import numpy as np
	import nltk
	from bs4 import BeautifulSoup
	from nltk.tokenize import WordPunctTokenizer
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	#Downloading the common english language stopwords from the nltk module in Python
	#nltk.download('stopwords')
	stop_words = stopwords.words('english')


	#Adding the extra stopwords identified as per business use-case into the above set
	stop_words.extend(("for medicine","medicine","for medicines","medicines","mg","mgs","ml","mls","kg","kgs","degree","degrees","g","gms","gm","mm","gram","grams","ft","cm","cms","m","cu","we","are","dealing","quality","manufacturers","manufacturer","exporters","supplier","dealer","good","topmost","business","trusted","finest","offer","offering","involved","provide","reputed","company","organization","trader","trading","li","pvt.","ltd","pvt","ltd."))

	#Buylead specific stopwords
	stop_words.extend(["i","want","to","buy","setup","am","looking","service provider","need","will","samples","before","purchasing","see","product","starting","business venture","kindly","share","details","via","whatsapp","whatsaap","sms","email","know","price","requirement","send","interested","my","good quality","would","would like","feet","bore","size","mr","rs","per","inch","indian rupee","total order value","rupee","l","xl","xxl","also","usage","basis","use","reselling","buying","nos","discussed","personal","personally","installation","required","quote","asap","indiamart","what","where","why","how","then","decide","kindly","piece","stock","length","usd","diameter","lot","kilogram","upto","km","peices","indian","purchase","meter","medium","pair","like","full","lakh","rupees","delivery","rpm","litre","ton","easy","yes","thickness","composite","liter","including","pack","venture","marketing","features","condition","hour","high","age group","packaging","dimension","star rating","making","searching","products","services","things","resell","suppliers"])

	stop_words1 = set(stopwords.words('english'))

	token1 = WordPunctTokenizer()

	#Defining regular expression for special characters, numerics and hyperlinks
	ditits_and_num1 = r'@[A-Za-z0-9_]+'
	hyperlink_2 = r'https?://[^ ]+'
	numeric_3 = r'[0-9]+'
	combined_pattern1 = r'\|'.join((ditits_and_num1, hyperlink_2,numeric_3))
	www_pattern = r'www.[^ ]+'
	pat_3 = r'[^A-Za-z0-9]+'

	#Transforming negative words into simple words
	negative_word_list = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
	"haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
	"wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
	"can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
	"mustn't":"must not"}


	neg_pattern = re.compile(r'\b(' + '\|'.join(negative_word_list.keys()) + r')\b')


	def cleaned_and_processed(text):
	b_soup = BeautifulSoup(text, 'lxml')
	b_souped = b_soup.get_text()
	try:
	b_removed = b_souped.decode("utf-8-sig").replace(u"\ufffd", "?")
	except:
	b_removed = b_souped

	stripped_w = re.sub(combined_pattern1, '', b_removed)
	stripped_w = re.sub(www_pattern, '', stripped_w)
	stripped_w = re.sub(pat_3, '#', stripped_w)
	stripped_w = re.sub(':',' ',stripped_w)
	stripped_w = re.sub('::',' ',stripped_w)
	lower_case = stripped_w.lower()
	negative_handling = neg_pattern.sub(lambda x: negative_word_list[x.group()], lower_case)

	# To handle letters_only cases process two lines above, it has created unnecessay white spaces,
	# tokenizing and joining together to remove unneccessary white spaces
	words = [x for x in token1.tokenize(negative_handling) if len(x) > 1]
	return (" ".join(words)).strip()


	def clean(text):
	cleanr = re.compile('<.*?>')
	text = re.sub(cleanr, '', text)
	text = re.sub(r'[^\w\s]'," ",text)
	text = re.sub("@"," ",text)
	text = re.sub("[0-9] \\w+ *"," ",text)
	text = re.sub("[0-9] \\w+ *"," ",text)
	text = re.sub(" +"," ",text)
	text = text.lower()
	return text

	def stop_words(text):
	text = ''.join([i for i in text if not i.isdigit()])
	text = ' '.join([word for word in text.split() if word not in stop_words])
	return(text)

	def rem_dup(l):
	ulist = []
	[ulist.append(x) for x in l if x not in ulist]
	return ulist