Last active
April 22, 2019 07:53
-
-
Save ashutoshsingh25/d33eaedf0252b5e6b879356e6784ad3a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import pandas as pd | |
import numpy as np | |
import nltk | |
from bs4 import BeautifulSoup | |
from nltk.tokenize import WordPunctTokenizer | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
#Downloading the common english language stopwords from the nltk module in Python | |
#nltk.download('stopwords') | |
stop_words = stopwords.words('english') | |
#Adding the extra stopwords identified as per business use-case into the above set | |
stop_words.extend(("for medicine","medicine","for medicines","medicines","mg","mgs","ml","mls","kg","kgs","degree","degrees","g","gms","gm","mm","gram","grams","ft","cm","cms","m","cu","we","are","dealing","quality","manufacturers","manufacturer","exporters","supplier","dealer","good","topmost","business","trusted","finest","offer","offering","involved","provide","reputed","company","organization","trader","trading","li","pvt.","ltd","pvt","ltd.")) | |
#Buylead specific stopwords | |
stop_words.extend(["i","want","to","buy","setup","am","looking","service provider","need","will","samples","before","purchasing","see","product","starting","business venture","kindly","share","details","via","whatsapp","whatsaap","sms","email","know","price","requirement","send","interested","my","good quality","would","would like","feet","bore","size","mr","rs","per","inch","indian rupee","total order value","rupee","l","xl","xxl","also","usage","basis","use","reselling","buying","nos","discussed","personal","personally","installation","required","quote","asap","indiamart","what","where","why","how","then","decide","kindly","piece","stock","length","usd","diameter","lot","kilogram","upto","km","peices","indian","purchase","meter","medium","pair","like","full","lakh","rupees","delivery","rpm","litre","ton","easy","yes","thickness","composite","liter","including","pack","venture","marketing","features","condition","hour","high","age group","packaging","dimension","star rating","making","searching","products","services","things","resell","suppliers"]) | |
stop_words1 = set(stopwords.words('english')) | |
token1 = WordPunctTokenizer() | |
#Defining regular expression for special characters, numerics and hyperlinks | |
ditits_and_num1 = r'@[A-Za-z0-9_]+' | |
hyperlink_2 = r'https?://[^ ]+' | |
numeric_3 = r'[0-9]+' | |
combined_pattern1 = r'|'.join((ditits_and_num1, hyperlink_2,numeric_3)) | |
www_pattern = r'www.[^ ]+' | |
pat_3 = r'[^A-Za-z0-9]+' | |
#Transforming negative words into simple words | |
negative_word_list = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not", | |
"haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not", | |
"wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not", | |
"can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not", | |
"mustn't":"must not"} | |
neg_pattern = re.compile(r'\b(' + '|'.join(negative_word_list.keys()) + r')\b') | |
def cleaned_and_processed(text): | |
b_soup = BeautifulSoup(text, 'lxml') | |
b_souped = b_soup.get_text() | |
try: | |
b_removed = b_souped.decode("utf-8-sig").replace(u"\ufffd", "?") | |
except: | |
b_removed = b_souped | |
stripped_w = re.sub(combined_pattern1, '', b_removed) | |
stripped_w = re.sub(www_pattern, '', stripped_w) | |
stripped_w = re.sub(pat_3, '#', stripped_w) | |
stripped_w = re.sub(':',' ',stripped_w) | |
stripped_w = re.sub('::',' ',stripped_w) | |
lower_case = stripped_w.lower() | |
negative_handling = neg_pattern.sub(lambda x: negative_word_list[x.group()], lower_case) | |
# To handle letters_only cases process two lines above, it has created unnecessay white spaces, | |
# tokenizing and joining together to remove unneccessary white spaces | |
words = [x for x in token1.tokenize(negative_handling) if len(x) > 1] | |
return (" ".join(words)).strip() | |
def clean(text): | |
cleanr = re.compile('<.*?>') | |
text = re.sub(cleanr, '', text) | |
text = re.sub(r'[^\w\s]'," ",text) | |
text = re.sub("@"," ",text) | |
text = re.sub("[0-9] \\w+ *"," ",text) | |
text = re.sub("[0-9] \\w+ *"," ",text) | |
text = re.sub(" +"," ",text) | |
text = text.lower() | |
return text | |
def stop_words(text): | |
text = ''.join([i for i in text if not i.isdigit()]) | |
text = ' '.join([word for word in text.split() if word not in stop_words]) | |
return(text) | |
def rem_dup(l): | |
ulist = [] | |
[ulist.append(x) for x in l if x not in ulist] | |
return ulist |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment