Skip to content

Instantly share code, notes, and snippets.

@ashutoshsingh25
Last active April 22, 2019 07:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashutoshsingh25/d33eaedf0252b5e6b879356e6784ad3a to your computer and use it in GitHub Desktop.
Save ashutoshsingh25/d33eaedf0252b5e6b879356e6784ad3a to your computer and use it in GitHub Desktop.
import re
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#Downloading the common english language stopwords from the nltk module in Python
#nltk.download('stopwords')
stop_words = stopwords.words('english')
#Adding the extra stopwords identified as per business use-case into the above set
stop_words.extend(("for medicine","medicine","for medicines","medicines","mg","mgs","ml","mls","kg","kgs","degree","degrees","g","gms","gm","mm","gram","grams","ft","cm","cms","m","cu","we","are","dealing","quality","manufacturers","manufacturer","exporters","supplier","dealer","good","topmost","business","trusted","finest","offer","offering","involved","provide","reputed","company","organization","trader","trading","li","pvt.","ltd","pvt","ltd."))
#Buylead specific stopwords
stop_words.extend(["i","want","to","buy","setup","am","looking","service provider","need","will","samples","before","purchasing","see","product","starting","business venture","kindly","share","details","via","whatsapp","whatsaap","sms","email","know","price","requirement","send","interested","my","good quality","would","would like","feet","bore","size","mr","rs","per","inch","indian rupee","total order value","rupee","l","xl","xxl","also","usage","basis","use","reselling","buying","nos","discussed","personal","personally","installation","required","quote","asap","indiamart","what","where","why","how","then","decide","kindly","piece","stock","length","usd","diameter","lot","kilogram","upto","km","peices","indian","purchase","meter","medium","pair","like","full","lakh","rupees","delivery","rpm","litre","ton","easy","yes","thickness","composite","liter","including","pack","venture","marketing","features","condition","hour","high","age group","packaging","dimension","star rating","making","searching","products","services","things","resell","suppliers"])
stop_words1 = set(stopwords.words('english'))
token1 = WordPunctTokenizer()
#Defining regular expression for special characters, numerics and hyperlinks
ditits_and_num1 = r'@[A-Za-z0-9_]+'
hyperlink_2 = r'https?://[^ ]+'
numeric_3 = r'[0-9]+'
combined_pattern1 = r'|'.join((ditits_and_num1, hyperlink_2,numeric_3))
www_pattern = r'www.[^ ]+'
pat_3 = r'[^A-Za-z0-9]+'
#Transforming negative words into simple words
negative_word_list = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
"haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
"wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
"can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
"mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negative_word_list.keys()) + r')\b')
def cleaned_and_processed(text):
b_soup = BeautifulSoup(text, 'lxml')
b_souped = b_soup.get_text()
try:
b_removed = b_souped.decode("utf-8-sig").replace(u"\ufffd", "?")
except:
b_removed = b_souped
stripped_w = re.sub(combined_pattern1, '', b_removed)
stripped_w = re.sub(www_pattern, '', stripped_w)
stripped_w = re.sub(pat_3, '#', stripped_w)
stripped_w = re.sub(':',' ',stripped_w)
stripped_w = re.sub('::',' ',stripped_w)
lower_case = stripped_w.lower()
negative_handling = neg_pattern.sub(lambda x: negative_word_list[x.group()], lower_case)
# To handle letters_only cases process two lines above, it has created unnecessay white spaces,
# tokenizing and joining together to remove unneccessary white spaces
words = [x for x in token1.tokenize(negative_handling) if len(x) > 1]
return (" ".join(words)).strip()
def clean(text):
cleanr = re.compile('<.*?>')
text = re.sub(cleanr, '', text)
text = re.sub(r'[^\w\s]'," ",text)
text = re.sub("@"," ",text)
text = re.sub("[0-9] \\w+ *"," ",text)
text = re.sub("[0-9] \\w+ *"," ",text)
text = re.sub(" +"," ",text)
text = text.lower()
return text
def stop_words(text):
text = ''.join([i for i in text if not i.isdigit()])
text = ' '.join([word for word in text.split() if word not in stop_words])
return(text)
def rem_dup(l):
ulist = []
[ulist.append(x) for x in l if x not in ulist]
return ulist
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment