arif9799/preprocessing.py

## preprocessing.py
!pip install Unidecode
from unidecode import unidecode
from gensim.parsing.preprocessing import remove_stopwords
from gensim import utils
import nltk
nltk.download('words')
global_words = set(nltk.corpus.words.words())

data = pd.read_csv("Whatever_your_path_is", sep='\t', error_bad_lines=False, skip_blank_lines=True)

def preprocess(text):
  text = unidecode(text) #Encodes unicode string object to ASCII bytes
  text = str(text)
  text = text.lower() #lower cases the text
  text = remove_stopwords(text) #eliminate stopwords from text
  text = " ".join(w for w in nltk.wordpunct_tokenize(text) # eliminate words
         if w.lower() in global_words or not w.isalpha()) # with no semantic meaning
  text = utils.simple_preprocess(text,min_len=2,max_len=100,deacc=True)
  return text


#apply method will call preprocess function for all the reviews in the dataset update
data['tokenized'] = data.review_body.apply(preprocess)
	!pip install Unidecode
	from unidecode import unidecode
	from gensim.parsing.preprocessing import remove_stopwords
	from gensim import utils
	import nltk
	nltk.download('words')
	global_words = set(nltk.corpus.words.words())

	data = pd.read_csv("Whatever_your_path_is", sep='\t', error_bad_lines=False, skip_blank_lines=True)

	def preprocess(text):
	text = unidecode(text) #Encodes unicode string object to ASCII bytes
	text = str(text)
	text = text.lower() #lower cases the text
	text = remove_stopwords(text) #eliminate stopwords from text
	text = " ".join(w for w in nltk.wordpunct_tokenize(text) # eliminate words
	if w.lower() in global_words or not w.isalpha()) # with no semantic meaning
	text = utils.simple_preprocess(text,min_len=2,max_len=100,deacc=True)
	return text


	#apply method will call preprocess function for all the reviews in the dataset update
	data['tokenized'] = data.review_body.apply(preprocess)