Skip to content

Instantly share code, notes, and snippets.

@arif9799
Last active May 12, 2022 22:17
Show Gist options
  • Save arif9799/a7e4e3b3f9d2be6b0034165ebd82de1a to your computer and use it in GitHub Desktop.
Save arif9799/a7e4e3b3f9d2be6b0034165ebd82de1a to your computer and use it in GitHub Desktop.
!pip install Unidecode
from unidecode import unidecode
from gensim.parsing.preprocessing import remove_stopwords
from gensim import utils
import nltk
nltk.download('words')
global_words = set(nltk.corpus.words.words())
data = pd.read_csv("Whatever_your_path_is", sep='\t', error_bad_lines=False, skip_blank_lines=True)
def preprocess(text):
text = unidecode(text) #Encodes unicode string object to ASCII bytes
text = str(text)
text = text.lower() #lower cases the text
text = remove_stopwords(text) #eliminate stopwords from text
text = " ".join(w for w in nltk.wordpunct_tokenize(text) # eliminate words
if w.lower() in global_words or not w.isalpha()) # with no semantic meaning
text = utils.simple_preprocess(text,min_len=2,max_len=100,deacc=True)
return text
#apply method will call preprocess function for all the reviews in the dataset update
data['tokenized'] = data.review_body.apply(preprocess)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment