This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
df = pd.read_csv('./amazonreviews.tsv',sep='\t') | |
df.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#importing all the required libraries | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk import tokenize |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install contractions | |
import contractions | |
# Expanding contractions | |
def con(text): | |
expand=contractions.fix(text) | |
return expand | |
df.review=df.review.apply(con) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df.review=df.review.apply(lambda x: x.lower()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
def remove_punc(text): | |
for i in string.punctuation: | |
text=text.replace(i,' ') | |
return text | |
df.review=df.review.apply(remove_punc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nltk.download('punkt') | |
df['review']=df.review.apply(word_tokenize) | |
df['review'][0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nltk.download('wordnet') | |
lemmatizer=WordNetLemmatizer() | |
df['review']=df.review.apply(lambda x:[lemmatizer.lemmatize(word) for word in x]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df.review= df.review.astype(str) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x = df.iloc[:,1].values # Features variable | |
y = df.iloc[:,0].values # Target variable | |
from sklearn.model_selection import train_test_split | |
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0,test_size=0.2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
tfidf=TfidfVectorizer() | |
x_train_tfidf = tfidf.fit_transform(x_train) | |
x_test_tfidf = tfidf.transform(x_test) |
OlderNewer