Skip to content

Instantly share code, notes, and snippets.

View aravindpai's full-sized avatar

Aravind Pai aravindpai

View GitHub Profile
data=pd.read_csv("../input/amazon-fine-food-reviews/Reviews.csv",nrows=100000)
from attention import AttentionLayer
import numpy as np  
import pandas as pd
import re          
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords   
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
data.drop_duplicates(subset=['Text'],inplace=True)  #dropping duplicates
data.dropna(axis=0,inplace=True)   #dropping na
data.info() #information about the dataset
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                          "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                          "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                          "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                          "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
data['Text'][:10]
stop_words = set(stopwords.words('english'))
def text_cleaner(text):
   newString = text.lower()
   newString = BeautifulSoup(newString, "lxml").text
   newString = re.sub(r'\([^)]*\)', '', newString)
   newString = re.sub('"','', newString)
   newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
   newString = re.sub(r"'s\b","",newString)
   newString = re.sub("[^a-zA-Z]", " ", newString)
   tokens = [w for w in newString.split() if not w in stop_words]
data['Summary'][:10]
def summary_cleaner(text):
   newString = re.sub('"','', text)
   newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
   newString = re.sub(r"'s\b","",newString)
   newString = re.sub("[^a-zA-Z]", " ", newString)
   newString = newString.lower()
   tokens=newString.split()
   newString=''
   for i in tokens:
       if len(i)>1: