Skip to content

Instantly share code, notes, and snippets.

View aravindpai's full-sized avatar

Aravind Pai aravindpai

View GitHub Profile
data=pd.read_csv("../input/amazon-fine-food-reviews/Reviews.csv",nrows=100000)
import numpy as np  
import pandas as pd
import re          
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords   
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
data.drop_duplicates(subset=['Text'],inplace=True)  #dropping duplicates
data.dropna(axis=0,inplace=True)   #dropping na
data.info() #information about the dataset
data['Text'][:10]
stop_words = set(stopwords.words('english'))
def text_cleaner(text):
   newString = text.lower()
   newString = BeautifulSoup(newString, "lxml").text
   newString = re.sub(r'\([^)]*\)', '', newString)
   newString = re.sub('"','', newString)
   newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
   newString = re.sub(r"'s\b","",newString)
   newString = re.sub("[^a-zA-Z]", " ", newString)
   tokens = [w for w in newString.split() if not w in stop_words]
data['Summary'][:10]
data['cleaned_summary'] = data['cleaned_summary'].apply(lambda x : '_START_ '+ x + ' _END_')
for i in range(5):
print("Review:",data['cleaned_text'][i])
print("Summary:",data['cleaned_summary'][i])
print("\n")
import matplotlib.pyplot as plt
text_word_count = []
summary_word_count = []
# populate the lists with sentence lengths
for i in data['cleaned_text']:
     text_word_count.append(len(i.split()))
for i in data['cleaned_summary']:
     summary_word_count.append(len(i.split()))