Aravind Pai aravindpai

## readfile.py
data=pd.read_csv("../input/amazon-fine-food-reviews/Reviews.csv",nrows=100000)

## libraries.py
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

## duplicates.py
data.drop_duplicates(subset=['Text'],inplace=True)  #dropping duplicates
data.dropna(axis=0,inplace=True)   #dropping na

## info.py
data.info()       #information about the dataset

## text.py
data['Text'][:10]

## textcleaning.py
stop_words = set(stopwords.words('english'))
def text_cleaner(text):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    tokens = [w for w in newString.split() if not w in stop_words]

## readsummary.py
data['Summary'][:10]

## append.py
data['cleaned_summary'] = data['cleaned_summary'].apply(lambda x : '_START_ '+ x + ' _END_')

## display.py
for i in range(5):
    print("Review:",data['cleaned_text'][i])
    print("Summary:",data['cleaned_summary'][i])
    print("\n")

## distribution.py
import matplotlib.pyplot as plt
text_word_count = []
summary_word_count = []

# populate the lists with sentence lengths
for i in data['cleaned_text']:
      text_word_count.append(len(i.split()))

for i in data['cleaned_summary']:
      summary_word_count.append(len(i.split()))
	import numpy as np
	import pandas as pd
	import re
	from bs4 import BeautifulSoup
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from nltk.corpus import stopwords
	from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
	from tensorflow.keras.models import Model
	from tensorflow.keras.callbacks import EarlyStopping
	data.drop_duplicates(subset=['Text'],inplace=True) #dropping duplicates
	data.dropna(axis=0,inplace=True) #dropping na
	stop_words = set(stopwords.words('english'))
	def text_cleaner(text):
	newString = text.lower()
	newString = BeautifulSoup(newString, "lxml").text
	newString = re.sub(r'\([^)]*\)', '', newString)
	newString = re.sub('"','', newString)
	newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
	newString = re.sub(r"'s\b","",newString)
	newString = re.sub("[^a-zA-Z]", " ", newString)
	tokens = [w for w in newString.split() if not w in stop_words]
	for i in range(5):
	print("Review:",data['cleaned_text'][i])
	print("Summary:",data['cleaned_summary'][i])
	print("\n")
	import matplotlib.pyplot as plt
	text_word_count = []
	summary_word_count = []

	# populate the lists with sentence lengths
	for i in data['cleaned_text']:
	text_word_count.append(len(i.split()))

	for i in data['cleaned_summary']:
	summary_word_count.append(len(i.split()))