Ankit Choudhary frenzy2106

## sub_file_lstm.py
# preparing the submission file
final_prediction = model.predict_classes(clean_test_data_pad)

test_tweets['label'] = final_prediction
test_predictions = test_tweets[['id','label']]
test_predictions.to_csv('LSTM3.csv',index=False)

## import_test.py
#Loading the test data
test_tweets = pd.read_csv("test_tweets_anuFYb8.csv")
test_tweets.shape

#cleaning the text
test_data = test_tweets['tweet']
clean_test_data  = clean_corpus(test_data)

#text to sequence and padding
clean_test_data_token = tokenizer.texts_to_sequences(clean_test_data)

## train_lstm.py
# Train the model
model.fit(X_train,y_train,batch_size=10,epochs=2, verbose=2)

## build_compile_model.py
# Building & Compiling the model

vocab_size = len(tokenizer.word_index) + 1
max_length = 25
model = keras.Sequential()
model.add(keras.layers.Embedding(input_dim=vocab_size,output_dim=50,input_length=max_length))
model.add(keras.layers.LSTM(units=50,dropout=0.2,recurrent_dropout=0.2))
model.add(keras.layers.Dense(units=1, activation='sigmoid'))

# compile the model

## validation_set.py
# Creating Validation Set
X_train,X_test,y_train,y_test = train_test_split(corpus_pad,y,test_size=0.2,random_state=101)

X_train.shape, X_test.shape

## padding_text.py
#finding the average words present per comment
print(corpus[0])
print(corpus_tokens[0:2])

num_of_words_in_doc =[]
for doc in corpus_tokens:
    num_of_words_in_doc.append(len(doc))
print("Average number of words: ", np.average(num_of_words_in_doc))


## tokenization.py
#check how many individual words present in the corpus
word_dict = {}
for doc in corpus:
    words = nltk.word_tokenize(doc)
    for word in words:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1


## separate_label_Input.py
X = tweets.iloc[:, 2].values
y = tweets.iloc[:,1].values

## text_cleaning_preprocessing.py
def clean_corpus(text):
    corpus = []
    for i in range(len(text)):
        tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s"," ", str(text[i]))
        tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ", tweet)
        tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ", tweet)
        tweet = tweet.lower()
        tweet = re.sub(r"can't","can not", tweet)
        tweet = re.sub(r"hv","have", tweet)
        tweet = re.sub(r"ur","your", tweet)

## read_file.py
tweets = pd.read_csv('/content/train_E6oV3lV.csv')
	# preparing the submission file
	final_prediction = model.predict_classes(clean_test_data_pad)

	test_tweets['label'] = final_prediction
	test_predictions = test_tweets[['id','label']]
	test_predictions.to_csv('LSTM3.csv',index=False)
	#Loading the test data
	test_tweets = pd.read_csv("test_tweets_anuFYb8.csv")
	test_tweets.shape

	#cleaning the text
	test_data = test_tweets['tweet']
	clean_test_data = clean_corpus(test_data)

	#text to sequence and padding
	clean_test_data_token = tokenizer.texts_to_sequences(clean_test_data)
	# Train the model
	model.fit(X_train,y_train,batch_size=10,epochs=2, verbose=2)
	# Building & Compiling the model

	vocab_size = len(tokenizer.word_index) + 1
	max_length = 25
	model = keras.Sequential()
	model.add(keras.layers.Embedding(input_dim=vocab_size,output_dim=50,input_length=max_length))
	model.add(keras.layers.LSTM(units=50,dropout=0.2,recurrent_dropout=0.2))
	model.add(keras.layers.Dense(units=1, activation='sigmoid'))

	# compile the model
	# Creating Validation Set
	X_train,X_test,y_train,y_test = train_test_split(corpus_pad,y,test_size=0.2,random_state=101)

	X_train.shape, X_test.shape
	#finding the average words present per comment
	print(corpus[0])
	print(corpus_tokens[0:2])

	num_of_words_in_doc =[]
	for doc in corpus_tokens:
	num_of_words_in_doc.append(len(doc))
	print("Average number of words: ", np.average(num_of_words_in_doc))
	#check how many individual words present in the corpus
	word_dict = {}
	for doc in corpus:
	words = nltk.word_tokenize(doc)
	for word in words:
	if word not in word_dict:
	word_dict[word] = 1
	else:
	word_dict[word] += 1
	def clean_corpus(text):
	corpus = []
	for i in range(len(text)):
	tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s"," ", str(text[i]))
	tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ", tweet)
	tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ", tweet)
	tweet = tweet.lower()
	tweet = re.sub(r"can't","can not", tweet)
	tweet = re.sub(r"hv","have", tweet)
	tweet = re.sub(r"ur","your", tweet)