Skip to content

Instantly share code, notes, and snippets.

View frenzy2106's full-sized avatar

Ankit Choudhary frenzy2106

View GitHub Profile
# preparing the submission file
final_prediction = model.predict_classes(clean_test_data_pad)
test_tweets['label'] = final_prediction
test_predictions = test_tweets[['id','label']]
test_predictions.to_csv('LSTM3.csv',index=False)
#Loading the test data
test_tweets = pd.read_csv("test_tweets_anuFYb8.csv")
test_tweets.shape
#cleaning the text
test_data = test_tweets['tweet']
clean_test_data = clean_corpus(test_data)
#text to sequence and padding
clean_test_data_token = tokenizer.texts_to_sequences(clean_test_data)
# Train the model
model.fit(X_train,y_train,batch_size=10,epochs=2, verbose=2)
# Building & Compiling the model
vocab_size = len(tokenizer.word_index) + 1
max_length = 25
model = keras.Sequential()
model.add(keras.layers.Embedding(input_dim=vocab_size,output_dim=50,input_length=max_length))
model.add(keras.layers.LSTM(units=50,dropout=0.2,recurrent_dropout=0.2))
model.add(keras.layers.Dense(units=1, activation='sigmoid'))
# compile the model
# Creating Validation Set
X_train,X_test,y_train,y_test = train_test_split(corpus_pad,y,test_size=0.2,random_state=101)
X_train.shape, X_test.shape
#finding the average words present per comment
print(corpus[0])
print(corpus_tokens[0:2])
num_of_words_in_doc =[]
for doc in corpus_tokens:
num_of_words_in_doc.append(len(doc))
print("Average number of words: ", np.average(num_of_words_in_doc))
#check how many individual words present in the corpus
word_dict = {}
for doc in corpus:
words = nltk.word_tokenize(doc)
for word in words:
if word not in word_dict:
word_dict[word] = 1
else:
word_dict[word] += 1
X = tweets.iloc[:, 2].values
y = tweets.iloc[:,1].values
def clean_corpus(text):
corpus = []
for i in range(len(text)):
tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s"," ", str(text[i]))
tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ", tweet)
tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ", tweet)
tweet = tweet.lower()
tweet = re.sub(r"can't","can not", tweet)
tweet = re.sub(r"hv","have", tweet)
tweet = re.sub(r"ur","your", tweet)
tweets = pd.read_csv('/content/train_E6oV3lV.csv')