Skip to content

Instantly share code, notes, and snippets.

View prafulgondane's full-sized avatar

prafulgondane

View GitHub Profile
import streamlit as st
def main():
# Retrieve or initialize user input information
first_name = st.session_state.get("first_name", "")
last_name = st.session_state.get("last_name", "")
preferred_languages = st.session_state.get("preferred_languages", [])
st.title("User Profile")
max_len = 70
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0]
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(2,activation = 'softmax')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME)))
print("#"*100)
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME)))
print("#"*100)
print(" Train Data : ")
print("#"*100)
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0)
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
EMBEDDINGS_DIMENSION,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
EMBEDDINGS_DIMENSION,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME)))
print("#"*100)
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME)))
print("#"*100)
print(" Train Data : ")
print("#"*100)
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0)
plot_confusion_matrix(train_labels[:,1], tr_pred)
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128
#Create CNN Layers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
EMBEDDINGS_DIMENSION,
weights=[embedding_matrix],
print(" Train Data : ")
print("#"*100)
randomlist = np.random.rand(train_data.shape[0])
pred_y = [ 1 if val > 0.5 else 0 for val in randomlist]
print(" Train AUC Score Random Model : ", roc_auc_score(train_labels[:, 0], pred_y))
print(" Train F1 - Score Random Model : ", f1_score(train_labels[:, 0], pred_y))
plot_confusion_matrix(train_labels[:, 1], pred_y)
print(" CV Data : ")
print("#"*100)
def text_preprocessing(df):
#Remove punctuations
df['clean_text'] = df['comment_text'].apply(lambda x:remove_punctuation(x))
#make lower case
df['clean_text_lower']= df['clean_text'].apply(lambda x: x.lower())
#toenize the string
df['text_tokenied']= df['clean_text_lower'].apply(lambda x: tokenization(x))
#remove stop words
df['text_tokenized_no_stopwords']= df['text_tokenied'].apply(lambda x:remove_stopwords(x))
#lemmatize
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
return lemm_text