This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
def main(): | |
# Retrieve or initialize user input information | |
first_name = st.session_state.get("first_name", "") | |
last_name = st.session_state.get("last_name", "") | |
preferred_languages = st.session_state.get("preferred_languages", []) | |
st.title("User Profile") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
max_len = 70 | |
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids") | |
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask") | |
embeddings = bert(input_ids,attention_mask = input_mask)[0] | |
out = tf.keras.layers.GlobalMaxPool1D()(embeddings) | |
out = Dense(128, activation='relu')(out) | |
out = tf.keras.layers.Dropout(0.1)(out) | |
out = Dense(32,activation = 'relu')(out) | |
y = Dense(2,activation = 'softmax')(out) | |
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN) | |
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME))) | |
print("#"*100) | |
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME))) | |
print("#"*100) | |
print(" Train Data : ") | |
print("#"*100) | |
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROPOUT_RATE = 0.3 | |
LEARNING_RATE = 0.00005 | |
NUM_EPOCHS = 10 | |
BATCH_SIZE = 128 | |
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') | |
embedding_layer = Embedding(len(tokenizer.word_index) + 1, | |
EMBEDDINGS_DIMENSION, | |
weights=[embedding_matrix], | |
input_length=MAX_SEQUENCE_LENGTH, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROPOUT_RATE = 0.3 | |
LEARNING_RATE = 0.00005 | |
NUM_EPOCHS = 10 | |
BATCH_SIZE = 128 | |
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') | |
embedding_layer = Embedding(len(tokenizer.word_index) + 1, | |
EMBEDDINGS_DIMENSION, | |
weights=[embedding_matrix], | |
input_length=MAX_SEQUENCE_LENGTH, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN) | |
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME))) | |
print("#"*100) | |
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME))) | |
print("#"*100) | |
print(" Train Data : ") | |
print("#"*100) | |
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0) | |
plot_confusion_matrix(train_labels[:,1], tr_pred) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROPOUT_RATE = 0.3 | |
LEARNING_RATE = 0.00005 | |
NUM_EPOCHS = 10 | |
BATCH_SIZE = 128 | |
#Create CNN Layers | |
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') | |
embedding_layer = Embedding(len(tokenizer.word_index) + 1, | |
EMBEDDINGS_DIMENSION, | |
weights=[embedding_matrix], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(" Train Data : ") | |
print("#"*100) | |
randomlist = np.random.rand(train_data.shape[0]) | |
pred_y = [ 1 if val > 0.5 else 0 for val in randomlist] | |
print(" Train AUC Score Random Model : ", roc_auc_score(train_labels[:, 0], pred_y)) | |
print(" Train F1 - Score Random Model : ", f1_score(train_labels[:, 0], pred_y)) | |
plot_confusion_matrix(train_labels[:, 1], pred_y) | |
print(" CV Data : ") | |
print("#"*100) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def text_preprocessing(df): | |
#Remove punctuations | |
df['clean_text'] = df['comment_text'].apply(lambda x:remove_punctuation(x)) | |
#make lower case | |
df['clean_text_lower']= df['clean_text'].apply(lambda x: x.lower()) | |
#toenize the string | |
df['text_tokenied']= df['clean_text_lower'].apply(lambda x: tokenization(x)) | |
#remove stop words | |
df['text_tokenized_no_stopwords']= df['text_tokenied'].apply(lambda x:remove_stopwords(x)) | |
#lemmatize |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Lemmatization | |
from nltk.stem import WordNetLemmatizer | |
wordnet_lemmatizer = WordNetLemmatizer() | |
def lemmatizer(text): | |
lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text] | |
return lemm_text |
NewerOlder