prafulgondane

## gist:304846fa66cb79e34a6c8037d99cab4a
import streamlit as st

def main():
    # Retrieve or initialize user input information
    first_name = st.session_state.get("first_name", "")
    last_name = st.session_state.get("last_name", "")
    preferred_languages = st.session_state.get("preferred_languages", [])

    st.title("User Profile")

## bert.py
max_len = 70
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0]
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(2,activation = 'softmax')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)

## gist:fda2e6b606cbea9ed635673fa18c3177
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME)))
print("#"*100)
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME)))
print("#"*100)


print(" Train Data : ")
print("#"*100)
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0)

## bidirectionalLSTM.py
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                EMBEDDINGS_DIMENSION,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,

## LSTM.py
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                EMBEDDINGS_DIMENSION,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,

## CNN1.py
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME)))
print("#"*100)
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME)))
print("#"*100)

print(" Train Data : ")
print("#"*100)
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0)
plot_confusion_matrix(train_labels[:,1], tr_pred)

## CNN.py
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128

#Create CNN Layers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                EMBEDDINGS_DIMENSION,
                                weights=[embedding_matrix],

## baseline.py
print(" Train Data : ")
print("#"*100)
randomlist = np.random.rand(train_data.shape[0])
pred_y = [ 1 if val > 0.5 else 0 for val in randomlist]
print(" Train AUC Score Random Model : ", roc_auc_score(train_labels[:, 0], pred_y))
print(" Train F1 - Score Random Model : ", f1_score(train_labels[:, 0], pred_y))
plot_confusion_matrix(train_labels[:, 1], pred_y)

print(" CV Data : ")
print("#"*100)

## text_preprocessing.py
def text_preprocessing(df):
  #Remove punctuations
  df['clean_text'] = df['comment_text'].apply(lambda x:remove_punctuation(x))
  #make lower case
  df['clean_text_lower']= df['clean_text'].apply(lambda x: x.lower())
  #toenize the string
  df['text_tokenied']= df['clean_text_lower'].apply(lambda x: tokenization(x))
  #remove stop words
  df['text_tokenized_no_stopwords']= df['text_tokenied'].apply(lambda x:remove_stopwords(x))
  #lemmatize

## lemmatization.py
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text
	import streamlit as st

	def main():
	# Retrieve or initialize user input information
	first_name = st.session_state.get("first_name", "")
	last_name = st.session_state.get("last_name", "")
	preferred_languages = st.session_state.get("preferred_languages", [])

	st.title("User Profile")
	max_len = 70
	input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
	input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
	embeddings = bert(input_ids,attention_mask = input_mask)[0]
	out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
	out = Dense(128, activation='relu')(out)
	out = tf.keras.layers.Dropout(0.1)(out)
	out = Dense(32,activation = 'relu')(out)
	y = Dense(2,activation = 'softmax')(out)
	model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
	bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
	print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME)))
	print("#"*100)
	print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME)))
	print("#"*100)


	print(" Train Data : ")
	print("#"*100)
	tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0)
	DROPOUT_RATE = 0.3
	LEARNING_RATE = 0.00005
	NUM_EPOCHS = 10
	BATCH_SIZE = 128

	sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
	embedding_layer = Embedding(len(tokenizer.word_index) + 1,
	EMBEDDINGS_DIMENSION,
	weights=[embedding_matrix],
	input_length=MAX_SEQUENCE_LENGTH,
	print(" Train Data : ")
	print("#"*100)
	randomlist = np.random.rand(train_data.shape[0])
	pred_y = [ 1 if val > 0.5 else 0 for val in randomlist]
	print(" Train AUC Score Random Model : ", roc_auc_score(train_labels[:, 0], pred_y))
	print(" Train F1 - Score Random Model : ", f1_score(train_labels[:, 0], pred_y))
	plot_confusion_matrix(train_labels[:, 1], pred_y)

	print(" CV Data : ")
	print("#"*100)
	def text_preprocessing(df):
	#Remove punctuations
	df['clean_text'] = df['comment_text'].apply(lambda x:remove_punctuation(x))
	#make lower case
	df['clean_text_lower']= df['clean_text'].apply(lambda x: x.lower())
	#toenize the string
	df['text_tokenied']= df['clean_text_lower'].apply(lambda x: tokenization(x))
	#remove stop words
	df['text_tokenized_no_stopwords']= df['text_tokenied'].apply(lambda x:remove_stopwords(x))
	#lemmatize
	# Lemmatization
	from nltk.stem import WordNetLemmatizer
	wordnet_lemmatizer = WordNetLemmatizer()

	def lemmatizer(text):
	lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
	return lemm_text