This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aggregation_dict = { | |
'NUM_INSTALMENT_VERSION': ['nunique'], | |
'NUM_INSTALMENT_VERSION': ['max'], | |
'DPD': ['max', 'mean', 'median', 'sum'], | |
'DBD': ['max', 'mean', 'median', 'sum'], | |
'PAYMENT_PERCENTAGE': ['max', 'mean', 'median', 'sum', 'var'], | |
'PAYMENT_DIFFERENCE': ['max', 'mean', 'median', 'sum', 'var'], | |
'AMT_INSTALMENT': ['max', 'mean', 'median', 'sum'], | |
'AMT_PAYMENT': ['min', 'max', 'mean', 'median', 'sum'], | |
'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'median', 'sum'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Removing punctuations like . , ! $( ) * % @ | |
def remove_punctuation(text): | |
punctuationfree="".join([i for i in text if i not in string.punctuation]) | |
return punctuationfree |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tokenization | |
def tokenization(text): | |
tokens = text.split() | |
return tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Remove stopwords from tokenized text | |
def remove_stopwords(text): | |
output= [i for i in text if i not in stopwords[0:140]] # after 140 indext it will don't haven't kind of words | |
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Stemming | |
from nltk.stem.porter import PorterStemmer | |
porter_stemmer = PorterStemmer() | |
def stemming(text): | |
stem_text = [porter_stemmer.stem(word) for word in text] | |
return stem_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Lemmatization | |
from nltk.stem import WordNetLemmatizer | |
wordnet_lemmatizer = WordNetLemmatizer() | |
def lemmatizer(text): | |
lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text] | |
return lemm_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def text_preprocessing(df): | |
#Remove punctuations | |
df['clean_text'] = df['comment_text'].apply(lambda x:remove_punctuation(x)) | |
#make lower case | |
df['clean_text_lower']= df['clean_text'].apply(lambda x: x.lower()) | |
#toenize the string | |
df['text_tokenied']= df['clean_text_lower'].apply(lambda x: tokenization(x)) | |
#remove stop words | |
df['text_tokenized_no_stopwords']= df['text_tokenied'].apply(lambda x:remove_stopwords(x)) | |
#lemmatize |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(" Train Data : ") | |
print("#"*100) | |
randomlist = np.random.rand(train_data.shape[0]) | |
pred_y = [ 1 if val > 0.5 else 0 for val in randomlist] | |
print(" Train AUC Score Random Model : ", roc_auc_score(train_labels[:, 0], pred_y)) | |
print(" Train F1 - Score Random Model : ", f1_score(train_labels[:, 0], pred_y)) | |
plot_confusion_matrix(train_labels[:, 1], pred_y) | |
print(" CV Data : ") | |
print("#"*100) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROPOUT_RATE = 0.3 | |
LEARNING_RATE = 0.00005 | |
NUM_EPOCHS = 10 | |
BATCH_SIZE = 128 | |
#Create CNN Layers | |
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') | |
embedding_layer = Embedding(len(tokenizer.word_index) + 1, | |
EMBEDDINGS_DIMENSION, | |
weights=[embedding_matrix], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN) | |
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME))) | |
print("#"*100) | |
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME))) | |
print("#"*100) | |
print(" Train Data : ") | |
print("#"*100) | |
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0) | |
plot_confusion_matrix(train_labels[:,1], tr_pred) |
OlderNewer