Skip to content

Instantly share code, notes, and snippets.

View prafulgondane's full-sized avatar

prafulgondane

View GitHub Profile
aggregation_dict = {
'NUM_INSTALMENT_VERSION': ['nunique'],
'NUM_INSTALMENT_VERSION': ['max'],
'DPD': ['max', 'mean', 'median', 'sum'],
'DBD': ['max', 'mean', 'median', 'sum'],
'PAYMENT_PERCENTAGE': ['max', 'mean', 'median', 'sum', 'var'],
'PAYMENT_DIFFERENCE': ['max', 'mean', 'median', 'sum', 'var'],
'AMT_INSTALMENT': ['max', 'mean', 'median', 'sum'],
'AMT_PAYMENT': ['min', 'max', 'mean', 'median', 'sum'],
'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'median', 'sum']
# Removing punctuations like . , ! $( ) * % @
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
# Tokenization
def tokenization(text):
tokens = text.split()
return tokens
# Remove stopwords from tokenized text
def remove_stopwords(text):
output= [i for i in text if i not in stopwords[0:140]] # after 140 indext it will don't haven't kind of words
return output
# Stemming
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
def stemming(text):
stem_text = [porter_stemmer.stem(word) for word in text]
return stem_text
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
return lemm_text
def text_preprocessing(df):
#Remove punctuations
df['clean_text'] = df['comment_text'].apply(lambda x:remove_punctuation(x))
#make lower case
df['clean_text_lower']= df['clean_text'].apply(lambda x: x.lower())
#toenize the string
df['text_tokenied']= df['clean_text_lower'].apply(lambda x: tokenization(x))
#remove stop words
df['text_tokenized_no_stopwords']= df['text_tokenied'].apply(lambda x:remove_stopwords(x))
#lemmatize
print(" Train Data : ")
print("#"*100)
randomlist = np.random.rand(train_data.shape[0])
pred_y = [ 1 if val > 0.5 else 0 for val in randomlist]
print(" Train AUC Score Random Model : ", roc_auc_score(train_labels[:, 0], pred_y))
print(" Train F1 - Score Random Model : ", f1_score(train_labels[:, 0], pred_y))
plot_confusion_matrix(train_labels[:, 1], pred_y)
print(" CV Data : ")
print("#"*100)
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128
#Create CNN Layers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
EMBEDDINGS_DIMENSION,
weights=[embedding_matrix],
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME)))
print("#"*100)
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME)))
print("#"*100)
print(" Train Data : ")
print("#"*100)
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0)
plot_confusion_matrix(train_labels[:,1], tr_pred)