prafulgondane

## gist:6087ca268b058d0d084c5f967a56b1b7
aggregation_dict = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'NUM_INSTALMENT_VERSION': ['max'],
    'DPD': ['max', 'mean', 'median', 'sum'],
    'DBD': ['max', 'mean', 'median', 'sum'],
    'PAYMENT_PERCENTAGE': ['max', 'mean', 'median', 'sum', 'var'],
    'PAYMENT_DIFFERENCE': ['max', 'mean', 'median', 'sum', 'var'],
    'AMT_INSTALMENT': ['max', 'mean', 'median', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'median', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'median', 'sum']

## punctuation.py
# Removing punctuations like . , ! $( ) * % @

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

## token.py
# Tokenization
def tokenization(text):
    tokens = text.split()
    return tokens

## stopwords.py
# Remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords[0:140]]  # after 140 indext it will don't haven't kind of words
    return output

## stem.py
# Stemming
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

def stemming(text):
  stem_text = [porter_stemmer.stem(word) for word in text]
  return stem_text

## lemmatization.py
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text

## text_preprocessing.py
def text_preprocessing(df):
  #Remove punctuations
  df['clean_text'] = df['comment_text'].apply(lambda x:remove_punctuation(x))
  #make lower case
  df['clean_text_lower']= df['clean_text'].apply(lambda x: x.lower())
  #toenize the string
  df['text_tokenied']= df['clean_text_lower'].apply(lambda x: tokenization(x))
  #remove stop words
  df['text_tokenized_no_stopwords']= df['text_tokenied'].apply(lambda x:remove_stopwords(x))
  #lemmatize

## baseline.py
print(" Train Data : ")
print("#"*100)
randomlist = np.random.rand(train_data.shape[0])
pred_y = [ 1 if val > 0.5 else 0 for val in randomlist]
print(" Train AUC Score Random Model : ", roc_auc_score(train_labels[:, 0], pred_y))
print(" Train F1 - Score Random Model : ", f1_score(train_labels[:, 0], pred_y))
plot_confusion_matrix(train_labels[:, 1], pred_y)

print(" CV Data : ")
print("#"*100)

## CNN.py
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128

#Create CNN Layers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                EMBEDDINGS_DIMENSION,
                                weights=[embedding_matrix],

## CNN1.py
bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME)))
print("#"*100)
print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME)))
print("#"*100)

print(" Train Data : ")
print("#"*100)
tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0)
plot_confusion_matrix(train_labels[:,1], tr_pred)
	aggregation_dict = {
	'NUM_INSTALMENT_VERSION': ['nunique'],
	'NUM_INSTALMENT_VERSION': ['max'],
	'DPD': ['max', 'mean', 'median', 'sum'],
	'DBD': ['max', 'mean', 'median', 'sum'],
	'PAYMENT_PERCENTAGE': ['max', 'mean', 'median', 'sum', 'var'],
	'PAYMENT_DIFFERENCE': ['max', 'mean', 'median', 'sum', 'var'],
	'AMT_INSTALMENT': ['max', 'mean', 'median', 'sum'],
	'AMT_PAYMENT': ['min', 'max', 'mean', 'median', 'sum'],
	'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'median', 'sum']
	# Removing punctuations like . , ! $( ) * % @

	def remove_punctuation(text):
	punctuationfree="".join([i for i in text if i not in string.punctuation])
	return punctuationfree
	# Tokenization
	def tokenization(text):
	tokens = text.split()
	return tokens
	# Remove stopwords from tokenized text
	def remove_stopwords(text):
	output= [i for i in text if i not in stopwords[0:140]] # after 140 indext it will don't haven't kind of words
	return output
	# Stemming
	from nltk.stem.porter import PorterStemmer
	porter_stemmer = PorterStemmer()

	def stemming(text):
	stem_text = [porter_stemmer.stem(word) for word in text]
	return stem_text
	# Lemmatization
	from nltk.stem import WordNetLemmatizer
	wordnet_lemmatizer = WordNetLemmatizer()

	def lemmatizer(text):
	lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
	return lemm_text
	def text_preprocessing(df):
	#Remove punctuations
	df['clean_text'] = df['comment_text'].apply(lambda x:remove_punctuation(x))
	#make lower case
	df['clean_text_lower']= df['clean_text'].apply(lambda x: x.lower())
	#toenize the string
	df['text_tokenied']= df['clean_text_lower'].apply(lambda x: tokenization(x))
	#remove stop words
	df['text_tokenized_no_stopwords']= df['text_tokenied'].apply(lambda x:remove_stopwords(x))
	#lemmatize
	print(" Train Data : ")
	print("#"*100)
	randomlist = np.random.rand(train_data.shape[0])
	pred_y = [ 1 if val > 0.5 else 0 for val in randomlist]
	print(" Train AUC Score Random Model : ", roc_auc_score(train_labels[:, 0], pred_y))
	print(" Train F1 - Score Random Model : ", f1_score(train_labels[:, 0], pred_y))
	plot_confusion_matrix(train_labels[:, 1], pred_y)

	print(" CV Data : ")
	print("#"*100)
	DROPOUT_RATE = 0.3
	LEARNING_RATE = 0.00005
	NUM_EPOCHS = 10
	BATCH_SIZE = 128

	#Create CNN Layers
	sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
	embedding_layer = Embedding(len(tokenizer.word_index) + 1,
	EMBEDDINGS_DIMENSION,
	weights=[embedding_matrix],
	bias_metrics_cv_df = compute_bias_metrics_for_model(df_cv, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
	print("Overall AUC for Train Data : ", get_final_metric(bias_metrics_train_df, calculate_overall_auc(df_train, MODEL_NAME)))
	print("#"*100)
	print("Overall AUC for CV Data : ", get_final_metric(bias_metrics_cv_df, calculate_overall_auc(df_cv, MODEL_NAME)))
	print("#"*100)

	print(" Train Data : ")
	print("#"*100)
	tr_pred = np.where(df_train[MODEL_NAME] >= 0.5, 1, 0)
	plot_confusion_matrix(train_labels[:,1], tr_pred)