Skip to content

Instantly share code, notes, and snippets.

@antonioFlavio
Created May 28, 2019 02:57
Show Gist options
  • Save antonioFlavio/45a1d32d6d2d85b1f4d86a3e32332c43 to your computer and use it in GitHub Desktop.
Save antonioFlavio/45a1d32d6d2d85b1f4d86a3e32332c43 to your computer and use it in GitHub Desktop.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.metrics import f1_score
def crie_features(row, coluna1, coluna2):
if row[coluna1] == 0 or row[coluna2] == 0 : return 0
return row[coluna1]/row[coluna2]
def cri_features_na_base(base):
base['balance_by_time'] = base.apply(lambda x: crie_features(x, 'Balance', 'Tenure'), axis=1)
base['salary_by_age'] = base.apply(lambda x: crie_features(x, 'EstimatedSalary', 'Age'), axis=1)
base['products_by_month'] = base.apply(lambda x: crie_features(x, 'NumOfProducts', 'Tenure'), axis=1)
base['balance_by__salary'] = base.apply(lambda x: crie_features(x, 'Balance', 'EstimatedSalary'), axis=1)
base['balance_by_age'] = base.apply(lambda x: crie_features(x, 'Balance', 'Age'), axis=1)
base['balance_by_products'] = base.apply(lambda x: crie_features(x, 'Balance', 'NumOfProducts'), axis=1)
base['score_balance'] = base['CreditScore'] * base['Balance']
return base
def salve_predicao(df_validacao, previsao):
df_validacao['Exited'] = previsao.reshape(-1, 1)
resultado = df_validacao[['RowNumber', 'Exited']].reset_index(drop=True)
resultado.to_csv("D:\\Projetos\\RepositoriosGit\\Analise_Churn\\resultado.csv",index=False)
#resultado.to_csv("D:\\Projetos\\VisualStudioCode\\AnaliseChurn\\resultado.csv",index=False)
def realizeTransformacao_1(df):
df = cri_features_na_base(df)
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)
df = df.drop('Surname', axis=1)
#df = df.drop('Geography_Spain', axis=1)
#df = df.drop('HasCrCard', axis=1)
#df = df.drop('Gender_Male', axis=1)
#df = df.drop('Tenure', axis=1)
return df
def realize_predicao(metodo_transformacao, modelo, features):
df_validacao = pd.read_csv('D:\\Projetos\\RepositoriosGit\\Analise_Churn\\valid.csv')
#df_validacao = pd.read_csv('D:\\Projetos\\VisualStudioCode\\AnaliseChurn\\valid.csv')
df_transformado = metodo_transformacao(df_validacao)
predicao = modelo.predict(df_transformado[features])
salve_predicao(df_transformado, predicao)
def f1_score_measure(nome_algoritmo, y_true, y_pred):
print(nome_algoritmo, ": ", f1_score(y_true, y_pred, average='weighted'))
def normalize_base(base, features):
colunas = features
for i in range(len(colunas)):
base[colunas[i]] = base[colunas[i]] / base[colunas[i]].max()
return base
df = pd.read_csv('D:\\Projetos\\RepositoriosGit\\Analise_Churn\\train.csv')
#df = pd.read_csv('D:\\Projetos\\VisualStudioCode\\AnaliseChurn\\train.csv')
#df = pd.read_csv('train.csv')
#features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'Geography_Spain', 'Gender_Male']
#features = ['balance_by__salary','products_by_month','salary_by_age','balance_by_time','CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'Geography_Spain', 'Gender_Male']
#features = ['balance_by__salary','products_by_month','salary_by_age','balance_by_time','CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'balance_by_age', 'balance_by_products', 'score_balance']
features = ['balance_by__salary','products_by_month','salary_by_age','balance_by_time','CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'EstimatedSalary', 'Geography_Germany', 'balance_by_age', 'balance_by_products', 'score_balance', 'Geography_Spain', 'HasCrCard', 'Gender_Male', 'Tenure']
# feature_plot = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Exited']
# sns.pairplot(df[feature_plot])
# plt.show()
df_2 = realizeTransformacao_1(df)
df_2 = normalize_base(df_2, features)
X_train, X_test, y_train, y_test = train_test_split(df_2, df_2.Exited, test_size = 0.35, random_state= 40)
X_train = X_train[features]
X_test = X_test[features]
def crie_matriz_classificacao_binaria(vetor):
matriz = []
for valor in vetor:
if valor == 1:
matriz.append([1, 0])
else:
matriz.append([0, 1])
return matriz
def deserialize_classificacao_binaria(matriz):
vetor = []
for valor in matriz:
if valor[0] > valor[1]:
vetor.append(1)
else:
vetor.append(0)
return vetor
def rede_keras():
X = np.array(X_train)
matriz_y = crie_matriz_classificacao_binaria(y_train)
#y= np.array(y_train)
y= np.array(matriz_y)
#y_teste = np.array(y_test)
matriz_y_test = crie_matriz_classificacao_binaria(y_test)
y_de_teste = np.array(matriz_y_test)
X_teste = np.array(X_test)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(56, input_dim=len(features), activation='elu'))
model.add(tf.keras.layers.Dense(28, activation='relu'))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
sgd = tf.keras.optimizers.SGD(lr=0.001)
RMSprop = tf.keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
Adagrad = tf.keras.optimizers.Adagrad(lr=0.01, epsilon=0.00001, decay=0.0)
model.compile(loss='mean_squared_error', optimizer=Adagrad, metrics=['mae', 'acc'])
epochs = 100
H = model.fit(X, y, batch_size=40, epochs=epochs, verbose=0, validation_data=(X_teste, y_de_teste))
score = model.evaluate(X_teste, y_de_teste, verbose=1)
print('Test loss:', score[0])
print('Test mae:', score[1])
print('Test accuracy:', score[2])
model_pred = model.predict(X_teste)
f1_score_measure('Rede Keras: ', deserialize_classificacao_binaria(matriz_y_test), deserialize_classificacao_binaria(model_pred))
# plotando 'loss' e 'accuracy' para os datasets 'train' e 'test'
plt.figure()
plt.plot(np.arange(0,epochs), H.history["loss"], label="Perda no treino")
plt.plot(np.arange(0,epochs), H.history["val_loss"], label="Perda no teste")
plt.plot(np.arange(0,epochs), H.history["acc"], label="Acurácia de Treino")
plt.plot(np.arange(0,epochs), H.history["val_acc"], label="Acurácia de teste")
plt.title("Métricas do modelo")
plt.xlabel("Épocas #")
plt.ylabel("Perda/Acurácia")
plt.legend()
plt.show()
rede_keras()
# Antes de prosseguir com as previsões, vamos realizar uma análise dos dados:
# Sobre nossas entradas
#CustomerId: Identificador do cliente
#Surname: Nome do cliente
#CreditScore: Nota de crédito atual
#Geography: País
#Gender: Gênero
#Age: Idade
#Tenure: Número de meses que o cliente ficou na empresa
#Balance: Ainda não está claro o que é..
#NumOfProducts: Número de produtos
#HasCrCard: Indicador de cartão de crédito
#IsActiveMember: Se o cliente é ativo no banco
#EstimatedSalary: Salário
#Exited: Se o cliente saiu ou não.
# Nosso alvo é a variável Exited
# # Conhecendo nossos dados
# print(df.head())
# print(df.describe())
# # Pode-se perceber que é uma base bem tratada, com pouca ou nenhuma variável com problemas.
# # Visualizando variáveis categóricas
# print(df['Geography'].unique())
# print(df['Gender'].unique())
# # Exploração dos dados
# # As classes estão desbalanceadas...
# #print(df['Exited'].value_counts())
# # Verificando algumas médias
# print("Médias por Exited")
# print(df.groupby('Exited').mean())
# # Podemos perceber que a média do saldo dos clientes Alemães é maior,
# # mesmo que o salário estimado seja semelhante. Talvez existam outliers.
# print("Médias por localização")
# print(df.groupby('Geography').mean())
# print("Médias por gênero")
# print(df.groupby('Gender').mean())
# print("Médias por idade")
# print(df.groupby('Age').mean())
# print("Média por Cartão de Crédito")
# print(df.groupby('HasCrCard').mean())
# print("Média por Tenure")
# print(df.groupby('Tenure').mean())
# print("Média por IsActiveMember")
# print(df.groupby('IsActiveMember').mean())
# # Correlações por localização
# # df_germany = df.loc[df['Geography'] == 'France']
# # print(df_germany.corr())
# #'Spain' 'France', Germany
# def mostre_crosstab(coluna):
# pd.crosstab(df[coluna], df.Exited).plot(kind='bar')
# plt.xlabel(coluna)
# plt.ylabel('Frequency of Purchase')
# plt.savefig('pur_dayofweek_bar')
# def mostre_crosstab_formato_diferente(coluna):
# table=pd.crosstab(df[coluna],df.Exited)
# table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
# plt.xlabel(coluna)
# plt.ylabel('Exited')
# plt.savefig('mariral_vs_pur_stack')
#mostre_crosstab('Gender')
#mostre_crosstab_formato_diferente('Gender')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment