Skip to content

Instantly share code, notes, and snippets.

@cnmoro
Created January 4, 2022 23:06
Show Gist options
  • Save cnmoro/7ba9541af975dbb4fe1eb08658f97641 to your computer and use it in GitHub Desktop.
Save cnmoro/7ba9541af975dbb4fe1eb08658f97641 to your computer and use it in GitHub Desktop.
SKLearn Snippets
# CLUSTERING
# Davies Bouldin Index -> Menor Melhor para escolha do K
# Descrição das estatísticas das features
df.groupby("cluster").describe()
centroids = kmeans.cluster_centers_
max = centroids[0]
for i in range(max):
# Exibir as variâncias por feature
print(df.columns.values[i], "{:.4f}".format(centroids[:, i].var()))
# As features com maior variância são aquelas que
# diferenciaram mais os clusters
# Exemplo: feature IDADE
df.groupby("cluster")["IDADE"].describe()
# Analisar clusters separadamente por cada feature
description = df.groupby("cluster")["IDADE", "X", "Y", "Z"]
num_instances = description.size()
description = description.mean()
description['n_instances'] = num_instances
print(description)
##############################################
# CROSS-VALIDATION (K-FOLD VALIDATION)
from sklearn.model_selection import cross_validate
results = cross_validate(model, X, y, cv = 5, return_train_score=False)
avg_result = results['test_score'].mean()
print(avg_result)
# Com aleatoriedade
SEED = 301
np.random.seed(SEED)
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True)
from sklearn.model_selection import cross_validate
results = cross_validate(model, X, y, cv = cv, return_train_score=False)
avg_result = results['test_score'].mean()
print(avg_result)
# Com aleatoriedade e mantendo proporção do label
SEED = 301
np.random.seed(SEED)
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True)
from sklearn.model_selection import cross_validate
results = cross_validate(model, X, y, cv = cv, return_train_score=False)
avg_result = results['test_score'].mean()
print(avg_result)
# Geração de PipelineSEED = 301
from sklearn.pipeline import Pipeline
SEED = 301
np.random.seed(SEED)
scaler = StandardScaler()
modelo = SVC()
pipeline = Pipeline([
('transformacao', scaler),
('estimador', modelo)
])
cv = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_validate(pipeline, X, y, cv = cv, return_train_score=False)
avg_result = results['test_score'].mean()
print(avg_result)
# GridSearchCV
from sklearn.model_selection import GridSearchCV
SEED=301
np.random.seed(SEED)
parameter_grid = {
"max_depth" : [3, 5],
"min_samples_split": [32, 64, 128],
"min_samples_leaf": [32, 64, 128],
"criterion": ["gini", "entropy"]
}
busca = GridSearchCV(DecisionTreeClassifier(),
parameter_grid,
cv = StratifiedKFold(n_splits = 10))
busca.fit(X, y)
print(busca.best_params_)
print(busca.best_score_ * 100)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()
# GridSearchCV com parâmetros aleatórios
from sklearn.model_selection import RandomizedSearchCV
SEED=301
np.random.seed(SEED)
parameter_grid = {
"max_depth" : [3, 5], # da para pegar uma lista de numeros aleatorios aqui
"min_samples_split": [32, 64, 128],
"min_samples_leaf": [32, 64, 128],
"criterion": ["gini", "entropy"]
}
busca = RandomizedSearchCV(DecisionTreeClassifier(),
parameter_grid,
cv = StratifiedKFold(n_splits = 10))
busca.fit(X, y)
print(busca.best_params_)
print(busca.best_score_ * 100)
resultados = pd.DataFrame(busca.cv_results_)
resultados.head()
####################################################
# Classificador Multiclasse OneVsRest, um modelo por label
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegressionClassifier
reg_log = LogisticRegressionClassifier()
classificador_onevsrest = OneVsRestClassifier(reg_log)
# y precisa ser uma matriz binária
import numpy as np
y = np.asarray(y)
classificador_onevsrest.fit(X, y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment