Created
November 12, 2018 04:05
-
-
Save socratesk/dffa1ff9d1b7bdb48bb68649104c3614 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn import datasets | |
from sklearn.model_selection import KFold | |
from sklearn.linear_model import LogisticRegression | |
# Import breast cancer (dataset) object from sklearn library | |
breast_cancer = datasets.load_breast_cancer() | |
# Define features need to be extracted from breast cancer (dataset) object | |
feature_names = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', | |
'mean smoothness', 'mean compactness', 'mean concavity', | |
'mean concave points', 'mean symmetry', 'mean fractal dimension', | |
'radius error', 'texture error', 'perimeter error', 'area error', | |
'smoothness error', 'compactness error', 'concavity error', | |
'concave points error', 'symmetry error', 'fractal dimension error', | |
'worst radius', 'worst texture', 'worst perimeter', 'worst area', | |
'worst smoothness', 'worst compactness', 'worst concavity', | |
'worst concave points', 'worst symmetry', 'worst fractal dimension'] | |
# Extract breast cancer dataset and create a dataframe out of it | |
fulldata = pd.DataFrame(breast_cancer.data, columns=feature_names) | |
# Print the shape of the dataframe | |
print(fulldata.shape) | |
>> (569, 30) | |
# Extract breast cancer dataset's target | |
target = breast_cancer.target | |
# Set number of splits | |
NO_SPLITS = 5 | |
# Create KFold object with number of splits | |
kf = KFold(n_splits=NO_SPLITS, random_state=111) | |
# Create a Logistic Regression object. | |
logistic_regression = LogisticRegression() | |
# Define a variable to store the sum of accuracy of each fold | |
accuracy_score_sum = 0 | |
# Split the data using KFold object, run the model iteratively, and compute accuracy | |
for train_index, test_index in kf.split(fulldata): | |
# Split train and test datasets sing fold index | |
X_train, X_test = fulldata.iloc[train_index], fulldata.iloc[test_index] | |
y_train, y_test = target[train_index], target[test_index] | |
# Train a Logistic Regression model with Train dataset | |
logistic_regression.fit(X_train, y_train) | |
# Compute the accuracy score and print it | |
accuracy_score = logistic_regression.score(X_test, y_test) | |
print(accuracy_score) | |
# Add accuracy of each iterations | |
accuracy_score_sum += accuracy_score | |
# Compute the mean accuracy score of K-Folds and print it | |
mean_accuracy = accuracy_score_sum/NO_SPLITS | |
print ("Final accuracy: ", mean_accuracy) | |
>> 0.912280701754 | |
>> 0.947368421053 | |
>> 0.973684210526 | |
>> 0.973684210526 | |
>> 0.955752212389 | |
>> Final accuracy: 0.95255395125 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment