socratesk/KFold.py

## KFold.py
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

# Import breast cancer (dataset) object from sklearn library
breast_cancer = datasets.load_breast_cancer()

# Define features need to be extracted from breast cancer (dataset) object
feature_names = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
        'mean smoothness', 'mean compactness', 'mean concavity',
        'mean concave points', 'mean symmetry', 'mean fractal dimension',
        'radius error', 'texture error', 'perimeter error', 'area error',
        'smoothness error', 'compactness error', 'concavity error',
        'concave points error', 'symmetry error', 'fractal dimension error',
        'worst radius', 'worst texture', 'worst perimeter', 'worst area',
        'worst smoothness', 'worst compactness', 'worst concavity',
        'worst concave points', 'worst symmetry', 'worst fractal dimension']

# Extract breast cancer dataset and create a dataframe out of it
fulldata = pd.DataFrame(breast_cancer.data, columns=feature_names)

# Print the shape of the dataframe
print(fulldata.shape)
>> (569, 30)

# Extract breast cancer dataset's target
target = breast_cancer.target

# Set number of splits
NO_SPLITS = 5

# Create KFold object with number of splits
kf = KFold(n_splits=NO_SPLITS, random_state=111)

# Create a Logistic Regression object.
logistic_regression = LogisticRegression()

# Define a variable to store the sum of accuracy of each fold
accuracy_score_sum = 0

# Split the data using KFold object, run the model iteratively, and compute accuracy
for train_index, test_index in kf.split(fulldata):

    # Split train and test datasets sing fold index
    X_train, X_test = fulldata.iloc[train_index], fulldata.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # Train a Logistic Regression model with Train dataset
    logistic_regression.fit(X_train, y_train)

    # Compute the accuracy score and print it
    accuracy_score = logistic_regression.score(X_test, y_test)
    print(accuracy_score)

    # Add accuracy of each iterations
    accuracy_score_sum += accuracy_score

# Compute the mean accuracy score of K-Folds and print it
mean_accuracy = accuracy_score_sum/NO_SPLITS
print ("Final accuracy: ", mean_accuracy)

>> 0.912280701754
>> 0.947368421053
>> 0.973684210526
>> 0.973684210526
>> 0.955752212389
>> Final accuracy:  0.95255395125
	import pandas as pd

	from sklearn import datasets
	from sklearn.model_selection import KFold
	from sklearn.linear_model import LogisticRegression

	# Import breast cancer (dataset) object from sklearn library
	breast_cancer = datasets.load_breast_cancer()

	# Define features need to be extracted from breast cancer (dataset) object
	feature_names = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
	'mean smoothness', 'mean compactness', 'mean concavity',
	'mean concave points', 'mean symmetry', 'mean fractal dimension',
	'radius error', 'texture error', 'perimeter error', 'area error',
	'smoothness error', 'compactness error', 'concavity error',
	'concave points error', 'symmetry error', 'fractal dimension error',
	'worst radius', 'worst texture', 'worst perimeter', 'worst area',
	'worst smoothness', 'worst compactness', 'worst concavity',
	'worst concave points', 'worst symmetry', 'worst fractal dimension']

	# Extract breast cancer dataset and create a dataframe out of it
	fulldata = pd.DataFrame(breast_cancer.data, columns=feature_names)

	# Print the shape of the dataframe
	print(fulldata.shape)
	>> (569, 30)

	# Extract breast cancer dataset's target
	target = breast_cancer.target

	# Set number of splits
	NO_SPLITS = 5

	# Create KFold object with number of splits
	kf = KFold(n_splits=NO_SPLITS, random_state=111)

	# Create a Logistic Regression object.
	logistic_regression = LogisticRegression()

	# Define a variable to store the sum of accuracy of each fold
	accuracy_score_sum = 0

	# Split the data using KFold object, run the model iteratively, and compute accuracy
	for train_index, test_index in kf.split(fulldata):

	# Split train and test datasets sing fold index
	X_train, X_test = fulldata.iloc[train_index], fulldata.iloc[test_index]
	y_train, y_test = target[train_index], target[test_index]

	# Train a Logistic Regression model with Train dataset
	logistic_regression.fit(X_train, y_train)

	# Compute the accuracy score and print it
	accuracy_score = logistic_regression.score(X_test, y_test)
	print(accuracy_score)

	# Add accuracy of each iterations
	accuracy_score_sum += accuracy_score

	# Compute the mean accuracy score of K-Folds and print it
	mean_accuracy = accuracy_score_sum/NO_SPLITS
	print ("Final accuracy: ", mean_accuracy)

	>> 0.912280701754
	>> 0.947368421053
	>> 0.973684210526
	>> 0.973684210526
	>> 0.955752212389
	>> Final accuracy: 0.95255395125