Created
August 21, 2018 00:08
-
-
Save mikebenfield/8441218aa49161500dbfd942adf7377b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import pathlib | |
from time import perf_counter | |
import numpy as np | |
import sklearn.datasets as datasets | |
from sklearn.metrics import accuracy_score | |
from sklearn.ensemble import ExtraTreesClassifier | |
def arg_generate(args): | |
directory = args.directory | |
(X, y) = datasets.make_classification( | |
n_samples=args.train_size + args.test_size, | |
n_features=args.n_features, | |
n_informative=int(args.n_features * 0.8), | |
n_redundant=int(args.n_features * 0.1), | |
n_classes=args.n_classes, ) | |
X_test = X[:args.test_size] | |
X_train = X[args.test_size:] | |
y_test = y[:args.test_size] | |
y_train = y[args.test_size:] | |
y_train = y_train[:, np.newaxis] | |
train = np.concatenate([X_train, y_train], axis=1) | |
zeros = np.array([0.0] * len(y_test))[:, np.newaxis] | |
train = np.concatenate([X_train, y_train], axis=1) | |
test = np.concatenate([X_test, zeros], axis=1) | |
test_path = pathlib.Path(directory, 'data.test') | |
train_path = pathlib.Path(directory, 'data.train') | |
result_path = pathlib.Path(directory, 'data.target') | |
fmt = ['%f'] * args.n_features + ['%d'] | |
header = ' '.join(['a' + str(i) | |
for i in range(args.n_features)] + ['target']) | |
np.savetxt( | |
test_path, test, delimiter=' ', fmt=fmt, header=header, comments='') | |
np.savetxt(result_path, y_test, delimiter=' ', fmt='%d') | |
np.savetxt( | |
train_path, train, delimiter=' ', fmt=fmt, header=header, comments='') | |
def train_predict(args, model, prediction_f): | |
directory = args.directory | |
prediction_file = args.prediction_file | |
test_path = pathlib.Path(directory, 'data.test') | |
train_path = pathlib.Path(directory, 'data.train') | |
time_1 = perf_counter() | |
train = np.loadtxt(train_path, skiprows=1) | |
X_train = train[:, :-1] | |
y_train = train[:, -1] | |
time_2 = perf_counter() | |
print('{} seconds to parse training data'.format(time_2 - time_1)) | |
model.fit(X_train, y_train) | |
time_3 = perf_counter() | |
print('{} seconds to train'.format(time_3 - time_2)) | |
X_test = np.loadtxt(test_path, skiprows=1)[:, :-1] | |
time_4 = perf_counter() | |
print('{} seconds to parse testing data'.format(time_4 - time_3)) | |
y_pred = prediction_f(model, X_test) | |
time_5 = perf_counter() | |
print('{} seconds to predict'.format(time_5 - time_4)) | |
np.savetxt(prediction_file, y_pred, delimiter=' ', header='Pred', fmt='%d') | |
def arg_prob_train_predict(args): | |
classifier = ExtraTreesClassifier( | |
n_estimators=args.tree_count, | |
max_features=args.split_tries, | |
min_samples_split=args.min_samples_split, | |
n_jobs=args.thread_count) | |
train_predict(args, classifier, lambda mod, x: mod.predict(x)) | |
def arg_evaluate(args): | |
target_file = args.target | |
pred_file = args.prediction | |
y_target = np.loadtxt(target_file) | |
y_pred = np.loadtxt(pred_file, skiprows=1) | |
accuracy = accuracy_score(y_target, y_pred) | |
print('accuracy: {}'.format(accuracy)) | |
def add_train_predict_args(parser): | |
parser.add_argument('--directory', required=True) | |
parser.add_argument('--prediction_file', required=True) | |
parser.add_argument('--tree_count', type=int, required=True) | |
parser.add_argument('--thread_count', type=int, required=True) | |
parser.add_argument('--min_samples_split', type=int, required=True) | |
parser.add_argument('--split_tries', type=int, required=True) | |
def add_generate_args(parser): | |
parser.add_argument('--directory', required=True) | |
parser.add_argument( | |
'--train_size', | |
type=int, | |
default=40000, | |
help='Number of samples in the training set') | |
parser.add_argument( | |
'--test_size', | |
type=int, | |
default=10000, | |
help='Number of samples in the test set') | |
parser.add_argument('--n_features', type=int, required=True) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
subparsers = parser.add_subparsers() | |
prob_generate_parser = subparsers.add_parser('generate') | |
add_generate_args(prob_generate_parser) | |
prob_generate_parser.add_argument('--n_classes', type=int, required=True) | |
prob_generate_parser.set_defaults(func=arg_generate) | |
prob_train_predict_parser = subparsers.add_parser('prob_train_predict') | |
add_train_predict_args(prob_train_predict_parser) | |
prob_train_predict_parser.set_defaults(func=arg_prob_train_predict) | |
evaluate_parser = subparsers.add_parser('evaluate') | |
evaluate_parser.add_argument('--target', required=True) | |
evaluate_parser.add_argument('--prediction', required=True) | |
evaluate_parser.set_defaults(func=arg_evaluate) | |
args = parser.parse_args() | |
args.func(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment