Skip to content

Instantly share code, notes, and snippets.

@mikebenfield
Created August 21, 2018 00:08
Show Gist options
  • Save mikebenfield/8441218aa49161500dbfd942adf7377b to your computer and use it in GitHub Desktop.
Save mikebenfield/8441218aa49161500dbfd942adf7377b to your computer and use it in GitHub Desktop.
import argparse
import pathlib
from time import perf_counter
import numpy as np
import sklearn.datasets as datasets
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
def arg_generate(args):
directory = args.directory
(X, y) = datasets.make_classification(
n_samples=args.train_size + args.test_size,
n_features=args.n_features,
n_informative=int(args.n_features * 0.8),
n_redundant=int(args.n_features * 0.1),
n_classes=args.n_classes, )
X_test = X[:args.test_size]
X_train = X[args.test_size:]
y_test = y[:args.test_size]
y_train = y[args.test_size:]
y_train = y_train[:, np.newaxis]
train = np.concatenate([X_train, y_train], axis=1)
zeros = np.array([0.0] * len(y_test))[:, np.newaxis]
train = np.concatenate([X_train, y_train], axis=1)
test = np.concatenate([X_test, zeros], axis=1)
test_path = pathlib.Path(directory, 'data.test')
train_path = pathlib.Path(directory, 'data.train')
result_path = pathlib.Path(directory, 'data.target')
fmt = ['%f'] * args.n_features + ['%d']
header = ' '.join(['a' + str(i)
for i in range(args.n_features)] + ['target'])
np.savetxt(
test_path, test, delimiter=' ', fmt=fmt, header=header, comments='')
np.savetxt(result_path, y_test, delimiter=' ', fmt='%d')
np.savetxt(
train_path, train, delimiter=' ', fmt=fmt, header=header, comments='')
def train_predict(args, model, prediction_f):
directory = args.directory
prediction_file = args.prediction_file
test_path = pathlib.Path(directory, 'data.test')
train_path = pathlib.Path(directory, 'data.train')
time_1 = perf_counter()
train = np.loadtxt(train_path, skiprows=1)
X_train = train[:, :-1]
y_train = train[:, -1]
time_2 = perf_counter()
print('{} seconds to parse training data'.format(time_2 - time_1))
model.fit(X_train, y_train)
time_3 = perf_counter()
print('{} seconds to train'.format(time_3 - time_2))
X_test = np.loadtxt(test_path, skiprows=1)[:, :-1]
time_4 = perf_counter()
print('{} seconds to parse testing data'.format(time_4 - time_3))
y_pred = prediction_f(model, X_test)
time_5 = perf_counter()
print('{} seconds to predict'.format(time_5 - time_4))
np.savetxt(prediction_file, y_pred, delimiter=' ', header='Pred', fmt='%d')
def arg_prob_train_predict(args):
classifier = ExtraTreesClassifier(
n_estimators=args.tree_count,
max_features=args.split_tries,
min_samples_split=args.min_samples_split,
n_jobs=args.thread_count)
train_predict(args, classifier, lambda mod, x: mod.predict(x))
def arg_evaluate(args):
target_file = args.target
pred_file = args.prediction
y_target = np.loadtxt(target_file)
y_pred = np.loadtxt(pred_file, skiprows=1)
accuracy = accuracy_score(y_target, y_pred)
print('accuracy: {}'.format(accuracy))
def add_train_predict_args(parser):
parser.add_argument('--directory', required=True)
parser.add_argument('--prediction_file', required=True)
parser.add_argument('--tree_count', type=int, required=True)
parser.add_argument('--thread_count', type=int, required=True)
parser.add_argument('--min_samples_split', type=int, required=True)
parser.add_argument('--split_tries', type=int, required=True)
def add_generate_args(parser):
parser.add_argument('--directory', required=True)
parser.add_argument(
'--train_size',
type=int,
default=40000,
help='Number of samples in the training set')
parser.add_argument(
'--test_size',
type=int,
default=10000,
help='Number of samples in the test set')
parser.add_argument('--n_features', type=int, required=True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
prob_generate_parser = subparsers.add_parser('generate')
add_generate_args(prob_generate_parser)
prob_generate_parser.add_argument('--n_classes', type=int, required=True)
prob_generate_parser.set_defaults(func=arg_generate)
prob_train_predict_parser = subparsers.add_parser('prob_train_predict')
add_train_predict_args(prob_train_predict_parser)
prob_train_predict_parser.set_defaults(func=arg_prob_train_predict)
evaluate_parser = subparsers.add_parser('evaluate')
evaluate_parser.add_argument('--target', required=True)
evaluate_parser.add_argument('--prediction', required=True)
evaluate_parser.set_defaults(func=arg_evaluate)
args = parser.parse_args()
args.func(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment