Last active Dec 30, 2015
Snippet for reading in a table of numbers and predicting the last column as a function of the others, using either just a constant, or linear regression, or linear regression regularised with the elastic net. Uses Numpy and Scikit-learn.
 #!/usr/bin/env python from __future__ import print_function import numpy as np from sklearn.linear_model import ElasticNet, LinearRegression import sys # James McDermott (c) 2013 # Hosted at https://gist.github.com/jmmcd/7790588 # Requires Numpy and Scikit-learn def mae(y, yhat): """Calculate mean absolute error between inputs.""" return np.mean(np.abs(y - yhat)) def rmse(y, yhat): """Calculate root mean square error between inputs.""" return np.sqrt(np.mean(np.square(y - yhat))) def get_Xy_train_test(filename, randomise=True, test_proportion=0.5, skip_header=0): """Read in a table of numbers and split it into X (all columns up to last) and y (last column), then split it into training and testing subsets according to test_proportion. Shuffle if required.""" Xy = np.genfromtxt(filename, skip_header=skip_header) if randomise: np.random.shuffle(Xy) X = Xy[:,:-1] # all columns but last y = Xy[:,-1] # last column idx = int((1.0 - test_proportion) * len(y)) train_X = X[:idx] train_y = y[:idx] test_X = X[idx:] test_y = y[idx:] return train_X, train_y, test_X, test_y def get_Xy_train_test_separate(train_filename, test_filename, skip_header=0): """Read in training and testing data files, and split each into X (all columns up to last) and y (last column).""" train_Xy = np.genfromtxt(train_filename, skip_header=skip_header) test_Xy = np.genfromtxt(test_filename, skip_header=skip_header) train_X = train_Xy[:,:-1] # all columns but last train_y = train_Xy[:,-1] # last column test_X = test_Xy[:,:-1] # all columns but last test_y = test_Xy[:,-1] # last column return train_X, train_y, test_X, test_y def fit_const(train_X, train_y, test_X, test_y): """Use the mean of the y training values as a predictor.""" mn = np.mean(train_y) print("Predicting constant", mn) yhat = np.ones(len(train_y)) * mn print("Train error =", error(train_y, yhat)) yhat = np.ones(len(test_y)) * mn print("Test error =", error(test_y, yhat)) def fit_lr(train_X, train_y, test_X, test_y): """Use linear regression to predict.""" lr = LinearRegression() lr.fit(train_X, train_y) print("LR predicting intercept", lr.intercept_, "and coefs", lr.coef_) yhat = lr.predict(train_X) print("Train error =", error(train_y, yhat)) yhat = lr.predict(test_X) print("Test error =", error(test_y, yhat)) def fit_enet(train_X, train_y, test_X, test_y): """Use linear regression to predict -- elastic net is LR with L1 and L2 regularisation.""" enet = ElasticNet() enet.fit(train_X, train_y) print("ElasticNet predicting intercept", enet.intercept_, "and coefs", enet.coef_) yhat = enet.predict(train_X) print("Train error =", error(train_y, yhat)) yhat = enet.predict(test_X) print("Test error =", error(test_y, yhat)) if __name__ == "__main__": error = rmse #error = mae if len(sys.argv) == 3: train_filename = sys.argv[1] test_filename = sys.argv[2] train_X, train_y, test_X, test_y = get_Xy_train_test_separate(train_filename, test_filename) else: filename = sys.argv[1] train_X, train_y, test_X, test_y = get_Xy_train_test(filename) fit_const(train_X, train_y, test_X, test_y) fit_lr(train_X, train_y, test_X, test_y) fit_enet(train_X, train_y, test_X, test_y)