Skip to content

Instantly share code, notes, and snippets.

@Mashimo
Last active April 29, 2018 22:37
Show Gist options
  • Save Mashimo/df3ba243b8d8982bc824ac0663ed3e23 to your computer and use it in GitHub Desktop.
Save Mashimo/df3ba243b8d8982bc824ac0663ed3e23 to your computer and use it in GitHub Desktop.
Regression
"""
There are four relationships we are interested in modeling:
1.The amount charged for room and board, expressed as a function of the number of: accepted students
2.The number of enrolled students per college, expressed as a function of the number of: accepted students
3.The number of failed undergraduate students per college, expressed as a function of: the number of accepted students
4.The amount charged for room and board coupled with the number of enrolled students, expressed as a function of: the number of accepted students.
"""
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import linear_model
from sklearn.model_selection import train_test_split
matplotlib.style.use('ggplot') # Look Pretty
def drawLine(model, X_test, y_test, title, R2):
# This convenience method will take care of plotting the
# test observations, comparing them to the regression line,
# and displaying the R2 coefficient
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(X_test, y_test, c='g', marker='o')
ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)
title += " R2: " + str(R2)
ax.set_title(title)
print (title)
print ("Intercept(s): ", model.intercept_)
plt.show()
def drawPlane(model, X_test, y_test, title, R2):
# This convenience method will take care of plotting the
# test observations, comparing them to the regression plane,
# and displaying the R2 coefficient
fig = plt.figure()
ax = Axes3D(fig)
ax.set_zlabel('prediction')
# You might have passed in a DataFrame, a Series (slice),
# an NDArray, or a Python List... so let's keep it simple:
X_test = np.array(X_test)
col1 = X_test[:,0]
col2 = X_test[:,1]
# Set up a Grid. We could have predicted on the actual
# col1, col2 values directly; but that would have generated
# a mesh with WAY too fine a grid, which would have detracted
# from the visualization
x_min, x_max = col1.min(), col1.max()
y_min, y_max = col2.min(), col2.max()
x = np.arange(x_min, x_max, (x_max-x_min) / 10)
y = np.arange(y_min, y_max, (y_max-y_min) / 10)
x, y = np.meshgrid(x, y)
# Predict based on possible input values that span the domain
# of the x and y inputs:
z = model.predict( np.c_[x.ravel(), y.ravel()] )
z = z.reshape(x.shape)
ax.scatter(col1, col2, y_test, c='g', marker='o')
ax.plot_wireframe(x, y, z, color='orange', alpha=0.7)
title += " R2: " + str(R2)
ax.set_title(title)
print (title)
print ("Intercept(s): ", model.intercept_)
plt.show()
#
# INFO: Let's get started!
# the first column is both unique (the name of each)
# college, as well as unlabeled. This is a HINT that it must be the
# index column. If you do not indicate to Pandas that you already
# have an index column, it'll create one for you, which would be
# undesirable since you already have one.
#
# load up the College dataset into a variable
# called X:
#
X = pd.read_csv("Datasets/college.csv", index_col=0)
#
# The .map() method is like .apply(), but instead of taking in a
# lambda / function, you simply provide a mapping of keys:values.
X.Private = X.Private.map({'Yes':1, 'No':0})
#
# : Create the linear regression model
#
model = linear_model.LinearRegression()
#
# INFO: The first relationship we're interested in is the
# number of accepted students, as a function of the amount
# charged for room and board.
#
# : Using indexing, create two slices (series). One will just
# store the room and board column, the other will store the accepted
# students column. Then use train_test_split to cut the data up
# into X_train, X_test, y_train, y_test, with a test_size of 30% and
# a random_state of 7.
#
X_rb = X['Room.Board'] # series
y = X['Accept']
X_train, X_test, y_train, y_test = train_test_split(X_rb, y,
test_size=0.3, random_state=7)
#
# : Fit and score the model appropriately.
#
# fit(), score() and predict() expect 2d arrays
model.fit(X_train.reshape(-1,1), y_train)
X_test = X_test.reshape(-1,1)
score = model.score(X_test, y_test)
drawLine(model, X_test, y_test, "Accept(Room&Board)", score)
#
# : Duplicate the process above; this time, model the number of
# accepted students, as a function of the number of enrolled students
# per college.
#
X_en = X['Enroll'] # series
X_train, X_test, y_train, y_test = train_test_split(X_en, y,
test_size=0.3, random_state=7)
model.fit(X_train.reshape(-1,1), y_train)
X_test = X_test.reshape(-1,1)
score = model.score(X_test, y_test)
drawLine(model, X_test, y_test, "Accept(Enroll)", score)
#
# : Duplicate the process above; this time, model the number of
# accepted students, as as function of the numbr of failed undergraduate
# students per college.
#
X_fu = X['F.Undergrad'] # series
X_train, X_test, y_train, y_test = train_test_split(X_fu, y,
test_size=0.3, random_state=7)
model.fit(X_train.reshape(-1,1), y_train)
X_test = X_test.reshape(-1,1)
score = model.score(X_test, y_test)
drawLine(model, X_test, y_test, "Accept(F.Undergrad)", score)
#
# : Duplicate the process above (almost). This time is going to be
# a bit more complicated. Instead of modeling one feature as a function
# of another, will attempt to do multivariate linear regression to
# model one feature as a function of TWO other features.
#
# Model the amount charged for room and board AND the number of enrolled
# students, as a function of the number of accepted students. To do
# this, instead of creating a regular slice for a single-feature input,
# simply create a slice that contains both columns you wish to use as
# inputs. Your training labels will remain a single slice.
#
X_rb_en = X[['Room.Board', 'Enroll']] # data frame
X_train, X_test, y_train, y_test = train_test_split(X_rb_en, y,
test_size=0.3, random_state=7)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
drawPlane(model, X_test, y_test, "Accept(Room&Board,Enroll)", score)
# INFO + HINT On Fitting, Scoring, and Predicting:
# When you use .fit(), .score(), and .predict() on
# your model, SciKit-Learn expects your training data to be in
# spreadsheet (2D Array-Like) form. This means you can't simply
# pass in a 1D Array (slice) and get away with it.
#
# To properly prep your data, you have to pass in a 2D Numpy Array,
# or a dataframe. But what happens if you really only want to pass
# in a single feature?
#
# If you slice your dataframe using df[['ColumnName']] syntax, the
# result that comes back is actually a *dataframe*. Go ahead and do
# a type() on it to check it out. Since it's already a dataframe,
# you're good -- no further changes needed.
#
# But if you slice your dataframe using the df.ColumnName syntax,
# OR if you call df['ColumnName'], the result that comes back is
# actually a series (1D Array)! This will cause SKLearn to bug out.
# So if you are slicing using either of those two techniques, before
# sending your training or testing data to .fit / .score, do a
# my_column = my_column.reshape(-1,1). This will convert your 1D
# array of [n_samples], to a 2D array shaped like [n_samples, 1].
# A single feature, with many samples.
#
# If you did something like my_column = [my_column], that would produce
# an array in the shape of [1, n_samples], which is incorrect because
# SKLearn expects your data to be arranged as [n_samples, n_features].
# Keep in mind, all of the above only relates to your "X" or input
# data, and does not apply to your "y" or labels.
#
# Extra
# ========================
#
#
# What happens if you apply scaling to your data before doing
# linear regression? Would it alter the quality of your results?
# Do the scalers that work on a per-feature basis, such as MinMaxScaler
# behave differently that those that work on a multi-feature basis, such
# as normalize? And moreover, once your features have been scaled, you
# won't be able to use the resulting regression directly... unless you're
# able to .inverse_transform() the scaling. Do all of the SciKit-Learn
# scalers support that?
#
"""
Using linear regression, extrapolate how long people will live in the future.
The dataset "Life expectancy at birth, at age 65, and at age 75, by sex, race, and origin" is provided courtesy of the Center for Disease Control and Prevention's National Center for Health Statistics; page: http://www.cdc.gov/nchs/data_access/ftp_data.htm
"""
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn import linear_model
matplotlib.style.use('ggplot') # Look Pretty
def drawLine(model, X_test, y_test, title):
# This convenience method will take care of plotting the
# test observations, comparing them to the regression line,
# and displaying the R2 coefficient
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(X_test, y_test, c='g', marker='o')
ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)
print ("Est 2014 " + title + " Life Expectancy: ", model.predict([[2014]])[0])
print ("Est 2030 " + title + " Life Expectancy: ", model.predict([[2030]])[0])
print ("Est 2045 " + title + " Life Expectancy: ", model.predict([[2045]])[0])
score = model.score(X_test, y_test)
title += " R2: " + str(score)
ax.set_title(title)
plt.show()
#
# : Load up the data into a variable called 'X'.
#
X = pd.read_csv("Datasets/life_expectancy.csv", sep='\t')
#
# : Create the linear regression model
#
model = linear_model.LinearRegression()
#
# : Slice out the data manually (e.g. not using train_test_split.
# Set X_train to be year values
# LESS than 1986, and y_train to be corresponding WhiteMale age values.
#
# INFO You might also want to read the note about slicing on the bottom
# of this document.
#
X_train = X.loc[X['Year']<1986, ['Year']] # data frame
y_trainWM = X[X['Year'] < 1986].WhiteMale # series
#
# : Train the model then pass it into drawLine with the training
# set and labels. drawLine will output
# to the console a 2014 extrapolation / approximation for what it
# believes the WhiteMale's life expectancy in the U.S. will be...
# given the pre-1986 data you trained it with. It'll also produce a
# 2030 and 2045 extrapolation.
#
model.fit(X_train, y_trainWM)
drawLine(model, X_train, y_trainWM, "WhiteMale")
#
# : Print the actual 2014 WhiteMale life expectancy
print("Actual 2014 WhiteMale Life Expectancy = ",
X[X.Year == 2014].WhiteMale.iloc[0])
#
# : Repeat the process, but instead of for WhiteMale, this time
# select BlackFemale. Create a slice for BlackFemales, fit the
# model, and then call drawLine. Lastly, print out the actual 2014
# BlackFemale life expectancy
#
y_trainBF = X[X['Year'] < 1986].BlackFemale # series
model.fit(X_train, y_trainBF)
drawLine(model, X_train, y_trainBF, "BlackFemale")
print("Actual 2014 BlackFemale Life Expectancy = ",
X[X.Year == 2014].BlackFemale.iloc[0])
#
# : Lastly, print out a correlation matrix for the entire
# dataset, and display a visualization of the correlation
# matrix
#
print("Correlation matrix: ")
print(X.corr())
fig, ax = plt.subplots()
im = ax.imshow(X.corr(), cmap=plt.cm.Blues, interpolation='nearest')
plt.colorbar(im)
tick_marks = [i for i in range(len(X.columns))]
plt.xticks(tick_marks, X.columns, rotation='vertical')
plt.yticks(tick_marks, X.columns)
ax.set_title("Correlation matrix")
plt.show()
#
# INFO On Fitting, Scoring, and Predicting:
#
# When you use .fit(), .score(), and .predict() on
# your model, SciKit-Learn expects your training data to be in
# spreadsheet (2D Array-Like) form. This means you can't simply
# pass in a 1D Array (slice) and get away with it.
#
# To properly prep your data, you have to pass in a 2D Numpy Array,
# or a dataframe. But what happens if you really only want to pass
# in a single feature?
#
# If you slice your dataframe using df[['ColumnName']] syntax, the
# result that comes back is actually a *dataframe*. Go ahead and do
# a type() on it to check it out. Since it's already a dataframe,
# you're good -- no further changes needed.
#
# But if you slice your dataframe using the df.ColumnName syntax,
# OR if you call df['ColumnName'], the result that comes back is
# actually a series (1D Array)! This will cause SKLearn to bug out.
# So if you are slicing using either of those two techniques, before
# sending your training or testing data to .fit / .score, do a
# my_column = my_column.reshape(-1,1). This will convert your 1D
# array of [n_samples], to a 2D array shaped like [n_samples, 1].
# A single feature, with many samples.
#
# If you did something like my_column = [my_column], that would produce
# an array in the shape of [1, n_samples], which is incorrect because
# SKLearn expects your data to be arranged as [n_samples, n_features].
# Keep in mind, all of the above only relates to your "X" or input
# data, and does not apply to your "y" or labels.
Examples of regression models for prediction
"""
Use linear regression to recover or 'fill out' a completely deleted portion of an audio file!
This will be using The FSDD, Free-Spoken-Digits-Dataset, an audio dataset put together by Zohar Jackson:
cleaned up audio (no dead-space, roughly same length, same bitrate, same samples-per-second rate, etc) samples ready for machine learning.
"""
import numpy as np
import pandas as pd
import os
from sklearn import linear_model
import scipy.io.wavfile as wavfile
#
# INFO:
# Samples = Observations. Each audio file is a single sample
# in the dataset.
#
# Audio Samples = https://en.wikipedia.org/wiki/Sampling_(signal_processing)
# Each .wav file is actually just a bunch of numeric samples, "sampled"
# from the analog signal. Sampling is a type of discretization. When we
# mention 'samples', we mean observations. When we mention 'audio samples',
# we mean the actually "features" of the audio file.
#
#
# The goal of this gist is to use multi-target, linear regression to generate
# by extrapolation, the missing portion of the test audio file.
#
# Each one audio_sample features will be the output of an equation,
# which is a function of the provided portion of the audio_samples:
#
# missing_samples = f(provided_samples)
#
# You can experiment with how much of the audio you want to chop off
# and have the computer generate using the Provided_Portion parameter.
#
# Play with this. This is how much of the audio file will
# be provided, in percent. The remaining percent of the file will
# be generated via linear extrapolation.
Provided_Portion = 0.25
# INFO: You have to download the dataset (audio files) from the website:
# https://github.com/Jakobovski/free-spoken-digit-dataset
#
# Create a regular ol' Python List called 'zero'
# Loop through the dataset and load up all 50 of the 0_jackson*.wav files
# For each audio file, simply append the audio data (not the sample_rate,
# just the data!) to the Python list 'zero':
#
zero = []
directory = "Datasets/free-spoken-digit-dataset-master/recordings/"
for fname in os.listdir(directory):
if fname.startswith("0_jackson"):
fullname = os.path.join(directory, fname)
sample_rate, data = wavfile.read(fullname)
zero.append( data )
#
# convert zero into a DataFrame and set the dtype to
# np.int16, since the input audio files are 16
# bits per sample. This is important otherwise the produced audio samples
# will be encoded as 64 bits per sample and will be too short.
zeroDF = pd.DataFrame(zero, dtype=np.int16)
#
# Since these audio clips are unfortunately not length-normalized,
# we're going to have to just hard chop them to all be the same length.
# Since Pandas would have inserted NANs at any spot to make zero a
# perfectly rectangular [n_observed_samples, n_audio_samples] array,
# do a dropna on the Y axis here. Then, convert one back into an
# NDArray using .values
#
if zeroDF.isnull().values.any() == True:
print("Preprocessing data: dropping all NaN")
zeroDF.dropna(axis=1, inplace=True)
else:
print("Preprocessing data: No NaN found!")
zero = zeroDF.values # this is a list
#
# It's important to know how (many audio_samples samples) long the
# data is now. 'zero' is currently shaped [n_samples, n_audio_samples]
#
n_audio_samples = zero.shape[1]
#
# Create the linear regression model
#
model = linear_model.LinearRegression()
#
# INFO: There are 50 takes of each clip. You want to pull out just one
# of them, randomly, and that one will NOT be used in the training of
# the model. In other words, the one file we'll be testing / scoring
# on will be an unseen sample, independent to the rest of the
# training set:
from sklearn.utils.validation import check_random_state
rng = check_random_state(7)
random_idx = rng.randint(zero.shape[0])
test = zero[random_idx] # the test sample
train = np.delete(zero, [random_idx], axis=0)
#
# Print out the shape of train, and the shape of test
# train will be shaped: [n_samples, n_audio_samples], where
# n_audio_samples are the 'features' of the audio file
# test will be shaped [n_audio_features], since it is a single
# sample (audio file, e.g. observation).
#
print(train.shape)
print(test.shape)
#
# INFO: The test data will have two parts, X_test and y_test. X_test is
# going to be the first portion of the test audio file, which we will
# be providing the computer as input. y_test, the "label" if you will,
# is going to be the remaining portion of the audio file. Like such,
# the computer will use linear regression to derive the missing
# portion of the sound file based off of the training data its received!
#
# Save the original 'test' clip, the one you're about to delete
# half of, so that you can compare it to the 'patched' clip once
# you've generated it.
# this assume the sample rate is always the same for all samples
wavfile.write('OriginalTestClip.wav', sample_rate, test)
#
# Prepare the TEST data by creating a slice called X_test. It
# should have Provided_Portion * n_audio_samples audio sample features,
# taken from the test audio file, currently stored in the variable
# 'test'.
#
test_samples = int(Provided_Portion * n_audio_samples)
X_test = test[0:test_samples] # first ones
#
# If the first Provided_Portion * n_audio_samples features were
# stored in X_test, then we need to also grab the *remaining* audio
# features and store it in y_test. With the remaining features stored
# in there, we will be able to R^2 "score" how well our algorithm did
# in completing the sound file.
#
y_test = test[test_samples:] # remaining ones
#
# Duplicate the same process for X_train, y_train.
#
X_train = train[:, 0:test_samples] # first ones
y_train = train[:, test_samples:]
#
# SciKit-Learn gets mad if you don't supply your training
# data in the form of a 2D arrays: [n_samples, n_features].
#
# So if you only have one SAMPLE, such as is our case with X_test,
# and y_test, then by calling .reshape(1, -1), you can turn
# [n_features] into [1, n_features].
#
#
X_test = X_test.reshape(1,-1)
y_test = y_test.reshape(1,-1)
#
# Fit the model using the training data and label:
#
model.fit(X_train, y_train)
#
# Use the model to predict the 'label' of X_test.
#
y_test_prediction = model.predict(X_test)
# INFO: SciKit-Learn will use float64 to generate the predictions
# so let's take those values back to int16:
y_test_prediction = y_test_prediction.astype(dtype=np.int16)
#
# Score how well the prediction would do for some good laughs,
# by passing in the test data and test label (y_test).
#
score = model.score(X_test, y_test)
print ("Extrapolation R^2 Score: ", score)
#
# First, take the first Provided_Portion portion of the test clip, the
# part you fed into your linear regression model. Then, stitch that
# together with the abomination the predictor model generated for you,
# and then save the completed audio clip:
completed_clip = np.hstack((X_test, y_test_prediction))
wavfile.write('Extrapolated Clip.wav', sample_rate, completed_clip[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment