-
-
Save clettieri/7124e5ea5d9bd78bfe50e99ed60a1b54 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
def get_column_names(file_name, first_col_is_label=True): | |
'''(string) -> list of strings | |
Will open a text file assumed to contain 1 column name | |
per line. Will remove all blank lines. And will strip | |
rest of line after a ':' is reached. Returns a list of | |
column names. | |
''' | |
col_names = [] | |
#Since data file uses carriage return, open 'rU' | |
with open(file_name, 'rU') as f: | |
for line in f: | |
#If not blank line | |
if line.strip(): | |
#Get column name until ':' | |
col_names.append(line.strip().split(':')[0]) | |
if first_col_is_label: | |
#Col List has label as first column | |
col_names.pop(0) | |
#DataFrame has label as last column | |
col_names.append('is_ad') | |
return col_names | |
def convert_label_to_numeric(df): | |
'''(DataFrame) -> DataFrame | |
Will convert the label column 'is_ad' into numeric | |
values. 1 representing an ad, 0 representing non-ad. | |
''' | |
df['is_ad'] = np.where(df['is_ad']=='ad.', 1, 0) | |
return df | |
def load_data_with_col_names(data_file, column_file): | |
'''(string, string) -> DataFrame | |
Will load column names from a text file and then read the | |
data file as csv. Return a dataframe of data file | |
with proper column names. | |
''' | |
col_names = get_column_names(column_file) | |
#Missing values in the data file are a '?' with varying whitespace | |
df = pd.read_csv(data_file, names=col_names, skipinitialspace=True, na_values=['?']) | |
df = convert_label_to_numeric(df) | |
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import Imputer | |
from sklearn.externals import joblib | |
from sklearn.metrics import accuracy_score | |
def load_data(file_name, label_col_index=-1): | |
'''(string, int) -> array, array | |
Given a file path and name to a csv file in the SAME FORMAT | |
as the test data set, this will laod the file and return two | |
arrays of values. Label_col_index is the numerical index of the | |
label column, default is -1 (last column in data file). | |
''' | |
df = pd.read_csv(file_name, skipinitialspace=True, na_values=['?']) | |
#Get Features | |
x_df = df.drop(df.columns[label_col_index], axis=1) | |
X = x_df.values | |
#Impute Missing x values | |
imp = Imputer(strategy="median", axis=0) | |
X = imp.fit_transform(X) | |
#Get Labels | |
df.iloc[:,label_col_index] = np.where(df.iloc[:,label_col_index]=='ad.', 1, 0) | |
y_df = df.iloc[:,label_col_index] | |
y = y_df.values | |
return X, y | |
def load_model(file_name): | |
'''(string) -> sklearn classifier | |
Given a file path and name to a joblib file, will return | |
the sklearn classifer as python object. | |
''' | |
clf = joblib.load(file_name) | |
return clf | |
def run_model(data_file, model_file, label_col_index=-1): | |
'''(string, string, int) -> None | |
Given a path to a data file in the same form as test data set, | |
and to a joblib file containing the final model, this function will | |
load the data and then run the model and score it. | |
This function assumes that the labels will be included with the data | |
file. | |
''' | |
#Load data | |
X, y = load_data(data_file, label_col_index=label_col_index) | |
#Load model | |
try: | |
clf = load_model(model_file) | |
except: | |
print "Error loading model file" | |
print "Did you unzip 'final_model.zip'?" | |
#Make predictions | |
predictions = clf.predict(X) | |
#Calculate score | |
score = accuracy_score(predictions, y) | |
print "Model Accuracy: " + str(score) | |
if __name__ == "__main__": | |
run_model("data.csv", "final_model.pkl") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment