Skip to content

Instantly share code, notes, and snippets.

@clettieri
Created August 23, 2017 11:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clettieri/7124e5ea5d9bd78bfe50e99ed60a1b54 to your computer and use it in GitHub Desktop.
Save clettieri/7124e5ea5d9bd78bfe50e99ed60a1b54 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
def get_column_names(file_name, first_col_is_label=True):
'''(string) -> list of strings
Will open a text file assumed to contain 1 column name
per line. Will remove all blank lines. And will strip
rest of line after a ':' is reached. Returns a list of
column names.
'''
col_names = []
#Since data file uses carriage return, open 'rU'
with open(file_name, 'rU') as f:
for line in f:
#If not blank line
if line.strip():
#Get column name until ':'
col_names.append(line.strip().split(':')[0])
if first_col_is_label:
#Col List has label as first column
col_names.pop(0)
#DataFrame has label as last column
col_names.append('is_ad')
return col_names
def convert_label_to_numeric(df):
'''(DataFrame) -> DataFrame
Will convert the label column 'is_ad' into numeric
values. 1 representing an ad, 0 representing non-ad.
'''
df['is_ad'] = np.where(df['is_ad']=='ad.', 1, 0)
return df
def load_data_with_col_names(data_file, column_file):
'''(string, string) -> DataFrame
Will load column names from a text file and then read the
data file as csv. Return a dataframe of data file
with proper column names.
'''
col_names = get_column_names(column_file)
#Missing values in the data file are a '?' with varying whitespace
df = pd.read_csv(data_file, names=col_names, skipinitialspace=True, na_values=['?'])
df = convert_label_to_numeric(df)
return df
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
def load_data(file_name, label_col_index=-1):
'''(string, int) -> array, array
Given a file path and name to a csv file in the SAME FORMAT
as the test data set, this will laod the file and return two
arrays of values. Label_col_index is the numerical index of the
label column, default is -1 (last column in data file).
'''
df = pd.read_csv(file_name, skipinitialspace=True, na_values=['?'])
#Get Features
x_df = df.drop(df.columns[label_col_index], axis=1)
X = x_df.values
#Impute Missing x values
imp = Imputer(strategy="median", axis=0)
X = imp.fit_transform(X)
#Get Labels
df.iloc[:,label_col_index] = np.where(df.iloc[:,label_col_index]=='ad.', 1, 0)
y_df = df.iloc[:,label_col_index]
y = y_df.values
return X, y
def load_model(file_name):
'''(string) -> sklearn classifier
Given a file path and name to a joblib file, will return
the sklearn classifer as python object.
'''
clf = joblib.load(file_name)
return clf
def run_model(data_file, model_file, label_col_index=-1):
'''(string, string, int) -> None
Given a path to a data file in the same form as test data set,
and to a joblib file containing the final model, this function will
load the data and then run the model and score it.
This function assumes that the labels will be included with the data
file.
'''
#Load data
X, y = load_data(data_file, label_col_index=label_col_index)
#Load model
try:
clf = load_model(model_file)
except:
print "Error loading model file"
print "Did you unzip 'final_model.zip'?"
#Make predictions
predictions = clf.predict(X)
#Calculate score
score = accuracy_score(predictions, y)
print "Model Accuracy: " + str(score)
if __name__ == "__main__":
run_model("data.csv", "final_model.pkl")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment