clettieri/InternetAds Challenge.ipynb Secret

## data_functions.py
import pandas as pd
import numpy as np

def get_column_names(file_name, first_col_is_label=True):
    '''(string) -> list of strings

    Will open a text file assumed to contain 1 column name
    per line.  Will remove all blank lines.  And will strip
    rest of line after a ':' is reached. Returns a list of
    column names.
    '''
    col_names = []
    #Since data file uses carriage return, open 'rU'
    with open(file_name, 'rU') as f:
        for line in f:
            #If not blank line
            if line.strip():
                #Get column name until ':'
                col_names.append(line.strip().split(':')[0])
    if first_col_is_label:
        #Col List has label as first column
        col_names.pop(0)
        #DataFrame has label as last column
        col_names.append('is_ad')
    return col_names

def convert_label_to_numeric(df):
    '''(DataFrame) -> DataFrame

    Will convert the label column 'is_ad' into numeric
    values. 1 representing an ad, 0 representing non-ad.
    '''
    df['is_ad'] = np.where(df['is_ad']=='ad.', 1, 0)
    return df

def load_data_with_col_names(data_file, column_file):
    '''(string, string) -> DataFrame

    Will load column names from a text file and then read the
    data file as csv.  Return a dataframe of data file
    with proper column names.
    '''
    col_names = get_column_names(column_file)
    #Missing values in the data file are a '?' with varying whitespace
    df = pd.read_csv(data_file, names=col_names, skipinitialspace=True, na_values=['?'])
    df = convert_label_to_numeric(df)
    return df

## InternetAds Challenge.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              InternetAds Challenge.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## run_model.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score

def load_data(file_name, label_col_index=-1):
    '''(string, int) -> array, array

    Given a file path and name to a csv file in the SAME FORMAT
    as the test data set, this will laod the file and return two
    arrays of values.  Label_col_index is the numerical index of the
    label column, default is -1 (last column in data file).
    '''
    df = pd.read_csv(file_name, skipinitialspace=True, na_values=['?'])
    #Get Features
    x_df = df.drop(df.columns[label_col_index], axis=1)
    X = x_df.values

    #Impute Missing x values
    imp = Imputer(strategy="median", axis=0)
    X = imp.fit_transform(X)

    #Get Labels
    df.iloc[:,label_col_index] = np.where(df.iloc[:,label_col_index]=='ad.', 1, 0)
    y_df = df.iloc[:,label_col_index]
    y = y_df.values

    return X, y

def load_model(file_name):
    '''(string) -> sklearn classifier

    Given a file path and name to a joblib file, will return
    the sklearn classifer as python object.
    '''
    clf = joblib.load(file_name)
    return clf

def run_model(data_file, model_file, label_col_index=-1):
    '''(string, string, int) -> None

    Given a path to a data file in the same form as test data set,
    and to a joblib file containing the final model, this function will
    load the data and then run the model and score it.

    This function assumes that the labels will be included with the data
    file.
    '''
    #Load data
    X, y = load_data(data_file, label_col_index=label_col_index)
    #Load model
    try:
        clf = load_model(model_file)
    except:
        print "Error loading model file"
        print "Did you unzip 'final_model.zip'?"
    #Make predictions
    predictions = clf.predict(X)
    #Calculate score
    score = accuracy_score(predictions, y)
    print "Model Accuracy: " + str(score)

if __name__ == "__main__":
    run_model("data.csv", "final_model.pkl")
	import pandas as pd
	import numpy as np

	def get_column_names(file_name, first_col_is_label=True):
	'''(string) -> list of strings

	Will open a text file assumed to contain 1 column name
	per line. Will remove all blank lines. And will strip
	rest of line after a ':' is reached. Returns a list of
	column names.
	'''
	col_names = []
	#Since data file uses carriage return, open 'rU'
	with open(file_name, 'rU') as f:
	for line in f:
	#If not blank line
	if line.strip():
	#Get column name until ':'
	col_names.append(line.strip().split(':')[0])
	if first_col_is_label:
	#Col List has label as first column
	col_names.pop(0)
	#DataFrame has label as last column
	col_names.append('is_ad')
	return col_names

	def convert_label_to_numeric(df):
	'''(DataFrame) -> DataFrame

	Will convert the label column 'is_ad' into numeric
	values. 1 representing an ad, 0 representing non-ad.
	'''
	df['is_ad'] = np.where(df['is_ad']=='ad.', 1, 0)
	return df

	def load_data_with_col_names(data_file, column_file):
	'''(string, string) -> DataFrame

	Will load column names from a text file and then read the
	data file as csv. Return a dataframe of data file
	with proper column names.
	'''
	col_names = get_column_names(column_file)
	#Missing values in the data file are a '?' with varying whitespace
	df = pd.read_csv(data_file, names=col_names, skipinitialspace=True, na_values=['?'])
	df = convert_label_to_numeric(df)
	return df
	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import Imputer
	from sklearn.externals import joblib
	from sklearn.metrics import accuracy_score

	def load_data(file_name, label_col_index=-1):
	'''(string, int) -> array, array

	Given a file path and name to a csv file in the SAME FORMAT
	as the test data set, this will laod the file and return two
	arrays of values. Label_col_index is the numerical index of the
	label column, default is -1 (last column in data file).
	'''
	df = pd.read_csv(file_name, skipinitialspace=True, na_values=['?'])
	#Get Features
	x_df = df.drop(df.columns[label_col_index], axis=1)
	X = x_df.values

	#Impute Missing x values
	imp = Imputer(strategy="median", axis=0)
	X = imp.fit_transform(X)

	#Get Labels
	df.iloc[:,label_col_index] = np.where(df.iloc[:,label_col_index]=='ad.', 1, 0)
	y_df = df.iloc[:,label_col_index]
	y = y_df.values

	return X, y

	def load_model(file_name):
	'''(string) -> sklearn classifier

	Given a file path and name to a joblib file, will return
	the sklearn classifer as python object.
	'''
	clf = joblib.load(file_name)
	return clf

	def run_model(data_file, model_file, label_col_index=-1):
	'''(string, string, int) -> None

	Given a path to a data file in the same form as test data set,
	and to a joblib file containing the final model, this function will
	load the data and then run the model and score it.

	This function assumes that the labels will be included with the data
	file.
	'''
	#Load data
	X, y = load_data(data_file, label_col_index=label_col_index)
	#Load model
	try:
	clf = load_model(model_file)
	except:
	print "Error loading model file"
	print "Did you unzip 'final_model.zip'?"
	#Make predictions
	predictions = clf.predict(X)
	#Calculate score
	score = accuracy_score(predictions, y)
	print "Model Accuracy: " + str(score)

	if __name__ == "__main__":
	run_model("data.csv", "final_model.pkl")