Skip to content

Instantly share code, notes, and snippets.

@mason-stewart
Created July 9, 2018 20:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mason-stewart/d359fece05eeb2172e188da1bed590fe to your computer and use it in GitHub Desktop.
Save mason-stewart/d359fece05eeb2172e188da1bed590fe to your computer and use it in GitHub Desktop.
This generates the following error:
# sentio model v3
# use:
# Sentio gives a clients best employees the test
# Those employes are compared to previous sentio test takers
# to determine which characteristics are most similar to
# best employees.
# output:
# 1) output scores (0-10) for total and for each input variable
# for the "best" employees and the employees which need to be scored
# 2) parameters from regression
# 3) scaling parameters
# diagnostics:
# provided in a non-production file (sentio_diagnostics.r)
#
# Data file requirements
# d: file of example people and example target, see training_data_v2.csv
# Wordcount all greater than 100
# No NA/Missing scores.
# Top:
# 1, if one of the targets,
# 0, if example data,
# -1, if it should be excluded from the regression but scored on the resulting model
# Estimation process
#
# Use target and example population Top %in% c(0,1)
# 1. Target variable: 1 if in target, -1 if not in target
#
# 2. For each explanatory variable, calculate empirical distribution and label each observation
# with its empirical percentile (will be between 0 and 1 inclusive)
#
# 3. Standardize explanatory variables by subtracting mean (0.5) and dividiving by sd (always .2887ish for uniform [0,1])
#
# 4. regress target on each variable independently without an intercept
#
# 5. Generate predictions from each variable by multiplying the coefficient by the score for each individual
#
# 6. Find the mean and sd of all the scores (across all people and variables)
# standardize: z = (x-mean)/sd
# scale: pnorm(z)*10 to convert it to a score between 0 and 1
# Doing this across all scores allows you to see easily that a 10 is really good and a 0 is really bad
# the weighting of the variable is implied by the score
#
# 7 Do the same scaling (0-10)for the final score
# Start with a clean environment
rm(list=ls(all=T))
# For the %>% (pipe) operator
library(magrittr)
# Not used at the moment
app_dir <- getwd()
args <- commandArgs(trailingOnly = TRUE)
inputFile <- args[1]
outputFile <- args[2]
# For some manipulation functions
library(tidyr)
library(dplyr)
# Read in training data
d = read.csv(inputFile)
use_in_estimation = ifelse(d$Top %in% c(0,1),1,0)
target = ifelse(use_in_estimation,2*d$Top-1,0)
watson_vars = colnames(d[6:length(colnames(d))])
var_mean = apply(d[,watson_vars],2,"mean")
var_sd = pmax(apply(d[,watson_vars],2,"sd"),.20)
var_stats = cbind(Mean = var_mean,SD = var_sd)
z_scorer = function(x){
z = ecdf(x)(x)
z = (z -.5)/.2887
return(z)
}
# Y needs to be a dataset of all of the variables
# with columns Mean and SD
f = function(x,y=var_stats){(x - y[,"Mean"])/y[,"SD"]}
raw_scores = d[,c(6:length(d))]
z_score = apply(raw_scores,2,z_scorer) %>% as.data.frame
colnames(z_score) = colnames(raw_scores)
# Zero out variables which are bad
# Bad optics, plus the score as little variation
#z_score = sign(z_score)*pmin(abs(z_score),2.0) %>% as.data.frame
#z_data=z_score[,1]
## target = d$Top
estimation = function(z_data,target){
dat = cbind(x=z_data,y=target) %>% as.data.frame
#r = glm(y~0+x,data=dat,family = binomial(link="logit"))
r = lm(y~0+x,data=dat)
r2 = summary(r)$r.squared
result = c(r$coef,r2)
names(result) = NULL
return(result)
}
target_var = d$Top[use_in_estimation==1]*2-1
x_vars = z_score[use_in_estimation==1,]
regression_params = apply(x_vars,2,
estimation,
target=target_var) %>% as.data.frame
betas = regression_params[1,] %>% unlist
rsq = regression_params[2,] %>% unlist
r = rsq^(1/2)
preds = apply(z_score,1,function(x){ ( x * betas)}) %>% t %>% as.data.frame
stage1_mean = mean(preds %>% unlist)
stage1_sd = sd(preds %>% unlist)*1.5
preds_normalized = apply(preds,1,function(x){(x - stage1_mean)/stage1_sd}) %>% t %>% as.data.frame
params_regression = cbind(
betas = betas,
rsq = rsq
) %>% as.data.frame
rownames(params_regression) = names(betas)
total = rowSums(preds_normalized)/ncol(preds_normalized)
total_mean = mean(total)
total_sd = sd(total)
params_scaling = cbind(
stage1_mean = stage1_mean,
stage1_sd = stage1_sd,
total_mean = total_mean,
total_sd = total_sd
) %>% as.data.frame
total_normalized = (total - total_mean)/total_sd
total_score = round(pnorm(total_normalized)*10,1)
preds_score = round(pnorm(as.matrix(preds_normalized))*10,1)
output_scores = cbind(d[,c("Id", "Name","Top","Department","WordCount")],Total = total_score,preds_score) %>%
filter(Top %in% c(1,-1))
write.csv(
output_scores,file=outputFile
)
# alternative output format, not currently used.
# ask Bill West for more details
# write.csv(
# params_regression[sort.list(params_regression$rsq,decreasing=T),],file="output_params_regression.csv"
# )
# write.csv(
# params_scaling,file="output_params_scaling.csv"
# )
Id Name Top Department WordCount openness adventurousness artistic_interests emotionality imagination intellect liberalism conscientiousness achievement_striving cautiousness dutifulness orderliness self_discipline self_efficacy extraversion activity_level assertiveness cheerfulness excitement_seeking friendliness gregariousness agreeableness altruism cooperation modesty morality sympathy trust neuroticism anger anxiety depression immoderation self_consciousness vulnerability challenge closeness curiosity excitement harmony ideal liberty love practicality self_expression stability structure conservation openness_to_change hedonism self_enhancement self_transcendence
7e1416bf-b1db-456e-aec2-3215a384825c AJ Richichi -1 Entry-Level Sales 828 0.98 0.668 0.346 0.094 0.434 0.97 0.72 0.875 0.981 0.904 0.421 0.332 0.873 0.971 0.985 0.989 0.988 0.357 0.162 0.427 0.234 0.213 0.565 0.462 0.038 0.538 0.572 0.815 0.963 0.286 0.102 0.434 0.134 0.211 0.057 0.512 0.039 0.574 0.113 0.06 0.269 0.233 0.095 0.497 0.184 0.225 0.761 0.037 0.431 0.019 0.275 0.149
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment