mason-stewart/analyzer.r

## analyzer.r
# sentio model v3
#   use:
#     Sentio gives a clients best employees the test
#     Those employes are compared to previous sentio test takers
#     to determine which characteristics are most similar to
#     best employees.
#   output:
#     1) output scores (0-10) for total and for each input variable
#     for the "best" employees and the employees which need to be scored
#     2) parameters from regression
#     3) scaling parameters
#   diagnostics:
#     provided in a non-production file (sentio_diagnostics.r)
#

# Data file requirements
#  d: file of example people and example target, see training_data_v2.csv
#  Wordcount all greater than 100
#  No NA/Missing scores.
#  Top:
#   1, if one of the targets,
#   0, if example data,
#   -1, if it should be excluded from the regression but scored on the resulting model

# Estimation process
#
# Use target and example population Top %in% c(0,1)
#  1. Target variable:  1 if in target, -1 if not in target
#
#  2. For each explanatory variable, calculate empirical distribution and label each observation
#  with its empirical percentile (will be between 0 and 1 inclusive)
#
#  3. Standardize explanatory variables by subtracting mean (0.5) and dividiving by sd (always .2887ish for uniform [0,1])
#
#  4. regress target on each variable independently without an intercept
#
#  5. Generate predictions from each variable by multiplying the coefficient by the score for each individual
#
#  6. Find the mean and sd of all the scores (across all people and variables)
#    standardize: z = (x-mean)/sd
#    scale:  pnorm(z)*10 to convert it to a score between 0 and 1
#    Doing this across all scores allows you to see easily that a 10 is really good and a 0 is really bad
#    the weighting of the variable is implied by the score
#
#  7 Do the same scaling (0-10)for the final score

# Start with a clean environment
rm(list=ls(all=T))

# For the %>% (pipe) operator
library(magrittr)

# Not used at the moment
app_dir <- getwd()

args <- commandArgs(trailingOnly = TRUE)
inputFile <- args[1]
outputFile <- args[2]

# For some manipulation functions
library(tidyr)
library(dplyr)

# Read in training data
d = read.csv(inputFile)

use_in_estimation = ifelse(d$Top %in% c(0,1),1,0)
target = ifelse(use_in_estimation,2*d$Top-1,0)
watson_vars = colnames(d[6:length(colnames(d))])

var_mean = apply(d[,watson_vars],2,"mean")
var_sd = pmax(apply(d[,watson_vars],2,"sd"),.20)
var_stats = cbind(Mean = var_mean,SD = var_sd)

z_scorer = function(x){
  z = ecdf(x)(x)
  z = (z -.5)/.2887
  return(z)
}

# Y needs to be a dataset of all of the variables
# with columns Mean and SD
f = function(x,y=var_stats){(x - y[,"Mean"])/y[,"SD"]}
raw_scores = d[,c(6:length(d))]
z_score = apply(raw_scores,2,z_scorer) %>% as.data.frame
colnames(z_score) = colnames(raw_scores)

# Zero out variables which are bad
# Bad optics, plus the score as little variation

#z_score = sign(z_score)*pmin(abs(z_score),2.0) %>% as.data.frame

#z_data=z_score[,1]
## target = d$Top
estimation = function(z_data,target){
   dat = cbind(x=z_data,y=target) %>% as.data.frame
   #r = glm(y~0+x,data=dat,family = binomial(link="logit"))
   r = lm(y~0+x,data=dat)
   r2 = summary(r)$r.squared
   result = c(r$coef,r2)
   names(result) = NULL
   return(result)
}

target_var = d$Top[use_in_estimation==1]*2-1
x_vars = z_score[use_in_estimation==1,]
regression_params = apply(x_vars,2,
            estimation,
            target=target_var) %>% as.data.frame
betas = regression_params[1,] %>% unlist
rsq = regression_params[2,] %>% unlist
r = rsq^(1/2)
preds = apply(z_score,1,function(x){ ( x * betas)}) %>% t %>% as.data.frame

stage1_mean = mean(preds %>% unlist)
stage1_sd = sd(preds %>% unlist)*1.5

preds_normalized = apply(preds,1,function(x){(x - stage1_mean)/stage1_sd}) %>% t %>% as.data.frame

params_regression = cbind(
  betas = betas,
  rsq = rsq
) %>% as.data.frame
rownames(params_regression) = names(betas)


total = rowSums(preds_normalized)/ncol(preds_normalized)
total_mean = mean(total)
total_sd = sd(total)

params_scaling = cbind(
  stage1_mean = stage1_mean,
  stage1_sd = stage1_sd,
  total_mean = total_mean,
  total_sd = total_sd
) %>% as.data.frame

total_normalized = (total - total_mean)/total_sd

total_score = round(pnorm(total_normalized)*10,1)

preds_score = round(pnorm(as.matrix(preds_normalized))*10,1)

output_scores = cbind(d[,c("Id", "Name","Top","Department","WordCount")],Total = total_score,preds_score) %>%
  filter(Top %in% c(1,-1))

write.csv(
  output_scores,file=outputFile
)

# alternative output format, not currently used.
# ask Bill West for more details
# write.csv(
#   params_regression[sort.list(params_regression$rsq,decreasing=T),],file="output_params_regression.csv"
# )
# write.csv(
#   params_scaling,file="output_params_scaling.csv"
# )


## input.csv

          
            Id
            Name
            Top
            Department
            WordCount
            openness
            adventurousness
            artistic_interests
            emotionality
            imagination
            intellect
            liberalism
            conscientiousness
            achievement_striving
            cautiousness
            dutifulness
            orderliness
            self_discipline
            self_efficacy
            extraversion
            activity_level
            assertiveness
            cheerfulness
            excitement_seeking
            friendliness
            gregariousness
            agreeableness
            altruism
            cooperation
            modesty
            morality
            sympathy
            trust
            neuroticism
            anger
            anxiety
            depression
            immoderation
            self_consciousness
            vulnerability
            challenge
            closeness
            curiosity
            excitement
            harmony
            ideal
            liberty
            love
            practicality
            self_expression
            stability
            structure
            conservation
            openness_to_change
            hedonism
            self_enhancement
            self_transcendence

            
              7e1416bf-b1db-456e-aec2-3215a384825c
              AJ Richichi
              -1
              Entry-Level Sales
              828
              0.98
              0.668
              0.346
              0.094
              0.434
              0.97
              0.72
              0.875
              0.981
              0.904
              0.421
              0.332
              0.873
              0.971
              0.985
              0.989
              0.988
              0.357
              0.162
              0.427
              0.234
              0.213
              0.565
              0.462
              0.038
              0.538
              0.572
              0.815
              0.963
              0.286
              0.102
              0.434
              0.134
              0.211
              0.057
              0.512
              0.039
              0.574
              0.113
              0.06
              0.269
              0.233
              0.095
              0.497
              0.184
              0.225
              0.761
              0.037
              0.431
              0.019
              0.275
              0.149
	# sentio model v3
	# use:
	# Sentio gives a clients best employees the test
	# Those employes are compared to previous sentio test takers
	# to determine which characteristics are most similar to
	# best employees.
	# output:
	# 1) output scores (0-10) for total and for each input variable
	# for the "best" employees and the employees which need to be scored
	# 2) parameters from regression
	# 3) scaling parameters
	# diagnostics:
	# provided in a non-production file (sentio_diagnostics.r)
	#

	# Data file requirements
	# d: file of example people and example target, see training_data_v2.csv
	# Wordcount all greater than 100
	# No NA/Missing scores.
	# Top:
	# 1, if one of the targets,
	# 0, if example data,
	# -1, if it should be excluded from the regression but scored on the resulting model

	# Estimation process
	#
	# Use target and example population Top %in% c(0,1)
	# 1. Target variable: 1 if in target, -1 if not in target
	#
	# 2. For each explanatory variable, calculate empirical distribution and label each observation
	# with its empirical percentile (will be between 0 and 1 inclusive)
	#
	# 3. Standardize explanatory variables by subtracting mean (0.5) and dividiving by sd (always .2887ish for uniform [0,1])
	#
	# 4. regress target on each variable independently without an intercept
	#
	# 5. Generate predictions from each variable by multiplying the coefficient by the score for each individual
	#
	# 6. Find the mean and sd of all the scores (across all people and variables)
	# standardize: z = (x-mean)/sd
	# scale: pnorm(z)*10 to convert it to a score between 0 and 1
	# Doing this across all scores allows you to see easily that a 10 is really good and a 0 is really bad
	# the weighting of the variable is implied by the score
	#
	# 7 Do the same scaling (0-10)for the final score

	# Start with a clean environment
	rm(list=ls(all=T))

	# For the %>% (pipe) operator
	library(magrittr)

	# Not used at the moment
	app_dir <- getwd()

	args <- commandArgs(trailingOnly = TRUE)
	inputFile <- args[1]
	outputFile <- args[2]

	# For some manipulation functions
	library(tidyr)
	library(dplyr)

	# Read in training data
	d = read.csv(inputFile)

	use_in_estimation = ifelse(d$Top %in% c(0,1),1,0)
	target = ifelse(use_in_estimation,2*d$Top-1,0)
	watson_vars = colnames(d[6:length(colnames(d))])

	var_mean = apply(d[,watson_vars],2,"mean")
	var_sd = pmax(apply(d[,watson_vars],2,"sd"),.20)
	var_stats = cbind(Mean = var_mean,SD = var_sd)

	z_scorer = function(x){
	z = ecdf(x)(x)
	z = (z -.5)/.2887
	return(z)
	}

	# Y needs to be a dataset of all of the variables
	# with columns Mean and SD
	f = function(x,y=var_stats){(x - y[,"Mean"])/y[,"SD"]}
	raw_scores = d[,c(6:length(d))]
	z_score = apply(raw_scores,2,z_scorer) %>% as.data.frame
	colnames(z_score) = colnames(raw_scores)

	# Zero out variables which are bad
	# Bad optics, plus the score as little variation

	#z_score = sign(z_score)*pmin(abs(z_score),2.0) %>% as.data.frame

	#z_data=z_score[,1]
	## target = d$Top
	estimation = function(z_data,target){
	dat = cbind(x=z_data,y=target) %>% as.data.frame
	#r = glm(y~0+x,data=dat,family = binomial(link="logit"))
	r = lm(y~0+x,data=dat)
	r2 = summary(r)$r.squared
	result = c(r$coef,r2)
	names(result) = NULL
	return(result)
	}

	target_var = d$Top[use_in_estimation==1]*2-1
	x_vars = z_score[use_in_estimation==1,]
	regression_params = apply(x_vars,2,
	estimation,
	target=target_var) %>% as.data.frame
	betas = regression_params[1,] %>% unlist
	rsq = regression_params[2,] %>% unlist
	r = rsq^(1/2)
	preds = apply(z_score,1,function(x){ ( x * betas)}) %>% t %>% as.data.frame

	stage1_mean = mean(preds %>% unlist)
	stage1_sd = sd(preds %>% unlist)*1.5

	preds_normalized = apply(preds,1,function(x){(x - stage1_mean)/stage1_sd}) %>% t %>% as.data.frame

	params_regression = cbind(
	betas = betas,
	rsq = rsq
	) %>% as.data.frame
	rownames(params_regression) = names(betas)


	total = rowSums(preds_normalized)/ncol(preds_normalized)
	total_mean = mean(total)
	total_sd = sd(total)

	params_scaling = cbind(
	stage1_mean = stage1_mean,
	stage1_sd = stage1_sd,
	total_mean = total_mean,
	total_sd = total_sd
	) %>% as.data.frame

	total_normalized = (total - total_mean)/total_sd

	total_score = round(pnorm(total_normalized)*10,1)

	preds_score = round(pnorm(as.matrix(preds_normalized))*10,1)

	output_scores = cbind(d[,c("Id", "Name","Top","Department","WordCount")],Total = total_score,preds_score) %>%
	filter(Top %in% c(1,-1))

	write.csv(
	output_scores,file=outputFile
	)

	# alternative output format, not currently used.
	# ask Bill West for more details
	# write.csv(
	# params_regression[sort.list(params_regression$rsq,decreasing=T),],file="output_params_regression.csv"
	# )
	# write.csv(
	# params_scaling,file="output_params_scaling.csv"
	# )
	Id	Name	Top	Department	WordCount	openness	adventurousness	artistic_interests	emotionality	imagination	intellect	liberalism	conscientiousness	achievement_striving	cautiousness	dutifulness	orderliness	self_discipline	self_efficacy	extraversion	activity_level	assertiveness	cheerfulness	excitement_seeking	friendliness	gregariousness	agreeableness	altruism	cooperation	modesty	morality	sympathy	trust	neuroticism	anger	anxiety	depression	immoderation	self_consciousness	vulnerability	challenge	closeness	curiosity	excitement	harmony	ideal	liberty	love	practicality	self_expression	stability	structure	conservation	openness_to_change	hedonism	self_enhancement	self_transcendence
	7e1416bf-b1db-456e-aec2-3215a384825c	AJ Richichi	-1	Entry-Level Sales	828	0.98	0.668	0.346	0.094	0.434	0.97	0.72	0.875	0.981	0.904	0.421	0.332	0.873	0.971	0.985	0.989	0.988	0.357	0.162	0.427	0.234	0.213	0.565	0.462	0.038	0.538	0.572	0.815	0.963	0.286	0.102	0.434	0.134	0.211	0.057	0.512	0.039	0.574	0.113	0.06	0.269	0.233	0.095	0.497	0.184	0.225	0.761	0.037	0.431	0.019	0.275	0.149