Skip to content

Instantly share code, notes, and snippets.

@joaovissoci
Created October 16, 2014 16:38
Show Gist options
  • Save joaovissoci/a202849f6d34e5099002 to your computer and use it in GitHub Desktop.
Save joaovissoci/a202849f6d34e5099002 to your computer and use it in GitHub Desktop.
taxometric_function.R
###########################################################################################################
#
# MAXEIG -- performs MAXEIG, but can also be used to perform MAXCOV.
#
# Written by John Ruscio
# Last modified January 9, 2012
#
###########################################################################################################
#
# Preliminary program to read and submit research data to analysis and then, if requested, perform analyses
# of comparison data to test fit of averaged curves.
#
###########################################################################################################
MAXEIG <- function(Data.Set, Comp.Data = T, N.Pop = 100000, N.Samples = 100, Supplied.Class = F,
Supplied.P = 0, Ind.Triplets = F, Ind.Comp = F, Intervals = F, N.Int = 15,
Windows = 25, Overlap = .90, Calc.Cov = F, St.Ind = F, Replications = 0,
Gr.Orient = 1, Gr.Rows = 3, Gr.Cols = 2, Gr.Smooth = 0, Gr.Common.Y = F, Gr.Ref = 2,
Gr.Avg = F, Gr.Ind = F, Classify = 1, Gr.Bayes = F, Save.Class = 0, File.Output = F,
File.Name = "Output.txt", Seed = 1)
{
Data.Set <- Remove.Missing(Data.Set) # remove any missing data (listwise deletion)
N <- dim(Data.Set)[1] # determine sample size
I <- dim(Data.Set)[2] # determine number of indicators; adjust if one column contains class assignments
if (Supplied.Class) I <- I - 1 # subtract to allow for class assignment data, if provided
if ((version$major == 1) & (version$minor < 9)) library(eda)
else library(stats) # load R code for running medians smoother
set.seed(Seed) # set random number seed to allow exact replications of results
# automatically fix some potential parameter inconsistencies or omissions
if (Comp.Data) Gr.Avg <- T # show the averaged curve that will be compared to those of comparison data
if ((Comp.Data) & (N.Samples == 1)) N.Samples <- 100 # do not allow a single sample of comparison data
if (Ind.Comp) Ind.Triplets <- T # for this input method, triplets must be used
if (Ind.Comp & (I < 4)) Ind.Comp <- F # cannot form composites using individual variables
if (Calc.Cov) Ind.Triplets <- T # covariances can only be calculated using two outputs
if (Ind.Comp) Classify <- 1 # cannot use Bayes' Theorem with composite input indicators
if (Ind.Comp | !Ind.Triplets) Gr.Ind <- F # cannot average curves for each indicator
if ((Classify == 1) & (Save.Class == 2)) Save.Class <- 1 # cannot return Bayesian prob if not generated
if (Gr.Avg) St.Ind <- T # always standardize indicators if graphs will be averaged
if (St.Ind) Data.Set[,1:I] <- apply(Data.Set[,1:I], 2, scale) # standardize data if requested
# check for tied scores and set number of replications
Ties <- F
for (i in 1:I)
if (length(unique(Data.Set[,I])) < N) Ties <- T
if ((Replications == 0) & (Ties)) Replications <- 10 # default is 10 replications if any tied scores
if (Replications == 0) Replications <- 1
if ((Replications > 1) & (!Ties)) Replications <- 1 # eliminate unnecessary replications
# if output to file is requested, redirect all text output
if (File.Output) sink(File.Name)
# label supplied data and submit to main program for analysis; class assignments will be returned
Research.Results <- MAXEIG.Main(Data.Set, Research.Data = T, Comp.Data, N.Pop, N.Samples,
Supplied.P, Supplied.Class, N, I, Ind.Triplets, Ind.Comp, Intervals,
N.Int, Windows, Overlap, Calc.Cov, St.Ind, Replications,
Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth,Gr.Common.Y, Gr.Ref, Gr.Avg,
Gr.Ind, Classify, Save.Class, Gr.Bayes, File.Output, File.Name)
if (Comp.Data)
{
P.Est.M <- Research.Results[[1]]
P.Est.SD <- Research.Results[[2]]
Class.Assign <- Research.Results[[3]]
MAXEIG.Results <- Research.Results[[4]]
}
else
{
P.Est.M <- Research.Results[[1]]
Class.Assign <- Research.Results[[2]]
}
# if comparison data requested, then generate and submit samples for analysis
if (Comp.Data)
{
# create matrices to store results for each comparison data set
if (Intervals) N.Subsamples <- N.Int
else N.Subsamples <- Windows
Tax.Y.Values <- matrix(nrow = N.Samples, ncol = N.Subsamples)
Dim.Y.Values <- matrix(nrow = N.Samples, ncol = N.Subsamples)
if (Ind.Comp) N.Curves <- (I * (I - 1)) / 2
else if (Ind.Triplets) N.Curves <- (I * (I - 1) * (I - 2)) / 2
else N.Curves <- I
Tax.P.Ests <- matrix(nrow = N.Samples, ncol = N.Curves)
Dim.P.Ests <- matrix(nrow = N.Samples, ncol = N.Curves)
# generate and analyze categorical data
set.seed(Seed)
if (!Supplied.Class) # if class assignments were not provided, generate using base rate
{
Data.Set <- cbind(Data.Set, rep(0, N))
if (Supplied.P == 0) P <- P.Est.M # use estimated P in research data if no P was provided
else P <- Supplied.P
if ((P.Est.M * N) < 20) P.Est.M <- 20 / N
if ((N - (P.Est.M * N)) < 20) P.Est.M <- N - (20 / N)
for (i in 1:I)
Data.Set[, (I + 1)] <- Data.Set[, (I + 1)] + Data.Set[,i]
Data.Set <- Data.Set[sort.list(Data.Set[,(I + 1)]),]
P.Est.Series <- c(rep(2, round(N * P)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Set[,(I + 1)] <- c(Q.Est.Series, P.Est.Series)
}
Tax.Data <- Data.Set[(Data.Set[,(I + 1)] == 2), 1:I]
Com.Data <- Data.Set[(Data.Set[,(I + 1)] == 1), 1:I]
P.Data <- dim(Tax.Data)[1] / (dim(Tax.Data)[1] + dim(Com.Data)[1])
cat("\nGenerating and analyzing comparison data\n")
Sim.Tax <- GenData(Tax.Data, round(N.Pop * P.Data))
Sim.Com <- GenData(Com.Data, (N.Pop - round(N.Pop * P.Data)))
Sim.Pop <- rbind(Sim.Tax, Sim.Com)
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))]
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I)))
cat(" Categorical population generated, RMSR r =",round(RMSR,3),"\n")
for (Sims in 1:N.Samples)
{
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),]
MAXEIG.Tax.Results <- MAXEIG.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, N.Samples,
Supplied.P, Supplied.Class, N, I, Ind.Triplets, Ind.Comp,
Intervals, N.Int, Windows, Overlap, Calc.Cov,
St.Ind, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth,
Gr.Common.Y, Gr.Ref, Gr.Avg, Gr.Ind, Classify, Save.Class,
Gr.Bayes, File.Output, File.Name)
Tax.Y.Values[Sims,] <- MAXEIG.Tax.Results[[2]]
Tax.P.Ests[Sims,] <- MAXEIG.Tax.Results[[3]]
}
cat(" Analysis of",N.Samples,"samples of categorical comparison data completed\n")
# generate and analyze dimensional samples
set.seed(Seed)
Sim.Pop <- GenData(Data.Set[, 1:I], N.Pop)
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))]
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I)))
cat("\n Dimensional population generated, RMSR r =",round(RMSR,3),"\n")
for (Sims in 1:N.Samples)
{
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),]
MAXEIG.Dim.Results <- MAXEIG.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, N.Samples,
Supplied.P, Supplied.Class, N, I, Ind.Triplets, Ind.Comp,
Intervals, N.Int, Windows, Overlap, Calc.Cov,
St.Ind, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth,
Gr.Common.Y, Gr.Ref, Gr.Avg, Gr.Ind, Classify, Save.Class,
Gr.Bayes, File.Output, File.Name)
Dim.Y.Values[Sims,] <- MAXEIG.Dim.Results[[2]]
Dim.P.Ests[Sims,] <- MAXEIG.Dim.Results[[3]]
}
cat(" Analysis of",N.Samples,"samples of dimensional comparison data completed\n")
# summarize base rate estimates for comparison data
P.Est.Summary <- matrix(nrow = (N.Samples + 2), ncol = 4)
for (Sims in 1:N.Samples)
{
P.Est.Summary[Sims,1] <- mean(Tax.P.Ests[Sims,], na.rm = T)
P.Est.Summary[Sims,2] <- sd(Tax.P.Ests[Sims,], na.rm = T)
P.Est.Summary[Sims,3] <- mean(Dim.P.Ests[Sims,], na.rm = T)
P.Est.Summary[Sims,4] <- sd(Dim.P.Ests[Sims,], na.rm = T)
}
for (i in 1:4)
{
P.Est.Summary[Sims + 1,i] <- mean(P.Est.Summary[1:Sims,i])
P.Est.Summary[Sims + 2,i] <- sd(P.Est.Summary[1:Sims,i])
}
Row.Labels <- c(1:Sims,paste("M for",N.Samples,"samples"),paste("SD for",N.Samples,"samples"))
Col.Labels <- c("Categorical M","SD","Dimensional M","SD")
dimnames(P.Est.Summary) <- list(Row.Labels,Col.Labels)
cat("\n\nBase rate estimates for comparison data:\n\n")
if (N.Samples == 1)
{
cat(" Categorical sample: M =",round(P.Est.Summary[1,1],4),", SD =",
round(P.Est.Summary[1,2],4),"\n")
cat(" Dimensional sample: M =",round(P.Est.Summary[1,3],4),", SD =",
round(P.Est.Summary[1,4],4),"\n")
}
else print(round(P.Est.Summary[((N.Samples+1):(N.Samples+2)),],4))
# plot sampling distributions of Ms/SDs of base rate estimates
if (N.Samples > 1)
{
dev.new(width=7, height=3.5)
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75)
Y.M.Max <- max(c(density(P.Est.Summary[1:Sims,1])$y,density(P.Est.Summary[1:Sims,3])$y)) * 1.25
Y.SD.Max <- max(c(density(P.Est.Summary[1:Sims,2])$y,density(P.Est.Summary[1:Sims,4])$y)) * 1.25
plot(density(P.Est.Summary[1:Sims,1], adjust = .75), xlim = c(0,1), ylim = c(0, Y.M.Max),
xlab = "Ms of Base Rate Estimates", ylab = "", main = "", lwd = 2, axes = F)
axis(1)
lines(density(P.Est.Summary[1:Sims,3], adjust = .75), lwd = 2, lty = 2)
abline(v = P.Est.M, lwd = 2, lty = 3)
plot(density(P.Est.Summary[1:Sims,2], adjust = .75), xlim = c(0,.5), ylim = c(0, Y.SD.Max),
xlab = "SDs of Base Rate Estimates", ylab = "", main = "", lwd = 2, axes = F)
axis(1)
lines(density(P.Est.Summary[1:Sims,4], adjust = .75), lwd = 2, lty = 2)
abline(v = P.Est.SD, lwd = 2, lty = 3)
mtext("BASE RATE ESTIMATES FOR COMPARISON DATA",outer=T)
}
# plot averaged curves for comparison data
X.Values <- MAXEIG.Dim.Results[[1]]
X.Range <- c(min(X.Values), max(X.Values))
Tax.Y <- vector("numeric", length(X.Values))
for (i in 1:length(Tax.Y))
Tax.Y[i] <- mean(Tax.Y.Values[,i],na.rm=T)
Dim.Y <- vector("numeric", length(X.Values))
for (i in 1:length(Dim.Y))
Dim.Y[i] <- mean(Dim.Y.Values[,i],na.rm=T)
Y.Values <- c(MAXEIG.Results,Tax.Y.Values,Dim.Y.Values)
Y.Range <- c(min(0, min(Y.Values)), max(Y.Values) * 1.10)
# label the x axis in accord with input method and indicator
if (Intervals) X.Label <- paste(N.Int,"Intervals")
else X.Label <- paste(Windows,"Windows")
if (Ind.Comp)
if (Intervals) X.Label <- paste(N.Int,"Intervals")
else X.Label <- paste(Windows,"Windows")
# label the y axis in accord with calculation type and output indicators
if (Calc.Cov) Y.Label <- "Covariance"
else Y.Label <- "Eigenvalue"
# 2-panel plot for comparison data
Tax.Sum <- Dim.Sum <- matrix(0, nrow = 6, ncol = length(X.Values))
for (i in 1:length(X.Values))
{
Tax.Sum[,i] <- summary(Tax.Y.Values[,i])
Dim.Sum[,i] <- summary(Dim.Y.Values[,i])
}
dev.new(width=7, height=3.5)
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75)
plot(X.Values, MAXEIG.Results, type = "o", pch = 16, lwd = 2, ylim = Y.Range,
xlim = X.Range, ylab = Y.Label, xlab = X.Label, main = "Categorical Comparison Data")
for (i in 1:(length(X.Values)-1))
polygon(x = c(X.Values[i], X.Values[i], X.Values[i+1], X.Values[i+1]),
y = c(Tax.Sum[2,i], Tax.Sum[5,i], Tax.Sum[5,i+1], Tax.Sum[2,i+1]), col = 8, lty = 0)
lines(X.Values, Tax.Sum[1,])
lines(X.Values, Tax.Sum[6,])
points(X.Values, MAXEIG.Results, type = "o", pch = 16)
lines(X.Values, MAXEIG.Results, lwd = 2)
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0)
plot(X.Values, MAXEIG.Results, type = "o", pch = 16, lwd = 2, ylim = Y.Range,
xlim = X.Range, ylab = Y.Label, xlab = X.Label, main = "Dimensional Comparison Data")
for (i in 1:(length(X.Values)-1))
polygon(x = c(X.Values[i], X.Values[i], X.Values[i+1], X.Values[i+1]),
y = c(Dim.Sum[2,i], Dim.Sum[5,i], Dim.Sum[5,i+1], Dim.Sum[2,i+1]), col = 8, lty = 0)
lines(X.Values, Dim.Sum[1,])
lines(X.Values, Dim.Sum[6,])
points(X.Values, MAXEIG.Results, type = "o", pch = 16)
lines(X.Values, MAXEIG.Results, lwd = 2)
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0)
# calculate and print comparison curve fit index (CCFI)
Fit.RMSR.t <- sqrt(sum((Tax.Y - MAXEIG.Results)^2)/N.Subsamples)
Fit.RMSR.d <- sqrt(sum((Dim.Y - MAXEIG.Results)^2)/N.Subsamples)
CCFI <- Fit.RMSR.d / (Fit.RMSR.d + Fit.RMSR.t)
cat("\n\nComparison curve fit index (CCFI) =",round(Fit.RMSR.d,4),"/ (",round(Fit.RMSR.d,4),
"+",round(Fit.RMSR.t,4),") =",round(CCFI,3),"\n")
cat(" Note: CCFI values can range from 0 (dimensional) to 1 (categorical). The more a CCFI\n")
cat(" value deviates from .50, the stronger the result. When .40 < CCFI < .60,\n")
cat(" this should be interpreted with caution.\n\n")
}
if (File.Output) sink()
if (Save.Class > 0) return(Class.Assign)
}
###########################################################################################################
#
# Main program shows the overall structure through calls to four subsidiary programs
#
###########################################################################################################
MAXEIG.Main <- function(Data, Research.Data, Comp.Data, N.Pop, N.Samples, Supplied.P, Supplied.Class, N, I,
Ind.Triplets, Ind.Comp, Intervals, N.Int, Windows, Overlap, Calc.Cov,
St.Ind, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, Gr.Ref,
Gr.Avg, Gr.Ind, Classify, Save.Class, Gr.Bayes, File.Output, File.Name)
{
# set up matrix to store input/output configurations for each curve to be generated
Analysis.Structure <- MAXEIG.Structure(Ind.Triplets, Ind.Comp, I, Intervals)
Curve.Info <- Analysis.Structure[[1]]
Structure <- Analysis.Structure[[2]]
Input.Type <- Analysis.Structure[[3]]
# carry out analyses by calling calculation procedure once per curve
Curves <- list(1)
for (Curve.Num in 1:length(Curve.Info[,1]))
Curves[[Curve.Num]] <- MAXEIG.Calculate(Data[,1:I], N, I, Curve.Info, Curve.Num, Structure, Input.Type,
Ind.Triplets, Ind.Comp, Intervals, N.Int, Windows, Overlap, Calc.Cov, Replications)
if (!Research.Data | Comp.Data)
{
# calculate averaged curve
X.Values <- rep(0,length(Curves[[1]][,1]))
for (i in 1:length(Curve.Info[,1]))
X.Values <- X.Values + Curves[[i]][,1]
X.Values <- X.Values / length(Curve.Info[,1])
Y.Values <- rep(0,length(X.Values))
for (i in 1:length(Curve.Info[,1]))
Y.Values <- Y.Values + Curves[[i]][,3]
Y.Values <- Y.Values / length(Curve.Info[,1])
# calculate base rate estimates
Data.Matrix <- cbind(Data[,1:I], 1:N, rep(0,N), rep(0,N))
N.Curves <- length(Curve.Info[,1])
P.Estimates <- vector("numeric",N.Curves) # store base rate estimates
for (i in 1:N.Curves)
{
if (Structure < 3) Input <- Curve.Info[i,2]
else
{
Data.Matrix[,(I + 2)] <- rep(0, N)
for (j in 1:I)
Data.Matrix[,(I + 2)] <- Data.Matrix[,(I + 2)] + Data.Matrix[,j]
Input <- I + 2
}
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),]
# determine curve data to use (smoothed or unsmoothed)
if (Gr.Smooth == 0) Curve.Data <- cbind(Curves[[i]][,1], Curves[[i]][,3], Curves[[i]][,2])
if (Gr.Smooth == 1)
{
Lowess.Y <- lowess(Curves[[i]][,1], Curves[[i]][,3])$y
for (j in 1:(length(Curves[[i]][,1])-1))
if (Curves[[i]][j,1] == Curves[[i]][(j + 1),1])
Lowess.Y <- c(Lowess.Y[1:j],Lowess.Y[j],Lowess.Y[j+1:length(Lowess.Y)])
Lowess.Y <- Lowess.Y[1:length(Curves[[i]][,1])]
Curve.Data <- cbind(Curves[[i]][,1], Lowess.Y, Curves[[i]][,2])
}
if (Gr.Smooth == 2) Curve.Data <- cbind(Curves[[i]][,1], smooth(Curves[[i]][,3]), Curves[[i]][,2])
# estimate hitmax, P
Hitmax.Value <- max(Curve.Data[,2]) # y value at peak
Hitmax.Locs <- (1:(length(Curve.Data[,1])))[Curve.Data[,2] == Hitmax.Value] # peak subsample(s)
Hitmax.Loc <- round(median(Hitmax.Locs))
Hitmax.Case <- trunc(mean(Curves[[i]][Hitmax.Loc,4:5])) # case at midpoint of peak subsample(s)
Ints <- length(Curve.Data[,1])
p <- vector("numeric",Ints)
n <- vector("numeric",Ints)
K <- 4 * Hitmax.Value
for (j in 1:Ints)
{
if ((K ^ 2 - 4 * K * Curve.Data[j,2]) >= 0)
quad1 <- (K + sqrt(K ^ 2 - 4 * K * Curve.Data[j,2])) / ( 2 * K)
else quad1 <- 1
if ((K ^ 2 - 4 * K * Curve.Data[j,2]) >= 0)
quad2 <- (K - sqrt(K ^ 2 - 4 * K * Curve.Data[j,2])) / ( 2 * K)
else quad2 <- 0
if (j < Hitmax.Loc) p[j] <- min(1,quad2)
if (j == Hitmax.Loc) p[j] <- .5
if (j > Hitmax.Loc) p[j] <- max(quad1,0)
if ((Curve.Data[j,2] < 0) & (j < Hitmax.Loc)) p[j] <- 0
if ((Curve.Data[j,2] < 0) & (j > Hitmax.Loc)) p[j] <- 1
n[j] <- Curve.Data[j,3]
}
P.Estimates[i] <- sum(p * n) / sum(n)
}
}
# plot the results
if (Research.Data) P.Avg <- MAXEIG.Plot(I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets,
Ind.Comp, Intervals, Calc.Cov, Gr.Orient, Gr.Rows, Gr.Cols,
Gr.Smooth, Gr.Common.Y, Gr.Ref, Gr.Avg, Gr.Ind)
# summarize analytic specifications
if (Research.Data) MAXEIG.Summarize(Data[,1:I], N, I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets,
Ind.Comp, Intervals, N.Int, Windows, Overlap, Calc.Cov, Replications,
Gr.Smooth, Gr.Avg, P.Avg, Classify, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Bayes, Comp.Data,
N.Pop, N.Samples, Supplied.P, Supplied.Class)
# generate and report estimates of latent parameters and, if requested, classify cases
if (Research.Data) Class.Assign <- MAXEIG.P.Est(Data, N, I, Curve.Info, Curves, Structure, Input.Type,
Ind.Triplets, Ind.Comp, Intervals, N.Int, Windows, Overlap,
Calc.Cov, Replications, Gr.Smooth, Gr.Avg, P.Avg, Classify, Gr.Orient, Gr.Rows,
Gr.Cols, Gr.Bayes, Save.Class, Comp.Data, Supplied.P, Supplied.Class)
if (Research.Data & Comp.Data) return(list(Class.Assign[[2]],Class.Assign[[3]],Class.Assign[[1]],Y.Values))
if (Research.Data & !Comp.Data) return(list(Class.Assign[[2]],Class.Assign[[1]]))
if (!Research.Data) return(list(X.Values,Y.Values,P.Estimates))
}
###########################################################################################################
#
# Program to determine structure of analysis and set up matrix to contain relevant curve information
#
###########################################################################################################
MAXEIG.Structure <- function(Ind.Triplets, Ind.Comp, I, Intervals)
{
# determine structure of the requested analysis:
# 1 = each indicator serves as input once, with all others as output
# 2 = triplets of indicators serve in all (input, output, output) combinations
# 3 = triplets of indicators, with each possible pair removed as outputs, input = sum of others
if (Ind.Comp) Structure <- 3
else if (Ind.Triplets) Structure <- 2
else Structure <- 1
# based on structure, create matrix listing input and/or output indicators to use for each curve
if (Structure == 1)
{
N.Curves <- I
Curve.Info <- matrix(nrow = N.Curves, ncol = 2)
Curve.Info[,1] <- 1:N.Curves
Curve.Info[,2] <- 1:N.Curves
}
if (Structure == 2)
{
if (I == 3) N.Curves <- 3
else N.Curves <- I * (prod(seq(1:(I - 1)))) / ((prod(seq(1:(I - 3)))) * 2)
Curve.Info <- matrix(nrow = N.Curves, ncol = 4)
Curve.Info[,1] <- 1:N.Curves
Counter <- 1
for (Triplet.A in 1:I)
for (Triplet.B in 1:I)
for (Triplet.C in 1:I)
if ((Triplet.A != Triplet.B) & (Triplet.A != Triplet.C) & (Triplet.B < Triplet.C))
{
Curve.Info[Counter,2] <- Triplet.A
Curve.Info[Counter,3] <- Triplet.B
Curve.Info[Counter,4] <- Triplet.C
Counter <- Counter + 1
}
}
if (Structure == 3)
{
N.Curves <- (I * (I - 1)) / 2
Curve.Info <- matrix(nrow = N.Curves, ncol = 3)
Curve.Info[,1] <- 1:N.Curves
Outputs <- 1
for (Output.A in 1:(I - 1))
for (Output.B in (Output.A + 1):I)
{
Curve.Info[Outputs,2] <- Output.A
Curve.Info[Outputs,3] <- Output.B
Outputs <- Outputs + 1
}
}
# determine type of input divisions:
# 1 = equal-sized intervals
# 2 = SD intervals (no longer allowed)
# 3 = overlapping windows
if (Intervals) Input.Type <- 1
else Input.Type <- 3
return(list(Curve.Info, Structure, Input.Type))
}
###########################################################################################################
#
# Program to subdivide the input and calculate eigenvalues (or, if requested, covariances)
#
###########################################################################################################
MAXEIG.Calculate <- function(Data, N, I, Curve.Info, Curve.Num, Structure, Input.Type, Ind.Triplets,
Ind.Comp, Intervals, N.Int, Windows, Overlap, Calc.Cov,
Replications)
{
# make room in data for summed indicator plus random numbers for resorting at each replication
Data.Matrix <- cbind(Data, rep(0,N), rnorm(N, mean = 0, sd = 1))
# determine input column, creating sum of other variables if necessary, and sort by input
if (Structure == 3)
{
Output1 <- Curve.Info[Curve.Num,2]
Output2 <- Curve.Info[Curve.Num,3]
Input <- I + 1
for (i in 1:I)
Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] + Data.Matrix[,i]
Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] - Data.Matrix[,Output1] - Data.Matrix[,Output2]
}
else Input <- Curve.Info[Curve.Num, 2]
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),]
# divide input as appropriate for a given input method
# generate matrix containing case numbers that form endpoints of each subsample
# generate appropriate labels for x axis of graph
if (Input.Type == 1) # create equal-sized intervals; label = mean value within each interval
{
Cuts <- matrix(nrow = 2, ncol = N.Int)
temp <- seq(from = 1, to = N, by = round(N / N.Int))
Cuts[1,] <- temp[1:N.Int]
Cuts[2,] <- c((temp[2:N.Int] - 1),N)
X.Labels <- vector("numeric", N.Int)
for (i in 1:N.Int)
X.Labels[i] <- round(mean(Data.Matrix[Cuts[1,i]:Cuts[2,i],Input]),2)
}
if (Input.Type == 3) # create overlapping windows; label = mean value within each window
{
N.Win <- N / (Windows * (1 - Overlap) + Overlap) # determine total n per window
N.Overlap <- N.Win * Overlap # determine how many cases overlap with adjacent windows
Cuts <- matrix(nrow = 2, ncol = Windows)
Cuts[1,] <- round(seq(from = 1, by = (N.Win - N.Overlap), length = Windows))
Cuts[2,] <- round(seq(from = N.Win, to = N, by = (N.Win - N.Overlap)))
X.Labels <- vector("numeric", Windows)
for (i in 1:Windows)
X.Labels[i] <- round(mean(Data.Matrix[Cuts[1,i]:Cuts[2,i],Input]),2)
}
# create matrix to store curve results
N.Subsamples <- length(X.Labels)
Curve.Results <- matrix(rep(0, N.Subsamples * 5), nrow = N.Subsamples, ncol = 5)
# having set up subsamples along the input and created a storage structure, perform the calculations
# replications loop; results are cumulated, subsequently divided by number of replications to yield average
for (Rep in 1:Replications)
{
# shuffle cases, then sort by input
Data.Matrix <- Data.Matrix[sample(1:N, N, replace = F),]
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),]
# begin calculations, storing results in either matrix or array as follows:
# 1st column = labels for x axis
# 2nd column = n per interval
# 3rd column = eigenvalues (or covariances)
# 4th, 5th columns = endpoints of each interval (case numbers)
# calculate values for one series of subsamples
Curve.Results[,4] <- Cuts[1,]
Curve.Results[,5] <- Cuts[2,]
Curve.Results[,1] <- X.Labels
if (Calc.Cov) # calculate covariances
{
if (Structure == 2) Outputs <- c(Curve.Info[Curve.Num,3], Curve.Info[Curve.Num,4])
if (Structure == 3) Outputs <- c(Curve.Info[Curve.Num,2], Curve.Info[Curve.Num,3])
for (i in 1:N.Subsamples)
{
Curve.Results[i,2] <- Cuts[2,i] - Cuts[1,i] + 1
Curve.Results[i,3] <- Curve.Results[i,3] + var(Data.Matrix[Cuts[1,i]:Cuts[2,i],Outputs])[2]
}
}
else # calculate eigenvalues
{
if (Structure == 1)
{
Outputs <- c(1:I)
Outputs <- Outputs[Outputs != Input]
}
if (Structure == 2) Outputs <- c(Curve.Info[Curve.Num,3], Curve.Info[Curve.Num,4])
if (Structure == 3) Outputs <- c(Curve.Info[Curve.Num,2], Curve.Info[Curve.Num,3])
for (i in 1:N.Subsamples)
{
Curve.Results[i,2] <- Cuts[2,i] - Cuts[1,i] + 1
Cov.Matrix <- var(Data.Matrix[Cuts[1,i]:Cuts[2,i],Outputs])
Data.No.Diag <- Cov.Matrix - diag(diag(Cov.Matrix))
Curve.Results[i,3] <- Curve.Results[i,3] + eigen(Data.No.Diag)$values[1]
}
}
}
# divide cumulated covariances or eigenvalues by number of replications to yield average values
Curve.Results[,3] <- Curve.Results[,3] / Replications
return(Curve.Results)
}
###########################################################################################################
#
# Program to plot and label a panel of curves
#
###########################################################################################################
MAXEIG.Plot <- function(I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets, Ind.Comp, Intervals,
Calc.Cov, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y,
Gr.Ref, Gr.Avg, Gr.Ind)
{
P.Avg <- 0
# create and label new graph sheet
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
Page <- 1
N.Curves <- length(Curve.Info[,1])
# if applying a common Y scale for all graphs, set the range here (algorithm accentuates peak/flatness)
if (Gr.Common.Y)
{
Ys <- 0
for (i in 1:N.Curves)
Ys <- c(Ys, Curves[[i]][,3])
Y.Min <- min(Ys[2:length(Ys)])
Y.Max <- max(Ys[2:length(Ys)])
Y.M <- mean(Ys[2:length(Ys)])
Y.SD <- sd(Ys[2:length(Ys)])
Y.Range <- c(min(0, Y.Min), max(((Y.Max / Y.SD) * (Y.M / 1.5)), (Y.Max * 1.10)))
}
# plot each curve
for (i in 1:N.Curves)
{
if (Structure == 3) Input <- I + 1
else Input <- Curve.Info[i,2]
if (Structure == 1)
{
Outputs <- c(1:I)
Outputs <- Outputs[Outputs != Input]
}
if (Structure == 2) Outputs <- c(Curve.Info[i,3], Curve.Info[i,4])
if (Structure == 3) Outputs <- c(Curve.Info[i,2], Curve.Info[i,3])
X.Values <- Curves[[i]][,1]
Y.Values <- Curves[[i]][,3]
X.Range <- c(min(X.Values), max(X.Values))
if (!Gr.Common.Y)
Y.Range <- c(min(0, min(Y.Values)), max(((max(Y.Values) / sd(Y.Values))
* (mean(Y.Values) / 1.5)), max(Y.Values) * 1.10))
# label the x axis in accord with input method and indicator
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals, Indicator",Input)
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows, Indicator",Input)
if (Ind.Comp)
{
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals")
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows")
}
# label the y axis in accord with calculation type and output indicators
if (Calc.Cov) Y.Label <- paste("Covariance, Ind",Outputs[1],"and",Outputs[2])
else if (Ind.Triplets) Y.Label <- paste("Eigenvalue, Ind",Outputs[1],"and",Outputs[2])
else Y.Label <- "Eigenvalue, All Other Vars"
# check to see whether new graph sheet needs to be created
if ((i > 1) & ((i - 1) %% (Gr.Rows * Gr.Cols) == 0))
{
if (Calc.Cov) mtext(paste("MAXCOV CURVES, page ",Page,sep=""),outer=T)
else mtext(paste("MAXEIG CURVES, page ",Page,sep=""),outer=T)
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
Page <- Page + 1
}
# plot the graph, plus smoothed curve and/or reference line at 0 if requested
plot(X.Values, Y.Values, type = "p", pch = 16, ylim = Y.Range, xlim = X.Range,
ylab = Y.Label, xlab = X.Label)
if (Gr.Smooth == 0) # no smoothing, connect "raw" data points
lines(X.Values, Y.Values) # solid line for raw data only
else
lines(X.Values, Y.Values, lty = 3) # dotted line for raw data, solid for smoothed
if (Gr.Smooth == 1) # LOWESS smoother
lines(lowess(X.Values, Y.Values))
if (Gr.Smooth == 2) # running medians smoother
lines(X.Values,smooth(Y.Values))
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0)
}
if (Calc.Cov) mtext(paste("MAXCOV CURVES, page ",Page,sep=""),outer=T)
else mtext(paste("MAXEIG CURVES, page ",Page,sep=""),outer=T)
# plot curves averaged for each indicator
if (Gr.Ind)
{
# create a new graph sheet
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
Page <- 1
# average y values for curves using the same input indicator
N.Subsamples <- length(Curves[[1]][,1])
Ind.Graphs <- matrix(nrow = I, ncol = N.Subsamples)
N.Each.Ind <- N.Curves / I
for (i in 1:I)
{
Ind.Curves <- ((i - 1) * N.Each.Ind + 1):(i * N.Each.Ind)
Ys <- vector("numeric",N.Subsamples)
for (j in 1:N.Subsamples)
for (k in Ind.Curves)
Ys[j] <- Ys[j] + Curves[[k]][j,3]
Ind.Graphs[i,] <- Ys / N.Each.Ind
}
# plot a curve for each indicator
for (i in 1:I)
{
Input <- i
X.Values <- Curves[[i * N.Each.Ind]][,1]
X.Range <- c(min(X.Values), max(X.Values))
Ind.Curves <- ((i - 1) * N.Each.Ind + 1):(i * N.Each.Ind)
Y.Values <- Ind.Graphs[i,]
for (j in Ind.Curves)
Y.Values <- c(Y.Values,Curves[[j]][,3])
Y.Range <- c(min(0, min(Y.Values)), max(((max(Y.Values) / sd(Y.Values))
* (mean(Y.Values) / 1.5)), max(Y.Values) * 1.10))
# label the x axis in accord with input method and indicator
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals, Indicator",Input)
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows, Indicator",Input)
# label the y axis in accord with calculation type
if (Calc.Cov) Y.Label <- "Covariance"
else Y.Label <- "Eigenvalue"
# check to see whether new graph sheet needs to be created
if ((i > 1) & ((i - 1) %% (Gr.Rows * Gr.Cols) == 0))
{
if (Calc.Cov) mtext(paste("MAXCOV INDICATOR CURVES, page ",Page,sep=""),outer=T)
else mtext(paste("MAXEIG INDICATOR CURVES, page ",Page,sep=""),outer=T)
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
Page <- Page + 1
}
# plot the averaged graph, with dotted lines for each individual curve
plot(X.Values, Ind.Graphs[i,], type = "l", pch = 16, ylim = Y.Range, xlim = X.Range,
ylab = Y.Label, xlab = X.Label, lwd = 2)
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0)
for (j in Ind.Curves)
lines(Curves[[j]][,1],Curves[[j]][,3], lty = 3, lwd = 2)
}
if (Calc.Cov) mtext(paste("MAXCOV INDICATOR CURVES, page ",Page,sep=""),outer=T)
else mtext(paste("MAXEIG INDICATOR CURVES, page ",Page,sep=""),outer=T)
}
if (Gr.Avg)
{
# create and label new graph sheet
dev.new(width=3.7, height=3.5)
par(omi=c(.25,.25,.25,.25), cex = .75)
X.Values <- rep(0,length(Curves[[1]][,1]))
for (i in 1:length(Curve.Info[,1]))
X.Values <- X.Values + Curves[[i]][,1]
X.Values <- X.Values / length(Curve.Info[,1])
Y.Values <- rep(0,length(X.Values))
for (i in 1:length(Curve.Info[,1]))
Y.Values <- Y.Values + Curves[[i]][,3]
Y.Values <- Y.Values / length(Curve.Info[,1])
Hitmax.Value <- max(Y.Values) # y value at peak
Hitmax.Locs <- (1:(length(Y.Values)))[Y.Values == Hitmax.Value] # peak subsample(s)
Hitmax.Loc <- round(median(Hitmax.Locs))
Hitmax.Case <- trunc(mean(Curves[[1]][Hitmax.Loc,4:5])) # case at midpoint of peak subsample(s)
Ints <- length(X.Values)
p <- vector("numeric",Ints)
n <- vector("numeric",Ints)
K <- 4 * Hitmax.Value
for (j in 1:Ints)
{
if ((K ^ 2 - 4 * K * Y.Values[j]) >= 0)
quad1 <- (K + sqrt(K ^ 2 - 4 * K * Y.Values[j])) / ( 2 * K)
else quad1 <- 1
if ((K ^ 2 - 4 * K * Y.Values[j]) >= 0)
quad2 <- (K - sqrt(K ^ 2 - 4 * K * Y.Values[j])) / ( 2 * K)
else quad2 <- 0
if (j < Hitmax.Loc) p[j] <- min(1,quad2)
if (j == Hitmax.Loc) p[j] <- .5
if (j > Hitmax.Loc) p[j] <- max(quad1,0)
}
P.Avg <- mean(p)
X.Range <- c(min(X.Values), max(X.Values))
Y.Values.Avg <- Y.Values
for (i in 1:N.Curves)
Y.Values <- c(Y.Values,Curves[[i]][,3])
Y.Range <- c(min(0, min(Y.Values)), max(((max(Y.Values) / sd(Y.Values))
* (mean(Y.Values) / 1.5)), max(Y.Values) * 1.10))
# label the x axis in accord with input method and indicator
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals")
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows")
if (Ind.Comp)
{
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals")
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows")
}
# label the y axis in accord with calculation type and output indicators
if (Calc.Cov) Y.Label <- "Covariance"
else Y.Label <- "Eigenvalue"
# plot the averaged graph, with dotted lines for each individual curve
if (Calc.Cov) Title <- "Averaged MAXCOV Curve"
else Title <- "Averaged MAXEIG Curve"
plot(X.Values, Y.Values.Avg, type = "l", pch = 16, ylim = Y.Range, xlim = X.Range,
ylab = Y.Label, xlab = X.Label, main = Title, lwd = 2)
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0)
for (i in 1:N.Curves)
lines(Curves[[i]][,1],Curves[[i]][,3], lty = 3, lwd = 2)
}
if (Gr.Avg) return(P.Avg)
}
###########################################################################################################
#
# Program to produce text summary (data parameters, analytic overview)
#
###########################################################################################################
MAXEIG.Summarize <- function(Data, N, I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets, Ind.Comp,
Intervals, N.Int, Windows, Overlap, Calc.Cov, Replications, Gr.Smooth, Gr.Avg,
P.Avg, Classify, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Bayes, Comp.Data, N.Pop, N.Samples,
Supplied.P, Supplied.Class)
{
# report relevant analytic information and program specifications
if (Calc.Cov) cat("\n\nSUMMARY OF MAXCOV ANALYTIC SPECIFICATIONS\n")
else cat("\n\nSUMMARY OF MAXEIG ANALYTIC SPECIFICATIONS\n")
cat("\nSample size: ",N,"\n")
cat("Number of indicator variables: ",I,"\n")
cat("Replications: ",Replications,"\n")
if (Intervals) Subsample.Label <- paste(N.Int,"intervals")
if (!Intervals) Subsample.Label <- paste(Windows,"windows with",Overlap,"overlap")
cat("Subsamples: ",Subsample.Label,"\n")
if (!Intervals) cat("n per window at",Windows,"windows: ", round(N /
(Windows * (1 - Overlap) + Overlap)),"\n")
if (Input.Type == 1) cat("n per interval: ",round(N / N.Int),"\n")
if (Ind.Triplets & !Ind.Comp) Ind.Label <-
"Variables serve in all possible (input, output, output) triplets"
if (Ind.Triplets & Ind.Comp) Ind.Label <-
paste("Two variables at a time serve as outputs, input = sum of remaining",(I-2),"variables")
if (!Ind.Triplets) Ind.Label <- "Each variable serves once as input, with all other variables as outputs"
cat("Indicators: ",Ind.Label,"\n")
cat("Total number of curves: ",length(Curve.Info[,1]),"\n")
if (Gr.Smooth == 0) cat("Y values smoothed for graphing and estimation: No\n")
if (Gr.Smooth == 1) cat("Y values smoothed for graphing and estimation: Yes, LOWESS method\n")
if (Gr.Smooth == 2) cat("Y values smoothed for graphing and estimation: Yes, running medians\n")
if (Calc.Cov) cat("Base rate estimation: General covariance mixture theorem\n")
else cat("Base rate estimation: Adapted general covariance mixture theorem\n")
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 1))
cat("Classification of cases: Cut total score at estimated base rate\n")
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 2))
cat("Classification of cases: Bayes' Theorem\n")
if (Supplied.P > 0) cat("Classification of cases: Cut total score at supplied base rate\n")
if (Supplied.Class) cat("Classification of cases: Supplied with data\n")
if (Comp.Data)
{
cat("Size of finite populations of comparison data: ",N.Pop,"\n")
cat("Number of samples of comparison data drawn from each population: ",N.Samples,"\n")
}
cat("\n")
}
###########################################################################################################
#
# Function to generate and report estimates of the taxon base rate and other latent parameters
#
###########################################################################################################
MAXEIG.P.Est <- function(Data, N, I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets, Ind.Comp,
Intervals, N.Int, Windows, Overlap, Calc.Cov, Replications, Gr.Smooth, Gr.Avg,
P.Avg, Classify, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Bayes, Save.Class, Comp.Data, Supplied.P,
Supplied.Class)
{
if (Calc.Cov) cat("\n\nSUMMARY OF MAXCOV PARAMETER ESTIMATES\n")
else cat("\n\nSUMMARY OF MAXEIG PARAMETER ESTIMATES\n")
# estimate input hitmax and base rate estimate for each curve
Data.Matrix <- cbind(Data[,1:I], 1:N, rep(0,N), rep(0,N)) # data plus case identifiers (for sorting into original
# form), summed indicators, class assignments
N.Curves <- length(Curve.Info[,1])
Curve.Est <- matrix(nrow = N.Curves, ncol = 2) # set up matrix to hold hitmax and base rate estimates
# Estimate P via general covariance mixture theorem (whether doing MAXCOV or MAXEIG)
VP.FP <- matrix(nrow = N.Curves, ncol = 2) # set up matrices to hold VP, FP estimates
for (i in 1:N.Curves)
{
if (Structure < 3) Input <- Curve.Info[i,2]
else
{
Data.Matrix[,(I + 2)] <- rep(0, N)
for (j in 1:I)
Data.Matrix[,(I + 2)] <- Data.Matrix[,(I + 2)] + Data.Matrix[,j]
Input <- I + 2
}
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),]
# determine curve data to use (smoothed or unsmoothed)
if (Gr.Smooth == 0) Curve.Data <- cbind(Curves[[i]][,1], Curves[[i]][,3], Curves[[i]][,2])
if (Gr.Smooth == 1)
{
Lowess.Y <- lowess(Curves[[i]][,1], Curves[[i]][,3])$y
for (j in 1:(length(Curves[[i]][,1])-1))
if (Curves[[i]][j,1] == Curves[[i]][(j + 1),1])
Lowess.Y <- c(Lowess.Y[1:j],Lowess.Y[j],Lowess.Y[j+1:length(Lowess.Y)])
Lowess.Y <- Lowess.Y[1:length(Curves[[i]][,1])]
Curve.Data <- cbind(Curves[[i]][,1], Lowess.Y, Curves[[i]][,2])
}
if (Gr.Smooth == 2) Curve.Data <- cbind(Curves[[i]][,1], smooth(Curves[[i]][,3]), Curves[[i]][,2])
# estimate hitmax, P, VP, FP
Hitmax.Value <- max(Curve.Data[,2]) # y value at peak
Hitmax.Locs <- (1:(length(Curve.Data[,1])))[Curve.Data[,2] == Hitmax.Value] # peak subsample(s)
Hitmax.Loc <- round(median(Hitmax.Locs))
Hitmax.Case <- trunc(mean(Curves[[i]][Hitmax.Loc,4:5])) # case at midpoint of peak subsample(s)
Ints <- length(Curve.Data[,1])
p <- vector("numeric",Ints)
n <- vector("numeric",Ints)
K <- 4 * Hitmax.Value
VP <- 0
FP <- 0
for (j in 1:Ints)
{
if ((K ^ 2 - 4 * K * Curve.Data[j,2]) >= 0)
quad1 <- (K + sqrt(K ^ 2 - 4 * K * Curve.Data[j,2])) / ( 2 * K)
else quad1 <- 1
if ((K ^ 2 - 4 * K * Curve.Data[j,2]) >= 0)
quad2 <- (K - sqrt(K ^ 2 - 4 * K * Curve.Data[j,2])) / ( 2 * K)
else quad2 <- 0
if (j < Hitmax.Loc) p[j] <- min(1,quad2)
if (j == Hitmax.Loc) p[j] <- .5
if (j > Hitmax.Loc) p[j] <- max(quad1,0)
if ((Curve.Data[j,2] < 0) & (j < Hitmax.Loc)) p[j] <- 0
if ((Curve.Data[j,2] < 0) & (j > Hitmax.Loc)) p[j] <- 1
n[j] <- Curve.Data[j,3]
if (Structure < 3)
{
if (j >= Hitmax.Loc) VP <- VP + p[j] * n[j]
if (j >= Hitmax.Loc) FP <- FP + (1 - p[j]) * n[j]
}
}
Curve.Est[i,1] <- Data.Matrix[Hitmax.Case,Input]
Curve.Est[i,2] <- sum(p * n) / sum(n)
VP.FP[i,1] <- VP / sum(p * n)
VP.FP[i,2] <- FP / sum((1 - p) * n)
}
Row.Labels <- paste("Curve",1:N.Curves)
Col.Labels <- c("Hitmax","P")
dimnames(Curve.Est) <- list(Row.Labels, Col.Labels)
cat("\nEstimated hitmax values and taxon base rates for each curve:\n\n")
print(round(Curve.Est,3))
P.Est.M <- mean(Curve.Est[,2], na.rm = T)
P.Est.SD <- sd(Curve.Est[,2], na.rm = T)
cat("\nSummary of base rate estimates across curves:\n M =",round(P.Est.M,3),
"\n SD =",round(P.Est.SD,3),"\n\n")
# if more than one curve per indicator, gather hitmax, P, VP, FP estimates for each indicator
if (Structure < 3)
{
Ind.Est <- matrix(nrow = I, ncol = 4)
N.Each.Ind <- N.Curves / I
for (i in 1:I)
{
Ind.Curves <- ((i - 1) * N.Each.Ind + 1):(i * N.Each.Ind)
Ind.Est[i,1] <- median(Curve.Est[Ind.Curves,1], na.rm = T)
Ind.Est[i,2] <- mean(Curve.Est[Ind.Curves,2], na.rm = T)
Ind.Est[i,3] <- mean(VP.FP[Ind.Curves,1], na.rm = T)
Ind.Est[i,4] <- mean(VP.FP[Ind.Curves,2], na.rm = T)
}
if ((Structure == 2) & (I > 3))
{
Row.Labels <- paste("Indicator",1:I)
Col.Labels <- c("Hitmax","P","VP","FP")
dimnames(Ind.Est) <- list(Row.Labels, Col.Labels)
cat("\nEstimated hitmax, base rate, VP, FP estimates for each indicator:\n\n")
print(round(Ind.Est,9))
cat("\nSummary of base rate estimates across indicators:\n M =",
round(mean(Ind.Est[,2], na.rm = T),3), "\n SD =",
round(sd(Ind.Est[,2], na.rm = T),3),"\n\n")
}
if ((Structure == 1) | (I == 3))
{
Row.Labels <- paste("Indicator",1:I)
Col.Labels <- c("Hitmax","P","VP","FP")
dimnames(Ind.Est) <- list(Row.Labels, Col.Labels)
cat("\nEstimated VP, FP values at each indicator's hitmax cut:\n\n")
print(round(Ind.Est[,3:4],9))
cat("\n\n")
}
}
# print base rate estimate for averaged curve(s) if appropriate
if (Gr.Avg) cat("\nBase rate estimate for averaged curve =",round(P.Avg, 3),"\n\n")
# assign cases to classes
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 1)) # use estimated base rate to classify
{
for (i in 1:I)
Data.Matrix[, (I + 2)] <- Data.Matrix[, (I + 2)] + Data.Matrix[,i]
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 2)]),]
P.Est.Series <- c(rep(2, trunc(N * P.Est.M + .5)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Matrix[,(I + 3)] <- c(Q.Est.Series, P.Est.Series)
}
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 2)) # use Bayes' Theorem to classify
{
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 1)]),]
P.Term <- rep(P.Est.M,N)
Q.Term <- rep(1 - P.Est.M,N)
for (i in 1:I)
if ((Ind.Est[i,3] < .9999) & (Ind.Est[i,4] < .9999))
{
P.Term <- (Data.Matrix[,i] >= Ind.Est[i,1]) * P.Term * Ind.Est[i,3] +
(Data.Matrix[,i] < Ind.Est[i,1]) * P.Term * (1 - Ind.Est[i,3])
Q.Term <- (Data.Matrix[,i] >= Ind.Est[i,1]) * Q.Term * Ind.Est[i,4] +
(Data.Matrix[,i] < Ind.Est[i,1]) * Q.Term * (1 - Ind.Est[i,4])
}
Bayes.Prob <- P.Term / (P.Term + Q.Term)
Data.Matrix[,(I + 3)] <- (Bayes.Prob >= .50) + 1
if (Gr.Bayes)
{
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
hist(Bayes.Prob, breaks = seq(from = 0, to = 1, by = .10), xlab = "Prob(Taxon Member)",
ylab = "Frequency", main="")
mtext("DISTRIBUTION OF BAYESIAN PROBABILITIES", outer=T)
}
}
if (Supplied.P > 0) # use supplied base rate to classify
{
for (i in 1:I)
Data.Matrix[, (I + 2)] <- Data.Matrix[, (I + 2)] + Data.Matrix[,i]
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 2)]),]
P.Est.Series <- c(rep(2, trunc(N * Supplied.P + .5)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Matrix[,(I + 3)] <- c(Q.Est.Series, P.Est.Series)
}
if (Supplied.Class) # use supplied classification
Data.Matrix <- cbind(Data[,1:I], (1:N), rep(0,N), Data[, (I + 1)])
# Construct full-sample and within-groups data files
Data.f <- as.matrix(Data.Matrix[,(1:I)])
Nt <- sum(Data.Matrix[,(I + 3)]) - N
Nc <- N - Nt
Data.t <- Data.Matrix[(Data.Matrix[,(I + 3)] == 2),1:I]
Data.c <- Data.Matrix[(Data.Matrix[,(I + 3)] == 1),1:I]
for (i in 1:I)
{
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001)
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001)
}
# Calculate distributions for each indicator (full sample, taxon, complement)
Dist.Full <- matrix(nrow = I + 2, ncol = 4)
Dist.Tax <- matrix(nrow = I + 2, ncol = 4)
Dist.Comp <- matrix(nrow = I + 2, ncol = 4)
Rows <- c(paste("Ind",1:I), "M", "SD")
Cols <- c("M","SD","Skew","Kurtosis")
dimnames(Dist.Full) <- list(Rows, Cols)
dimnames(Dist.Tax) <- list(Rows, Cols)
dimnames(Dist.Comp) <- list(Rows, Cols)
for (i in 1:I)
{
Dist.Full[i,1] <- mean(Data.f[,i])
Dist.Tax[i,1] <- mean(Data.t[,i])
Dist.Comp[i,1] <- mean(Data.c[,i])
Dist.Full[i,2] <- sd(Data.f[,i])
Dist.Tax[i,2] <- sd(Data.t[,i])
Dist.Comp[i,2] <- sd(Data.c[,i])
Dist.Full[i,3] <- Skew(Data.f[,i])
Dist.Tax[i,3] <- Skew(Data.t[,i])
Dist.Comp[i,3] <- Skew(Data.c[,i])
Dist.Full[i,4] <- Kurtosis(Data.f[,i])
Dist.Tax[i,4] <- Kurtosis(Data.t[,i])
Dist.Comp[i,4] <- Kurtosis(Data.c[,i])
}
for (i in 1:4)
{
Dist.Full[I + 1,i] <- mean(Dist.Full[1:I,i])
Dist.Tax[I + 1,i] <- mean(Dist.Tax[1:I,i])
Dist.Comp[I + 1,i] <- mean(Dist.Comp[1:I,i])
Dist.Full[I + 2,i] <- sd(Dist.Full[1:I,i])
Dist.Tax[I + 2,i] <- sd(Dist.Tax[1:I,i])
Dist.Comp[I + 2,i] <- sd(Dist.Comp[1:I,i])
}
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n")
print(round(Dist.Full,3))
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n")
print(round(Dist.Tax,3))
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n")
print(round(Dist.Comp,3))
# calculate validity for each indicator
Ind.Val <- matrix(nrow = I + 2, ncol = 2)
Rows <- c(paste("Ind",1:I), "M", "SD")
Cols <- c("Raw Units","Cohen's d")
dimnames(Ind.Val) <- list(Rows, Cols)
for (i in 1:I)
{
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1]
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2)
/ (Nt + Nc - 2))
}
Ind.Val[I + 1,1] <- mean(Ind.Val[1:I,1])
Ind.Val[I + 1,2] <- mean(Ind.Val[1:I,2])
Ind.Val[I + 2,1] <- sd(Ind.Val[1:I,1])
Ind.Val[I + 2,2] <- sd(Ind.Val[1:I,2])
cat("\nBetween-group validity:\n\n")
print(round(Ind.Val,3))
# Calculate, print, and summarize full-sample and within-group correlations
Cor.Matrix <- cor(Data.f[,1:I])
Rows <- paste("Ind",1:I)
Cols <- paste("Ind",1:I)
dimnames(Cor.Matrix) <- list(Rows, Cols)
cat("\nIndicator correlations:\n\nFull Sample ( N =",N,"):\n\n")
print(round(Cor.Matrix,3))
Cor.Taxon <- cor(Data.t)
Cor.Comp <- cor(Data.c)
dimnames(Cor.Taxon) <- list(Rows, Cols)
dimnames(Cor.Comp) <- list(Rows, Cols)
cat("\nTaxon ( n =",Nt,"):\n\n")
print(round(Cor.Taxon,3))
cat("\nComplement ( n =",Nc,"):\n\n")
print(round(Cor.Comp,3))
Cor.Summary <- matrix(nrow = 3, ncol = 2)
dimnames(Cor.Summary) <- list(c("Full Sample","Taxon", "Complement"), c("M", "SD"))
Cor.Summary[1,1] <- mean(Cor.Matrix[lower.tri(Cor.Matrix)])
Cor.Summary[2,1] <- mean(Cor.Taxon[lower.tri(Cor.Taxon)])
Cor.Summary[3,1] <- mean(Cor.Comp[lower.tri(Cor.Comp)])
Cor.Summary[1,2] <- sd(Cor.Matrix[lower.tri(Cor.Matrix)])
Cor.Summary[2,2] <- sd(Cor.Taxon[lower.tri(Cor.Taxon)])
Cor.Summary[3,2] <- sd(Cor.Comp[lower.tri(Cor.Comp)])
cat("\nSummary of indicator correlations:\n\n")
print(round(Cor.Summary,3))
cat("\n")
# return list of class assignments and mean base rate estimate (after resorting by original case numbers)
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 1)]),]
if (Save.Class < 2) return(list(Data.Matrix[,(I + 3)],P.Est.M,P.Est.SD))
else return(list(round(Bayes.Prob,9),P.Est.M,P.Est.SD))
}
###########################################################################################################
#
# MAXSLOPE
#
# Written by John Ruscio
# Last modified January 9, 2012
#
###########################################################################################################
#
# Preliminary program to read data, call for comparison data if/as requested, submit each data set to
# main program along with an identifying label.
#
###########################################################################################################
MAXSLOPE <- function(Data.Set, Comp.Data = T, N.Pop = 100000, N.Samples = 100, Supplied.Class = F,
Supplied.P = 0, File.Output = F, File.Name = "Output.txt", Seed = 1)
{
Data.Set <- Remove.Missing(Data.Set) # remove any missing data (listwise deletion)
N <- dim(Data.Set)[1] # determine sample size
I <- dim(Data.Set)[2] # determine number of indicators; adjust if one column contains class assignments
if (Supplied.Class) I <- I - 1
if (I > 2)
{
cat("*** More than two indicators were submitted for analysis; only the first two indicators will be used ***\n\n")
if (Supplied.Class) Data.Set <- Data.Set[,c(1,2,(I + 1))]
else Data.Set <- Data.Set[,1:2]
I <- 2
}
Data.Set[,1:2] <- apply(Data.Set[,1:2], 2, scale) # standardize data
set.seed(Seed) # set random number seed to allow exact replications of results
if ((Comp.Data) & (N.Samples == 1)) N.Samples <- 100 # do not allow a single sample of comparison data
# if output to file is requested. redirect all text output
if (File.Output) sink(File.Name)
Research.Results <- MAXSLOPE.Main(Data.Set, Research.Data = T, Comp.Data, N.Pop, N.Samples, N,
Supplied.P, Supplied.Class)
Xs <- Research.Results[[1]]
Slopes <- Research.Results[[2]]
P.Est <- Research.Results[[3]]
# if comparison data were requested, then generate and submit data for analysis
if (Comp.Data)
{
# create matrices to store results for comparison
Tax.Slopes <- Dim.Slopes <- matrix(0, nrow = N.Samples, ncol = N - 1)
# generate and analyze categorical data
set.seed(Seed)
if (!Supplied.Class) # if class assignments were not provided, generate using base rate
{
Data.Set <- cbind(Data.Set, rep(0, N))
if (Supplied.P == 0) P <- P.Est # use estimated P in research data if no P was provided
else P <- Supplied.P
if ((P.Est * N) < 20) P.Est <- 20 / N
if ((N - (P.Est * N)) < 20) P.Est <- N - (20 / N)
for (i in 1:I)
Data.Set[, (I + 1)] <- Data.Set[, (I + 1)] + Data.Set[,i]
Data.Set <- Data.Set[sort.list(Data.Set[,(I + 1)]),]
P.Est.Series <- c(rep(2, round(N * P)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Set[,(I + 1)] <- c(Q.Est.Series, P.Est.Series)
}
Tax.Data <- Data.Set[(Data.Set[,(I + 1)] == 2), 1:I]
Com.Data <- Data.Set[(Data.Set[,(I + 1)] == 1), 1:I]
P.Data <- dim(Tax.Data)[1] / (dim(Tax.Data)[1] + dim(Com.Data)[1])
cat("\nGenerating and analyzing comparison data\n")
Sim.Tax <- GenData(Tax.Data, round(N.Pop * P.Data))
Sim.Com <- GenData(Com.Data, (N.Pop - round(N.Pop * P.Data)))
Sim.Pop <- rbind(Sim.Tax, Sim.Com)
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))]
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I)))
cat(" Categorical population generated, RMSR r =",round(RMSR,3),"\n")
for (Sims in 1:N.Samples)
{
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),]
Tax.Slopes[Sims,] <- MAXSLOPE.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop,
N.Samples, N, Supplied.P, Supplied.Class)
}
cat(" Analysis of",N.Samples,"samples of categorical comparison data completed\n")
# generate and analyze dimensional samples
set.seed(Seed)
Sim.Pop <- GenData(Data.Set[, 1:I], N.Pop)
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))]
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I)))
cat("\n Dimensional population generated, RMSR r =",round(RMSR,3),"\n")
for (Sims in 1:N.Samples)
{
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),]
Dim.Slopes[Sims,] <- MAXSLOPE.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop,
N.Samples, N, Supplied.P, Supplied.Class)
}
cat(" Analysis of",N.Samples,"samples of dimensional comparison data completed\n")
# 2-panel plot for comparison data
Tax.Min <- Tax.Max <- Tax.Q1 <- Tax.Q2 <- Tax.Q3 <- Dim.Min <- Dim.Max <- Dim.Q1 <- Dim.Q2 <- Dim.Q3 <- rep(0, length(Xs))
for (i in 1:length(Xs))
{
Tax.Min[i] <- min(Tax.Slopes[(Tax.Slopes[,i] != -Inf),i])
Tax.Max[i] <- max(Tax.Slopes[(Tax.Slopes[,i] != Inf),i])
Tax.Q1[i] <- sort(Tax.Slopes[,i])[N.Samples * .25]
Tax.Q2[i] <- sort(Tax.Slopes[,i])[N.Samples * .50]
Tax.Q3[i] <- sort(Tax.Slopes[,i])[N.Samples * .75]
Dim.Min[i] <- min(Dim.Slopes[(Dim.Slopes[,i] != -Inf),i])
Dim.Max[i] <- max(Dim.Slopes[(Dim.Slopes[,i] != Inf),i])
Dim.Q1[i] <- sort(Dim.Slopes[,i])[N.Samples * .25]
Dim.Q2[i] <- sort(Dim.Slopes[,i])[N.Samples * .50]
Dim.Q3[i] <- sort(Dim.Slopes[,i])[N.Samples * .75]
}
Min.X <- min(Xs)
Max.X <- max(Xs)
Min.Y <- min(c(Slopes, Tax.Min[Tax.Min != -Inf], Dim.Min[Dim.Min != -Inf]), na.rm = T)
Max.Y <- max(c(Slopes, Tax.Max[Tax.Max != Inf], Dim.Max[Dim.Min != Inf]), na.rm = T)
dev.new(width=7, height=3.5)
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75)
plot(x = Xs, y = Slopes, type = "o", pch = 16, lwd = 2, xlim = c(Min.X, Max.X),
ylim = c(Min.Y, Max.Y), xlab = "Indicator Score", ylab = "LOWESS Slope", main = "Categorical Comparison Data")
for (i in 1:(length(Xs)-1))
polygon(x = c(Xs[i], Xs[i], Xs[i+1], Xs[i+1]),
y = c(Tax.Q1[i], Tax.Q3[i], Tax.Q3[i+1], Tax.Q1[i+1]), col = 8, lty = 0)
lines(Xs, Tax.Min)
lines(Xs, Tax.Max)
points(Xs, Slopes, type = "o", pch = 16)
lines(Xs, Slopes, lwd = 2)
plot(x = Xs, y = Slopes, type = "o", pch = 16, lwd = 2, xlim = c(Min.X, Max.X),
ylim = c(Min.Y, Max.Y), xlab = "Indicator Score", ylab = "LOWESS Slope", main = "Dimensional Comparison Data")
for (i in 1:(length(Xs)-1))
polygon(x = c(Xs[i], Xs[i], Xs[i+1], Xs[i+1]),
y = c(Dim.Q1[i], Dim.Q3[i], Dim.Q3[i+1], Dim.Q1[i+1]), col = 8, lty = 0)
lines(Xs, Dim.Min)
lines(Xs, Dim.Max)
points(Xs, Slopes, type = "o", pch = 16)
lines(Xs, Slopes, lwd = 2)
# calculate and print comparison curve fit index (CCFI)
Fit.RMSR.t <- sqrt(sum((Slopes - Tax.Q2)^2)/(N - 1))
Fit.RMSR.d <- sqrt(sum((Slopes - Dim.Q2)^2)/(N - 1))
CCFI <- Fit.RMSR.d / (Fit.RMSR.d + Fit.RMSR.t)
cat("\n\nComparison curve fit index (CCFI) =",round(Fit.RMSR.d,4),"/ (",round(Fit.RMSR.d,4),
"+",round(Fit.RMSR.t,4),") =",round(CCFI,3),"\n")
cat(" Note: CCFI values can range from 0 (dimensional) to 1 (categorical). The more a CCFI\n")
cat(" value deviates from .50, the stronger the result. When .40 < CCFI < .60,\n")
cat(" this should be interpreted with caution.\n\n")
}
cat("\n")
if (File.Output) sink()
}
###########################################################################################################
#
# Main program -- calculate, plot, and summarize results -- no subsidiary programs are called
#
###########################################################################################################
MAXSLOPE.Main <- function(Data, Research.Data, Comp.Data, N.Pop, N.Samples, N, Supplied.P, Supplied.Class)
{
# perform two MAXSLOPE analyses
x1 <- y2 <- Data[,1]
x2 <- y1 <- Data[,2]
low1 <- lowess(y1 ~ x1)
low2 <- lowess(y2 ~ x2)
slope1 <- (low1$y[2:N] - low1$y[1:(N - 1)]) /
(low1$x[2:N] - low1$x[1:(N - 1)])
slope2 <- (low2$y[2:N] - low2$y[1:(N - 1)]) /
(low2$x[2:N] - low2$x[1:(N - 1)])
# replace missing slopes with nearest neighbor
while(sum(is.nan(slope1)) > 0)
{
for (i in 2:(N - 1))
if (is.nan(slope1[i]) & !is.nan(slope1[i - 1])) slope1[i] <- slope1[i - 1]
for (i in 1:(N - 2))
if (is.nan(slope1[i]) & !is.nan(slope1[i + 1])) slope1[i] <- slope1[i + 1]
}
while(sum(is.nan(slope2)) > 0)
{
for (i in 2:(N - 1))
if (is.nan(slope2[i]) & !is.nan(slope2[i - 1])) slope2[i] <- slope2[i - 1]
for (i in 1:(N - 2))
if (is.nan(slope2[i]) & !is.nan(slope2[i + 1])) slope2[i] <- slope2[i + 1]
}
# average the curves, estimate the base rate, and return results
Y <- (slope1 + slope2) / 2
X <- (low1$x[1:(N - 1)] + low2$x[1:(N - 1)]) / 2
my <- max(Y)
my.x <- max(X[Y == my])
my.n <- sum(X >= my.x) - .5 * (sum(Y = my) - 1)
P.Est <- my.n / N
if (Research.Data)
{
# create and label new graph sheet
dev.new(width=3.7, height=3.5)
par(omi=c(.25,.25,.25,.25), cex = .75)
# plot results as slopes by x values
X.Range <- c(min(X), max(X))
Y.Range <- c(min(0, min(Y)), max(((max(Y) / sd(Y))
* (mean(Y) / 1.5)), max(Y) * 1.10))
plot(X, Y, type = "o", pch = 16, xlim = X.Range, ylim = Y.Range,
main = "MAXSLOPE Curve", xlab = "Indicator Score", ylab = "LOWESS Slope")
# report relevant analytic information and program specifications
cat("\n\nSUMMARY OF MAXSLOPE ANALYTIC SPECIFICATIONS\n")
cat("\nSample size: ",N,"\n")
if ((Supplied.P == 0) & (!Supplied.Class))
cat("Classification of cases: Cut total score at estimated base rate\n")
if (Supplied.P > 0) cat("Classification of cases: Cut total score at supplied base rate\n")
if (Supplied.Class) cat("Classification of cases: Supplied with data\n")
if (Comp.Data)
{
cat("Size of finite populations of comparison data: ",N.Pop,"\n")
cat("Number of samples of comparison data drawn from each population: ",N.Samples,"\n")
}
cat("\n")
# report estimated taxon base rate
cat("\n\nSUMMARY OF MAXSLOPE PARAMETER ESTIMATES\n")
cat("\nEstimated taxon base rate =", round(P.Est, 3), "\n\n")
# assign cases to classes using estimated base rate
if ((Supplied.P == 0) & (!Supplied.Class))
{
Data.Matrix <- cbind(Data, 1:N)
Data.Matrix[,3] <- apply(Data.Matrix[,1:2], 1, sum)
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,3]),]
P.Est.Series <- c(rep(2, round(N * P.Est)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Matrix[,3] <- c(Q.Est.Series, P.Est.Series)
}
# assign cases to classes using supplied base rate
if ((Supplied.P > 0) & (!Supplied.Class))
{
Data.Matrix <- cbind(Data, 1:N)
Data.Matrix[,3] <- apply(Data.Matrix[,1:2], 1, sum)
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,3]),]
P.Est.Series <- c(rep(2, round(N * Supplied.P)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Matrix[,3] <- c(Q.Est.Series, P.Est.Series)
}
# assign cases to classes using supplied classification
if (Supplied.Class)
Data.Matrix <- Data
# Construct full-sample and within-groups data files
Data.f <- as.matrix(Data.Matrix[,1:2])
Nt <- sum(Data.Matrix[,3]) - N
Nc <- N - Nt
Data.t <- Data.Matrix[(Data.Matrix[,3] == 2),1:2]
Data.c <- Data.Matrix[(Data.Matrix[,3] == 1),1:2]
for (i in 1:2)
{
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001)
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001)
}
# Calculate distributions for each indicator (full sample, taxon, complement)
Dist.Full <- Dist.Tax <- Dist.Comp <- matrix(nrow = 4, ncol = 4)
Rows <- c(paste("Ind",1:2), "M", "SD")
Cols <- c("M","SD","Skew","Kurtosis")
dimnames(Dist.Full) <- list(Rows, Cols)
dimnames(Dist.Tax) <- list(Rows, Cols)
dimnames(Dist.Comp) <- list(Rows, Cols)
for (i in 1:2)
{
Dist.Full[i,1] <- mean(Data.f[,i])
Dist.Tax[i,1] <- mean(Data.t[,i])
Dist.Comp[i,1] <- mean(Data.c[,i])
Dist.Full[i,2] <- sd(Data.f[,i])
Dist.Tax[i,2] <- sd(Data.t[,i])
Dist.Comp[i,2] <- sd(Data.c[,i])
Dist.Full[i,3] <- Skew(Data.f[,i])
Dist.Tax[i,3] <- Skew(Data.t[,i])
Dist.Comp[i,3] <- Skew(Data.c[,i])
Dist.Full[i,4] <- Kurtosis(Data.f[,i])
Dist.Tax[i,4] <- Kurtosis(Data.t[,i])
Dist.Comp[i,4] <- Kurtosis(Data.c[,i])
}
for (i in 1:4)
{
Dist.Full[3,i] <- mean(Dist.Full[1:2,i])
Dist.Tax[3,i] <- mean(Dist.Tax[1:2,i])
Dist.Comp[3,i] <- mean(Dist.Comp[1:2,i])
Dist.Full[4,i] <- sd(Dist.Full[1:2,i])
Dist.Tax[4,i] <- sd(Dist.Tax[1:2,i])
Dist.Comp[4,i] <- sd(Dist.Comp[1:2,i])
}
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n")
print(round(Dist.Full,3))
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n")
print(round(Dist.Tax,3))
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n")
print(round(Dist.Comp,3))
# calculate validity for each indicator
Ind.Val <- matrix(nrow = 4, ncol = 2)
Rows <- c(paste("Ind",1:2), "M", "SD")
Cols <- c("Raw Units","Cohen's d")
dimnames(Ind.Val) <- list(Rows, Cols)
for (i in 1:2)
{
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1]
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2)
/ (Nt + Nc - 2))
}
Ind.Val[3,1] <- mean(Ind.Val[1:2,1])
Ind.Val[3,2] <- mean(Ind.Val[1:2,2])
Ind.Val[4,1] <- sd(Ind.Val[1:2,1])
Ind.Val[4,2] <- sd(Ind.Val[1:2,2])
cat("\nBetween-group validity:\n\n")
print(round(Ind.Val,3))
# Calculate, print, and summarize full-sample and within-group correlations
cat("\nIndicator correlations:\n\n")
cat("Full Sample r =",round(cor(Data.f)[1,2],3),"\n")
cat(" Taxon r =",round(cor(Data.t)[1,2],3),"\n")
cat(" Complement r =",round(cor(Data.c)[1,2],3),"\n\n")
}
# if research data, return x values, slopes, and base rate estimate;
# if comparison data, return only slopes
if (Research.Data) return(list(X, Y, P.Est))
else return(Y)
}
###########################################################################################################
#
# L-Mode
#
# Written by John Ruscio
# Last modified January 9, 2012
#
###########################################################################################################
#
# Preliminary program to read data, call for comparison data if/as requested, submit each data set to
# main program along with an identifying label.
#
###########################################################################################################
LMode <- function(Data.Set, Comp.Data = T, N.Pop = 100000, N.Samples = 100, Supplied.Class = F,
Supplied.P = 0, Mode.L = -.001, Mode.R = .001, St.Ind = T, Classify = 1, Save.Class = F,
File.Output = F, File.Name = "Output.txt", Seed = 1)
{
Data.Set <- Remove.Missing(Data.Set) # remove any missing data (listwise deletion)
N <- dim(Data.Set)[1] # determine sample size
I <- dim(Data.Set)[2] # determine number of indicators; adjust if one column contains class assignments
if (Supplied.Class) I <- I - 1
if (St.Ind) Data.Set[,1:I] <- apply(Data.Set[,1:I], 2, scale) # standardize data if requested
set.seed(Seed) # set random number seed to allow exact replications of results
if ((Comp.Data) & (N.Samples == 1)) N.Samples <- 100 # do not allow a single sample of comparison data
# if output to file is requested. redirect all text output
if (File.Output) sink(File.Name)
Research.Results <- LMode.Main(Data.Set, Research.Data = T, Comp.Data, N.Pop, N.Samples, N, I,
Mode.L, Mode.R, Classify, Save.Class, Supplied.P, Supplied.Class)
Scores <- Research.Results[[1]]
P.Est <- Research.Results[[2]]
Class.Assign <- Research.Results[[3]]
# if comparison data were requested, then generate and submit data for analysis
if (Comp.Data)
{
# create matrices to store results for comparison
Tax.Scores <- Dim.Scores <- matrix(0, nrow = N.Samples, ncol = N)
# generate and analyze categorical data
set.seed(Seed)
if (!Supplied.Class) # if class assignments were not provided, generate using base rate
{
Data.Set <- cbind(Data.Set, rep(0, N))
if (Supplied.P == 0) P <- P.Est # use estimated P in research data if no P was provided
else P <- Supplied.P
if ((P.Est * N) < 20) P.Est <- 20 / N
if ((N - (P.Est * N)) < 20) P.Est <- N - (20 / N)
for (i in 1:I)
Data.Set[, (I + 1)] <- Data.Set[, (I + 1)] + Data.Set[,i]
Data.Set <- Data.Set[sort.list(Data.Set[,(I + 1)]),]
P.Est.Series <- c(rep(2, round(N * P)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Set[,(I + 1)] <- c(Q.Est.Series, P.Est.Series)
}
Tax.Data <- Data.Set[(Data.Set[,(I + 1)] == 2), 1:I]
Com.Data <- Data.Set[(Data.Set[,(I + 1)] == 1), 1:I]
P.Data <- dim(Tax.Data)[1] / (dim(Tax.Data)[1] + dim(Com.Data)[1])
cat("\nGenerating and analyzing comparison data\n")
Sim.Tax <- GenData(Tax.Data, round(N.Pop * P.Data))
Sim.Com <- GenData(Com.Data, (N.Pop - round(N.Pop * P.Data)))
Sim.Pop <- rbind(Sim.Tax, Sim.Com)
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))]
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I)))
cat(" Categorical population generated, RMSR r =",round(RMSR,3),"\n")
for (Sims in 1:N.Samples)
{
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),]
Tax.Scores[Sims,] <- LMode.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop,
N.Samples, N, I, Mode.L, Mode.R, Classify, Supplied.P,
Supplied.Class)
}
cat(" Analysis of",N.Samples,"samples of categorical comparison data completed\n")
# generate and analyze dimensional samples
set.seed(Seed)
Sim.Pop <- GenData(Data.Set[, 1:I], N.Pop)
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))]
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I)))
cat("\n Dimensional population generated, RMSR r =",round(RMSR,3),"\n")
for (Sims in 1:N.Samples)
{
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),]
Dim.Scores[Sims,] <- LMode.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop,
N.Samples, N, I, Mode.L, Mode.R, Classify, Supplied.P,
Supplied.Class)
}
cat(" Analysis of",N.Samples,"samples of dimensional comparison data completed\n")
Res.Density <- density(Scores)
Tax.Density <- Dim.Density <- matrix(0, nrow = N.Samples, ncol = 512)
for (i in 1:N.Samples)
{
Tax.Density[i, ] <- density(Tax.Scores[i,], from = min(Res.Density$x), to = max(Res.Density$x))$y
Dim.Density[i, ] <- density(Dim.Scores[i,], from = min(Res.Density$x), to = max(Res.Density$x))$y
}
Tax.Density.M <- apply(Tax.Density, 2, mean)
Dim.Density.M <- apply(Dim.Density, 2, mean)
# 2-panel plot for comparison data
Tax.Sum <- Dim.Sum <- matrix(0, nrow = 6, ncol = 512)
for (i in 1:512)
{
Tax.Sum[,i] <- summary(Tax.Density[,i])
Dim.Sum[,i] <- summary(Dim.Density[,i])
}
dev.new(width=7, height=3.5)
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75)
Max.Y <- max(c(Res.Density$y, Tax.Sum[6,], Dim.Sum[6,]))
Min.X <- min(Res.Density$x)
Max.X <- max(Res.Density$x)
plot(Res.Density, type = "l", xlim = c(Min.X, Max.X), ylim = c(0, Max.Y), xlab = "Factor Scores",
ylab = "Density", main = "Categorical Comparison Data", lwd = 2)
for (i in 1:511)
polygon(x = c(Res.Density$x[i], Res.Density$x[i], Res.Density$x[i+1], Res.Density$x[i+1]),
y = c(Tax.Sum[2,i], Tax.Sum[5,i], Tax.Sum[5,i+1], Tax.Sum[2,i+1]), col = 8, lty = 0)
lines(Res.Density$x, Tax.Sum[1,])
lines(Res.Density$x, Tax.Sum[6,])
lines(Res.Density, lwd = 2)
plot(Res.Density, type = "l", xlim = c(Min.X, Max.X), ylim = c(0, Max.Y), xlab = "Factor Scores",
ylab = "Density", main = "Dimensional Comparison Data", lwd = 2)
for (i in 1:511)
polygon(x = c(Res.Density$x[i], Res.Density$x[i], Res.Density$x[i+1], Res.Density$x[i+1]),
y = c(Dim.Sum[2,i], Dim.Sum[5,i], Dim.Sum[5,i+1], Dim.Sum[2,i+1]), col = 8, lty = 0)
lines(Res.Density$x, Dim.Sum[1,])
lines(Res.Density$x, Dim.Sum[6,])
lines(Res.Density, lwd = 2)
# calculate and print comparison curve fit index (CCFI)
data <- list(Res.Density, Dim.Density.M)
Fit.RMSR.d <- optimize(f = Fit.Densities, interval = c(-1,1), data)$objective
data <- list(Res.Density, Tax.Density.M)
Fit.RMSR.t <- optimize(f = Fit.Densities, interval = c(-1,1), data)$objective
CCFI <- Fit.RMSR.d / (Fit.RMSR.d + Fit.RMSR.t)
cat("\n\nComparison curve fit index (CCFI) =",round(Fit.RMSR.d,4),"/ (",round(Fit.RMSR.d,4),
"+",round(Fit.RMSR.t,4),") =",round(CCFI,3),"\n")
cat(" Note: CCFI values can range from 0 (dimensional) to 1 (categorical). The more a CCFI\n")
cat(" value deviates from .50, the stronger the result. When .40 < CCFI < .60,\n")
cat(" this should be interpreted with caution.\n\n")
}
cat("\n")
if (File.Output) sink()
if (Save.Class) return(Class.Assign)
}
###########################################################################################################
#
# Main program -- calculate, plot, and summarize results -- no subsidiary programs are called
#
###########################################################################################################
LMode.Main <- function(Data, Research.Data, Comp.Data, N.Pop, N.Samples, N, I, Mode.L, Mode.R, Classify,
Save.Class, Supplied.P, Supplied.Class)
{
# load R code for running medians smoother
if ((version$major == 1) & (version$minor < 9)) library(eda)
else library(stats) # load R code for running medians smoother
# make room in data file for factor scores (col = I + 1) and class assignments (col = I + 2)
Data.Matrix <- cbind(Data[,1:I], rep(0, N), rep(0, N))
# submit all indicators to factor analysis and store factor scores
Factor.Results <- Factor.Analysis.LMode(Data.Matrix[,1:I])
Data.Matrix[,(I + 1)] <- Factor.Results$scores
Neg.Loadings <- (mean(Factor.Results$loadings) < 0)
if (Neg.Loadings) Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] * -1
Score.Density <- density(Data.Matrix[,(I + 1)])
if (Research.Data)
{
# compute location of estimated latent modes
Max.L <- max(smooth(Score.Density$y)[Score.Density$x <= Mode.L])
Loc.L <- (1: length(Score.Density$x))[smooth(Score.Density$y) == Max.L]
if (length(Loc.L) > 1) Loc.L <- Loc.L[1]
Max.R <- max(smooth(Score.Density$y)[Score.Density$x >= Mode.R])
Loc.R <- (1: length(Score.Density$x))[smooth(Score.Density$y) == Max.R]
if (length(Loc.R) > 1) Loc.R <- Loc.R[1]
# create and label new graph sheet
dev.new(width=3.7, height=3.5)
par(omi=c(.25,.25,.25,.25), cex = .75)
# plot results as frequency distribution of factor scores, with vertical lines at estimated latent modes
Max.Y <- max(smooth(Score.Density$y))
Min.X <- min(smooth(Score.Density$x))
Max.X <- max(smooth(Score.Density$x))
plot(Score.Density, type = "l", xlim = c(Min.X, Max.X), ylim = c(0, Max.Y),
main = "L-Mode Curve", xlab = "Factor Scores", ylab = "Density")
abline(v = Score.Density$x[Loc.L])
abline(v = Score.Density$x[Loc.R])
# report relevant analytic information and program specifications
cat("\n\nSUMMARY OF L-MODE ANALYTIC SPECIFICATIONS\n")
cat("\nSample size: ",N,"\n")
cat("Number of indicator variables: ",I,"\n")
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 1))
cat("Classification of cases: Cut total score at estimated base rate\n")
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 2))
cat("Classification of cases: Nearest mode\n")
if (Supplied.P > 0) cat("Classification of cases: Cut total score at supplied base rate\n")
if (Supplied.Class) cat("Classification of cases: Supplied with data\n")
if (Comp.Data)
{
cat("Size of finite populations of comparison data: ",N.Pop,"\n")
cat("Number of samples of comparison data drawn from each population: ",N.Samples,"\n")
}
cat("\n")
# estimate (and report) taxon base rate from location of latent modes
cat("\n\nSUMMARY OF L-MODE PARAMETER ESTIMATES\n")
P.R <- 1 / (1 + Score.Density$x[Loc.R] ^ 2)
P.L <- 1 - (1 / (1 + Score.Density$x[Loc.L] ^ 2))
P.Est <- mean(c(P.L, P.R))
cat("\nSummary of taxon base rate estimates:\n")
cat(" Based on location of left mode: ",round(P.L,3),"\n")
cat(" Based on location of right mode: ",round(P.R,3),"\n")
cat(" M =",round(P.Est,3),"\n\n")
# estimate (and report) latent class M, SD on each indicator (using factor loadings)
Ms <- apply(Data.Matrix[,1:I], 2, mean)
SDs <- apply(Data.Matrix[,1:I], 2, sd)
Q.Est <- 1 - P.Est
M.SD <- matrix(nrow = I, ncol = 2)
M.SD[,1] <- Ms + SDs * (Q.Est / sqrt(P.Est * Q.Est) * as.vector(Factor.Results$loadings))
M.SD[,2] <- Ms + SDs * (-P.Est / sqrt(P.Est * Q.Est) * as.vector(Factor.Results$loadings))
Row.Labels <- paste("Indicator",1:I)
Col.Labels <- c("Taxon","Complement")
dimnames(M.SD) <- list(Row.Labels, Col.Labels)
cat("\nEstimated latent group M on each indicator (via factor loadings):\n\n")
print(round(M.SD,3))
cat("\n")
# assign cases to classes using estimated base rate
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 1))
{
Data.Matrix <- cbind(Data.Matrix, 1:N)
Data.Matrix[,(I + 2)] <- apply(Data.Matrix[,1:I], 1, sum)
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 2)]),]
P.Est.Series <- c(rep(2, round(N * P.Est)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Matrix[,(I + 2)] <- c(Q.Est.Series, P.Est.Series)
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 3)]),]
Data.Matrix <- Data.Matrix[,1:(I + 2)]
}
# assign cases to classes and print base rate estimate from empirical classification
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 2))
{
Mean.Diff.C <- abs(Data.Matrix[,(I + 1)] - Score.Density$x[Loc.L])
Mean.Diff.T <- abs(Data.Matrix[,(I + 1)] - Score.Density$x[Loc.R])
for (i in 1:N)
if (Mean.Diff.T[i] <= Mean.Diff.C[i]) Data.Matrix[i,(I + 2)] <- 2
else Data.Matrix[i,(I + 2)] <- 1
cat("\nBase rate estimated from classification of cases:\n")
cat(" P =",round((mean(Data.Matrix[,(I + 2)]) - 1),3),"\n\n")
}
# assign cases to classes using supplied base rate
if (Supplied.P > 0)
{
Data.Matrix <- cbind(Data.Matrix, 1:N)
Data.Matrix[,(I + 2)] <- apply(Data.Matrix[,1:I], 1, sum)
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 2)]),]
P.Est.Series <- c(rep(2, round(N * Supplied.P)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Matrix[,(I + 2)] <- c(Q.Est.Series, P.Est.Series)
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 3)]),]
Data.Matrix <- Data.Matrix[,1:(I + 2)]
}
# assign cases to classes using supplied classification
if (Supplied.Class)
Data.Matrix <- cbind(Data[,1:I],Data.Matrix[,(I + 1)],Data[,(I + 1)])
# Construct full-sample and within-groups data files
Data.f <- as.matrix(Data.Matrix[,(1:I)])
Nt <- sum(Data.Matrix[,(I + 2)]) - N
Nc <- N - Nt
Data.t <- Data.Matrix[(Data.Matrix[,(I + 2)] == 2),1:I]
Data.c <- Data.Matrix[(Data.Matrix[,(I + 2)] == 1),1:I]
for (i in 1:I)
{
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001)
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001)
}
# Calculate distributions for each indicator (full sample, taxon, complement)
Dist.Full <- matrix(nrow = I + 2, ncol = 4)
Dist.Tax <- matrix(nrow = I + 2, ncol = 4)
Dist.Comp <- matrix(nrow = I + 2, ncol = 4)
Rows <- c(paste("Ind",1:I), "M", "SD")
Cols <- c("M","SD","Skew","Kurtosis")
dimnames(Dist.Full) <- list(Rows, Cols)
dimnames(Dist.Tax) <- list(Rows, Cols)
dimnames(Dist.Comp) <- list(Rows, Cols)
for (i in 1:I)
{
Dist.Full[i,1] <- mean(Data.f[,i])
Dist.Tax[i,1] <- mean(Data.t[,i])
Dist.Comp[i,1] <- mean(Data.c[,i])
Dist.Full[i,2] <- sd(Data.f[,i])
Dist.Tax[i,2] <- sd(Data.t[,i])
Dist.Comp[i,2] <- sd(Data.c[,i])
Dist.Full[i,3] <- Skew(Data.f[,i])
Dist.Tax[i,3] <- Skew(Data.t[,i])
Dist.Comp[i,3] <- Skew(Data.c[,i])
Dist.Full[i,4] <- Kurtosis(Data.f[,i])
Dist.Tax[i,4] <- Kurtosis(Data.t[,i])
Dist.Comp[i,4] <- Kurtosis(Data.c[,i])
}
for (i in 1:4)
{
Dist.Full[I + 1,i] <- mean(Dist.Full[1:I,i])
Dist.Tax[I + 1,i] <- mean(Dist.Tax[1:I,i])
Dist.Comp[I + 1,i] <- mean(Dist.Comp[1:I,i])
Dist.Full[I + 2,i] <- sd(Dist.Full[1:I,i])
Dist.Tax[I + 2,i] <- sd(Dist.Tax[1:I,i])
Dist.Comp[I + 2,i] <- sd(Dist.Comp[1:I,i])
}
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n")
print(round(Dist.Full,3))
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n")
print(round(Dist.Tax,3))
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n")
print(round(Dist.Comp,3))
# calculate validity for each indicator
Ind.Val <- matrix(nrow = I + 2, ncol = 2)
Rows <- c(paste("Ind",1:I), "M", "SD")
Cols <- c("Raw Units","Cohen's d")
dimnames(Ind.Val) <- list(Rows, Cols)
for (i in 1:I)
{
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1]
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2)
/ (Nt + Nc - 2))
}
Ind.Val[I + 1,1] <- mean(Ind.Val[1:I,1])
Ind.Val[I + 1,2] <- mean(Ind.Val[1:I,2])
Ind.Val[I + 2,1] <- sd(Ind.Val[1:I,1])
Ind.Val[I + 2,2] <- sd(Ind.Val[1:I,2])
cat("\nBetween-group validity:\n\n")
print(round(Ind.Val,3))
# Print factor score validity estimate
Factor.Val <- matrix(nrow = 1, ncol = 6)
Factor.Val[1,1] <- mean(Data.Matrix[(Data.Matrix[,(I + 2)] == 2),(I + 1)])
Factor.Val[1,2] <- sd(Data.Matrix[(Data.Matrix[,(I + 2)] == 2),(I + 1)])
Factor.Val[1,3] <- mean(Data.Matrix[(Data.Matrix[,(I + 2)] == 1),(I + 1)])
Factor.Val[1,4] <- sd(Data.Matrix[(Data.Matrix[,(I + 2)] == 1),(I + 1)])
Factor.Val[1,5] <- Factor.Val[1,1] - Factor.Val[1,3]
Factor.Val[1,6] <- Factor.Val[1,5] / sqrt(((Nt - 1) * Factor.Val[1,2]^2 + (Nc - 1) * Factor.Val[1,4]^2)
/ (Nt + Nc - 2))
Row.Labels <- "Factor Scores"
Col.Labels <- c("Taxon M", "Taxon SD", "Comp M", "Comp SD", "Raw Units", "Cohen's d")
dimnames(Factor.Val) <- list(Row.Labels, Col.Labels)
cat("\nBetween-group validity on factor scores:\n\n")
print(round(Factor.Val[,5:6],3))
# Calculate, print, and summarize full-sample and within-group correlations
Cor.Matrix <- cor(Data.f[,1:I])
Rows <- paste("Ind",1:I)
Cols <- paste("Ind",1:I)
dimnames(Cor.Matrix) <- list(Rows, Cols)
cat("\nIndicator correlations:\n\nFull Sample ( N =",N,"):\n\n")
print(round(Cor.Matrix,3))
Cor.Taxon <- cor(Data.t)
Cor.Comp <- cor(Data.c)
dimnames(Cor.Taxon) <- list(Rows, Cols)
dimnames(Cor.Comp) <- list(Rows, Cols)
cat("\nTaxon ( n =",Nt,"):\n\n")
print(round(Cor.Taxon,3))
cat("\nComplement ( n =",Nc,"):\n\n")
print(round(Cor.Comp,3))
Cor.Summary <- matrix(nrow = 3, ncol = 2)
dimnames(Cor.Summary) <- list(c("Full Sample","Taxon", "Complement"), c("M", "SD"))
Cor.Summary[1,1] <- mean(Cor.Matrix[lower.tri(Cor.Matrix)])
Cor.Summary[2,1] <- mean(Cor.Taxon[lower.tri(Cor.Taxon)])
Cor.Summary[3,1] <- mean(Cor.Comp[lower.tri(Cor.Comp)])
Cor.Summary[1,2] <- sd(Cor.Matrix[lower.tri(Cor.Matrix)])
Cor.Summary[2,2] <- sd(Cor.Taxon[lower.tri(Cor.Taxon)])
Cor.Summary[3,2] <- sd(Cor.Comp[lower.tri(Cor.Comp)])
cat("\nSummary of indicator correlations:\n\n")
print(round(Cor.Summary,3))
cat("\n")
}
# if research data, return factor scores, base rate estimate, and class assignments;
# if comparison data, return only factor scores
if (Research.Data) return(list(Data.Matrix[,(I + 1)], P.Est, Data.Matrix[,(I + 2)]))
else return(Data.Matrix[,(I + 1)])
}
###########################################################################################################
#
# MAMBAC
#
# Written by John Ruscio
# Last modified January 9, 2012
#
###########################################################################################################
#
# Preliminary program to read and submit research data to analysis and then, if requested, perform analyses
# of comparison data to test fit of averaged curves.
#
###########################################################################################################
MAMBAC <- function(Data.Set, Comp.Data = T, N.Pop = 100000, N.Samples = 100, Supplied.Class = F,
Supplied.P = 0, All.Pairs = T, Ind.Comp = F, N.End = 25, N.Cuts = 50, St.Ind = F,
Replications = 0, Gr.Orient = 1, Gr.Rows = 3, Gr.Cols = 2, Gr.Smooth = F,
Gr.Common.Y = F, Gr.Avg = F, Gr.Ind = F, File.Output = F, File.Name = "Output.txt", Seed = 1)
{
Data.Set <- Remove.Missing(Data.Set) # remove any missing data (listwise deletion)
N <- dim(Data.Set)[1] # determine sample size
I <- dim(Data.Set)[2] # determine number of indicators; adjust if one column contains class assignments
if (Supplied.Class) I <- I - 1 # subtract to allow for class assignment data, if provided
set.seed(Seed) # set random number seed to allow exact replications of results
# automatically fix some potential parameter inconsistencies or omissions
if (Comp.Data) Gr.Avg <- T # need averaged curve to compare to those for comparison data
if ((Comp.Data) & (N.Samples == 1)) N.Samples <- 100 # do not allow a single sample of comparison data
if (Ind.Comp)
{
All.Pairs <- F # cannot use both methods
Gr.Ind <- F # cannot average by input indicator if composites are used
}
if (All.Pairs) Ind.Comp <- F # cannot use both methods
if (Ind.Comp & (I < 3))
{
Ind.Comp <- F # cannot form composites using individual variables
All.Pairs <- T
}
if (Gr.Avg) St.Ind <- T # always standardize indicators if graphs will be averaged
if (St.Ind) Data.Set[,1:I] <- apply(Data.Set[,1:I], 2, scale) # standardize data if requested
# check for tied scores and set number of replications
Ties <- F
for (i in 1:I)
if (length(unique(Data.Set[,I])) < N) Ties <- T
if ((Replications == 0) & (Ties)) Replications <- 10 # default is 10 replications if any tied scores
if (Replications == 0) Replications <- 1
if ((Replications > 1) & (!Ties)) Replications <- 1 # eliminate unnecessary replications
# if output to file is requested, redirect all text output
if (File.Output) sink(File.Name)
# submit data to main program for analysis; taxon base rate estimate will be returned
MAMBAC.Results <- MAMBAC.Main(Data.Set, Research.Data = T, Comp.Data, N.Pop, N.Samples, N, I,
All.Pairs, Ind.Comp, N.End, N.Cuts, Replications, Gr.Orient, Gr.Rows,
Gr.Cols, Gr.Smooth, Gr.Common.Y, Gr.Avg, Gr.Ind, Supplied.P,
Supplied.Class)
P.Est.M <- MAMBAC.Results[[3]]
P.Est.SD <- MAMBAC.Results[[4]]
# if comparison data requested, then generate and submit samples for analysis
if (Comp.Data)
{
# create matrices to store results for comparison data
Tax.Y.Values <- matrix(nrow = N.Samples, ncol = N.Cuts)
Dim.Y.Values <- matrix(nrow = N.Samples, ncol = N.Cuts)
if (Ind.Comp) N.Curves <- I
else N.Curves <- I * (I - 1)
Tax.P.Ests <- matrix(nrow = N.Samples, ncol = N.Curves)
Dim.P.Ests <- matrix(nrow = N.Samples, ncol = N.Curves)
# generate and analyze categorical data
set.seed(Seed)
if (!Supplied.Class) # if class assignments were not provided, generate using base rate
{
Data.Set <- cbind(Data.Set, rep(0, N))
if (Supplied.P == 0) P <- P.Est.M # use estimated P in research data if no P was provided
else P <- Supplied.P
if ((P.Est.M * N) < 20) P.Est.M <- 20 / N
if ((N - (P.Est.M * N)) < 20) P.Est.M <- N - (20 / N)
for (i in 1:I)
Data.Set[, (I + 1)] <- Data.Set[, (I + 1)] + Data.Set[,i]
Data.Set <- Data.Set[sort.list(Data.Set[,(I + 1)]),]
P.Est.Series <- c(rep(2, round(N * P)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Set[,(I + 1)] <- c(Q.Est.Series, P.Est.Series)
}
Tax.Data <- Data.Set[(Data.Set[,(I + 1)] == 2), 1:I]
Com.Data <- Data.Set[(Data.Set[,(I + 1)] == 1), 1:I]
P.Data <- dim(Tax.Data)[1] / (dim(Tax.Data)[1] + dim(Com.Data)[1])
cat("\nGenerating and analyzing comparison data\n")
Sim.Tax <- GenData(Tax.Data, round(N.Pop * P.Data))
Sim.Com <- GenData(Com.Data, (N.Pop - round(N.Pop * P.Data)))
Sim.Pop <- rbind(Sim.Tax, Sim.Com)
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))]
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I)))
cat(" Categorical population generated, RMSR r =",round(RMSR,3),"\n")
for (Sims in 1:N.Samples)
{
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),]
MAMBAC.Tax.Results <- MAMBAC.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, N.Samples,
N, I, All.Pairs, Ind.Comp, N.End, N.Cuts, Replications,
Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y,
Gr.Avg, Gr.Ind, Supplied.P, Supplied.Class)
Tax.Y.Values[Sims,] <- MAMBAC.Tax.Results[[1]]
Tax.P.Ests[Sims,] <- MAMBAC.Tax.Results[[2]]
}
cat(" Analysis of",N.Samples,"samples of categorical comparison data completed\n")
# generate and analyze dimensional samples
set.seed(Seed)
Sim.Pop <- GenData(Data.Set[, 1:I], N.Pop)
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))]
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I)))
cat("\n Dimensional population generated, RMSR r =",round(RMSR,3),"\n")
for (Sims in 1:N.Samples)
{
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),]
MAMBAC.Dim.Results <- MAMBAC.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, N.Samples,
N, I, All.Pairs, Ind.Comp, N.End, N.Cuts, Replications,
Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y,
Gr.Avg, Gr.Ind, Supplied.P, Supplied.Class)
Dim.Y.Values[Sims,] <- MAMBAC.Dim.Results[[1]]
Dim.P.Ests[Sims,] <- MAMBAC.Dim.Results[[2]]
}
cat(" Analysis of",N.Samples,"samples of dimensional comparison data completed\n")
# summarize base rate estimates for comparison data
P.Est.Summary <- matrix(nrow = (N.Samples + 2), ncol = 4)
for (Sims in 1:N.Samples)
{
P.Est.Summary[Sims,1] <- mean(Tax.P.Ests[Sims,], na.rm = T)
P.Est.Summary[Sims,2] <- sd(Tax.P.Ests[Sims,], na.rm = T)
P.Est.Summary[Sims,3] <- mean(Dim.P.Ests[Sims,], na.rm = T)
P.Est.Summary[Sims,4] <- sd(Dim.P.Ests[Sims,], na.rm = T)
}
for (i in 1:4)
{
P.Est.Summary[Sims + 1,i] <- mean(P.Est.Summary[1:Sims,i])
P.Est.Summary[Sims + 2,i] <- sd(P.Est.Summary[1:Sims,i])
}
Row.Labels <- c(1:Sims,paste("M for",N.Samples,"samples"),paste("SD for",N.Samples,"samples"))
Col.Labels <- c("Categorical M","SD","Dimensional M","SD")
dimnames(P.Est.Summary) <- list(Row.Labels,Col.Labels)
cat("\n\nBase rate estimates for comparison data:\n\n")
if (N.Samples == 1)
{
cat(" Categorical sample: M =",round(P.Est.Summary[1,1],4),", SD =",
round(P.Est.Summary[1,2],4),"\n")
cat(" Dimensional sample: M =",round(P.Est.Summary[1,3],4),", SD =",
round(P.Est.Summary[1,4],4),"\n")
}
else
print(round(P.Est.Summary[((N.Samples+1):(N.Samples+2)),],4))
# plot sampling distributions of Ms/SDs of base rate estimates
if (N.Samples > 1)
{
dev.new(width=7, height=3.5)
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75)
Y.M.Max <- max(c(density(P.Est.Summary[1:Sims,1])$y,density(P.Est.Summary[1:Sims,3])$y)) * 1.25
Y.SD.Max <- max(c(density(P.Est.Summary[1:Sims,2])$y,density(P.Est.Summary[1:Sims,4])$y)) * 1.25
plot(density(P.Est.Summary[1:Sims,1], adjust = .75), xlim = c(0,1), ylim = c(0, Y.M.Max),
xlab = "Ms of Base Rate Estimates", ylab = "", main = "", lwd = 2, axes = F)
axis(1)
lines(density(P.Est.Summary[1:Sims,3], adjust = .75), lwd = 2, lty = 2)
abline(v = P.Est.M, lwd = 2, lty = 3)
plot(density(P.Est.Summary[1:Sims,2], adjust = .75), xlim = c(0,.5), ylim = c(0, Y.SD.Max),
xlab = "SDs of Base Rate Estimates", ylab = "", main = "", lwd = 2, axes = F)
axis(1)
lines(density(P.Est.Summary[1:Sims,4], adjust = .75), lwd = 2, lty = 2)
abline(v = P.Est.SD, lwd = 2, lty = 3)
mtext("BASE RATE ESTIMATES FOR COMPARISON DATA",outer=T)
}
# plot panel of averaged curves
MAMBAC.Y <- MAMBAC.Results[[2]]
X.Values <- MAMBAC.Results[[1]]
X.Range <- c(min(X.Values), max(X.Values))
Tax.Y <- vector("numeric", length(X.Values))
for (i in 1:length(Tax.Y))
Tax.Y[i] <- mean(Tax.Y.Values[,i])
Dim.Y <- vector("numeric", length(X.Values))
for (i in 1:length(Dim.Y))
Dim.Y[i] <- mean(Dim.Y.Values[,i])
Y.Range <- c(min(MAMBAC.Y,Tax.Y.Values,Dim.Y.Values),max(MAMBAC.Y,Tax.Y.Values,Dim.Y.Values))
# label the x and y axes
X.Label <- paste(N.Cuts,"Cuts")
Y.Label <- "Mean Difference"
# 2-panel plot for comparison data
Tax.Sum <- Dim.Sum <- matrix(0, nrow = 6, ncol = length(X.Values))
for (i in 1:length(X.Values))
{
Tax.Sum[,i] <- summary(Tax.Y.Values[,i])
Dim.Sum[,i] <- summary(Dim.Y.Values[,i])
}
dev.new(width=7, height=3.5)
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75)
plot(X.Values, MAMBAC.Y, type = "o", pch = 16, lwd = 2, ylim = Y.Range,
xlim = X.Range, ylab = Y.Label, xlab = X.Label, main = "Categorical Comparison Data")
for (i in 1:(length(X.Values)-1))
polygon(x = c(X.Values[i], X.Values[i], X.Values[i+1], X.Values[i+1]),
y = c(Tax.Sum[2,i], Tax.Sum[5,i], Tax.Sum[5,i+1], Tax.Sum[2,i+1]), col = 8, lty = 0)
lines(X.Values, Tax.Sum[1,])
lines(X.Values, Tax.Sum[6,])
points(X.Values, MAMBAC.Y, type = "o", pch = 16)
lines(X.Values, MAMBAC.Y, lwd = 2)
plot(X.Values, MAMBAC.Y, type = "o", pch = 16, lwd = 2, ylim = Y.Range,
xlim = X.Range, ylab = Y.Label, xlab = X.Label, main = "Dimensional Comparison Data")
for (i in 1:(length(X.Values)-1))
polygon(x = c(X.Values[i], X.Values[i], X.Values[i+1], X.Values[i+1]),
y = c(Dim.Sum[2,i], Dim.Sum[5,i], Dim.Sum[5,i+1], Dim.Sum[2,i+1]), col = 8, lty = 0)
lines(X.Values, Dim.Sum[1,])
lines(X.Values, Dim.Sum[6,])
points(X.Values, MAMBAC.Y, type = "o", pch = 16)
lines(X.Values, MAMBAC.Y, lwd = 2)
# calculate and print comparison curve fit index (CCFI)
Fit.RMSR.t <- sqrt(sum((Tax.Y - MAMBAC.Y)^2)/N.Cuts)
Fit.RMSR.d <- sqrt(sum((Dim.Y - MAMBAC.Y)^2)/N.Cuts)
CCFI <- Fit.RMSR.d / (Fit.RMSR.d + Fit.RMSR.t)
cat("\n\nComparison curve fit index (CCFI) =",round(Fit.RMSR.d,4),"/ (",round(Fit.RMSR.d,4),
"+",round(Fit.RMSR.t,4),") =",round(CCFI,3),"\n")
cat(" Note: CCFI values can range from 0 (dimensional) to 1 (categorical). The more a CCFI\n")
cat(" value deviates from .50, the stronger the result. When .40 < CCFI < .60,\n")
cat(" this should be interpreted with caution.\n\n")
}
if (File.Output) sink()
}
###########################################################################################################
#
# Main program shows the overall structure through calls to four subsidiary programs
#
###########################################################################################################
MAMBAC.Main <- function(Data, Research.Data, Comp.Data, N.Pop, N.Samples, N, I, All.Pairs, Ind.Comp, N.End,
N.Cuts, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y,
Gr.Avg, Gr.Ind, Supplied.P, Supplied.Class)
{
# set up matrix to store input/output configurations for each curve to be generated
Curve.Info <- MAMBAC.Structure(I, All.Pairs)
# carry out analyses by calling calculation procedure once per curve
Curves <- list(1)
for (Curve.Num in 1:length(Curve.Info[,1]))
Curves[[Curve.Num]] <- MAMBAC.Calculate(Data[,1:I], N, I, Curve.Info, Curve.Num, All.Pairs, Ind.Comp,
N.End, N.Cuts, Replications)
# plot the results (returns P.Avg for research data, X.Values & Y.Values for comparison data)
MAMBAC.Results <- MAMBAC.Plot(Research.Data, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End,
N.Cuts, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y,
Gr.Avg, Gr.Ind)
if (Research.Data) P.Avg <- MAMBAC.Results[[1]]
# print summary of analytic specification (for research data only)
if (Research.Data) MAMBAC.Summarize(Data[,1:I], N, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End,
N.Cuts, Replications, Gr.Smooth, Gr.Avg, Comp.Data, N.Pop, N.Samples,
Supplied.P, Supplied.Class)
# generate estimates of taxon base rate (for research data only)
if (Research.Data) P.Est <- MAMBAC.P.Est(Data, N, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End,
N.Cuts, Replications, Gr.Smooth, Gr.Avg, P.Avg, Supplied.P, Supplied.Class)
if (!Research.Data)
{
N.Curves <- length(Curve.Info[,1])
P.Estimates <- vector("numeric", N.Curves)
for (i in 1:N.Curves)
{
P.Estimates[i] <- 1/((Curves[[i]][length(Curves[[i]][,2]),2]/Curves[[i]][1, 2]) + 1)
if (P.Estimates[i] < 0) P.Estimates[i] <- 0 # correct for possibilities of
if (P.Estimates[i] > 1) P.Estimates[i] <- 1 # negative values at extremes
}
}
if (Research.Data) return(list(MAMBAC.Results[[2]], MAMBAC.Results[[3]], P.Est[[1]], P.Est[[2]]))
else return(list(MAMBAC.Results, P.Estimates))
}
###########################################################################################################
#
# Program to determine structure of analysis and set up matrix to contain relevant curve information
#
###########################################################################################################
MAMBAC.Structure <- function(I, All.Pairs)
{
# based on indicator configuration, create matrix listing input/output indicators to use for each curve
if (All.Pairs)
{
N.Curves <- I * (I - 1)
Curve.Info <- matrix(nrow = N.Curves, ncol = 3)
Curve.Info[,1] <- 1:N.Curves
Counter <- 0
for (i in 1:(I - 1))
{
for (j in (i + 1):I)
{
Curve.Info[Counter + 1, 2:3] <- c(i, j)
Curve.Info[Counter + 2, 2:3] <- c(j, i)
Counter <- Counter + 2
}
}
}
else
{
N.Curves <- I
Curve.Info <- matrix(nrow = N.Curves, ncol = 2)
Curve.Info[,1] <- 1:N.Curves
Curve.Info[,2] <- 1:N.Curves
}
return(Curve.Info)
}
###########################################################################################################
#
# Program to locate cutting scores along the input and calculate mean differences
#
###########################################################################################################
MAMBAC.Calculate <- function(Data, N, I, Curve.Info, Curve.Num, All.Pairs, Ind.Comp, N.End, N.Cuts,
Replications)
{
# make room in data for composite indicator plus random numbers for resorting at each replication
Data.Matrix <- cbind(Data, rep(0,N), rnorm(N, mean = 0, sd = 1))
# determine input column, creating composite indicator if necessary, and sort by input
if (All.Pairs)
{
Input <- Curve.Info[Curve.Num, 2]
Output <- Curve.Info[Curve.Num, 3]
}
else
{
Output <- Curve.Info[Curve.Num,2]
Input <- I + 1
for (i in 1:I)
Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] + Data.Matrix[,i]
Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] - Data.Matrix[,Output]
}
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),]
Cuts <- seq(from = N.End, to = (N - N.End), length = N.Cuts)
X.Labels <- Cuts
# create matrix to store curve results
Num.Cuts <- length(Cuts)
Curve.Results <- matrix(rep(0, Num.Cuts * 2), nrow = Num.Cuts, ncol = 2)
# having set up subsamples along the input and created a storage structure, perform the calculations
# replications loop; results are cumulated, subsequently divided by number of replications to yield average
for (Rep in 1:Replications)
{
# shuffle cases, then sort by input
Data.Matrix <- Data.Matrix[sample(1:N, N, replace = F),]
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),]
# begin calculations, storing results in either matrix or array as follows:
# 1st column = labels for x axis
# 2nd column = MAMBAC values
Curve.Results[,1] <- X.Labels
for (i in 1:Num.Cuts)
Curve.Results[i,2] <- mean(Data.Matrix[(Cuts[i]+1):N,Output]) -
mean(Data.Matrix[1:Cuts[i],Output])
}
# divide cumulated mean differences by number of replications to yield average values
Curve.Results[,2] <- Curve.Results[,2] / Replications
return(Curve.Results)
}
###########################################################################################################
#
# Program to plot and label a panel of curves
#
###########################################################################################################
MAMBAC.Plot <- function(Research.Data, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, N.Cuts,
Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, Gr.Avg, Gr.Ind)
{
P.Avg <- 0
N.Curves <- length(Curve.Info[,1])
# create and label new graph sheet
if (Research.Data)
{
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
Page <- 1
# if applying a common Y scale for all graphs, set the range here (algorithm accentuates peak/flatness)
if (Gr.Common.Y)
{
Ys <- 0
for (i in 1:N.Curves)
Ys <- c(Ys, Curves[[i]][,2])
Y.Min <- min(Ys[2:length(Ys)])
Y.Max <- max(Ys[2:length(Ys)])
Y.Range <- c(Y.Min, Y.Max)
}
# plot each curve
for (i in 1:N.Curves)
{
if (All.Pairs)
{
Input <- Curve.Info[i,2]
Output <- Curve.Info[i,3]
}
else
{
Input <- Curve.Info[i,1]
Output <- Curve.Info[i,2]
}
X.Values <- Curves[[i]][,1]
Y.Values <- Curves[[i]][,2]
X.Range <- c(min(X.Values), max(X.Values))
if (!Gr.Common.Y) Y.Range <- c(min(Y.Values), max(Y.Values))
# label the x axis in accord with input method and indicator
if (All.Pairs) X.Label <- paste(N.Cuts,"Cuts on Indicator",Input)
else X.Label <- paste(N.Cuts,"Cuts")
# label the y axis
Y.Label <- paste("Mean Difference, Indicator",Output)
# check to see whether new graph sheet needs to be created
if ((i > 1) & ((i - 1) %% (Gr.Rows * Gr.Cols) == 0))
{
mtext(paste("MAMBAC CURVES, page ",Page,sep=""),outer=T)
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
Page <- Page + 1
}
# plot the graph, plus smoothed curve if requested
plot(X.Values, Y.Values, type = "p", pch = 16, ylim = Y.Range, xlim = X.Range,
ylab = Y.Label, xlab = X.Label)
if (!Gr.Smooth) # no smoothing, connect "raw" data points
lines(X.Values, Y.Values) # solid line for raw data only
else
lines(lowess(X.Values, Y.Values)) # dotted line for LOWESS smoothed curve
}
mtext(paste("MAMBAC CURVES, page ",Page,sep=""),outer=T)
}
if (Gr.Ind & Research.Data)
{
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
# average y values for curves using the same input indicator
Ind.Graphs <- matrix(nrow = I, ncol = N.Cuts)
N.Each.Ind <- N.Curves / I
for (i in 1:I)
{
Ind.Curves <- Curve.Info[,1][Curve.Info[,2] == i]
Ys <- vector("numeric",N.Cuts)
for (j in 1:N.Cuts)
for (k in Ind.Curves)
Ys[j] <- Ys[j] + Curves[[k]][j,2]
Ind.Graphs[i,] <- Ys / N.Each.Ind
}
# plot a curve for each indicator
Page <- 1
for (i in 1:I)
{
Input <- i
X.Values <- Curves[[i * N.Each.Ind]][,1]
X.Range <- c(min(X.Values), max(X.Values))
Ind.Curves <- Curve.Info[,1][Curve.Info[,2] == i]
Y.Values <- Ind.Graphs[i,]
for (j in Ind.Curves)
Y.Values <- c(Y.Values,Curves[[j]][,2])
Y.Range <- c(min(Y.Values), max(Y.Values))
X.Label <- paste(N.Cuts,"Cuts on Indicator",i)
Y.Label <- "Mean Difference"
# check to see whether new graph sheet needs to be created
if ((i > 1) & ((i - 1) %% (Gr.Rows * Gr.Cols) == 0))
{
mtext(paste("MAMBAC INDICATOR CURVES, page ",Page,sep=""),outer=T)
if (Gr.Orient == 1) dev.new(width=8.5, height=11)
else dev.new(width=11, height=8.5)
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25))
Page <- Page + 1
}
# plot the averaged graph, with dotted lines for each individual curve
plot(X.Values, Ind.Graphs[i,], type = "l", pch = 16, ylim = Y.Range, xlim = X.Range,
ylab = Y.Label, xlab = X.Label, lwd = 2)
for (j in Ind.Curves)
lines(Curves[[j]][,1],Curves[[j]][,2], lty = 3, lwd = 2)
}
mtext(paste("MAMBAC INDICATOR CURVES, page",Page),outer=T)
}
if (Gr.Avg)
{
if (Research.Data)
{
dev.new(width=3.7, height=3.5)
par(omi=c(.25,.25,.25,.25), cex = .75)
}
X.Values <- Curves[[1]][,1]
X.Range <- c(min(X.Values), max(X.Values))
Y.Values <- rep(0,length(X.Values))
for (i in 1:N.Curves)
Y.Values <- Y.Values + Curves[[i]][,2]
Y.Values <- Y.Values / N.Curves
Y.Vals <- Y.Values
for (j in 1:N.Curves)
Y.Vals <- c(Y.Vals,Curves[[j]][,2])
Y.Range <- c(min(Y.Vals), max(Y.Vals))
X.Label <- paste(N.Cuts,"Cuts")
Y.Label <- "Mean Difference"
P.Avg <- 1/((Y.Values[length(Y.Values)]/Y.Values[1]) + 1)
if (P.Avg < 0) P.Avg <- 0 # correct for possibilities of negative values at extremes
if (P.Avg > 1) P.Avg <- 1
if (Research.Data)
{
plot(X.Values, Y.Values, type = "l", pch = 16, ylim = Y.Range, xlim = X.Range,
ylab = Y.Label, xlab = X.Label, main = "Averaged MAMBAC Curve", lwd = 2)
for (j in 1:N.Curves)
lines(Curves[[j]][,1],Curves[[j]][,2], lty = 3, lwd = 2)
}
}
if (!Research.Data) return(Y.Values)
else return(list(P.Avg,X.Values,Y.Values))
}
###########################################################################################################
#
# Function to produce text summary of analytic specifications
#
###########################################################################################################
MAMBAC.Summarize <- function(Data, N, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, N.Cuts,
Replications, Gr.Smooth, Gr.Avg, Comp.Data, N.Pop, N.Samples, Supplied.P,
Supplied.Class)
{
# report relevant analytic information and program specifications
cat("\n\nSUMMARY OF MAMBAC ANALYTIC SPECIFICATIONS\n")
cat("\nSample size: ",N,"\n")
cat("Number of indicator variables: ",I,"\n")
cat("Replications: ",Replications,"\n")
cat("Cuts: ",N.Cuts,"evenly-spaced cuts beginning",N.End,"cases from either extreme\n")
if (All.Pairs) Ind.Label <- "Variables serve in all possible input-output pairs"
else Ind.Label <- paste("One variable serves as output, input = sum of remaining",(I-1),"variables")
cat("Indicators: ",Ind.Label,"\n")
cat("Total number of curves: ",length(Curve.Info[,1]),"\n")
if (Gr.Smooth) cat("Y values smoothed for graphing and P estimation: Yes, LOWESS method\n")
else cat("Y values smoothed for graphing and P estimation: No\n")
if ((Supplied.P == 0) & (!Supplied.Class))
cat("Classification of cases: Cut total score at estimated base rate\n")
if (Supplied.P > 0) cat("Classification of cases: Cut total score at supplied base rate\n")
if (Supplied.Class) cat("Classification of cases: Supplied with data\n")
if (Comp.Data)
{
cat("Size of finite populations of comparison data: ",N.Pop,"\n")
cat("Number of samples of comparison data drawn from each population: ",N.Samples,"\n")
}
cat("\n")
}
###########################################################################################################
#
# Function to generate and report estimates of the taxon base rate and other latent parameters
#
###########################################################################################################
MAMBAC.P.Est <- function(Data, N, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, N.Cuts, Replications,
Gr.Smooth, Gr.Avg, P.Avg, Supplied.P, Supplied.Class)
{
cat("\n\nSUMMARY OF MAMBAC PARAMETER ESTIMATES\n")
# estimate base rate for each curve
N.Curves <- length(Curve.Info[,1])
P.Estimates <- matrix(nrow = N.Curves, ncol = 1)
if (Gr.Smooth)
for (i in 1:N.Curves)
{
Num.Y <- length(Curves[[i]][,1])
Lowess.Y <- lowess(Curves[[i]][,1],Curves[[i]][,2])$y
for (j in 1:(Num.Y-1))
if (Curves[[i]][j,1] == Curves[[i]][(j + 1),1])
Lowess.Y <- c(Lowess.Y[1:j],Lowess.Y[j],Lowess.Y[j+1:length(Lowess.Y)])
Lowess.Y <- Lowess.Y[1:Num.Y]
P.Estimates[i,1] <- 1/((Lowess.Y[Num.Y]/Lowess.Y[1]) + 1)
if (P.Estimates[i,1] < 0) P.Estimates[i,1] <- 0 # correct for possibilities of
if (P.Estimates[i,1] > 1) P.Estimates[i,1] <- 1 # negative values at extremes
}
else
for (i in 1:N.Curves)
{
P.Estimates[i,1] <- 1/((Curves[[i]][length(Curves[[i]][,2]),2]/Curves[[i]][1, 2]) + 1)
if (P.Estimates[i,1] < 0) P.Estimates[i,1] <- 0 # correct for possibilities of
if (P.Estimates[i,1] > 1) P.Estimates[i,1] <- 1 # negative values at extremes
}
Row.Labels <- paste("Curve",1:N.Curves)
Col.Labels <- c("P")
dimnames(P.Estimates) <- list(Row.Labels, Col.Labels)
cat("\nEstimated taxon base rates for each curve:\n\n")
print(round(P.Estimates,3))
P.Est.M <- mean(P.Estimates[,1])
P.Est.SD <- sd(P.Estimates[,1])
cat("\nSummary of base rate estimates across curves:\n M =",round(P.Est.M,3),
"\n SD =",round(P.Est.SD,3),"\n\n")
# if more than one curve per indicator, gather P estimates for each indicator
if (All.Pairs & (I > 2))
{
Ind.Est <- matrix(rep(0, I), nrow = I, ncol = 1)
N.Each.Ind <- N.Curves / I
for (i in 1:N.Curves)
Ind.Est[Curve.Info[i,2]] <- Ind.Est[Curve.Info[i,2]] + P.Estimates[i]
Ind.Est[,1] <- Ind.Est[,1] / N.Each.Ind
Row.Labels <- paste("Indicator",1:I)
Col.Labels <- c("P")
dimnames(Ind.Est) <- list(Row.Labels, Col.Labels)
cat("Estimated taxon base rates for each indicator:\n\n")
print(round(Ind.Est,3))
cat("\n")
cat("Summary of base rate estimates across indicators:\n M =",round(mean(Ind.Est[,1]),3),
"\n SD =",round(sd(Ind.Est[,1]),3),"\n\n")
}
# print base rate estimate for averaged curve if appropriate
if (Gr.Avg)
cat("Base rate estimate for averaged curve =",round(P.Avg, 3),"\n\n")
# assign cases to classes using base-rate classification method with estimated base rate
if ((Supplied.P == 0) & (!Supplied.Class))
{
Data.Matrix <- cbind(Data[,1:I],rep(0,N),rep(0,N))
for (i in 1:I)
Data.Matrix[, (I + 1)] <- Data.Matrix[, (I + 1)] + Data.Matrix[,i]
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 1)]),]
P.Est.Series <- c(rep(2, trunc(N * P.Est.M + .5)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Matrix[,(I + 2)] <- c(Q.Est.Series, P.Est.Series)
}
# assign cases to classes using base-rate classification method with supplied base rate
if (Supplied.P > 0)
{
Data.Matrix <- cbind(Data[,1:I],rep(0,N),rep(0,N))
for (i in 1:I)
Data.Matrix[, (I + 1)] <- Data.Matrix[, (I + 1)] + Data.Matrix[,i]
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 1)]),]
P.Est.Series <- c(rep(2, trunc(N * Supplied.P + .5)))
Q.Est.Series <- c(rep(1, N - length(P.Est.Series)))
Data.Matrix[,(I + 2)] <- c(Q.Est.Series, P.Est.Series)
}
# assign cases to classes using supplied classification
if (Supplied.Class)
Data.Matrix <- cbind(Data[,1:I], rep(0,N), Data[, (I + 1)])
# Construct full-sample and within-groups data files
Data.f <- as.matrix(Data.Matrix[,(1:I)])
Nt <- sum(Data.Matrix[,(I + 2)]) - N
Nc <- N - Nt
Data.t <- Data.Matrix[(Data.Matrix[,(I + 2)] == 2),1:I]
Data.c <- Data.Matrix[(Data.Matrix[,(I + 2)] == 1),1:I]
for (i in 1:I)
{
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001)
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001)
}
# Calculate distributions for each indicator (full sample, taxon, complement)
Dist.Full <- matrix(nrow = I + 2, ncol = 4)
Dist.Tax <- matrix(nrow = I + 2, ncol = 4)
Dist.Comp <- matrix(nrow = I + 2, ncol = 4)
Rows <- c(paste("Ind",1:I), "M", "SD")
Cols <- c("M","SD","Skew","Kurtosis")
dimnames(Dist.Full) <- list(Rows, Cols)
dimnames(Dist.Tax) <- list(Rows, Cols)
dimnames(Dist.Comp) <- list(Rows, Cols)
for (i in 1:I)
{
Dist.Full[i,1] <- mean(Data.f[,i])
Dist.Tax[i,1] <- mean(Data.t[,i])
Dist.Comp[i,1] <- mean(Data.c[,i])
Dist.Full[i,2] <- sd(Data.f[,i])
Dist.Tax[i,2] <- sd(Data.t[,i])
Dist.Comp[i,2] <- sd(Data.c[,i])
Dist.Full[i,3] <- Skew(Data.f[,i])
Dist.Tax[i,3] <- Skew(Data.t[,i])
Dist.Comp[i,3] <- Skew(Data.c[,i])
Dist.Full[i,4] <- Kurtosis(Data.f[,i])
Dist.Tax[i,4] <- Kurtosis(Data.t[,i])
Dist.Comp[i,4] <- Kurtosis(Data.c[,i])
}
for (i in 1:4)
{
Dist.Full[I + 1,i] <- mean(Dist.Full[1:I,i])
Dist.Tax[I + 1,i] <- mean(Dist.Tax[1:I,i])
Dist.Comp[I + 1,i] <- mean(Dist.Comp[1:I,i])
Dist.Full[I + 2,i] <- sd(Dist.Full[1:I,i])
Dist.Tax[I + 2,i] <- sd(Dist.Tax[1:I,i])
Dist.Comp[I + 2,i] <- sd(Dist.Comp[1:I,i])
}
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n")
print(round(Dist.Full,3))
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n")
print(round(Dist.Tax,3))
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n")
print(round(Dist.Comp,3))
# calculate validity for each indicator
Ind.Val <- matrix(nrow = I + 2, ncol = 2)
Rows <- c(paste("Ind",1:I), "M", "SD")
Cols <- c("Raw Units","Cohen's d")
dimnames(Ind.Val) <- list(Rows, Cols)
for (i in 1:I)
{
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1]
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2)
/ (Nt + Nc - 2))
}
Ind.Val[I + 1,1] <- mean(Ind.Val[1:I,1])
Ind.Val[I + 1,2] <- mean(Ind.Val[1:I,2])
Ind.Val[I + 2,1] <- sd(Ind.Val[1:I,1])
Ind.Val[I + 2,2] <- sd(Ind.Val[1:I,2])
cat("\nBetween-group validity:\n\n")
print(round(Ind.Val,3))
# Calculate, print, and summarize full-sample and within-group correlations
Cor.Matrix <- cor(Data.f[,1:I])
Rows <- paste("Ind",1:I)
Cols <- paste("Ind",1:I)
dimnames(Cor.Matrix) <- list(Rows, Cols)
cat("\nIndicator correlations:\n\nFull Sample ( N =",N,"):\n\n")
print(round(Cor.Matrix,3))
Cor.Taxon <- cor(Data.t)
Cor.Comp <- cor(Data.c)
dimnames(Cor.Taxon) <- list(Rows, Cols)
dimnames(Cor.Comp) <- list(Rows, Cols)
cat("\nTaxon ( n =",Nt,"):\n\n")
print(round(Cor.Taxon,3))
cat("\nComplement ( n =",Nc,"):\n\n")
print(round(Cor.Comp,3))
Cor.Summary <- matrix(nrow = 3, ncol = 2)
dimnames(Cor.Summary) <- list(c("Full Sample","Taxon", "Complement"), c("M", "SD"))
Cor.Summary[1,1] <- mean(Cor.Matrix[lower.tri(Cor.Matrix)])
Cor.Summary[2,1] <- mean(Cor.Taxon[lower.tri(Cor.Taxon)])
Cor.Summary[3,1] <- mean(Cor.Comp[lower.tri(Cor.Comp)])
Cor.Summary[1,2] <- sd(Cor.Matrix[lower.tri(Cor.Matrix)])
Cor.Summary[2,2] <- sd(Cor.Taxon[lower.tri(Cor.Taxon)])
Cor.Summary[3,2] <- sd(Cor.Comp[lower.tri(Cor.Comp)])
cat("\nSummary of indicator correlations:\n\n")
print(round(Cor.Summary,3))
cat("\n")
# return M and SD of base rate estimates
return(list(P.Est.M,P.Est.SD))
}
################################################################################################################
GenData <- function(Supplied.Data, N, N.Factors = 0, Max.Trials = 5, Initial.Multiplier = 1, seed = 0)
{
# Initialize variables and (if applicable) set random number seed (step 1) -------------------------------------
k <- dim(Supplied.Data)[2] # Number of variables
Data <- matrix(0, nrow = N, ncol = k) # Matrix to store the simulated data
Distributions <- matrix(0, nrow = N, ncol = k) # Matrix to store each variable's score distribution
Iteration <- 0 # Iteration counter
Best.RMSR <- 1 # Lowest RMSR correlation
Trials.Without.Improvement <- 0 # Trial counter
if (seed != 0) set.seed(seed) # If user specified a random seed, set it
# Generate distribution for each variable (step 2) -------------------------------------------------------------
for (i in 1:k)
{
if (min(Supplied.Data[,i]) == max(Supplied.Data[,i]))
Supplied.Data[,i] <- Supplied.Data[,i] + rnorm(dim(Supplied.Data)[1], 0, .0001)
V <- F
while (!V)
{
Distributions[,i] <- sort(sample(Supplied.Data[,i], N, replace = T))
V <- min(Distributions[,i]) != max(Distributions[,i])
}
}
# Calculate and store a copy of the target correlation matrix (step 3) -----------------------------------------
Target.Corr <- cor(Supplied.Data)
Intermediate.Corr <- Target.Corr
# If number of latent factors was not specified, determine it through parallel analysis (step 4) ---------------
if (N.Factors == 0)
{
Eigenvalues.Observed <- eigen(Intermediate.Corr)$values
Eigenvalues.Random <- matrix(0, nrow = 100, ncol = k)
Random.Data <- matrix(0, nrow = N, ncol = k)
for (i in 1:100)
{
for (j in 1:k)
{
V <- F
while (!V)
{
Random.Data[,j] <- sample(Distributions[,j], size = N, replace = TRUE)
V <- min(Random.Data[,j]) != max(Random.Data[,j])
}
}
Eigenvalues.Random[i,] <- eigen(cor(Random.Data))$values
}
Eigenvalues.Random <- apply(Eigenvalues.Random, 2, mean) # calculate mean eigenvalue for each factor
N.Factors <- max(1, sum(Eigenvalues.Observed > Eigenvalues.Random))
}
# Generate random normal data for shared and unique components, initialize factor loadings (steps 5, 6) --------
Shared.Comp <- matrix(rnorm(N * N.Factors, 0, 1), nrow = N, ncol = N.Factors)
Unique.Comp <- matrix(rnorm(N * k, 0, 1), nrow = N, ncol = k)
Shared.Load <- matrix(0, nrow = k, ncol = N.Factors)
Unique.Load <- matrix(0, nrow = k, ncol = 1)
# Begin loop that ends when specified number of iterations pass without improvement in RMSR correlation --------
while (Trials.Without.Improvement < Max.Trials)
{
Iteration <- Iteration + 1
# Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8) ---------------------------
Fact.Anal <- Factor.Analysis(Intermediate.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors)
if (N.Factors == 1) Shared.Load[,1] <- Fact.Anal$loadings
else
for (i in 1:N.Factors)
Shared.Load[,i] <- Fact.Anal$loadings[,i]
Shared.Load[Shared.Load > 1] <- 1
Shared.Load[Shared.Load < -1] <- -1
if (Shared.Load[1,1] < 0) Shared.Load <- Shared.Load * -1
for (i in 1:k)
if (sum(Shared.Load[i,] * Shared.Load[i,]) < 1) Unique.Load[i,1] <-
(1 - sum(Shared.Load[i,] * Shared.Load[i,]))
else Unique.Load[i,1] <- 0
Unique.Load <- sqrt(Unique.Load)
for (i in 1:k)
Data[,i] <- (Shared.Comp %*% t(Shared.Load))[,i] + Unique.Comp[,i] * Unique.Load[i,1]
# Replace normal with nonnormal distributions (step 9) ---------------------------------------------------------
for (i in 1:k)
{
Data <- Data[sort.list(Data[,i]),]
Data[,i] <- Distributions[,i]
}
# Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12) --------------
Reproduced.Corr <- cor(Data)
Residual.Corr <- Target.Corr - Reproduced.Corr
RMSR <- sqrt(sum(Residual.Corr[lower.tri(Residual.Corr)] * Residual.Corr[lower.tri(Residual.Corr)]) /
(.5 * (k * k - k)))
if (RMSR < Best.RMSR)
{
Best.RMSR <- RMSR
Best.Corr <- Intermediate.Corr
Best.Res <- Residual.Corr
Intermediate.Corr <- Intermediate.Corr + Initial.Multiplier * Residual.Corr
Trials.Without.Improvement <- 0
}
else
{
Trials.Without.Improvement <- Trials.Without.Improvement + 1
Current.Multiplier <- Initial.Multiplier * .5 ^ Trials.Without.Improvement
Intermediate.Corr <- Best.Corr + Current.Multiplier * Best.Res
}
}
# Construct the data set with the lowest RMSR correlation (step 13) --------------------------------------------
Fact.Anal <- Factor.Analysis(Best.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors)
if (N.Factors == 1) Shared.Load[,1] <- Fact.Anal$loadings
else
for (i in 1:N.Factors)
Shared.Load[,i] <- Fact.Anal$loadings[,i]
Shared.Load[Shared.Load > 1] <- 1
Shared.Load[Shared.Load < -1] <- -1
if (Shared.Load[1,1] < 0) Shared.Load <- Shared.Load * -1
for (i in 1:k)
if (sum(Shared.Load[i,] * Shared.Load[i,]) < 1) Unique.Load[i,1] <-
(1 - sum(Shared.Load[i,] * Shared.Load[i,]))
else Unique.Load[i,1] <- 0
Unique.Load <- sqrt(Unique.Load)
for (i in 1:k)
Data[,i] <- (Shared.Comp %*% t(Shared.Load))[,i] + Unique.Comp[,i] * Unique.Load[i,1]
Data <- apply(Data, 2, scale) # standardizes each variable in the matrix
for (i in 1:k)
{
Data <- Data[sort.list(Data[,i]),]
Data[,i] <- Distributions[,i]
}
# Report the results and return the simulated data set (step 14) -----------------------------------------------
return(Data)
}
###########################################################################################################
# Factor analysis programs for R
# Written by Niels Waller, adapted for use with John Ruscio's taxometric programs
###########################################################################################################
Factor.Analysis <- function(Data, Corr.Matrix = FALSE, Max.Iter = 50, N.Factors = 0)
{
Data <- as.matrix(Data)
k <- dim(Data)[2]
if (N.Factors == 0) N.Factors <- k
if (!Corr.Matrix) Cor.Matrix <- cor(Data)
else Cor.Matrix <- Data
Criterion <- .001
Old.H2 <- rep(99, k)
H2 <- rep(0, k)
Change <- 1
Iter <- 0
Factor.Loadings <- matrix(nrow = k, ncol = N.Factors)
while ((Change >= Criterion) & (Iter < Max.Iter))
{
Iter <- Iter + 1
Eig <- eigen(Cor.Matrix)
L <- sqrt(Eig$values[1:N.Factors])
for (i in 1:N.Factors)
Factor.Loadings[,i] <- Eig$vectors[,i] * L[i]
for (i in 1:k)
H2[i] <- sum(Factor.Loadings[i,] * Factor.Loadings[i,])
Change <- max(abs(Old.H2 - H2))
Old.H2 <- H2
diag(Cor.Matrix) <- H2
}
if (N.Factors == k) N.Factors <- sum(Eig$values > 1)
return(list(loadings = Factor.Loadings[,1:N.Factors], factors = N.Factors))
}
Factor.Analysis.LMode <- function(Data, Max.Iter = 50)
{
Data <- as.matrix(Data)
Cor.Matrix <- cor(Data)
Criterion <- .001
Old.H2 <- rep(99,dim(Cor.Matrix)[2])
Change <- 1
Iter <- 0
while ((Change >= Criterion) & (Iter < Max.Iter))
{
Iter <- Iter + 1
Eig <- eigen(Cor.Matrix)
L <- sqrt(Eig$values[1])
Factor.Loadings <- Eig$vectors[,1] * L
H2 <- Factor.Loadings * Factor.Loadings
Change <- max(abs(Old.H2 - H2))
Old.h2 <- H2
diag(Cor.Matrix) <- H2
}
Factor.Scores <- apply(Data %*% Factor.Loadings %*% solve(t(Factor.Loadings) %*% Factor.Loadings), 2, scale)
return(list(loadings = Factor.Loadings, scores = Factor.Scores))
}
###########################################################################################################
# Program used by L-Mode to calculate best RMSR fit for two densities, allowing horizontal shifts (a)
###########################################################################################################
Fit.Densities <- function(a, data)
{
x1 <- x2 <- data[[1]]$x
y1 <- data[[1]]$y
y2 <- data[[2]]
Nx <- length(x1)
w <- (max(y1)-min(y1)) / (max(x1)-min(x1))
x1 <- x1 * w
x2 <- x2 * w
dist <- rep(0, Nx)
for (i in 1:Nx)
dist[i] <- min(sqrt((a + x1[i] - x2)^2 + (y1[i] - y2)^2))
fit <- sqrt(sum(dist^2) / Nx)
return(fit)
}
###########################################################################################################
#
# CreateData routine for simulating data using on a handful of basic parameters
#
# Written by John Ruscio
# Last modified April 5, 2008
#
###########################################################################################################
CreateData <- function (Str, N = 600, k = 4, P = .50, d = 2.00, r = .00, Tax.r = .00, Comp.r = .00,
Skew = 0, Cuts = 0)
{
if (Skew < 0) P <- 1 - P
if (Str == "Dim")
{
newdata <- matrix(rep(0, N * k), ncol = k)
if (r > 0)
R.expected <- r
else
{
r.wg <- mean(c(Tax.r, Comp.r))
R.expected <- (P * (1 - P) * d ^ 2 + r.wg) / (1 + (P * (1 - P) * d ^ 2))
}
Loading.Common <- sqrt(R.expected)
Loading.Error <- sqrt(1 - R.expected)
Random.Common <- rnorm(N, mean = 0, sd = 1)
for (i in 1:k)
newdata[,i] <- Loading.Common * Random.Common + Loading.Error * rnorm(N, mean = 0, sd = 1)
}
else # create categorical data
{
newdata <- matrix(rep(0, N * (k + 1)), ncol = (k + 1))
N.Tax <- round(N * P)
N.Comp <- N - N.Tax
newdata[,(k + 1)] <- c(rep(2, N.Tax),rep(1, N.Comp))
# create taxon (with desired within-group correlation)
Loading.Common <- sqrt(Tax.r)
Loading.Error <- sqrt(1 - Tax.r)
Random.Common <- rnorm(N.Tax, mean = 0, sd = 1)
for (i in 1:k)
newdata[1:N.Tax,i] <- Loading.Common * Random.Common + Loading.Error * rnorm(N.Tax, mean = 0, sd = 1)
# create complement (with desired within-group correlation)
Loading.Common <- sqrt(Comp.r)
Loading.Error <- sqrt(1 - Comp.r)
Random.Common <- rnorm(N.Comp, mean = 0, sd = 1)
for (i in 1:k)
newdata[(N.Tax + 1):N,i] <- Loading.Common * Random.Common + Loading.Error * rnorm(N.Comp, mean = 0, sd = 1)
# add in categorical separation
if (Skew >= 0)
for (i in 1:k)
newdata[,i] <- newdata[,i] + (d * (newdata[, (k + 1)] - 1.5))
else
for (i in 1:k)
newdata[,i] <- newdata[,i] - (d * (newdata[, (k + 1)] - 1.5))
}
if (Skew > 0)
for (i in 1:k)
newdata[,i] <- exp(newdata[,i]/Skew)
if (Skew < 0)
{
for (i in 1:k)
newdata[,i] <- -exp(newdata[,i]/Skew)
if (Str == "Tax") newdata[,(k + 1)] <- 3 - newdata[,(k + 1)]
}
newdata[,1:k] <- apply(newdata[,1:k], 2, scale)
if (Cuts > 0)
{
for (i in 1:k)
{
Sorted.X <- newdata[sort.list(newdata[,i]),i]
Trim <- Sorted.X[round(.01 * N):round(.99 * N)]
Dens <- density(Trim, from = min(Trim), to = max(Trim))
Dens.N <- length(Dens$y)
Cut.Pts <- round(seq(from = 0, to = Dens.N, length = (Cuts + 1)))
Cut.Vals <- vector("numeric",Cuts)
for (j in 1:Cuts)
Cut.Vals[j] <- sum(Dens$y[(Cut.Pts[j] + 1):Cut.Pts[j + 1]])
Vals <- round(Cut.Vals * (N / sum(Cut.Vals)))
if (sum(Vals) != N) Vals[round(Cuts/2)] <- Vals[round(Cuts/2)] + (N - sum(Vals))
for (j in 1:Cuts)
Cut.Vals[j] <- Sorted.X[sum(Vals[1:j])]
Cut.Vals <- c(-100,Cut.Vals,100)
newdata[,i] <- cut(newdata[,i], breaks = Cut.Vals)[1:N]
}
}
as.data.frame(newdata)
return(newdata)
}
###########################################################################################################
#
# Indicator.Dist
#
# This replaces the older "Validity.Est" program
#
# Written by John Ruscio
# Last modified November 10, 2008
#
###########################################################################################################
#
# Program to calculate indicator distributions and correlations in the full sample as well as within groups
# as well as between-group separation. Uses the final column of data as a classification variable (1 =
# complement, 2 = taxon). Prints raw and standardized (using pooled variances, weighted by df) validity
# values for each indicator, followed by distributional summary statistics for each indicator and the
# indicator correlation matrix (in the full sample and within groups).
#
###########################################################################################################
Indicator.Dist <- function(Data)
{
# Read N (# cases), I (# items), Nt (number taxon members), Nc (number complement members)
Data <- as.matrix(Data)
N <- dim(Data)[1]
I <- dim(Data)[2] - 1
Nt <- sum(Data[,(I + 1)]) - N
Nc <- N - Nt
Data.f <- Data[,1:I]
Data.t <- Data[(Data[,(I + 1)] == 2),1:I]
Data.c <- Data[(Data[,(I + 1)] == 1),1:I]
for (i in 1:I)
{
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001)
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001)
}
# Calculate distributions for each indicator (full sample, taxon, complement)
Dist.Full <- matrix(nrow = I + 2, ncol = 4)
Dist.Tax <- matrix(nrow = I + 2, ncol = 4)
Dist.Comp <- matrix(nrow = I + 2, ncol = 4)
Rows <- c(paste("Ind",1:I), "M", "SD")
Cols <- c("M","SD","Skew","Kurtosis")
dimnames(Dist.Full) <- list(Rows, Cols)
dimnames(Dist.Tax) <- list(Rows, Cols)
dimnames(Dist.Comp) <- list(Rows, Cols)
for (i in 1:I)
{
Dist.Full[i,1] <- mean(Data.f[,i])
Dist.Tax[i,1] <- mean(Data.t[,i])
Dist.Comp[i,1] <- mean(Data.c[,i])
Dist.Full[i,2] <- sd(Data.f[,i])
Dist.Tax[i,2] <- sd(Data.t[,i])
Dist.Comp[i,2] <- sd(Data.c[,i])
Dist.Full[i,3] <- Skew(Data.f[,i])
Dist.Tax[i,3] <- Skew(Data.t[,i])
Dist.Comp[i,3] <- Skew(Data.c[,i])
Dist.Full[i,4] <- Kurtosis(Data.f[,i])
Dist.Tax[i,4] <- Kurtosis(Data.t[,i])
Dist.Comp[i,4] <- Kurtosis(Data.c[,i])
}
for (i in 1:4)
{
Dist.Full[I + 1,i] <- mean(Dist.Full[1:I,i])
Dist.Tax[I + 1,i] <- mean(Dist.Tax[1:I,i])
Dist.Comp[I + 1,i] <- mean(Dist.Comp[1:I,i])
Dist.Full[I + 2,i] <- sd(Dist.Full[1:I,i])
Dist.Tax[I + 2,i] <- sd(Dist.Tax[1:I,i])
Dist.Comp[I + 2,i] <- sd(Dist.Comp[1:I,i])
}
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n")
print(round(Dist.Full,3))
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n")
print(round(Dist.Tax,3))
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n")
print(round(Dist.Comp,3))
# calculate validity for each indicator
Ind.Val <- matrix(nrow = I + 2, ncol = 2)
Rows <- c(paste("Ind",1:I), "M", "SD")
Cols <- c("Raw Units","Cohen's d")
dimnames(Ind.Val) <- list(Rows, Cols)
for (i in 1:I)
{
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1]
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2)
/ (Nt + Nc - 2))
}
Ind.Val[I + 1,1] <- mean(Ind.Val[1:I,1])
Ind.Val[I + 1,2] <- mean(Ind.Val[1:I,2])
Ind.Val[I + 2,1] <- sd(Ind.Val[1:I,1])
Ind.Val[I + 2,2] <- sd(Ind.Val[1:I,2])
cat("\nBetween-group validity:\n\n")
print(round(Ind.Val,3))
# Calculate, print, and summarize full-sample and within-group correlations
Cor.Matrix <- cor(Data.f[,1:I])
Rows <- paste("Ind",1:I)
Cols <- paste("Ind",1:I)
dimnames(Cor.Matrix) <- list(Rows, Cols)
cat("\nIndicator correlations:\n\nFull Sample ( N =",N,"):\n\n")
print(round(Cor.Matrix,3))
Cor.Taxon <- cor(Data.t)
Cor.Comp <- cor(Data.c)
dimnames(Cor.Taxon) <- list(Rows, Cols)
dimnames(Cor.Comp) <- list(Rows, Cols)
cat("\nTaxon ( n =",Nt,"):\n\n")
print(round(Cor.Taxon,3))
cat("\nComplement ( n =",Nc,"):\n\n")
print(round(Cor.Comp,3))
Cor.Summary <- matrix(nrow = 3, ncol = 2)
dimnames(Cor.Summary) <- list(c("Full Sample","Taxon", "Complement"), c("M", "SD"))
Cor.Summary[1,1] <- mean(Cor.Matrix[lower.tri(Cor.Matrix)])
Cor.Summary[2,1] <- mean(Cor.Taxon[lower.tri(Cor.Taxon)])
Cor.Summary[3,1] <- mean(Cor.Comp[lower.tri(Cor.Comp)])
Cor.Summary[1,2] <- sd(Cor.Matrix[lower.tri(Cor.Matrix)])
Cor.Summary[2,2] <- sd(Cor.Taxon[lower.tri(Cor.Taxon)])
Cor.Summary[3,2] <- sd(Cor.Comp[lower.tri(Cor.Comp)])
cat("\nSummary of indicator correlations:\n\n")
print(round(Cor.Summary,3))
cat("\n\n")
}
###########################################################################################################
# Skewness and kurtosis programs for R
# Written by John Ruscio
###########################################################################################################
Skew <- function(x)
{
N <- length(x)
m1 <- mean(x)
m2 <- sum((x - m1) ^ 2) / N
m3 <- sum((x - m1) ^ 3) / N
g1 <- m3 / (m2 ^ (3/2))
G1 <- (sqrt(N * (N - 1)) / (N - 2)) * g1
return(G1)
}
Kurtosis <- function(x)
{
N <- length(x)
m1 <- mean(x)
m2 <- sum((x - m1) ^ 2) / N
m3 <- sum((x - m1) ^ 3) / N
m4 <- sum((x - m1) ^ 4) / N
g2 <- (m4 / (m2 ^ 2)) - 3
G2 <- ((N - 1) / ((N - 2) * (N - 3))) * ((N + 1) * g2 + 6)
return(G2)
}
###########################################################################################################
#
# P.Classify program to implement the base-rate classification technique
#
# Written by John Ruscio
# Last modified November 13, 2011
#
###########################################################################################################
P.Classify <- function(x, P.est, cols = 0)
{
N <- dim(x)[1]
if (cols[1] == 0) cols <- 1:dim(x)[2]
s <- apply(x[,cols], 1, sum)
y <- cbind(x, rep(0,N), 1:N, s)
N.Tax <- round(P.est * N)
N.Comp <- N - N.Tax
Classes <- c(rep(1,N.Comp), rep(2,N.Tax))
last.col <- dim(y)[2]
y <- y[sort.list(y[,last.col]),]
y[,last.col - 2] <- Classes
if (max(y[(y[,last.col - 2] == 1), last.col]) == min(y[(y[,last.col - 2] == 2), last.col]))
{
tied.score <- max(y[(y[,last.col - 2] == 1), last.col])
tied.cases <- (y[,last.col] == tied.score)
n.tied1 <- sum((y[,last.col - 2] == 1) & (tied.cases))
n.tied2 <- sum((y[,last.col - 2] == 2) & (tied.cases))
tied.min <- tied.score == min(y[,last.col])
tied.max <- tied.score == max(y[,last.col])
if (n.tied1 > n.tied2)
if (!tied.max) y[tied.cases, last.col - 2] <- 1
else y[tied.cases, last.col - 2] <- 2
else
if (!tied.min) y[tied.cases, last.col - 2] <- 2
else y[tied.cases, last.col - 2] <- 1
}
y <- y[sort.list(y[,last.col - 1]),]
return(cbind(x, y[,last.col - 2]))
}
###########################################################################################################
#
# Remove.Missing program to remove missing data (listwise deletion)
#
# Written by John Ruscio
# Last modified May 12, 2010
#
###########################################################################################################
Remove.Missing <- function(x)
{
N <- dim(x)[1]
k <- dim(x)[2]
complete <- rep(T, N)
for (i in 1:N)
for (j in 1:k)
if (is.na(x[i,j])) complete[i] <- F
x.k <- x[complete,]
n <- dim(x.k)[1]
if (n < N)
cat("\n*** ",n," out of ",N," cases (",round(100*n/N,2),"%) retained for analysis; ",
N-n," cases (",round(100*(N-n)/N,2),"%) contained missing data.\n", sep = "")
return(x.k)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment