Created
October 16, 2014 16:38
-
-
Save joaovissoci/a202849f6d34e5099002 to your computer and use it in GitHub Desktop.
taxometric_function.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################################################################################### | |
# | |
# MAXEIG -- performs MAXEIG, but can also be used to perform MAXCOV. | |
# | |
# Written by John Ruscio | |
# Last modified January 9, 2012 | |
# | |
########################################################################################################### | |
# | |
# Preliminary program to read and submit research data to analysis and then, if requested, perform analyses | |
# of comparison data to test fit of averaged curves. | |
# | |
########################################################################################################### | |
MAXEIG <- function(Data.Set, Comp.Data = T, N.Pop = 100000, N.Samples = 100, Supplied.Class = F, | |
Supplied.P = 0, Ind.Triplets = F, Ind.Comp = F, Intervals = F, N.Int = 15, | |
Windows = 25, Overlap = .90, Calc.Cov = F, St.Ind = F, Replications = 0, | |
Gr.Orient = 1, Gr.Rows = 3, Gr.Cols = 2, Gr.Smooth = 0, Gr.Common.Y = F, Gr.Ref = 2, | |
Gr.Avg = F, Gr.Ind = F, Classify = 1, Gr.Bayes = F, Save.Class = 0, File.Output = F, | |
File.Name = "Output.txt", Seed = 1) | |
{ | |
Data.Set <- Remove.Missing(Data.Set) # remove any missing data (listwise deletion) | |
N <- dim(Data.Set)[1] # determine sample size | |
I <- dim(Data.Set)[2] # determine number of indicators; adjust if one column contains class assignments | |
if (Supplied.Class) I <- I - 1 # subtract to allow for class assignment data, if provided | |
if ((version$major == 1) & (version$minor < 9)) library(eda) | |
else library(stats) # load R code for running medians smoother | |
set.seed(Seed) # set random number seed to allow exact replications of results | |
# automatically fix some potential parameter inconsistencies or omissions | |
if (Comp.Data) Gr.Avg <- T # show the averaged curve that will be compared to those of comparison data | |
if ((Comp.Data) & (N.Samples == 1)) N.Samples <- 100 # do not allow a single sample of comparison data | |
if (Ind.Comp) Ind.Triplets <- T # for this input method, triplets must be used | |
if (Ind.Comp & (I < 4)) Ind.Comp <- F # cannot form composites using individual variables | |
if (Calc.Cov) Ind.Triplets <- T # covariances can only be calculated using two outputs | |
if (Ind.Comp) Classify <- 1 # cannot use Bayes' Theorem with composite input indicators | |
if (Ind.Comp | !Ind.Triplets) Gr.Ind <- F # cannot average curves for each indicator | |
if ((Classify == 1) & (Save.Class == 2)) Save.Class <- 1 # cannot return Bayesian prob if not generated | |
if (Gr.Avg) St.Ind <- T # always standardize indicators if graphs will be averaged | |
if (St.Ind) Data.Set[,1:I] <- apply(Data.Set[,1:I], 2, scale) # standardize data if requested | |
# check for tied scores and set number of replications | |
Ties <- F | |
for (i in 1:I) | |
if (length(unique(Data.Set[,I])) < N) Ties <- T | |
if ((Replications == 0) & (Ties)) Replications <- 10 # default is 10 replications if any tied scores | |
if (Replications == 0) Replications <- 1 | |
if ((Replications > 1) & (!Ties)) Replications <- 1 # eliminate unnecessary replications | |
# if output to file is requested, redirect all text output | |
if (File.Output) sink(File.Name) | |
# label supplied data and submit to main program for analysis; class assignments will be returned | |
Research.Results <- MAXEIG.Main(Data.Set, Research.Data = T, Comp.Data, N.Pop, N.Samples, | |
Supplied.P, Supplied.Class, N, I, Ind.Triplets, Ind.Comp, Intervals, | |
N.Int, Windows, Overlap, Calc.Cov, St.Ind, Replications, | |
Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth,Gr.Common.Y, Gr.Ref, Gr.Avg, | |
Gr.Ind, Classify, Save.Class, Gr.Bayes, File.Output, File.Name) | |
if (Comp.Data) | |
{ | |
P.Est.M <- Research.Results[[1]] | |
P.Est.SD <- Research.Results[[2]] | |
Class.Assign <- Research.Results[[3]] | |
MAXEIG.Results <- Research.Results[[4]] | |
} | |
else | |
{ | |
P.Est.M <- Research.Results[[1]] | |
Class.Assign <- Research.Results[[2]] | |
} | |
# if comparison data requested, then generate and submit samples for analysis | |
if (Comp.Data) | |
{ | |
# create matrices to store results for each comparison data set | |
if (Intervals) N.Subsamples <- N.Int | |
else N.Subsamples <- Windows | |
Tax.Y.Values <- matrix(nrow = N.Samples, ncol = N.Subsamples) | |
Dim.Y.Values <- matrix(nrow = N.Samples, ncol = N.Subsamples) | |
if (Ind.Comp) N.Curves <- (I * (I - 1)) / 2 | |
else if (Ind.Triplets) N.Curves <- (I * (I - 1) * (I - 2)) / 2 | |
else N.Curves <- I | |
Tax.P.Ests <- matrix(nrow = N.Samples, ncol = N.Curves) | |
Dim.P.Ests <- matrix(nrow = N.Samples, ncol = N.Curves) | |
# generate and analyze categorical data | |
set.seed(Seed) | |
if (!Supplied.Class) # if class assignments were not provided, generate using base rate | |
{ | |
Data.Set <- cbind(Data.Set, rep(0, N)) | |
if (Supplied.P == 0) P <- P.Est.M # use estimated P in research data if no P was provided | |
else P <- Supplied.P | |
if ((P.Est.M * N) < 20) P.Est.M <- 20 / N | |
if ((N - (P.Est.M * N)) < 20) P.Est.M <- N - (20 / N) | |
for (i in 1:I) | |
Data.Set[, (I + 1)] <- Data.Set[, (I + 1)] + Data.Set[,i] | |
Data.Set <- Data.Set[sort.list(Data.Set[,(I + 1)]),] | |
P.Est.Series <- c(rep(2, round(N * P))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Set[,(I + 1)] <- c(Q.Est.Series, P.Est.Series) | |
} | |
Tax.Data <- Data.Set[(Data.Set[,(I + 1)] == 2), 1:I] | |
Com.Data <- Data.Set[(Data.Set[,(I + 1)] == 1), 1:I] | |
P.Data <- dim(Tax.Data)[1] / (dim(Tax.Data)[1] + dim(Com.Data)[1]) | |
cat("\nGenerating and analyzing comparison data\n") | |
Sim.Tax <- GenData(Tax.Data, round(N.Pop * P.Data)) | |
Sim.Com <- GenData(Com.Data, (N.Pop - round(N.Pop * P.Data))) | |
Sim.Pop <- rbind(Sim.Tax, Sim.Com) | |
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))] | |
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I))) | |
cat(" Categorical population generated, RMSR r =",round(RMSR,3),"\n") | |
for (Sims in 1:N.Samples) | |
{ | |
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),] | |
MAXEIG.Tax.Results <- MAXEIG.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, N.Samples, | |
Supplied.P, Supplied.Class, N, I, Ind.Triplets, Ind.Comp, | |
Intervals, N.Int, Windows, Overlap, Calc.Cov, | |
St.Ind, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, | |
Gr.Common.Y, Gr.Ref, Gr.Avg, Gr.Ind, Classify, Save.Class, | |
Gr.Bayes, File.Output, File.Name) | |
Tax.Y.Values[Sims,] <- MAXEIG.Tax.Results[[2]] | |
Tax.P.Ests[Sims,] <- MAXEIG.Tax.Results[[3]] | |
} | |
cat(" Analysis of",N.Samples,"samples of categorical comparison data completed\n") | |
# generate and analyze dimensional samples | |
set.seed(Seed) | |
Sim.Pop <- GenData(Data.Set[, 1:I], N.Pop) | |
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))] | |
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I))) | |
cat("\n Dimensional population generated, RMSR r =",round(RMSR,3),"\n") | |
for (Sims in 1:N.Samples) | |
{ | |
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),] | |
MAXEIG.Dim.Results <- MAXEIG.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, N.Samples, | |
Supplied.P, Supplied.Class, N, I, Ind.Triplets, Ind.Comp, | |
Intervals, N.Int, Windows, Overlap, Calc.Cov, | |
St.Ind, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, | |
Gr.Common.Y, Gr.Ref, Gr.Avg, Gr.Ind, Classify, Save.Class, | |
Gr.Bayes, File.Output, File.Name) | |
Dim.Y.Values[Sims,] <- MAXEIG.Dim.Results[[2]] | |
Dim.P.Ests[Sims,] <- MAXEIG.Dim.Results[[3]] | |
} | |
cat(" Analysis of",N.Samples,"samples of dimensional comparison data completed\n") | |
# summarize base rate estimates for comparison data | |
P.Est.Summary <- matrix(nrow = (N.Samples + 2), ncol = 4) | |
for (Sims in 1:N.Samples) | |
{ | |
P.Est.Summary[Sims,1] <- mean(Tax.P.Ests[Sims,], na.rm = T) | |
P.Est.Summary[Sims,2] <- sd(Tax.P.Ests[Sims,], na.rm = T) | |
P.Est.Summary[Sims,3] <- mean(Dim.P.Ests[Sims,], na.rm = T) | |
P.Est.Summary[Sims,4] <- sd(Dim.P.Ests[Sims,], na.rm = T) | |
} | |
for (i in 1:4) | |
{ | |
P.Est.Summary[Sims + 1,i] <- mean(P.Est.Summary[1:Sims,i]) | |
P.Est.Summary[Sims + 2,i] <- sd(P.Est.Summary[1:Sims,i]) | |
} | |
Row.Labels <- c(1:Sims,paste("M for",N.Samples,"samples"),paste("SD for",N.Samples,"samples")) | |
Col.Labels <- c("Categorical M","SD","Dimensional M","SD") | |
dimnames(P.Est.Summary) <- list(Row.Labels,Col.Labels) | |
cat("\n\nBase rate estimates for comparison data:\n\n") | |
if (N.Samples == 1) | |
{ | |
cat(" Categorical sample: M =",round(P.Est.Summary[1,1],4),", SD =", | |
round(P.Est.Summary[1,2],4),"\n") | |
cat(" Dimensional sample: M =",round(P.Est.Summary[1,3],4),", SD =", | |
round(P.Est.Summary[1,4],4),"\n") | |
} | |
else print(round(P.Est.Summary[((N.Samples+1):(N.Samples+2)),],4)) | |
# plot sampling distributions of Ms/SDs of base rate estimates | |
if (N.Samples > 1) | |
{ | |
dev.new(width=7, height=3.5) | |
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75) | |
Y.M.Max <- max(c(density(P.Est.Summary[1:Sims,1])$y,density(P.Est.Summary[1:Sims,3])$y)) * 1.25 | |
Y.SD.Max <- max(c(density(P.Est.Summary[1:Sims,2])$y,density(P.Est.Summary[1:Sims,4])$y)) * 1.25 | |
plot(density(P.Est.Summary[1:Sims,1], adjust = .75), xlim = c(0,1), ylim = c(0, Y.M.Max), | |
xlab = "Ms of Base Rate Estimates", ylab = "", main = "", lwd = 2, axes = F) | |
axis(1) | |
lines(density(P.Est.Summary[1:Sims,3], adjust = .75), lwd = 2, lty = 2) | |
abline(v = P.Est.M, lwd = 2, lty = 3) | |
plot(density(P.Est.Summary[1:Sims,2], adjust = .75), xlim = c(0,.5), ylim = c(0, Y.SD.Max), | |
xlab = "SDs of Base Rate Estimates", ylab = "", main = "", lwd = 2, axes = F) | |
axis(1) | |
lines(density(P.Est.Summary[1:Sims,4], adjust = .75), lwd = 2, lty = 2) | |
abline(v = P.Est.SD, lwd = 2, lty = 3) | |
mtext("BASE RATE ESTIMATES FOR COMPARISON DATA",outer=T) | |
} | |
# plot averaged curves for comparison data | |
X.Values <- MAXEIG.Dim.Results[[1]] | |
X.Range <- c(min(X.Values), max(X.Values)) | |
Tax.Y <- vector("numeric", length(X.Values)) | |
for (i in 1:length(Tax.Y)) | |
Tax.Y[i] <- mean(Tax.Y.Values[,i],na.rm=T) | |
Dim.Y <- vector("numeric", length(X.Values)) | |
for (i in 1:length(Dim.Y)) | |
Dim.Y[i] <- mean(Dim.Y.Values[,i],na.rm=T) | |
Y.Values <- c(MAXEIG.Results,Tax.Y.Values,Dim.Y.Values) | |
Y.Range <- c(min(0, min(Y.Values)), max(Y.Values) * 1.10) | |
# label the x axis in accord with input method and indicator | |
if (Intervals) X.Label <- paste(N.Int,"Intervals") | |
else X.Label <- paste(Windows,"Windows") | |
if (Ind.Comp) | |
if (Intervals) X.Label <- paste(N.Int,"Intervals") | |
else X.Label <- paste(Windows,"Windows") | |
# label the y axis in accord with calculation type and output indicators | |
if (Calc.Cov) Y.Label <- "Covariance" | |
else Y.Label <- "Eigenvalue" | |
# 2-panel plot for comparison data | |
Tax.Sum <- Dim.Sum <- matrix(0, nrow = 6, ncol = length(X.Values)) | |
for (i in 1:length(X.Values)) | |
{ | |
Tax.Sum[,i] <- summary(Tax.Y.Values[,i]) | |
Dim.Sum[,i] <- summary(Dim.Y.Values[,i]) | |
} | |
dev.new(width=7, height=3.5) | |
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75) | |
plot(X.Values, MAXEIG.Results, type = "o", pch = 16, lwd = 2, ylim = Y.Range, | |
xlim = X.Range, ylab = Y.Label, xlab = X.Label, main = "Categorical Comparison Data") | |
for (i in 1:(length(X.Values)-1)) | |
polygon(x = c(X.Values[i], X.Values[i], X.Values[i+1], X.Values[i+1]), | |
y = c(Tax.Sum[2,i], Tax.Sum[5,i], Tax.Sum[5,i+1], Tax.Sum[2,i+1]), col = 8, lty = 0) | |
lines(X.Values, Tax.Sum[1,]) | |
lines(X.Values, Tax.Sum[6,]) | |
points(X.Values, MAXEIG.Results, type = "o", pch = 16) | |
lines(X.Values, MAXEIG.Results, lwd = 2) | |
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0) | |
plot(X.Values, MAXEIG.Results, type = "o", pch = 16, lwd = 2, ylim = Y.Range, | |
xlim = X.Range, ylab = Y.Label, xlab = X.Label, main = "Dimensional Comparison Data") | |
for (i in 1:(length(X.Values)-1)) | |
polygon(x = c(X.Values[i], X.Values[i], X.Values[i+1], X.Values[i+1]), | |
y = c(Dim.Sum[2,i], Dim.Sum[5,i], Dim.Sum[5,i+1], Dim.Sum[2,i+1]), col = 8, lty = 0) | |
lines(X.Values, Dim.Sum[1,]) | |
lines(X.Values, Dim.Sum[6,]) | |
points(X.Values, MAXEIG.Results, type = "o", pch = 16) | |
lines(X.Values, MAXEIG.Results, lwd = 2) | |
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0) | |
# calculate and print comparison curve fit index (CCFI) | |
Fit.RMSR.t <- sqrt(sum((Tax.Y - MAXEIG.Results)^2)/N.Subsamples) | |
Fit.RMSR.d <- sqrt(sum((Dim.Y - MAXEIG.Results)^2)/N.Subsamples) | |
CCFI <- Fit.RMSR.d / (Fit.RMSR.d + Fit.RMSR.t) | |
cat("\n\nComparison curve fit index (CCFI) =",round(Fit.RMSR.d,4),"/ (",round(Fit.RMSR.d,4), | |
"+",round(Fit.RMSR.t,4),") =",round(CCFI,3),"\n") | |
cat(" Note: CCFI values can range from 0 (dimensional) to 1 (categorical). The more a CCFI\n") | |
cat(" value deviates from .50, the stronger the result. When .40 < CCFI < .60,\n") | |
cat(" this should be interpreted with caution.\n\n") | |
} | |
if (File.Output) sink() | |
if (Save.Class > 0) return(Class.Assign) | |
} | |
########################################################################################################### | |
# | |
# Main program shows the overall structure through calls to four subsidiary programs | |
# | |
########################################################################################################### | |
MAXEIG.Main <- function(Data, Research.Data, Comp.Data, N.Pop, N.Samples, Supplied.P, Supplied.Class, N, I, | |
Ind.Triplets, Ind.Comp, Intervals, N.Int, Windows, Overlap, Calc.Cov, | |
St.Ind, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, Gr.Ref, | |
Gr.Avg, Gr.Ind, Classify, Save.Class, Gr.Bayes, File.Output, File.Name) | |
{ | |
# set up matrix to store input/output configurations for each curve to be generated | |
Analysis.Structure <- MAXEIG.Structure(Ind.Triplets, Ind.Comp, I, Intervals) | |
Curve.Info <- Analysis.Structure[[1]] | |
Structure <- Analysis.Structure[[2]] | |
Input.Type <- Analysis.Structure[[3]] | |
# carry out analyses by calling calculation procedure once per curve | |
Curves <- list(1) | |
for (Curve.Num in 1:length(Curve.Info[,1])) | |
Curves[[Curve.Num]] <- MAXEIG.Calculate(Data[,1:I], N, I, Curve.Info, Curve.Num, Structure, Input.Type, | |
Ind.Triplets, Ind.Comp, Intervals, N.Int, Windows, Overlap, Calc.Cov, Replications) | |
if (!Research.Data | Comp.Data) | |
{ | |
# calculate averaged curve | |
X.Values <- rep(0,length(Curves[[1]][,1])) | |
for (i in 1:length(Curve.Info[,1])) | |
X.Values <- X.Values + Curves[[i]][,1] | |
X.Values <- X.Values / length(Curve.Info[,1]) | |
Y.Values <- rep(0,length(X.Values)) | |
for (i in 1:length(Curve.Info[,1])) | |
Y.Values <- Y.Values + Curves[[i]][,3] | |
Y.Values <- Y.Values / length(Curve.Info[,1]) | |
# calculate base rate estimates | |
Data.Matrix <- cbind(Data[,1:I], 1:N, rep(0,N), rep(0,N)) | |
N.Curves <- length(Curve.Info[,1]) | |
P.Estimates <- vector("numeric",N.Curves) # store base rate estimates | |
for (i in 1:N.Curves) | |
{ | |
if (Structure < 3) Input <- Curve.Info[i,2] | |
else | |
{ | |
Data.Matrix[,(I + 2)] <- rep(0, N) | |
for (j in 1:I) | |
Data.Matrix[,(I + 2)] <- Data.Matrix[,(I + 2)] + Data.Matrix[,j] | |
Input <- I + 2 | |
} | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),] | |
# determine curve data to use (smoothed or unsmoothed) | |
if (Gr.Smooth == 0) Curve.Data <- cbind(Curves[[i]][,1], Curves[[i]][,3], Curves[[i]][,2]) | |
if (Gr.Smooth == 1) | |
{ | |
Lowess.Y <- lowess(Curves[[i]][,1], Curves[[i]][,3])$y | |
for (j in 1:(length(Curves[[i]][,1])-1)) | |
if (Curves[[i]][j,1] == Curves[[i]][(j + 1),1]) | |
Lowess.Y <- c(Lowess.Y[1:j],Lowess.Y[j],Lowess.Y[j+1:length(Lowess.Y)]) | |
Lowess.Y <- Lowess.Y[1:length(Curves[[i]][,1])] | |
Curve.Data <- cbind(Curves[[i]][,1], Lowess.Y, Curves[[i]][,2]) | |
} | |
if (Gr.Smooth == 2) Curve.Data <- cbind(Curves[[i]][,1], smooth(Curves[[i]][,3]), Curves[[i]][,2]) | |
# estimate hitmax, P | |
Hitmax.Value <- max(Curve.Data[,2]) # y value at peak | |
Hitmax.Locs <- (1:(length(Curve.Data[,1])))[Curve.Data[,2] == Hitmax.Value] # peak subsample(s) | |
Hitmax.Loc <- round(median(Hitmax.Locs)) | |
Hitmax.Case <- trunc(mean(Curves[[i]][Hitmax.Loc,4:5])) # case at midpoint of peak subsample(s) | |
Ints <- length(Curve.Data[,1]) | |
p <- vector("numeric",Ints) | |
n <- vector("numeric",Ints) | |
K <- 4 * Hitmax.Value | |
for (j in 1:Ints) | |
{ | |
if ((K ^ 2 - 4 * K * Curve.Data[j,2]) >= 0) | |
quad1 <- (K + sqrt(K ^ 2 - 4 * K * Curve.Data[j,2])) / ( 2 * K) | |
else quad1 <- 1 | |
if ((K ^ 2 - 4 * K * Curve.Data[j,2]) >= 0) | |
quad2 <- (K - sqrt(K ^ 2 - 4 * K * Curve.Data[j,2])) / ( 2 * K) | |
else quad2 <- 0 | |
if (j < Hitmax.Loc) p[j] <- min(1,quad2) | |
if (j == Hitmax.Loc) p[j] <- .5 | |
if (j > Hitmax.Loc) p[j] <- max(quad1,0) | |
if ((Curve.Data[j,2] < 0) & (j < Hitmax.Loc)) p[j] <- 0 | |
if ((Curve.Data[j,2] < 0) & (j > Hitmax.Loc)) p[j] <- 1 | |
n[j] <- Curve.Data[j,3] | |
} | |
P.Estimates[i] <- sum(p * n) / sum(n) | |
} | |
} | |
# plot the results | |
if (Research.Data) P.Avg <- MAXEIG.Plot(I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets, | |
Ind.Comp, Intervals, Calc.Cov, Gr.Orient, Gr.Rows, Gr.Cols, | |
Gr.Smooth, Gr.Common.Y, Gr.Ref, Gr.Avg, Gr.Ind) | |
# summarize analytic specifications | |
if (Research.Data) MAXEIG.Summarize(Data[,1:I], N, I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets, | |
Ind.Comp, Intervals, N.Int, Windows, Overlap, Calc.Cov, Replications, | |
Gr.Smooth, Gr.Avg, P.Avg, Classify, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Bayes, Comp.Data, | |
N.Pop, N.Samples, Supplied.P, Supplied.Class) | |
# generate and report estimates of latent parameters and, if requested, classify cases | |
if (Research.Data) Class.Assign <- MAXEIG.P.Est(Data, N, I, Curve.Info, Curves, Structure, Input.Type, | |
Ind.Triplets, Ind.Comp, Intervals, N.Int, Windows, Overlap, | |
Calc.Cov, Replications, Gr.Smooth, Gr.Avg, P.Avg, Classify, Gr.Orient, Gr.Rows, | |
Gr.Cols, Gr.Bayes, Save.Class, Comp.Data, Supplied.P, Supplied.Class) | |
if (Research.Data & Comp.Data) return(list(Class.Assign[[2]],Class.Assign[[3]],Class.Assign[[1]],Y.Values)) | |
if (Research.Data & !Comp.Data) return(list(Class.Assign[[2]],Class.Assign[[1]])) | |
if (!Research.Data) return(list(X.Values,Y.Values,P.Estimates)) | |
} | |
########################################################################################################### | |
# | |
# Program to determine structure of analysis and set up matrix to contain relevant curve information | |
# | |
########################################################################################################### | |
MAXEIG.Structure <- function(Ind.Triplets, Ind.Comp, I, Intervals) | |
{ | |
# determine structure of the requested analysis: | |
# 1 = each indicator serves as input once, with all others as output | |
# 2 = triplets of indicators serve in all (input, output, output) combinations | |
# 3 = triplets of indicators, with each possible pair removed as outputs, input = sum of others | |
if (Ind.Comp) Structure <- 3 | |
else if (Ind.Triplets) Structure <- 2 | |
else Structure <- 1 | |
# based on structure, create matrix listing input and/or output indicators to use for each curve | |
if (Structure == 1) | |
{ | |
N.Curves <- I | |
Curve.Info <- matrix(nrow = N.Curves, ncol = 2) | |
Curve.Info[,1] <- 1:N.Curves | |
Curve.Info[,2] <- 1:N.Curves | |
} | |
if (Structure == 2) | |
{ | |
if (I == 3) N.Curves <- 3 | |
else N.Curves <- I * (prod(seq(1:(I - 1)))) / ((prod(seq(1:(I - 3)))) * 2) | |
Curve.Info <- matrix(nrow = N.Curves, ncol = 4) | |
Curve.Info[,1] <- 1:N.Curves | |
Counter <- 1 | |
for (Triplet.A in 1:I) | |
for (Triplet.B in 1:I) | |
for (Triplet.C in 1:I) | |
if ((Triplet.A != Triplet.B) & (Triplet.A != Triplet.C) & (Triplet.B < Triplet.C)) | |
{ | |
Curve.Info[Counter,2] <- Triplet.A | |
Curve.Info[Counter,3] <- Triplet.B | |
Curve.Info[Counter,4] <- Triplet.C | |
Counter <- Counter + 1 | |
} | |
} | |
if (Structure == 3) | |
{ | |
N.Curves <- (I * (I - 1)) / 2 | |
Curve.Info <- matrix(nrow = N.Curves, ncol = 3) | |
Curve.Info[,1] <- 1:N.Curves | |
Outputs <- 1 | |
for (Output.A in 1:(I - 1)) | |
for (Output.B in (Output.A + 1):I) | |
{ | |
Curve.Info[Outputs,2] <- Output.A | |
Curve.Info[Outputs,3] <- Output.B | |
Outputs <- Outputs + 1 | |
} | |
} | |
# determine type of input divisions: | |
# 1 = equal-sized intervals | |
# 2 = SD intervals (no longer allowed) | |
# 3 = overlapping windows | |
if (Intervals) Input.Type <- 1 | |
else Input.Type <- 3 | |
return(list(Curve.Info, Structure, Input.Type)) | |
} | |
########################################################################################################### | |
# | |
# Program to subdivide the input and calculate eigenvalues (or, if requested, covariances) | |
# | |
########################################################################################################### | |
MAXEIG.Calculate <- function(Data, N, I, Curve.Info, Curve.Num, Structure, Input.Type, Ind.Triplets, | |
Ind.Comp, Intervals, N.Int, Windows, Overlap, Calc.Cov, | |
Replications) | |
{ | |
# make room in data for summed indicator plus random numbers for resorting at each replication | |
Data.Matrix <- cbind(Data, rep(0,N), rnorm(N, mean = 0, sd = 1)) | |
# determine input column, creating sum of other variables if necessary, and sort by input | |
if (Structure == 3) | |
{ | |
Output1 <- Curve.Info[Curve.Num,2] | |
Output2 <- Curve.Info[Curve.Num,3] | |
Input <- I + 1 | |
for (i in 1:I) | |
Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] + Data.Matrix[,i] | |
Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] - Data.Matrix[,Output1] - Data.Matrix[,Output2] | |
} | |
else Input <- Curve.Info[Curve.Num, 2] | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),] | |
# divide input as appropriate for a given input method | |
# generate matrix containing case numbers that form endpoints of each subsample | |
# generate appropriate labels for x axis of graph | |
if (Input.Type == 1) # create equal-sized intervals; label = mean value within each interval | |
{ | |
Cuts <- matrix(nrow = 2, ncol = N.Int) | |
temp <- seq(from = 1, to = N, by = round(N / N.Int)) | |
Cuts[1,] <- temp[1:N.Int] | |
Cuts[2,] <- c((temp[2:N.Int] - 1),N) | |
X.Labels <- vector("numeric", N.Int) | |
for (i in 1:N.Int) | |
X.Labels[i] <- round(mean(Data.Matrix[Cuts[1,i]:Cuts[2,i],Input]),2) | |
} | |
if (Input.Type == 3) # create overlapping windows; label = mean value within each window | |
{ | |
N.Win <- N / (Windows * (1 - Overlap) + Overlap) # determine total n per window | |
N.Overlap <- N.Win * Overlap # determine how many cases overlap with adjacent windows | |
Cuts <- matrix(nrow = 2, ncol = Windows) | |
Cuts[1,] <- round(seq(from = 1, by = (N.Win - N.Overlap), length = Windows)) | |
Cuts[2,] <- round(seq(from = N.Win, to = N, by = (N.Win - N.Overlap))) | |
X.Labels <- vector("numeric", Windows) | |
for (i in 1:Windows) | |
X.Labels[i] <- round(mean(Data.Matrix[Cuts[1,i]:Cuts[2,i],Input]),2) | |
} | |
# create matrix to store curve results | |
N.Subsamples <- length(X.Labels) | |
Curve.Results <- matrix(rep(0, N.Subsamples * 5), nrow = N.Subsamples, ncol = 5) | |
# having set up subsamples along the input and created a storage structure, perform the calculations | |
# replications loop; results are cumulated, subsequently divided by number of replications to yield average | |
for (Rep in 1:Replications) | |
{ | |
# shuffle cases, then sort by input | |
Data.Matrix <- Data.Matrix[sample(1:N, N, replace = F),] | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),] | |
# begin calculations, storing results in either matrix or array as follows: | |
# 1st column = labels for x axis | |
# 2nd column = n per interval | |
# 3rd column = eigenvalues (or covariances) | |
# 4th, 5th columns = endpoints of each interval (case numbers) | |
# calculate values for one series of subsamples | |
Curve.Results[,4] <- Cuts[1,] | |
Curve.Results[,5] <- Cuts[2,] | |
Curve.Results[,1] <- X.Labels | |
if (Calc.Cov) # calculate covariances | |
{ | |
if (Structure == 2) Outputs <- c(Curve.Info[Curve.Num,3], Curve.Info[Curve.Num,4]) | |
if (Structure == 3) Outputs <- c(Curve.Info[Curve.Num,2], Curve.Info[Curve.Num,3]) | |
for (i in 1:N.Subsamples) | |
{ | |
Curve.Results[i,2] <- Cuts[2,i] - Cuts[1,i] + 1 | |
Curve.Results[i,3] <- Curve.Results[i,3] + var(Data.Matrix[Cuts[1,i]:Cuts[2,i],Outputs])[2] | |
} | |
} | |
else # calculate eigenvalues | |
{ | |
if (Structure == 1) | |
{ | |
Outputs <- c(1:I) | |
Outputs <- Outputs[Outputs != Input] | |
} | |
if (Structure == 2) Outputs <- c(Curve.Info[Curve.Num,3], Curve.Info[Curve.Num,4]) | |
if (Structure == 3) Outputs <- c(Curve.Info[Curve.Num,2], Curve.Info[Curve.Num,3]) | |
for (i in 1:N.Subsamples) | |
{ | |
Curve.Results[i,2] <- Cuts[2,i] - Cuts[1,i] + 1 | |
Cov.Matrix <- var(Data.Matrix[Cuts[1,i]:Cuts[2,i],Outputs]) | |
Data.No.Diag <- Cov.Matrix - diag(diag(Cov.Matrix)) | |
Curve.Results[i,3] <- Curve.Results[i,3] + eigen(Data.No.Diag)$values[1] | |
} | |
} | |
} | |
# divide cumulated covariances or eigenvalues by number of replications to yield average values | |
Curve.Results[,3] <- Curve.Results[,3] / Replications | |
return(Curve.Results) | |
} | |
########################################################################################################### | |
# | |
# Program to plot and label a panel of curves | |
# | |
########################################################################################################### | |
MAXEIG.Plot <- function(I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets, Ind.Comp, Intervals, | |
Calc.Cov, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, | |
Gr.Ref, Gr.Avg, Gr.Ind) | |
{ | |
P.Avg <- 0 | |
# create and label new graph sheet | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
Page <- 1 | |
N.Curves <- length(Curve.Info[,1]) | |
# if applying a common Y scale for all graphs, set the range here (algorithm accentuates peak/flatness) | |
if (Gr.Common.Y) | |
{ | |
Ys <- 0 | |
for (i in 1:N.Curves) | |
Ys <- c(Ys, Curves[[i]][,3]) | |
Y.Min <- min(Ys[2:length(Ys)]) | |
Y.Max <- max(Ys[2:length(Ys)]) | |
Y.M <- mean(Ys[2:length(Ys)]) | |
Y.SD <- sd(Ys[2:length(Ys)]) | |
Y.Range <- c(min(0, Y.Min), max(((Y.Max / Y.SD) * (Y.M / 1.5)), (Y.Max * 1.10))) | |
} | |
# plot each curve | |
for (i in 1:N.Curves) | |
{ | |
if (Structure == 3) Input <- I + 1 | |
else Input <- Curve.Info[i,2] | |
if (Structure == 1) | |
{ | |
Outputs <- c(1:I) | |
Outputs <- Outputs[Outputs != Input] | |
} | |
if (Structure == 2) Outputs <- c(Curve.Info[i,3], Curve.Info[i,4]) | |
if (Structure == 3) Outputs <- c(Curve.Info[i,2], Curve.Info[i,3]) | |
X.Values <- Curves[[i]][,1] | |
Y.Values <- Curves[[i]][,3] | |
X.Range <- c(min(X.Values), max(X.Values)) | |
if (!Gr.Common.Y) | |
Y.Range <- c(min(0, min(Y.Values)), max(((max(Y.Values) / sd(Y.Values)) | |
* (mean(Y.Values) / 1.5)), max(Y.Values) * 1.10)) | |
# label the x axis in accord with input method and indicator | |
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals, Indicator",Input) | |
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows, Indicator",Input) | |
if (Ind.Comp) | |
{ | |
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals") | |
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows") | |
} | |
# label the y axis in accord with calculation type and output indicators | |
if (Calc.Cov) Y.Label <- paste("Covariance, Ind",Outputs[1],"and",Outputs[2]) | |
else if (Ind.Triplets) Y.Label <- paste("Eigenvalue, Ind",Outputs[1],"and",Outputs[2]) | |
else Y.Label <- "Eigenvalue, All Other Vars" | |
# check to see whether new graph sheet needs to be created | |
if ((i > 1) & ((i - 1) %% (Gr.Rows * Gr.Cols) == 0)) | |
{ | |
if (Calc.Cov) mtext(paste("MAXCOV CURVES, page ",Page,sep=""),outer=T) | |
else mtext(paste("MAXEIG CURVES, page ",Page,sep=""),outer=T) | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
Page <- Page + 1 | |
} | |
# plot the graph, plus smoothed curve and/or reference line at 0 if requested | |
plot(X.Values, Y.Values, type = "p", pch = 16, ylim = Y.Range, xlim = X.Range, | |
ylab = Y.Label, xlab = X.Label) | |
if (Gr.Smooth == 0) # no smoothing, connect "raw" data points | |
lines(X.Values, Y.Values) # solid line for raw data only | |
else | |
lines(X.Values, Y.Values, lty = 3) # dotted line for raw data, solid for smoothed | |
if (Gr.Smooth == 1) # LOWESS smoother | |
lines(lowess(X.Values, Y.Values)) | |
if (Gr.Smooth == 2) # running medians smoother | |
lines(X.Values,smooth(Y.Values)) | |
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0) | |
} | |
if (Calc.Cov) mtext(paste("MAXCOV CURVES, page ",Page,sep=""),outer=T) | |
else mtext(paste("MAXEIG CURVES, page ",Page,sep=""),outer=T) | |
# plot curves averaged for each indicator | |
if (Gr.Ind) | |
{ | |
# create a new graph sheet | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
Page <- 1 | |
# average y values for curves using the same input indicator | |
N.Subsamples <- length(Curves[[1]][,1]) | |
Ind.Graphs <- matrix(nrow = I, ncol = N.Subsamples) | |
N.Each.Ind <- N.Curves / I | |
for (i in 1:I) | |
{ | |
Ind.Curves <- ((i - 1) * N.Each.Ind + 1):(i * N.Each.Ind) | |
Ys <- vector("numeric",N.Subsamples) | |
for (j in 1:N.Subsamples) | |
for (k in Ind.Curves) | |
Ys[j] <- Ys[j] + Curves[[k]][j,3] | |
Ind.Graphs[i,] <- Ys / N.Each.Ind | |
} | |
# plot a curve for each indicator | |
for (i in 1:I) | |
{ | |
Input <- i | |
X.Values <- Curves[[i * N.Each.Ind]][,1] | |
X.Range <- c(min(X.Values), max(X.Values)) | |
Ind.Curves <- ((i - 1) * N.Each.Ind + 1):(i * N.Each.Ind) | |
Y.Values <- Ind.Graphs[i,] | |
for (j in Ind.Curves) | |
Y.Values <- c(Y.Values,Curves[[j]][,3]) | |
Y.Range <- c(min(0, min(Y.Values)), max(((max(Y.Values) / sd(Y.Values)) | |
* (mean(Y.Values) / 1.5)), max(Y.Values) * 1.10)) | |
# label the x axis in accord with input method and indicator | |
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals, Indicator",Input) | |
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows, Indicator",Input) | |
# label the y axis in accord with calculation type | |
if (Calc.Cov) Y.Label <- "Covariance" | |
else Y.Label <- "Eigenvalue" | |
# check to see whether new graph sheet needs to be created | |
if ((i > 1) & ((i - 1) %% (Gr.Rows * Gr.Cols) == 0)) | |
{ | |
if (Calc.Cov) mtext(paste("MAXCOV INDICATOR CURVES, page ",Page,sep=""),outer=T) | |
else mtext(paste("MAXEIG INDICATOR CURVES, page ",Page,sep=""),outer=T) | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
Page <- Page + 1 | |
} | |
# plot the averaged graph, with dotted lines for each individual curve | |
plot(X.Values, Ind.Graphs[i,], type = "l", pch = 16, ylim = Y.Range, xlim = X.Range, | |
ylab = Y.Label, xlab = X.Label, lwd = 2) | |
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0) | |
for (j in Ind.Curves) | |
lines(Curves[[j]][,1],Curves[[j]][,3], lty = 3, lwd = 2) | |
} | |
if (Calc.Cov) mtext(paste("MAXCOV INDICATOR CURVES, page ",Page,sep=""),outer=T) | |
else mtext(paste("MAXEIG INDICATOR CURVES, page ",Page,sep=""),outer=T) | |
} | |
if (Gr.Avg) | |
{ | |
# create and label new graph sheet | |
dev.new(width=3.7, height=3.5) | |
par(omi=c(.25,.25,.25,.25), cex = .75) | |
X.Values <- rep(0,length(Curves[[1]][,1])) | |
for (i in 1:length(Curve.Info[,1])) | |
X.Values <- X.Values + Curves[[i]][,1] | |
X.Values <- X.Values / length(Curve.Info[,1]) | |
Y.Values <- rep(0,length(X.Values)) | |
for (i in 1:length(Curve.Info[,1])) | |
Y.Values <- Y.Values + Curves[[i]][,3] | |
Y.Values <- Y.Values / length(Curve.Info[,1]) | |
Hitmax.Value <- max(Y.Values) # y value at peak | |
Hitmax.Locs <- (1:(length(Y.Values)))[Y.Values == Hitmax.Value] # peak subsample(s) | |
Hitmax.Loc <- round(median(Hitmax.Locs)) | |
Hitmax.Case <- trunc(mean(Curves[[1]][Hitmax.Loc,4:5])) # case at midpoint of peak subsample(s) | |
Ints <- length(X.Values) | |
p <- vector("numeric",Ints) | |
n <- vector("numeric",Ints) | |
K <- 4 * Hitmax.Value | |
for (j in 1:Ints) | |
{ | |
if ((K ^ 2 - 4 * K * Y.Values[j]) >= 0) | |
quad1 <- (K + sqrt(K ^ 2 - 4 * K * Y.Values[j])) / ( 2 * K) | |
else quad1 <- 1 | |
if ((K ^ 2 - 4 * K * Y.Values[j]) >= 0) | |
quad2 <- (K - sqrt(K ^ 2 - 4 * K * Y.Values[j])) / ( 2 * K) | |
else quad2 <- 0 | |
if (j < Hitmax.Loc) p[j] <- min(1,quad2) | |
if (j == Hitmax.Loc) p[j] <- .5 | |
if (j > Hitmax.Loc) p[j] <- max(quad1,0) | |
} | |
P.Avg <- mean(p) | |
X.Range <- c(min(X.Values), max(X.Values)) | |
Y.Values.Avg <- Y.Values | |
for (i in 1:N.Curves) | |
Y.Values <- c(Y.Values,Curves[[i]][,3]) | |
Y.Range <- c(min(0, min(Y.Values)), max(((max(Y.Values) / sd(Y.Values)) | |
* (mean(Y.Values) / 1.5)), max(Y.Values) * 1.10)) | |
# label the x axis in accord with input method and indicator | |
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals") | |
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows") | |
if (Ind.Comp) | |
{ | |
if (Input.Type == 1) X.Label <- paste(length(X.Values),"Intervals") | |
if (Input.Type == 3) X.Label <- paste(length(X.Values),"Windows") | |
} | |
# label the y axis in accord with calculation type and output indicators | |
if (Calc.Cov) Y.Label <- "Covariance" | |
else Y.Label <- "Eigenvalue" | |
# plot the averaged graph, with dotted lines for each individual curve | |
if (Calc.Cov) Title <- "Averaged MAXCOV Curve" | |
else Title <- "Averaged MAXEIG Curve" | |
plot(X.Values, Y.Values.Avg, type = "l", pch = 16, ylim = Y.Range, xlim = X.Range, | |
ylab = Y.Label, xlab = X.Label, main = Title, lwd = 2) | |
if ((Gr.Ref == 1) | ((Gr.Ref == 2) & (Y.Range[1] < 0))) abline(h = 0) | |
for (i in 1:N.Curves) | |
lines(Curves[[i]][,1],Curves[[i]][,3], lty = 3, lwd = 2) | |
} | |
if (Gr.Avg) return(P.Avg) | |
} | |
########################################################################################################### | |
# | |
# Program to produce text summary (data parameters, analytic overview) | |
# | |
########################################################################################################### | |
MAXEIG.Summarize <- function(Data, N, I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets, Ind.Comp, | |
Intervals, N.Int, Windows, Overlap, Calc.Cov, Replications, Gr.Smooth, Gr.Avg, | |
P.Avg, Classify, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Bayes, Comp.Data, N.Pop, N.Samples, | |
Supplied.P, Supplied.Class) | |
{ | |
# report relevant analytic information and program specifications | |
if (Calc.Cov) cat("\n\nSUMMARY OF MAXCOV ANALYTIC SPECIFICATIONS\n") | |
else cat("\n\nSUMMARY OF MAXEIG ANALYTIC SPECIFICATIONS\n") | |
cat("\nSample size: ",N,"\n") | |
cat("Number of indicator variables: ",I,"\n") | |
cat("Replications: ",Replications,"\n") | |
if (Intervals) Subsample.Label <- paste(N.Int,"intervals") | |
if (!Intervals) Subsample.Label <- paste(Windows,"windows with",Overlap,"overlap") | |
cat("Subsamples: ",Subsample.Label,"\n") | |
if (!Intervals) cat("n per window at",Windows,"windows: ", round(N / | |
(Windows * (1 - Overlap) + Overlap)),"\n") | |
if (Input.Type == 1) cat("n per interval: ",round(N / N.Int),"\n") | |
if (Ind.Triplets & !Ind.Comp) Ind.Label <- | |
"Variables serve in all possible (input, output, output) triplets" | |
if (Ind.Triplets & Ind.Comp) Ind.Label <- | |
paste("Two variables at a time serve as outputs, input = sum of remaining",(I-2),"variables") | |
if (!Ind.Triplets) Ind.Label <- "Each variable serves once as input, with all other variables as outputs" | |
cat("Indicators: ",Ind.Label,"\n") | |
cat("Total number of curves: ",length(Curve.Info[,1]),"\n") | |
if (Gr.Smooth == 0) cat("Y values smoothed for graphing and estimation: No\n") | |
if (Gr.Smooth == 1) cat("Y values smoothed for graphing and estimation: Yes, LOWESS method\n") | |
if (Gr.Smooth == 2) cat("Y values smoothed for graphing and estimation: Yes, running medians\n") | |
if (Calc.Cov) cat("Base rate estimation: General covariance mixture theorem\n") | |
else cat("Base rate estimation: Adapted general covariance mixture theorem\n") | |
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 1)) | |
cat("Classification of cases: Cut total score at estimated base rate\n") | |
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 2)) | |
cat("Classification of cases: Bayes' Theorem\n") | |
if (Supplied.P > 0) cat("Classification of cases: Cut total score at supplied base rate\n") | |
if (Supplied.Class) cat("Classification of cases: Supplied with data\n") | |
if (Comp.Data) | |
{ | |
cat("Size of finite populations of comparison data: ",N.Pop,"\n") | |
cat("Number of samples of comparison data drawn from each population: ",N.Samples,"\n") | |
} | |
cat("\n") | |
} | |
########################################################################################################### | |
# | |
# Function to generate and report estimates of the taxon base rate and other latent parameters | |
# | |
########################################################################################################### | |
MAXEIG.P.Est <- function(Data, N, I, Curve.Info, Curves, Structure, Input.Type, Ind.Triplets, Ind.Comp, | |
Intervals, N.Int, Windows, Overlap, Calc.Cov, Replications, Gr.Smooth, Gr.Avg, | |
P.Avg, Classify, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Bayes, Save.Class, Comp.Data, Supplied.P, | |
Supplied.Class) | |
{ | |
if (Calc.Cov) cat("\n\nSUMMARY OF MAXCOV PARAMETER ESTIMATES\n") | |
else cat("\n\nSUMMARY OF MAXEIG PARAMETER ESTIMATES\n") | |
# estimate input hitmax and base rate estimate for each curve | |
Data.Matrix <- cbind(Data[,1:I], 1:N, rep(0,N), rep(0,N)) # data plus case identifiers (for sorting into original | |
# form), summed indicators, class assignments | |
N.Curves <- length(Curve.Info[,1]) | |
Curve.Est <- matrix(nrow = N.Curves, ncol = 2) # set up matrix to hold hitmax and base rate estimates | |
# Estimate P via general covariance mixture theorem (whether doing MAXCOV or MAXEIG) | |
VP.FP <- matrix(nrow = N.Curves, ncol = 2) # set up matrices to hold VP, FP estimates | |
for (i in 1:N.Curves) | |
{ | |
if (Structure < 3) Input <- Curve.Info[i,2] | |
else | |
{ | |
Data.Matrix[,(I + 2)] <- rep(0, N) | |
for (j in 1:I) | |
Data.Matrix[,(I + 2)] <- Data.Matrix[,(I + 2)] + Data.Matrix[,j] | |
Input <- I + 2 | |
} | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),] | |
# determine curve data to use (smoothed or unsmoothed) | |
if (Gr.Smooth == 0) Curve.Data <- cbind(Curves[[i]][,1], Curves[[i]][,3], Curves[[i]][,2]) | |
if (Gr.Smooth == 1) | |
{ | |
Lowess.Y <- lowess(Curves[[i]][,1], Curves[[i]][,3])$y | |
for (j in 1:(length(Curves[[i]][,1])-1)) | |
if (Curves[[i]][j,1] == Curves[[i]][(j + 1),1]) | |
Lowess.Y <- c(Lowess.Y[1:j],Lowess.Y[j],Lowess.Y[j+1:length(Lowess.Y)]) | |
Lowess.Y <- Lowess.Y[1:length(Curves[[i]][,1])] | |
Curve.Data <- cbind(Curves[[i]][,1], Lowess.Y, Curves[[i]][,2]) | |
} | |
if (Gr.Smooth == 2) Curve.Data <- cbind(Curves[[i]][,1], smooth(Curves[[i]][,3]), Curves[[i]][,2]) | |
# estimate hitmax, P, VP, FP | |
Hitmax.Value <- max(Curve.Data[,2]) # y value at peak | |
Hitmax.Locs <- (1:(length(Curve.Data[,1])))[Curve.Data[,2] == Hitmax.Value] # peak subsample(s) | |
Hitmax.Loc <- round(median(Hitmax.Locs)) | |
Hitmax.Case <- trunc(mean(Curves[[i]][Hitmax.Loc,4:5])) # case at midpoint of peak subsample(s) | |
Ints <- length(Curve.Data[,1]) | |
p <- vector("numeric",Ints) | |
n <- vector("numeric",Ints) | |
K <- 4 * Hitmax.Value | |
VP <- 0 | |
FP <- 0 | |
for (j in 1:Ints) | |
{ | |
if ((K ^ 2 - 4 * K * Curve.Data[j,2]) >= 0) | |
quad1 <- (K + sqrt(K ^ 2 - 4 * K * Curve.Data[j,2])) / ( 2 * K) | |
else quad1 <- 1 | |
if ((K ^ 2 - 4 * K * Curve.Data[j,2]) >= 0) | |
quad2 <- (K - sqrt(K ^ 2 - 4 * K * Curve.Data[j,2])) / ( 2 * K) | |
else quad2 <- 0 | |
if (j < Hitmax.Loc) p[j] <- min(1,quad2) | |
if (j == Hitmax.Loc) p[j] <- .5 | |
if (j > Hitmax.Loc) p[j] <- max(quad1,0) | |
if ((Curve.Data[j,2] < 0) & (j < Hitmax.Loc)) p[j] <- 0 | |
if ((Curve.Data[j,2] < 0) & (j > Hitmax.Loc)) p[j] <- 1 | |
n[j] <- Curve.Data[j,3] | |
if (Structure < 3) | |
{ | |
if (j >= Hitmax.Loc) VP <- VP + p[j] * n[j] | |
if (j >= Hitmax.Loc) FP <- FP + (1 - p[j]) * n[j] | |
} | |
} | |
Curve.Est[i,1] <- Data.Matrix[Hitmax.Case,Input] | |
Curve.Est[i,2] <- sum(p * n) / sum(n) | |
VP.FP[i,1] <- VP / sum(p * n) | |
VP.FP[i,2] <- FP / sum((1 - p) * n) | |
} | |
Row.Labels <- paste("Curve",1:N.Curves) | |
Col.Labels <- c("Hitmax","P") | |
dimnames(Curve.Est) <- list(Row.Labels, Col.Labels) | |
cat("\nEstimated hitmax values and taxon base rates for each curve:\n\n") | |
print(round(Curve.Est,3)) | |
P.Est.M <- mean(Curve.Est[,2], na.rm = T) | |
P.Est.SD <- sd(Curve.Est[,2], na.rm = T) | |
cat("\nSummary of base rate estimates across curves:\n M =",round(P.Est.M,3), | |
"\n SD =",round(P.Est.SD,3),"\n\n") | |
# if more than one curve per indicator, gather hitmax, P, VP, FP estimates for each indicator | |
if (Structure < 3) | |
{ | |
Ind.Est <- matrix(nrow = I, ncol = 4) | |
N.Each.Ind <- N.Curves / I | |
for (i in 1:I) | |
{ | |
Ind.Curves <- ((i - 1) * N.Each.Ind + 1):(i * N.Each.Ind) | |
Ind.Est[i,1] <- median(Curve.Est[Ind.Curves,1], na.rm = T) | |
Ind.Est[i,2] <- mean(Curve.Est[Ind.Curves,2], na.rm = T) | |
Ind.Est[i,3] <- mean(VP.FP[Ind.Curves,1], na.rm = T) | |
Ind.Est[i,4] <- mean(VP.FP[Ind.Curves,2], na.rm = T) | |
} | |
if ((Structure == 2) & (I > 3)) | |
{ | |
Row.Labels <- paste("Indicator",1:I) | |
Col.Labels <- c("Hitmax","P","VP","FP") | |
dimnames(Ind.Est) <- list(Row.Labels, Col.Labels) | |
cat("\nEstimated hitmax, base rate, VP, FP estimates for each indicator:\n\n") | |
print(round(Ind.Est,9)) | |
cat("\nSummary of base rate estimates across indicators:\n M =", | |
round(mean(Ind.Est[,2], na.rm = T),3), "\n SD =", | |
round(sd(Ind.Est[,2], na.rm = T),3),"\n\n") | |
} | |
if ((Structure == 1) | (I == 3)) | |
{ | |
Row.Labels <- paste("Indicator",1:I) | |
Col.Labels <- c("Hitmax","P","VP","FP") | |
dimnames(Ind.Est) <- list(Row.Labels, Col.Labels) | |
cat("\nEstimated VP, FP values at each indicator's hitmax cut:\n\n") | |
print(round(Ind.Est[,3:4],9)) | |
cat("\n\n") | |
} | |
} | |
# print base rate estimate for averaged curve(s) if appropriate | |
if (Gr.Avg) cat("\nBase rate estimate for averaged curve =",round(P.Avg, 3),"\n\n") | |
# assign cases to classes | |
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 1)) # use estimated base rate to classify | |
{ | |
for (i in 1:I) | |
Data.Matrix[, (I + 2)] <- Data.Matrix[, (I + 2)] + Data.Matrix[,i] | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 2)]),] | |
P.Est.Series <- c(rep(2, trunc(N * P.Est.M + .5))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Matrix[,(I + 3)] <- c(Q.Est.Series, P.Est.Series) | |
} | |
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 2)) # use Bayes' Theorem to classify | |
{ | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 1)]),] | |
P.Term <- rep(P.Est.M,N) | |
Q.Term <- rep(1 - P.Est.M,N) | |
for (i in 1:I) | |
if ((Ind.Est[i,3] < .9999) & (Ind.Est[i,4] < .9999)) | |
{ | |
P.Term <- (Data.Matrix[,i] >= Ind.Est[i,1]) * P.Term * Ind.Est[i,3] + | |
(Data.Matrix[,i] < Ind.Est[i,1]) * P.Term * (1 - Ind.Est[i,3]) | |
Q.Term <- (Data.Matrix[,i] >= Ind.Est[i,1]) * Q.Term * Ind.Est[i,4] + | |
(Data.Matrix[,i] < Ind.Est[i,1]) * Q.Term * (1 - Ind.Est[i,4]) | |
} | |
Bayes.Prob <- P.Term / (P.Term + Q.Term) | |
Data.Matrix[,(I + 3)] <- (Bayes.Prob >= .50) + 1 | |
if (Gr.Bayes) | |
{ | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
hist(Bayes.Prob, breaks = seq(from = 0, to = 1, by = .10), xlab = "Prob(Taxon Member)", | |
ylab = "Frequency", main="") | |
mtext("DISTRIBUTION OF BAYESIAN PROBABILITIES", outer=T) | |
} | |
} | |
if (Supplied.P > 0) # use supplied base rate to classify | |
{ | |
for (i in 1:I) | |
Data.Matrix[, (I + 2)] <- Data.Matrix[, (I + 2)] + Data.Matrix[,i] | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 2)]),] | |
P.Est.Series <- c(rep(2, trunc(N * Supplied.P + .5))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Matrix[,(I + 3)] <- c(Q.Est.Series, P.Est.Series) | |
} | |
if (Supplied.Class) # use supplied classification | |
Data.Matrix <- cbind(Data[,1:I], (1:N), rep(0,N), Data[, (I + 1)]) | |
# Construct full-sample and within-groups data files | |
Data.f <- as.matrix(Data.Matrix[,(1:I)]) | |
Nt <- sum(Data.Matrix[,(I + 3)]) - N | |
Nc <- N - Nt | |
Data.t <- Data.Matrix[(Data.Matrix[,(I + 3)] == 2),1:I] | |
Data.c <- Data.Matrix[(Data.Matrix[,(I + 3)] == 1),1:I] | |
for (i in 1:I) | |
{ | |
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001) | |
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001) | |
} | |
# Calculate distributions for each indicator (full sample, taxon, complement) | |
Dist.Full <- matrix(nrow = I + 2, ncol = 4) | |
Dist.Tax <- matrix(nrow = I + 2, ncol = 4) | |
Dist.Comp <- matrix(nrow = I + 2, ncol = 4) | |
Rows <- c(paste("Ind",1:I), "M", "SD") | |
Cols <- c("M","SD","Skew","Kurtosis") | |
dimnames(Dist.Full) <- list(Rows, Cols) | |
dimnames(Dist.Tax) <- list(Rows, Cols) | |
dimnames(Dist.Comp) <- list(Rows, Cols) | |
for (i in 1:I) | |
{ | |
Dist.Full[i,1] <- mean(Data.f[,i]) | |
Dist.Tax[i,1] <- mean(Data.t[,i]) | |
Dist.Comp[i,1] <- mean(Data.c[,i]) | |
Dist.Full[i,2] <- sd(Data.f[,i]) | |
Dist.Tax[i,2] <- sd(Data.t[,i]) | |
Dist.Comp[i,2] <- sd(Data.c[,i]) | |
Dist.Full[i,3] <- Skew(Data.f[,i]) | |
Dist.Tax[i,3] <- Skew(Data.t[,i]) | |
Dist.Comp[i,3] <- Skew(Data.c[,i]) | |
Dist.Full[i,4] <- Kurtosis(Data.f[,i]) | |
Dist.Tax[i,4] <- Kurtosis(Data.t[,i]) | |
Dist.Comp[i,4] <- Kurtosis(Data.c[,i]) | |
} | |
for (i in 1:4) | |
{ | |
Dist.Full[I + 1,i] <- mean(Dist.Full[1:I,i]) | |
Dist.Tax[I + 1,i] <- mean(Dist.Tax[1:I,i]) | |
Dist.Comp[I + 1,i] <- mean(Dist.Comp[1:I,i]) | |
Dist.Full[I + 2,i] <- sd(Dist.Full[1:I,i]) | |
Dist.Tax[I + 2,i] <- sd(Dist.Tax[1:I,i]) | |
Dist.Comp[I + 2,i] <- sd(Dist.Comp[1:I,i]) | |
} | |
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n") | |
print(round(Dist.Full,3)) | |
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n") | |
print(round(Dist.Tax,3)) | |
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n") | |
print(round(Dist.Comp,3)) | |
# calculate validity for each indicator | |
Ind.Val <- matrix(nrow = I + 2, ncol = 2) | |
Rows <- c(paste("Ind",1:I), "M", "SD") | |
Cols <- c("Raw Units","Cohen's d") | |
dimnames(Ind.Val) <- list(Rows, Cols) | |
for (i in 1:I) | |
{ | |
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1] | |
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2) | |
/ (Nt + Nc - 2)) | |
} | |
Ind.Val[I + 1,1] <- mean(Ind.Val[1:I,1]) | |
Ind.Val[I + 1,2] <- mean(Ind.Val[1:I,2]) | |
Ind.Val[I + 2,1] <- sd(Ind.Val[1:I,1]) | |
Ind.Val[I + 2,2] <- sd(Ind.Val[1:I,2]) | |
cat("\nBetween-group validity:\n\n") | |
print(round(Ind.Val,3)) | |
# Calculate, print, and summarize full-sample and within-group correlations | |
Cor.Matrix <- cor(Data.f[,1:I]) | |
Rows <- paste("Ind",1:I) | |
Cols <- paste("Ind",1:I) | |
dimnames(Cor.Matrix) <- list(Rows, Cols) | |
cat("\nIndicator correlations:\n\nFull Sample ( N =",N,"):\n\n") | |
print(round(Cor.Matrix,3)) | |
Cor.Taxon <- cor(Data.t) | |
Cor.Comp <- cor(Data.c) | |
dimnames(Cor.Taxon) <- list(Rows, Cols) | |
dimnames(Cor.Comp) <- list(Rows, Cols) | |
cat("\nTaxon ( n =",Nt,"):\n\n") | |
print(round(Cor.Taxon,3)) | |
cat("\nComplement ( n =",Nc,"):\n\n") | |
print(round(Cor.Comp,3)) | |
Cor.Summary <- matrix(nrow = 3, ncol = 2) | |
dimnames(Cor.Summary) <- list(c("Full Sample","Taxon", "Complement"), c("M", "SD")) | |
Cor.Summary[1,1] <- mean(Cor.Matrix[lower.tri(Cor.Matrix)]) | |
Cor.Summary[2,1] <- mean(Cor.Taxon[lower.tri(Cor.Taxon)]) | |
Cor.Summary[3,1] <- mean(Cor.Comp[lower.tri(Cor.Comp)]) | |
Cor.Summary[1,2] <- sd(Cor.Matrix[lower.tri(Cor.Matrix)]) | |
Cor.Summary[2,2] <- sd(Cor.Taxon[lower.tri(Cor.Taxon)]) | |
Cor.Summary[3,2] <- sd(Cor.Comp[lower.tri(Cor.Comp)]) | |
cat("\nSummary of indicator correlations:\n\n") | |
print(round(Cor.Summary,3)) | |
cat("\n") | |
# return list of class assignments and mean base rate estimate (after resorting by original case numbers) | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 1)]),] | |
if (Save.Class < 2) return(list(Data.Matrix[,(I + 3)],P.Est.M,P.Est.SD)) | |
else return(list(round(Bayes.Prob,9),P.Est.M,P.Est.SD)) | |
} | |
########################################################################################################### | |
# | |
# MAXSLOPE | |
# | |
# Written by John Ruscio | |
# Last modified January 9, 2012 | |
# | |
########################################################################################################### | |
# | |
# Preliminary program to read data, call for comparison data if/as requested, submit each data set to | |
# main program along with an identifying label. | |
# | |
########################################################################################################### | |
MAXSLOPE <- function(Data.Set, Comp.Data = T, N.Pop = 100000, N.Samples = 100, Supplied.Class = F, | |
Supplied.P = 0, File.Output = F, File.Name = "Output.txt", Seed = 1) | |
{ | |
Data.Set <- Remove.Missing(Data.Set) # remove any missing data (listwise deletion) | |
N <- dim(Data.Set)[1] # determine sample size | |
I <- dim(Data.Set)[2] # determine number of indicators; adjust if one column contains class assignments | |
if (Supplied.Class) I <- I - 1 | |
if (I > 2) | |
{ | |
cat("*** More than two indicators were submitted for analysis; only the first two indicators will be used ***\n\n") | |
if (Supplied.Class) Data.Set <- Data.Set[,c(1,2,(I + 1))] | |
else Data.Set <- Data.Set[,1:2] | |
I <- 2 | |
} | |
Data.Set[,1:2] <- apply(Data.Set[,1:2], 2, scale) # standardize data | |
set.seed(Seed) # set random number seed to allow exact replications of results | |
if ((Comp.Data) & (N.Samples == 1)) N.Samples <- 100 # do not allow a single sample of comparison data | |
# if output to file is requested. redirect all text output | |
if (File.Output) sink(File.Name) | |
Research.Results <- MAXSLOPE.Main(Data.Set, Research.Data = T, Comp.Data, N.Pop, N.Samples, N, | |
Supplied.P, Supplied.Class) | |
Xs <- Research.Results[[1]] | |
Slopes <- Research.Results[[2]] | |
P.Est <- Research.Results[[3]] | |
# if comparison data were requested, then generate and submit data for analysis | |
if (Comp.Data) | |
{ | |
# create matrices to store results for comparison | |
Tax.Slopes <- Dim.Slopes <- matrix(0, nrow = N.Samples, ncol = N - 1) | |
# generate and analyze categorical data | |
set.seed(Seed) | |
if (!Supplied.Class) # if class assignments were not provided, generate using base rate | |
{ | |
Data.Set <- cbind(Data.Set, rep(0, N)) | |
if (Supplied.P == 0) P <- P.Est # use estimated P in research data if no P was provided | |
else P <- Supplied.P | |
if ((P.Est * N) < 20) P.Est <- 20 / N | |
if ((N - (P.Est * N)) < 20) P.Est <- N - (20 / N) | |
for (i in 1:I) | |
Data.Set[, (I + 1)] <- Data.Set[, (I + 1)] + Data.Set[,i] | |
Data.Set <- Data.Set[sort.list(Data.Set[,(I + 1)]),] | |
P.Est.Series <- c(rep(2, round(N * P))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Set[,(I + 1)] <- c(Q.Est.Series, P.Est.Series) | |
} | |
Tax.Data <- Data.Set[(Data.Set[,(I + 1)] == 2), 1:I] | |
Com.Data <- Data.Set[(Data.Set[,(I + 1)] == 1), 1:I] | |
P.Data <- dim(Tax.Data)[1] / (dim(Tax.Data)[1] + dim(Com.Data)[1]) | |
cat("\nGenerating and analyzing comparison data\n") | |
Sim.Tax <- GenData(Tax.Data, round(N.Pop * P.Data)) | |
Sim.Com <- GenData(Com.Data, (N.Pop - round(N.Pop * P.Data))) | |
Sim.Pop <- rbind(Sim.Tax, Sim.Com) | |
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))] | |
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I))) | |
cat(" Categorical population generated, RMSR r =",round(RMSR,3),"\n") | |
for (Sims in 1:N.Samples) | |
{ | |
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),] | |
Tax.Slopes[Sims,] <- MAXSLOPE.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, | |
N.Samples, N, Supplied.P, Supplied.Class) | |
} | |
cat(" Analysis of",N.Samples,"samples of categorical comparison data completed\n") | |
# generate and analyze dimensional samples | |
set.seed(Seed) | |
Sim.Pop <- GenData(Data.Set[, 1:I], N.Pop) | |
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))] | |
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I))) | |
cat("\n Dimensional population generated, RMSR r =",round(RMSR,3),"\n") | |
for (Sims in 1:N.Samples) | |
{ | |
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),] | |
Dim.Slopes[Sims,] <- MAXSLOPE.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, | |
N.Samples, N, Supplied.P, Supplied.Class) | |
} | |
cat(" Analysis of",N.Samples,"samples of dimensional comparison data completed\n") | |
# 2-panel plot for comparison data | |
Tax.Min <- Tax.Max <- Tax.Q1 <- Tax.Q2 <- Tax.Q3 <- Dim.Min <- Dim.Max <- Dim.Q1 <- Dim.Q2 <- Dim.Q3 <- rep(0, length(Xs)) | |
for (i in 1:length(Xs)) | |
{ | |
Tax.Min[i] <- min(Tax.Slopes[(Tax.Slopes[,i] != -Inf),i]) | |
Tax.Max[i] <- max(Tax.Slopes[(Tax.Slopes[,i] != Inf),i]) | |
Tax.Q1[i] <- sort(Tax.Slopes[,i])[N.Samples * .25] | |
Tax.Q2[i] <- sort(Tax.Slopes[,i])[N.Samples * .50] | |
Tax.Q3[i] <- sort(Tax.Slopes[,i])[N.Samples * .75] | |
Dim.Min[i] <- min(Dim.Slopes[(Dim.Slopes[,i] != -Inf),i]) | |
Dim.Max[i] <- max(Dim.Slopes[(Dim.Slopes[,i] != Inf),i]) | |
Dim.Q1[i] <- sort(Dim.Slopes[,i])[N.Samples * .25] | |
Dim.Q2[i] <- sort(Dim.Slopes[,i])[N.Samples * .50] | |
Dim.Q3[i] <- sort(Dim.Slopes[,i])[N.Samples * .75] | |
} | |
Min.X <- min(Xs) | |
Max.X <- max(Xs) | |
Min.Y <- min(c(Slopes, Tax.Min[Tax.Min != -Inf], Dim.Min[Dim.Min != -Inf]), na.rm = T) | |
Max.Y <- max(c(Slopes, Tax.Max[Tax.Max != Inf], Dim.Max[Dim.Min != Inf]), na.rm = T) | |
dev.new(width=7, height=3.5) | |
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75) | |
plot(x = Xs, y = Slopes, type = "o", pch = 16, lwd = 2, xlim = c(Min.X, Max.X), | |
ylim = c(Min.Y, Max.Y), xlab = "Indicator Score", ylab = "LOWESS Slope", main = "Categorical Comparison Data") | |
for (i in 1:(length(Xs)-1)) | |
polygon(x = c(Xs[i], Xs[i], Xs[i+1], Xs[i+1]), | |
y = c(Tax.Q1[i], Tax.Q3[i], Tax.Q3[i+1], Tax.Q1[i+1]), col = 8, lty = 0) | |
lines(Xs, Tax.Min) | |
lines(Xs, Tax.Max) | |
points(Xs, Slopes, type = "o", pch = 16) | |
lines(Xs, Slopes, lwd = 2) | |
plot(x = Xs, y = Slopes, type = "o", pch = 16, lwd = 2, xlim = c(Min.X, Max.X), | |
ylim = c(Min.Y, Max.Y), xlab = "Indicator Score", ylab = "LOWESS Slope", main = "Dimensional Comparison Data") | |
for (i in 1:(length(Xs)-1)) | |
polygon(x = c(Xs[i], Xs[i], Xs[i+1], Xs[i+1]), | |
y = c(Dim.Q1[i], Dim.Q3[i], Dim.Q3[i+1], Dim.Q1[i+1]), col = 8, lty = 0) | |
lines(Xs, Dim.Min) | |
lines(Xs, Dim.Max) | |
points(Xs, Slopes, type = "o", pch = 16) | |
lines(Xs, Slopes, lwd = 2) | |
# calculate and print comparison curve fit index (CCFI) | |
Fit.RMSR.t <- sqrt(sum((Slopes - Tax.Q2)^2)/(N - 1)) | |
Fit.RMSR.d <- sqrt(sum((Slopes - Dim.Q2)^2)/(N - 1)) | |
CCFI <- Fit.RMSR.d / (Fit.RMSR.d + Fit.RMSR.t) | |
cat("\n\nComparison curve fit index (CCFI) =",round(Fit.RMSR.d,4),"/ (",round(Fit.RMSR.d,4), | |
"+",round(Fit.RMSR.t,4),") =",round(CCFI,3),"\n") | |
cat(" Note: CCFI values can range from 0 (dimensional) to 1 (categorical). The more a CCFI\n") | |
cat(" value deviates from .50, the stronger the result. When .40 < CCFI < .60,\n") | |
cat(" this should be interpreted with caution.\n\n") | |
} | |
cat("\n") | |
if (File.Output) sink() | |
} | |
########################################################################################################### | |
# | |
# Main program -- calculate, plot, and summarize results -- no subsidiary programs are called | |
# | |
########################################################################################################### | |
MAXSLOPE.Main <- function(Data, Research.Data, Comp.Data, N.Pop, N.Samples, N, Supplied.P, Supplied.Class) | |
{ | |
# perform two MAXSLOPE analyses | |
x1 <- y2 <- Data[,1] | |
x2 <- y1 <- Data[,2] | |
low1 <- lowess(y1 ~ x1) | |
low2 <- lowess(y2 ~ x2) | |
slope1 <- (low1$y[2:N] - low1$y[1:(N - 1)]) / | |
(low1$x[2:N] - low1$x[1:(N - 1)]) | |
slope2 <- (low2$y[2:N] - low2$y[1:(N - 1)]) / | |
(low2$x[2:N] - low2$x[1:(N - 1)]) | |
# replace missing slopes with nearest neighbor | |
while(sum(is.nan(slope1)) > 0) | |
{ | |
for (i in 2:(N - 1)) | |
if (is.nan(slope1[i]) & !is.nan(slope1[i - 1])) slope1[i] <- slope1[i - 1] | |
for (i in 1:(N - 2)) | |
if (is.nan(slope1[i]) & !is.nan(slope1[i + 1])) slope1[i] <- slope1[i + 1] | |
} | |
while(sum(is.nan(slope2)) > 0) | |
{ | |
for (i in 2:(N - 1)) | |
if (is.nan(slope2[i]) & !is.nan(slope2[i - 1])) slope2[i] <- slope2[i - 1] | |
for (i in 1:(N - 2)) | |
if (is.nan(slope2[i]) & !is.nan(slope2[i + 1])) slope2[i] <- slope2[i + 1] | |
} | |
# average the curves, estimate the base rate, and return results | |
Y <- (slope1 + slope2) / 2 | |
X <- (low1$x[1:(N - 1)] + low2$x[1:(N - 1)]) / 2 | |
my <- max(Y) | |
my.x <- max(X[Y == my]) | |
my.n <- sum(X >= my.x) - .5 * (sum(Y = my) - 1) | |
P.Est <- my.n / N | |
if (Research.Data) | |
{ | |
# create and label new graph sheet | |
dev.new(width=3.7, height=3.5) | |
par(omi=c(.25,.25,.25,.25), cex = .75) | |
# plot results as slopes by x values | |
X.Range <- c(min(X), max(X)) | |
Y.Range <- c(min(0, min(Y)), max(((max(Y) / sd(Y)) | |
* (mean(Y) / 1.5)), max(Y) * 1.10)) | |
plot(X, Y, type = "o", pch = 16, xlim = X.Range, ylim = Y.Range, | |
main = "MAXSLOPE Curve", xlab = "Indicator Score", ylab = "LOWESS Slope") | |
# report relevant analytic information and program specifications | |
cat("\n\nSUMMARY OF MAXSLOPE ANALYTIC SPECIFICATIONS\n") | |
cat("\nSample size: ",N,"\n") | |
if ((Supplied.P == 0) & (!Supplied.Class)) | |
cat("Classification of cases: Cut total score at estimated base rate\n") | |
if (Supplied.P > 0) cat("Classification of cases: Cut total score at supplied base rate\n") | |
if (Supplied.Class) cat("Classification of cases: Supplied with data\n") | |
if (Comp.Data) | |
{ | |
cat("Size of finite populations of comparison data: ",N.Pop,"\n") | |
cat("Number of samples of comparison data drawn from each population: ",N.Samples,"\n") | |
} | |
cat("\n") | |
# report estimated taxon base rate | |
cat("\n\nSUMMARY OF MAXSLOPE PARAMETER ESTIMATES\n") | |
cat("\nEstimated taxon base rate =", round(P.Est, 3), "\n\n") | |
# assign cases to classes using estimated base rate | |
if ((Supplied.P == 0) & (!Supplied.Class)) | |
{ | |
Data.Matrix <- cbind(Data, 1:N) | |
Data.Matrix[,3] <- apply(Data.Matrix[,1:2], 1, sum) | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,3]),] | |
P.Est.Series <- c(rep(2, round(N * P.Est))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Matrix[,3] <- c(Q.Est.Series, P.Est.Series) | |
} | |
# assign cases to classes using supplied base rate | |
if ((Supplied.P > 0) & (!Supplied.Class)) | |
{ | |
Data.Matrix <- cbind(Data, 1:N) | |
Data.Matrix[,3] <- apply(Data.Matrix[,1:2], 1, sum) | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,3]),] | |
P.Est.Series <- c(rep(2, round(N * Supplied.P))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Matrix[,3] <- c(Q.Est.Series, P.Est.Series) | |
} | |
# assign cases to classes using supplied classification | |
if (Supplied.Class) | |
Data.Matrix <- Data | |
# Construct full-sample and within-groups data files | |
Data.f <- as.matrix(Data.Matrix[,1:2]) | |
Nt <- sum(Data.Matrix[,3]) - N | |
Nc <- N - Nt | |
Data.t <- Data.Matrix[(Data.Matrix[,3] == 2),1:2] | |
Data.c <- Data.Matrix[(Data.Matrix[,3] == 1),1:2] | |
for (i in 1:2) | |
{ | |
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001) | |
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001) | |
} | |
# Calculate distributions for each indicator (full sample, taxon, complement) | |
Dist.Full <- Dist.Tax <- Dist.Comp <- matrix(nrow = 4, ncol = 4) | |
Rows <- c(paste("Ind",1:2), "M", "SD") | |
Cols <- c("M","SD","Skew","Kurtosis") | |
dimnames(Dist.Full) <- list(Rows, Cols) | |
dimnames(Dist.Tax) <- list(Rows, Cols) | |
dimnames(Dist.Comp) <- list(Rows, Cols) | |
for (i in 1:2) | |
{ | |
Dist.Full[i,1] <- mean(Data.f[,i]) | |
Dist.Tax[i,1] <- mean(Data.t[,i]) | |
Dist.Comp[i,1] <- mean(Data.c[,i]) | |
Dist.Full[i,2] <- sd(Data.f[,i]) | |
Dist.Tax[i,2] <- sd(Data.t[,i]) | |
Dist.Comp[i,2] <- sd(Data.c[,i]) | |
Dist.Full[i,3] <- Skew(Data.f[,i]) | |
Dist.Tax[i,3] <- Skew(Data.t[,i]) | |
Dist.Comp[i,3] <- Skew(Data.c[,i]) | |
Dist.Full[i,4] <- Kurtosis(Data.f[,i]) | |
Dist.Tax[i,4] <- Kurtosis(Data.t[,i]) | |
Dist.Comp[i,4] <- Kurtosis(Data.c[,i]) | |
} | |
for (i in 1:4) | |
{ | |
Dist.Full[3,i] <- mean(Dist.Full[1:2,i]) | |
Dist.Tax[3,i] <- mean(Dist.Tax[1:2,i]) | |
Dist.Comp[3,i] <- mean(Dist.Comp[1:2,i]) | |
Dist.Full[4,i] <- sd(Dist.Full[1:2,i]) | |
Dist.Tax[4,i] <- sd(Dist.Tax[1:2,i]) | |
Dist.Comp[4,i] <- sd(Dist.Comp[1:2,i]) | |
} | |
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n") | |
print(round(Dist.Full,3)) | |
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n") | |
print(round(Dist.Tax,3)) | |
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n") | |
print(round(Dist.Comp,3)) | |
# calculate validity for each indicator | |
Ind.Val <- matrix(nrow = 4, ncol = 2) | |
Rows <- c(paste("Ind",1:2), "M", "SD") | |
Cols <- c("Raw Units","Cohen's d") | |
dimnames(Ind.Val) <- list(Rows, Cols) | |
for (i in 1:2) | |
{ | |
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1] | |
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2) | |
/ (Nt + Nc - 2)) | |
} | |
Ind.Val[3,1] <- mean(Ind.Val[1:2,1]) | |
Ind.Val[3,2] <- mean(Ind.Val[1:2,2]) | |
Ind.Val[4,1] <- sd(Ind.Val[1:2,1]) | |
Ind.Val[4,2] <- sd(Ind.Val[1:2,2]) | |
cat("\nBetween-group validity:\n\n") | |
print(round(Ind.Val,3)) | |
# Calculate, print, and summarize full-sample and within-group correlations | |
cat("\nIndicator correlations:\n\n") | |
cat("Full Sample r =",round(cor(Data.f)[1,2],3),"\n") | |
cat(" Taxon r =",round(cor(Data.t)[1,2],3),"\n") | |
cat(" Complement r =",round(cor(Data.c)[1,2],3),"\n\n") | |
} | |
# if research data, return x values, slopes, and base rate estimate; | |
# if comparison data, return only slopes | |
if (Research.Data) return(list(X, Y, P.Est)) | |
else return(Y) | |
} | |
########################################################################################################### | |
# | |
# L-Mode | |
# | |
# Written by John Ruscio | |
# Last modified January 9, 2012 | |
# | |
########################################################################################################### | |
# | |
# Preliminary program to read data, call for comparison data if/as requested, submit each data set to | |
# main program along with an identifying label. | |
# | |
########################################################################################################### | |
LMode <- function(Data.Set, Comp.Data = T, N.Pop = 100000, N.Samples = 100, Supplied.Class = F, | |
Supplied.P = 0, Mode.L = -.001, Mode.R = .001, St.Ind = T, Classify = 1, Save.Class = F, | |
File.Output = F, File.Name = "Output.txt", Seed = 1) | |
{ | |
Data.Set <- Remove.Missing(Data.Set) # remove any missing data (listwise deletion) | |
N <- dim(Data.Set)[1] # determine sample size | |
I <- dim(Data.Set)[2] # determine number of indicators; adjust if one column contains class assignments | |
if (Supplied.Class) I <- I - 1 | |
if (St.Ind) Data.Set[,1:I] <- apply(Data.Set[,1:I], 2, scale) # standardize data if requested | |
set.seed(Seed) # set random number seed to allow exact replications of results | |
if ((Comp.Data) & (N.Samples == 1)) N.Samples <- 100 # do not allow a single sample of comparison data | |
# if output to file is requested. redirect all text output | |
if (File.Output) sink(File.Name) | |
Research.Results <- LMode.Main(Data.Set, Research.Data = T, Comp.Data, N.Pop, N.Samples, N, I, | |
Mode.L, Mode.R, Classify, Save.Class, Supplied.P, Supplied.Class) | |
Scores <- Research.Results[[1]] | |
P.Est <- Research.Results[[2]] | |
Class.Assign <- Research.Results[[3]] | |
# if comparison data were requested, then generate and submit data for analysis | |
if (Comp.Data) | |
{ | |
# create matrices to store results for comparison | |
Tax.Scores <- Dim.Scores <- matrix(0, nrow = N.Samples, ncol = N) | |
# generate and analyze categorical data | |
set.seed(Seed) | |
if (!Supplied.Class) # if class assignments were not provided, generate using base rate | |
{ | |
Data.Set <- cbind(Data.Set, rep(0, N)) | |
if (Supplied.P == 0) P <- P.Est # use estimated P in research data if no P was provided | |
else P <- Supplied.P | |
if ((P.Est * N) < 20) P.Est <- 20 / N | |
if ((N - (P.Est * N)) < 20) P.Est <- N - (20 / N) | |
for (i in 1:I) | |
Data.Set[, (I + 1)] <- Data.Set[, (I + 1)] + Data.Set[,i] | |
Data.Set <- Data.Set[sort.list(Data.Set[,(I + 1)]),] | |
P.Est.Series <- c(rep(2, round(N * P))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Set[,(I + 1)] <- c(Q.Est.Series, P.Est.Series) | |
} | |
Tax.Data <- Data.Set[(Data.Set[,(I + 1)] == 2), 1:I] | |
Com.Data <- Data.Set[(Data.Set[,(I + 1)] == 1), 1:I] | |
P.Data <- dim(Tax.Data)[1] / (dim(Tax.Data)[1] + dim(Com.Data)[1]) | |
cat("\nGenerating and analyzing comparison data\n") | |
Sim.Tax <- GenData(Tax.Data, round(N.Pop * P.Data)) | |
Sim.Com <- GenData(Com.Data, (N.Pop - round(N.Pop * P.Data))) | |
Sim.Pop <- rbind(Sim.Tax, Sim.Com) | |
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))] | |
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I))) | |
cat(" Categorical population generated, RMSR r =",round(RMSR,3),"\n") | |
for (Sims in 1:N.Samples) | |
{ | |
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),] | |
Tax.Scores[Sims,] <- LMode.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, | |
N.Samples, N, I, Mode.L, Mode.R, Classify, Supplied.P, | |
Supplied.Class) | |
} | |
cat(" Analysis of",N.Samples,"samples of categorical comparison data completed\n") | |
# generate and analyze dimensional samples | |
set.seed(Seed) | |
Sim.Pop <- GenData(Data.Set[, 1:I], N.Pop) | |
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))] | |
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I))) | |
cat("\n Dimensional population generated, RMSR r =",round(RMSR,3),"\n") | |
for (Sims in 1:N.Samples) | |
{ | |
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),] | |
Dim.Scores[Sims,] <- LMode.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, | |
N.Samples, N, I, Mode.L, Mode.R, Classify, Supplied.P, | |
Supplied.Class) | |
} | |
cat(" Analysis of",N.Samples,"samples of dimensional comparison data completed\n") | |
Res.Density <- density(Scores) | |
Tax.Density <- Dim.Density <- matrix(0, nrow = N.Samples, ncol = 512) | |
for (i in 1:N.Samples) | |
{ | |
Tax.Density[i, ] <- density(Tax.Scores[i,], from = min(Res.Density$x), to = max(Res.Density$x))$y | |
Dim.Density[i, ] <- density(Dim.Scores[i,], from = min(Res.Density$x), to = max(Res.Density$x))$y | |
} | |
Tax.Density.M <- apply(Tax.Density, 2, mean) | |
Dim.Density.M <- apply(Dim.Density, 2, mean) | |
# 2-panel plot for comparison data | |
Tax.Sum <- Dim.Sum <- matrix(0, nrow = 6, ncol = 512) | |
for (i in 1:512) | |
{ | |
Tax.Sum[,i] <- summary(Tax.Density[,i]) | |
Dim.Sum[,i] <- summary(Dim.Density[,i]) | |
} | |
dev.new(width=7, height=3.5) | |
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75) | |
Max.Y <- max(c(Res.Density$y, Tax.Sum[6,], Dim.Sum[6,])) | |
Min.X <- min(Res.Density$x) | |
Max.X <- max(Res.Density$x) | |
plot(Res.Density, type = "l", xlim = c(Min.X, Max.X), ylim = c(0, Max.Y), xlab = "Factor Scores", | |
ylab = "Density", main = "Categorical Comparison Data", lwd = 2) | |
for (i in 1:511) | |
polygon(x = c(Res.Density$x[i], Res.Density$x[i], Res.Density$x[i+1], Res.Density$x[i+1]), | |
y = c(Tax.Sum[2,i], Tax.Sum[5,i], Tax.Sum[5,i+1], Tax.Sum[2,i+1]), col = 8, lty = 0) | |
lines(Res.Density$x, Tax.Sum[1,]) | |
lines(Res.Density$x, Tax.Sum[6,]) | |
lines(Res.Density, lwd = 2) | |
plot(Res.Density, type = "l", xlim = c(Min.X, Max.X), ylim = c(0, Max.Y), xlab = "Factor Scores", | |
ylab = "Density", main = "Dimensional Comparison Data", lwd = 2) | |
for (i in 1:511) | |
polygon(x = c(Res.Density$x[i], Res.Density$x[i], Res.Density$x[i+1], Res.Density$x[i+1]), | |
y = c(Dim.Sum[2,i], Dim.Sum[5,i], Dim.Sum[5,i+1], Dim.Sum[2,i+1]), col = 8, lty = 0) | |
lines(Res.Density$x, Dim.Sum[1,]) | |
lines(Res.Density$x, Dim.Sum[6,]) | |
lines(Res.Density, lwd = 2) | |
# calculate and print comparison curve fit index (CCFI) | |
data <- list(Res.Density, Dim.Density.M) | |
Fit.RMSR.d <- optimize(f = Fit.Densities, interval = c(-1,1), data)$objective | |
data <- list(Res.Density, Tax.Density.M) | |
Fit.RMSR.t <- optimize(f = Fit.Densities, interval = c(-1,1), data)$objective | |
CCFI <- Fit.RMSR.d / (Fit.RMSR.d + Fit.RMSR.t) | |
cat("\n\nComparison curve fit index (CCFI) =",round(Fit.RMSR.d,4),"/ (",round(Fit.RMSR.d,4), | |
"+",round(Fit.RMSR.t,4),") =",round(CCFI,3),"\n") | |
cat(" Note: CCFI values can range from 0 (dimensional) to 1 (categorical). The more a CCFI\n") | |
cat(" value deviates from .50, the stronger the result. When .40 < CCFI < .60,\n") | |
cat(" this should be interpreted with caution.\n\n") | |
} | |
cat("\n") | |
if (File.Output) sink() | |
if (Save.Class) return(Class.Assign) | |
} | |
########################################################################################################### | |
# | |
# Main program -- calculate, plot, and summarize results -- no subsidiary programs are called | |
# | |
########################################################################################################### | |
LMode.Main <- function(Data, Research.Data, Comp.Data, N.Pop, N.Samples, N, I, Mode.L, Mode.R, Classify, | |
Save.Class, Supplied.P, Supplied.Class) | |
{ | |
# load R code for running medians smoother | |
if ((version$major == 1) & (version$minor < 9)) library(eda) | |
else library(stats) # load R code for running medians smoother | |
# make room in data file for factor scores (col = I + 1) and class assignments (col = I + 2) | |
Data.Matrix <- cbind(Data[,1:I], rep(0, N), rep(0, N)) | |
# submit all indicators to factor analysis and store factor scores | |
Factor.Results <- Factor.Analysis.LMode(Data.Matrix[,1:I]) | |
Data.Matrix[,(I + 1)] <- Factor.Results$scores | |
Neg.Loadings <- (mean(Factor.Results$loadings) < 0) | |
if (Neg.Loadings) Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] * -1 | |
Score.Density <- density(Data.Matrix[,(I + 1)]) | |
if (Research.Data) | |
{ | |
# compute location of estimated latent modes | |
Max.L <- max(smooth(Score.Density$y)[Score.Density$x <= Mode.L]) | |
Loc.L <- (1: length(Score.Density$x))[smooth(Score.Density$y) == Max.L] | |
if (length(Loc.L) > 1) Loc.L <- Loc.L[1] | |
Max.R <- max(smooth(Score.Density$y)[Score.Density$x >= Mode.R]) | |
Loc.R <- (1: length(Score.Density$x))[smooth(Score.Density$y) == Max.R] | |
if (length(Loc.R) > 1) Loc.R <- Loc.R[1] | |
# create and label new graph sheet | |
dev.new(width=3.7, height=3.5) | |
par(omi=c(.25,.25,.25,.25), cex = .75) | |
# plot results as frequency distribution of factor scores, with vertical lines at estimated latent modes | |
Max.Y <- max(smooth(Score.Density$y)) | |
Min.X <- min(smooth(Score.Density$x)) | |
Max.X <- max(smooth(Score.Density$x)) | |
plot(Score.Density, type = "l", xlim = c(Min.X, Max.X), ylim = c(0, Max.Y), | |
main = "L-Mode Curve", xlab = "Factor Scores", ylab = "Density") | |
abline(v = Score.Density$x[Loc.L]) | |
abline(v = Score.Density$x[Loc.R]) | |
# report relevant analytic information and program specifications | |
cat("\n\nSUMMARY OF L-MODE ANALYTIC SPECIFICATIONS\n") | |
cat("\nSample size: ",N,"\n") | |
cat("Number of indicator variables: ",I,"\n") | |
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 1)) | |
cat("Classification of cases: Cut total score at estimated base rate\n") | |
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 2)) | |
cat("Classification of cases: Nearest mode\n") | |
if (Supplied.P > 0) cat("Classification of cases: Cut total score at supplied base rate\n") | |
if (Supplied.Class) cat("Classification of cases: Supplied with data\n") | |
if (Comp.Data) | |
{ | |
cat("Size of finite populations of comparison data: ",N.Pop,"\n") | |
cat("Number of samples of comparison data drawn from each population: ",N.Samples,"\n") | |
} | |
cat("\n") | |
# estimate (and report) taxon base rate from location of latent modes | |
cat("\n\nSUMMARY OF L-MODE PARAMETER ESTIMATES\n") | |
P.R <- 1 / (1 + Score.Density$x[Loc.R] ^ 2) | |
P.L <- 1 - (1 / (1 + Score.Density$x[Loc.L] ^ 2)) | |
P.Est <- mean(c(P.L, P.R)) | |
cat("\nSummary of taxon base rate estimates:\n") | |
cat(" Based on location of left mode: ",round(P.L,3),"\n") | |
cat(" Based on location of right mode: ",round(P.R,3),"\n") | |
cat(" M =",round(P.Est,3),"\n\n") | |
# estimate (and report) latent class M, SD on each indicator (using factor loadings) | |
Ms <- apply(Data.Matrix[,1:I], 2, mean) | |
SDs <- apply(Data.Matrix[,1:I], 2, sd) | |
Q.Est <- 1 - P.Est | |
M.SD <- matrix(nrow = I, ncol = 2) | |
M.SD[,1] <- Ms + SDs * (Q.Est / sqrt(P.Est * Q.Est) * as.vector(Factor.Results$loadings)) | |
M.SD[,2] <- Ms + SDs * (-P.Est / sqrt(P.Est * Q.Est) * as.vector(Factor.Results$loadings)) | |
Row.Labels <- paste("Indicator",1:I) | |
Col.Labels <- c("Taxon","Complement") | |
dimnames(M.SD) <- list(Row.Labels, Col.Labels) | |
cat("\nEstimated latent group M on each indicator (via factor loadings):\n\n") | |
print(round(M.SD,3)) | |
cat("\n") | |
# assign cases to classes using estimated base rate | |
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 1)) | |
{ | |
Data.Matrix <- cbind(Data.Matrix, 1:N) | |
Data.Matrix[,(I + 2)] <- apply(Data.Matrix[,1:I], 1, sum) | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 2)]),] | |
P.Est.Series <- c(rep(2, round(N * P.Est))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Matrix[,(I + 2)] <- c(Q.Est.Series, P.Est.Series) | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 3)]),] | |
Data.Matrix <- Data.Matrix[,1:(I + 2)] | |
} | |
# assign cases to classes and print base rate estimate from empirical classification | |
if ((Supplied.P == 0) & (!Supplied.Class) & (Classify == 2)) | |
{ | |
Mean.Diff.C <- abs(Data.Matrix[,(I + 1)] - Score.Density$x[Loc.L]) | |
Mean.Diff.T <- abs(Data.Matrix[,(I + 1)] - Score.Density$x[Loc.R]) | |
for (i in 1:N) | |
if (Mean.Diff.T[i] <= Mean.Diff.C[i]) Data.Matrix[i,(I + 2)] <- 2 | |
else Data.Matrix[i,(I + 2)] <- 1 | |
cat("\nBase rate estimated from classification of cases:\n") | |
cat(" P =",round((mean(Data.Matrix[,(I + 2)]) - 1),3),"\n\n") | |
} | |
# assign cases to classes using supplied base rate | |
if (Supplied.P > 0) | |
{ | |
Data.Matrix <- cbind(Data.Matrix, 1:N) | |
Data.Matrix[,(I + 2)] <- apply(Data.Matrix[,1:I], 1, sum) | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 2)]),] | |
P.Est.Series <- c(rep(2, round(N * Supplied.P))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Matrix[,(I + 2)] <- c(Q.Est.Series, P.Est.Series) | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 3)]),] | |
Data.Matrix <- Data.Matrix[,1:(I + 2)] | |
} | |
# assign cases to classes using supplied classification | |
if (Supplied.Class) | |
Data.Matrix <- cbind(Data[,1:I],Data.Matrix[,(I + 1)],Data[,(I + 1)]) | |
# Construct full-sample and within-groups data files | |
Data.f <- as.matrix(Data.Matrix[,(1:I)]) | |
Nt <- sum(Data.Matrix[,(I + 2)]) - N | |
Nc <- N - Nt | |
Data.t <- Data.Matrix[(Data.Matrix[,(I + 2)] == 2),1:I] | |
Data.c <- Data.Matrix[(Data.Matrix[,(I + 2)] == 1),1:I] | |
for (i in 1:I) | |
{ | |
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001) | |
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001) | |
} | |
# Calculate distributions for each indicator (full sample, taxon, complement) | |
Dist.Full <- matrix(nrow = I + 2, ncol = 4) | |
Dist.Tax <- matrix(nrow = I + 2, ncol = 4) | |
Dist.Comp <- matrix(nrow = I + 2, ncol = 4) | |
Rows <- c(paste("Ind",1:I), "M", "SD") | |
Cols <- c("M","SD","Skew","Kurtosis") | |
dimnames(Dist.Full) <- list(Rows, Cols) | |
dimnames(Dist.Tax) <- list(Rows, Cols) | |
dimnames(Dist.Comp) <- list(Rows, Cols) | |
for (i in 1:I) | |
{ | |
Dist.Full[i,1] <- mean(Data.f[,i]) | |
Dist.Tax[i,1] <- mean(Data.t[,i]) | |
Dist.Comp[i,1] <- mean(Data.c[,i]) | |
Dist.Full[i,2] <- sd(Data.f[,i]) | |
Dist.Tax[i,2] <- sd(Data.t[,i]) | |
Dist.Comp[i,2] <- sd(Data.c[,i]) | |
Dist.Full[i,3] <- Skew(Data.f[,i]) | |
Dist.Tax[i,3] <- Skew(Data.t[,i]) | |
Dist.Comp[i,3] <- Skew(Data.c[,i]) | |
Dist.Full[i,4] <- Kurtosis(Data.f[,i]) | |
Dist.Tax[i,4] <- Kurtosis(Data.t[,i]) | |
Dist.Comp[i,4] <- Kurtosis(Data.c[,i]) | |
} | |
for (i in 1:4) | |
{ | |
Dist.Full[I + 1,i] <- mean(Dist.Full[1:I,i]) | |
Dist.Tax[I + 1,i] <- mean(Dist.Tax[1:I,i]) | |
Dist.Comp[I + 1,i] <- mean(Dist.Comp[1:I,i]) | |
Dist.Full[I + 2,i] <- sd(Dist.Full[1:I,i]) | |
Dist.Tax[I + 2,i] <- sd(Dist.Tax[1:I,i]) | |
Dist.Comp[I + 2,i] <- sd(Dist.Comp[1:I,i]) | |
} | |
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n") | |
print(round(Dist.Full,3)) | |
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n") | |
print(round(Dist.Tax,3)) | |
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n") | |
print(round(Dist.Comp,3)) | |
# calculate validity for each indicator | |
Ind.Val <- matrix(nrow = I + 2, ncol = 2) | |
Rows <- c(paste("Ind",1:I), "M", "SD") | |
Cols <- c("Raw Units","Cohen's d") | |
dimnames(Ind.Val) <- list(Rows, Cols) | |
for (i in 1:I) | |
{ | |
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1] | |
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2) | |
/ (Nt + Nc - 2)) | |
} | |
Ind.Val[I + 1,1] <- mean(Ind.Val[1:I,1]) | |
Ind.Val[I + 1,2] <- mean(Ind.Val[1:I,2]) | |
Ind.Val[I + 2,1] <- sd(Ind.Val[1:I,1]) | |
Ind.Val[I + 2,2] <- sd(Ind.Val[1:I,2]) | |
cat("\nBetween-group validity:\n\n") | |
print(round(Ind.Val,3)) | |
# Print factor score validity estimate | |
Factor.Val <- matrix(nrow = 1, ncol = 6) | |
Factor.Val[1,1] <- mean(Data.Matrix[(Data.Matrix[,(I + 2)] == 2),(I + 1)]) | |
Factor.Val[1,2] <- sd(Data.Matrix[(Data.Matrix[,(I + 2)] == 2),(I + 1)]) | |
Factor.Val[1,3] <- mean(Data.Matrix[(Data.Matrix[,(I + 2)] == 1),(I + 1)]) | |
Factor.Val[1,4] <- sd(Data.Matrix[(Data.Matrix[,(I + 2)] == 1),(I + 1)]) | |
Factor.Val[1,5] <- Factor.Val[1,1] - Factor.Val[1,3] | |
Factor.Val[1,6] <- Factor.Val[1,5] / sqrt(((Nt - 1) * Factor.Val[1,2]^2 + (Nc - 1) * Factor.Val[1,4]^2) | |
/ (Nt + Nc - 2)) | |
Row.Labels <- "Factor Scores" | |
Col.Labels <- c("Taxon M", "Taxon SD", "Comp M", "Comp SD", "Raw Units", "Cohen's d") | |
dimnames(Factor.Val) <- list(Row.Labels, Col.Labels) | |
cat("\nBetween-group validity on factor scores:\n\n") | |
print(round(Factor.Val[,5:6],3)) | |
# Calculate, print, and summarize full-sample and within-group correlations | |
Cor.Matrix <- cor(Data.f[,1:I]) | |
Rows <- paste("Ind",1:I) | |
Cols <- paste("Ind",1:I) | |
dimnames(Cor.Matrix) <- list(Rows, Cols) | |
cat("\nIndicator correlations:\n\nFull Sample ( N =",N,"):\n\n") | |
print(round(Cor.Matrix,3)) | |
Cor.Taxon <- cor(Data.t) | |
Cor.Comp <- cor(Data.c) | |
dimnames(Cor.Taxon) <- list(Rows, Cols) | |
dimnames(Cor.Comp) <- list(Rows, Cols) | |
cat("\nTaxon ( n =",Nt,"):\n\n") | |
print(round(Cor.Taxon,3)) | |
cat("\nComplement ( n =",Nc,"):\n\n") | |
print(round(Cor.Comp,3)) | |
Cor.Summary <- matrix(nrow = 3, ncol = 2) | |
dimnames(Cor.Summary) <- list(c("Full Sample","Taxon", "Complement"), c("M", "SD")) | |
Cor.Summary[1,1] <- mean(Cor.Matrix[lower.tri(Cor.Matrix)]) | |
Cor.Summary[2,1] <- mean(Cor.Taxon[lower.tri(Cor.Taxon)]) | |
Cor.Summary[3,1] <- mean(Cor.Comp[lower.tri(Cor.Comp)]) | |
Cor.Summary[1,2] <- sd(Cor.Matrix[lower.tri(Cor.Matrix)]) | |
Cor.Summary[2,2] <- sd(Cor.Taxon[lower.tri(Cor.Taxon)]) | |
Cor.Summary[3,2] <- sd(Cor.Comp[lower.tri(Cor.Comp)]) | |
cat("\nSummary of indicator correlations:\n\n") | |
print(round(Cor.Summary,3)) | |
cat("\n") | |
} | |
# if research data, return factor scores, base rate estimate, and class assignments; | |
# if comparison data, return only factor scores | |
if (Research.Data) return(list(Data.Matrix[,(I + 1)], P.Est, Data.Matrix[,(I + 2)])) | |
else return(Data.Matrix[,(I + 1)]) | |
} | |
########################################################################################################### | |
# | |
# MAMBAC | |
# | |
# Written by John Ruscio | |
# Last modified January 9, 2012 | |
# | |
########################################################################################################### | |
# | |
# Preliminary program to read and submit research data to analysis and then, if requested, perform analyses | |
# of comparison data to test fit of averaged curves. | |
# | |
########################################################################################################### | |
MAMBAC <- function(Data.Set, Comp.Data = T, N.Pop = 100000, N.Samples = 100, Supplied.Class = F, | |
Supplied.P = 0, All.Pairs = T, Ind.Comp = F, N.End = 25, N.Cuts = 50, St.Ind = F, | |
Replications = 0, Gr.Orient = 1, Gr.Rows = 3, Gr.Cols = 2, Gr.Smooth = F, | |
Gr.Common.Y = F, Gr.Avg = F, Gr.Ind = F, File.Output = F, File.Name = "Output.txt", Seed = 1) | |
{ | |
Data.Set <- Remove.Missing(Data.Set) # remove any missing data (listwise deletion) | |
N <- dim(Data.Set)[1] # determine sample size | |
I <- dim(Data.Set)[2] # determine number of indicators; adjust if one column contains class assignments | |
if (Supplied.Class) I <- I - 1 # subtract to allow for class assignment data, if provided | |
set.seed(Seed) # set random number seed to allow exact replications of results | |
# automatically fix some potential parameter inconsistencies or omissions | |
if (Comp.Data) Gr.Avg <- T # need averaged curve to compare to those for comparison data | |
if ((Comp.Data) & (N.Samples == 1)) N.Samples <- 100 # do not allow a single sample of comparison data | |
if (Ind.Comp) | |
{ | |
All.Pairs <- F # cannot use both methods | |
Gr.Ind <- F # cannot average by input indicator if composites are used | |
} | |
if (All.Pairs) Ind.Comp <- F # cannot use both methods | |
if (Ind.Comp & (I < 3)) | |
{ | |
Ind.Comp <- F # cannot form composites using individual variables | |
All.Pairs <- T | |
} | |
if (Gr.Avg) St.Ind <- T # always standardize indicators if graphs will be averaged | |
if (St.Ind) Data.Set[,1:I] <- apply(Data.Set[,1:I], 2, scale) # standardize data if requested | |
# check for tied scores and set number of replications | |
Ties <- F | |
for (i in 1:I) | |
if (length(unique(Data.Set[,I])) < N) Ties <- T | |
if ((Replications == 0) & (Ties)) Replications <- 10 # default is 10 replications if any tied scores | |
if (Replications == 0) Replications <- 1 | |
if ((Replications > 1) & (!Ties)) Replications <- 1 # eliminate unnecessary replications | |
# if output to file is requested, redirect all text output | |
if (File.Output) sink(File.Name) | |
# submit data to main program for analysis; taxon base rate estimate will be returned | |
MAMBAC.Results <- MAMBAC.Main(Data.Set, Research.Data = T, Comp.Data, N.Pop, N.Samples, N, I, | |
All.Pairs, Ind.Comp, N.End, N.Cuts, Replications, Gr.Orient, Gr.Rows, | |
Gr.Cols, Gr.Smooth, Gr.Common.Y, Gr.Avg, Gr.Ind, Supplied.P, | |
Supplied.Class) | |
P.Est.M <- MAMBAC.Results[[3]] | |
P.Est.SD <- MAMBAC.Results[[4]] | |
# if comparison data requested, then generate and submit samples for analysis | |
if (Comp.Data) | |
{ | |
# create matrices to store results for comparison data | |
Tax.Y.Values <- matrix(nrow = N.Samples, ncol = N.Cuts) | |
Dim.Y.Values <- matrix(nrow = N.Samples, ncol = N.Cuts) | |
if (Ind.Comp) N.Curves <- I | |
else N.Curves <- I * (I - 1) | |
Tax.P.Ests <- matrix(nrow = N.Samples, ncol = N.Curves) | |
Dim.P.Ests <- matrix(nrow = N.Samples, ncol = N.Curves) | |
# generate and analyze categorical data | |
set.seed(Seed) | |
if (!Supplied.Class) # if class assignments were not provided, generate using base rate | |
{ | |
Data.Set <- cbind(Data.Set, rep(0, N)) | |
if (Supplied.P == 0) P <- P.Est.M # use estimated P in research data if no P was provided | |
else P <- Supplied.P | |
if ((P.Est.M * N) < 20) P.Est.M <- 20 / N | |
if ((N - (P.Est.M * N)) < 20) P.Est.M <- N - (20 / N) | |
for (i in 1:I) | |
Data.Set[, (I + 1)] <- Data.Set[, (I + 1)] + Data.Set[,i] | |
Data.Set <- Data.Set[sort.list(Data.Set[,(I + 1)]),] | |
P.Est.Series <- c(rep(2, round(N * P))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Set[,(I + 1)] <- c(Q.Est.Series, P.Est.Series) | |
} | |
Tax.Data <- Data.Set[(Data.Set[,(I + 1)] == 2), 1:I] | |
Com.Data <- Data.Set[(Data.Set[,(I + 1)] == 1), 1:I] | |
P.Data <- dim(Tax.Data)[1] / (dim(Tax.Data)[1] + dim(Com.Data)[1]) | |
cat("\nGenerating and analyzing comparison data\n") | |
Sim.Tax <- GenData(Tax.Data, round(N.Pop * P.Data)) | |
Sim.Com <- GenData(Com.Data, (N.Pop - round(N.Pop * P.Data))) | |
Sim.Pop <- rbind(Sim.Tax, Sim.Com) | |
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))] | |
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I))) | |
cat(" Categorical population generated, RMSR r =",round(RMSR,3),"\n") | |
for (Sims in 1:N.Samples) | |
{ | |
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),] | |
MAMBAC.Tax.Results <- MAMBAC.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, N.Samples, | |
N, I, All.Pairs, Ind.Comp, N.End, N.Cuts, Replications, | |
Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, | |
Gr.Avg, Gr.Ind, Supplied.P, Supplied.Class) | |
Tax.Y.Values[Sims,] <- MAMBAC.Tax.Results[[1]] | |
Tax.P.Ests[Sims,] <- MAMBAC.Tax.Results[[2]] | |
} | |
cat(" Analysis of",N.Samples,"samples of categorical comparison data completed\n") | |
# generate and analyze dimensional samples | |
set.seed(Seed) | |
Sim.Pop <- GenData(Data.Set[, 1:I], N.Pop) | |
Res.Corr <- cor(Data.Set[,1:I])[lower.tri(diag(I))] - cor(Sim.Pop)[lower.tri(diag(I))] | |
RMSR <- sqrt(sum(Res.Corr * Res.Corr) / (.5 * (I * I - I))) | |
cat("\n Dimensional population generated, RMSR r =",round(RMSR,3),"\n") | |
for (Sims in 1:N.Samples) | |
{ | |
Sim.Sample <- Sim.Pop[sample(1:N.Pop, N, replace = T),] | |
MAMBAC.Dim.Results <- MAMBAC.Main(Sim.Sample, Research.Data = F, Comp.Data, N.Pop, N.Samples, | |
N, I, All.Pairs, Ind.Comp, N.End, N.Cuts, Replications, | |
Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, | |
Gr.Avg, Gr.Ind, Supplied.P, Supplied.Class) | |
Dim.Y.Values[Sims,] <- MAMBAC.Dim.Results[[1]] | |
Dim.P.Ests[Sims,] <- MAMBAC.Dim.Results[[2]] | |
} | |
cat(" Analysis of",N.Samples,"samples of dimensional comparison data completed\n") | |
# summarize base rate estimates for comparison data | |
P.Est.Summary <- matrix(nrow = (N.Samples + 2), ncol = 4) | |
for (Sims in 1:N.Samples) | |
{ | |
P.Est.Summary[Sims,1] <- mean(Tax.P.Ests[Sims,], na.rm = T) | |
P.Est.Summary[Sims,2] <- sd(Tax.P.Ests[Sims,], na.rm = T) | |
P.Est.Summary[Sims,3] <- mean(Dim.P.Ests[Sims,], na.rm = T) | |
P.Est.Summary[Sims,4] <- sd(Dim.P.Ests[Sims,], na.rm = T) | |
} | |
for (i in 1:4) | |
{ | |
P.Est.Summary[Sims + 1,i] <- mean(P.Est.Summary[1:Sims,i]) | |
P.Est.Summary[Sims + 2,i] <- sd(P.Est.Summary[1:Sims,i]) | |
} | |
Row.Labels <- c(1:Sims,paste("M for",N.Samples,"samples"),paste("SD for",N.Samples,"samples")) | |
Col.Labels <- c("Categorical M","SD","Dimensional M","SD") | |
dimnames(P.Est.Summary) <- list(Row.Labels,Col.Labels) | |
cat("\n\nBase rate estimates for comparison data:\n\n") | |
if (N.Samples == 1) | |
{ | |
cat(" Categorical sample: M =",round(P.Est.Summary[1,1],4),", SD =", | |
round(P.Est.Summary[1,2],4),"\n") | |
cat(" Dimensional sample: M =",round(P.Est.Summary[1,3],4),", SD =", | |
round(P.Est.Summary[1,4],4),"\n") | |
} | |
else | |
print(round(P.Est.Summary[((N.Samples+1):(N.Samples+2)),],4)) | |
# plot sampling distributions of Ms/SDs of base rate estimates | |
if (N.Samples > 1) | |
{ | |
dev.new(width=7, height=3.5) | |
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75) | |
Y.M.Max <- max(c(density(P.Est.Summary[1:Sims,1])$y,density(P.Est.Summary[1:Sims,3])$y)) * 1.25 | |
Y.SD.Max <- max(c(density(P.Est.Summary[1:Sims,2])$y,density(P.Est.Summary[1:Sims,4])$y)) * 1.25 | |
plot(density(P.Est.Summary[1:Sims,1], adjust = .75), xlim = c(0,1), ylim = c(0, Y.M.Max), | |
xlab = "Ms of Base Rate Estimates", ylab = "", main = "", lwd = 2, axes = F) | |
axis(1) | |
lines(density(P.Est.Summary[1:Sims,3], adjust = .75), lwd = 2, lty = 2) | |
abline(v = P.Est.M, lwd = 2, lty = 3) | |
plot(density(P.Est.Summary[1:Sims,2], adjust = .75), xlim = c(0,.5), ylim = c(0, Y.SD.Max), | |
xlab = "SDs of Base Rate Estimates", ylab = "", main = "", lwd = 2, axes = F) | |
axis(1) | |
lines(density(P.Est.Summary[1:Sims,4], adjust = .75), lwd = 2, lty = 2) | |
abline(v = P.Est.SD, lwd = 2, lty = 3) | |
mtext("BASE RATE ESTIMATES FOR COMPARISON DATA",outer=T) | |
} | |
# plot panel of averaged curves | |
MAMBAC.Y <- MAMBAC.Results[[2]] | |
X.Values <- MAMBAC.Results[[1]] | |
X.Range <- c(min(X.Values), max(X.Values)) | |
Tax.Y <- vector("numeric", length(X.Values)) | |
for (i in 1:length(Tax.Y)) | |
Tax.Y[i] <- mean(Tax.Y.Values[,i]) | |
Dim.Y <- vector("numeric", length(X.Values)) | |
for (i in 1:length(Dim.Y)) | |
Dim.Y[i] <- mean(Dim.Y.Values[,i]) | |
Y.Range <- c(min(MAMBAC.Y,Tax.Y.Values,Dim.Y.Values),max(MAMBAC.Y,Tax.Y.Values,Dim.Y.Values)) | |
# label the x and y axes | |
X.Label <- paste(N.Cuts,"Cuts") | |
Y.Label <- "Mean Difference" | |
# 2-panel plot for comparison data | |
Tax.Sum <- Dim.Sum <- matrix(0, nrow = 6, ncol = length(X.Values)) | |
for (i in 1:length(X.Values)) | |
{ | |
Tax.Sum[,i] <- summary(Tax.Y.Values[,i]) | |
Dim.Sum[,i] <- summary(Dim.Y.Values[,i]) | |
} | |
dev.new(width=7, height=3.5) | |
par(mfrow=c(1,2), omi=c(.25,.25,.25,.25), cex=.75) | |
plot(X.Values, MAMBAC.Y, type = "o", pch = 16, lwd = 2, ylim = Y.Range, | |
xlim = X.Range, ylab = Y.Label, xlab = X.Label, main = "Categorical Comparison Data") | |
for (i in 1:(length(X.Values)-1)) | |
polygon(x = c(X.Values[i], X.Values[i], X.Values[i+1], X.Values[i+1]), | |
y = c(Tax.Sum[2,i], Tax.Sum[5,i], Tax.Sum[5,i+1], Tax.Sum[2,i+1]), col = 8, lty = 0) | |
lines(X.Values, Tax.Sum[1,]) | |
lines(X.Values, Tax.Sum[6,]) | |
points(X.Values, MAMBAC.Y, type = "o", pch = 16) | |
lines(X.Values, MAMBAC.Y, lwd = 2) | |
plot(X.Values, MAMBAC.Y, type = "o", pch = 16, lwd = 2, ylim = Y.Range, | |
xlim = X.Range, ylab = Y.Label, xlab = X.Label, main = "Dimensional Comparison Data") | |
for (i in 1:(length(X.Values)-1)) | |
polygon(x = c(X.Values[i], X.Values[i], X.Values[i+1], X.Values[i+1]), | |
y = c(Dim.Sum[2,i], Dim.Sum[5,i], Dim.Sum[5,i+1], Dim.Sum[2,i+1]), col = 8, lty = 0) | |
lines(X.Values, Dim.Sum[1,]) | |
lines(X.Values, Dim.Sum[6,]) | |
points(X.Values, MAMBAC.Y, type = "o", pch = 16) | |
lines(X.Values, MAMBAC.Y, lwd = 2) | |
# calculate and print comparison curve fit index (CCFI) | |
Fit.RMSR.t <- sqrt(sum((Tax.Y - MAMBAC.Y)^2)/N.Cuts) | |
Fit.RMSR.d <- sqrt(sum((Dim.Y - MAMBAC.Y)^2)/N.Cuts) | |
CCFI <- Fit.RMSR.d / (Fit.RMSR.d + Fit.RMSR.t) | |
cat("\n\nComparison curve fit index (CCFI) =",round(Fit.RMSR.d,4),"/ (",round(Fit.RMSR.d,4), | |
"+",round(Fit.RMSR.t,4),") =",round(CCFI,3),"\n") | |
cat(" Note: CCFI values can range from 0 (dimensional) to 1 (categorical). The more a CCFI\n") | |
cat(" value deviates from .50, the stronger the result. When .40 < CCFI < .60,\n") | |
cat(" this should be interpreted with caution.\n\n") | |
} | |
if (File.Output) sink() | |
} | |
########################################################################################################### | |
# | |
# Main program shows the overall structure through calls to four subsidiary programs | |
# | |
########################################################################################################### | |
MAMBAC.Main <- function(Data, Research.Data, Comp.Data, N.Pop, N.Samples, N, I, All.Pairs, Ind.Comp, N.End, | |
N.Cuts, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, | |
Gr.Avg, Gr.Ind, Supplied.P, Supplied.Class) | |
{ | |
# set up matrix to store input/output configurations for each curve to be generated | |
Curve.Info <- MAMBAC.Structure(I, All.Pairs) | |
# carry out analyses by calling calculation procedure once per curve | |
Curves <- list(1) | |
for (Curve.Num in 1:length(Curve.Info[,1])) | |
Curves[[Curve.Num]] <- MAMBAC.Calculate(Data[,1:I], N, I, Curve.Info, Curve.Num, All.Pairs, Ind.Comp, | |
N.End, N.Cuts, Replications) | |
# plot the results (returns P.Avg for research data, X.Values & Y.Values for comparison data) | |
MAMBAC.Results <- MAMBAC.Plot(Research.Data, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, | |
N.Cuts, Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, | |
Gr.Avg, Gr.Ind) | |
if (Research.Data) P.Avg <- MAMBAC.Results[[1]] | |
# print summary of analytic specification (for research data only) | |
if (Research.Data) MAMBAC.Summarize(Data[,1:I], N, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, | |
N.Cuts, Replications, Gr.Smooth, Gr.Avg, Comp.Data, N.Pop, N.Samples, | |
Supplied.P, Supplied.Class) | |
# generate estimates of taxon base rate (for research data only) | |
if (Research.Data) P.Est <- MAMBAC.P.Est(Data, N, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, | |
N.Cuts, Replications, Gr.Smooth, Gr.Avg, P.Avg, Supplied.P, Supplied.Class) | |
if (!Research.Data) | |
{ | |
N.Curves <- length(Curve.Info[,1]) | |
P.Estimates <- vector("numeric", N.Curves) | |
for (i in 1:N.Curves) | |
{ | |
P.Estimates[i] <- 1/((Curves[[i]][length(Curves[[i]][,2]),2]/Curves[[i]][1, 2]) + 1) | |
if (P.Estimates[i] < 0) P.Estimates[i] <- 0 # correct for possibilities of | |
if (P.Estimates[i] > 1) P.Estimates[i] <- 1 # negative values at extremes | |
} | |
} | |
if (Research.Data) return(list(MAMBAC.Results[[2]], MAMBAC.Results[[3]], P.Est[[1]], P.Est[[2]])) | |
else return(list(MAMBAC.Results, P.Estimates)) | |
} | |
########################################################################################################### | |
# | |
# Program to determine structure of analysis and set up matrix to contain relevant curve information | |
# | |
########################################################################################################### | |
MAMBAC.Structure <- function(I, All.Pairs) | |
{ | |
# based on indicator configuration, create matrix listing input/output indicators to use for each curve | |
if (All.Pairs) | |
{ | |
N.Curves <- I * (I - 1) | |
Curve.Info <- matrix(nrow = N.Curves, ncol = 3) | |
Curve.Info[,1] <- 1:N.Curves | |
Counter <- 0 | |
for (i in 1:(I - 1)) | |
{ | |
for (j in (i + 1):I) | |
{ | |
Curve.Info[Counter + 1, 2:3] <- c(i, j) | |
Curve.Info[Counter + 2, 2:3] <- c(j, i) | |
Counter <- Counter + 2 | |
} | |
} | |
} | |
else | |
{ | |
N.Curves <- I | |
Curve.Info <- matrix(nrow = N.Curves, ncol = 2) | |
Curve.Info[,1] <- 1:N.Curves | |
Curve.Info[,2] <- 1:N.Curves | |
} | |
return(Curve.Info) | |
} | |
########################################################################################################### | |
# | |
# Program to locate cutting scores along the input and calculate mean differences | |
# | |
########################################################################################################### | |
MAMBAC.Calculate <- function(Data, N, I, Curve.Info, Curve.Num, All.Pairs, Ind.Comp, N.End, N.Cuts, | |
Replications) | |
{ | |
# make room in data for composite indicator plus random numbers for resorting at each replication | |
Data.Matrix <- cbind(Data, rep(0,N), rnorm(N, mean = 0, sd = 1)) | |
# determine input column, creating composite indicator if necessary, and sort by input | |
if (All.Pairs) | |
{ | |
Input <- Curve.Info[Curve.Num, 2] | |
Output <- Curve.Info[Curve.Num, 3] | |
} | |
else | |
{ | |
Output <- Curve.Info[Curve.Num,2] | |
Input <- I + 1 | |
for (i in 1:I) | |
Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] + Data.Matrix[,i] | |
Data.Matrix[,(I + 1)] <- Data.Matrix[,(I + 1)] - Data.Matrix[,Output] | |
} | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),] | |
Cuts <- seq(from = N.End, to = (N - N.End), length = N.Cuts) | |
X.Labels <- Cuts | |
# create matrix to store curve results | |
Num.Cuts <- length(Cuts) | |
Curve.Results <- matrix(rep(0, Num.Cuts * 2), nrow = Num.Cuts, ncol = 2) | |
# having set up subsamples along the input and created a storage structure, perform the calculations | |
# replications loop; results are cumulated, subsequently divided by number of replications to yield average | |
for (Rep in 1:Replications) | |
{ | |
# shuffle cases, then sort by input | |
Data.Matrix <- Data.Matrix[sample(1:N, N, replace = F),] | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,Input]),] | |
# begin calculations, storing results in either matrix or array as follows: | |
# 1st column = labels for x axis | |
# 2nd column = MAMBAC values | |
Curve.Results[,1] <- X.Labels | |
for (i in 1:Num.Cuts) | |
Curve.Results[i,2] <- mean(Data.Matrix[(Cuts[i]+1):N,Output]) - | |
mean(Data.Matrix[1:Cuts[i],Output]) | |
} | |
# divide cumulated mean differences by number of replications to yield average values | |
Curve.Results[,2] <- Curve.Results[,2] / Replications | |
return(Curve.Results) | |
} | |
########################################################################################################### | |
# | |
# Program to plot and label a panel of curves | |
# | |
########################################################################################################### | |
MAMBAC.Plot <- function(Research.Data, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, N.Cuts, | |
Replications, Gr.Orient, Gr.Rows, Gr.Cols, Gr.Smooth, Gr.Common.Y, Gr.Avg, Gr.Ind) | |
{ | |
P.Avg <- 0 | |
N.Curves <- length(Curve.Info[,1]) | |
# create and label new graph sheet | |
if (Research.Data) | |
{ | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
Page <- 1 | |
# if applying a common Y scale for all graphs, set the range here (algorithm accentuates peak/flatness) | |
if (Gr.Common.Y) | |
{ | |
Ys <- 0 | |
for (i in 1:N.Curves) | |
Ys <- c(Ys, Curves[[i]][,2]) | |
Y.Min <- min(Ys[2:length(Ys)]) | |
Y.Max <- max(Ys[2:length(Ys)]) | |
Y.Range <- c(Y.Min, Y.Max) | |
} | |
# plot each curve | |
for (i in 1:N.Curves) | |
{ | |
if (All.Pairs) | |
{ | |
Input <- Curve.Info[i,2] | |
Output <- Curve.Info[i,3] | |
} | |
else | |
{ | |
Input <- Curve.Info[i,1] | |
Output <- Curve.Info[i,2] | |
} | |
X.Values <- Curves[[i]][,1] | |
Y.Values <- Curves[[i]][,2] | |
X.Range <- c(min(X.Values), max(X.Values)) | |
if (!Gr.Common.Y) Y.Range <- c(min(Y.Values), max(Y.Values)) | |
# label the x axis in accord with input method and indicator | |
if (All.Pairs) X.Label <- paste(N.Cuts,"Cuts on Indicator",Input) | |
else X.Label <- paste(N.Cuts,"Cuts") | |
# label the y axis | |
Y.Label <- paste("Mean Difference, Indicator",Output) | |
# check to see whether new graph sheet needs to be created | |
if ((i > 1) & ((i - 1) %% (Gr.Rows * Gr.Cols) == 0)) | |
{ | |
mtext(paste("MAMBAC CURVES, page ",Page,sep=""),outer=T) | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
Page <- Page + 1 | |
} | |
# plot the graph, plus smoothed curve if requested | |
plot(X.Values, Y.Values, type = "p", pch = 16, ylim = Y.Range, xlim = X.Range, | |
ylab = Y.Label, xlab = X.Label) | |
if (!Gr.Smooth) # no smoothing, connect "raw" data points | |
lines(X.Values, Y.Values) # solid line for raw data only | |
else | |
lines(lowess(X.Values, Y.Values)) # dotted line for LOWESS smoothed curve | |
} | |
mtext(paste("MAMBAC CURVES, page ",Page,sep=""),outer=T) | |
} | |
if (Gr.Ind & Research.Data) | |
{ | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
# average y values for curves using the same input indicator | |
Ind.Graphs <- matrix(nrow = I, ncol = N.Cuts) | |
N.Each.Ind <- N.Curves / I | |
for (i in 1:I) | |
{ | |
Ind.Curves <- Curve.Info[,1][Curve.Info[,2] == i] | |
Ys <- vector("numeric",N.Cuts) | |
for (j in 1:N.Cuts) | |
for (k in Ind.Curves) | |
Ys[j] <- Ys[j] + Curves[[k]][j,2] | |
Ind.Graphs[i,] <- Ys / N.Each.Ind | |
} | |
# plot a curve for each indicator | |
Page <- 1 | |
for (i in 1:I) | |
{ | |
Input <- i | |
X.Values <- Curves[[i * N.Each.Ind]][,1] | |
X.Range <- c(min(X.Values), max(X.Values)) | |
Ind.Curves <- Curve.Info[,1][Curve.Info[,2] == i] | |
Y.Values <- Ind.Graphs[i,] | |
for (j in Ind.Curves) | |
Y.Values <- c(Y.Values,Curves[[j]][,2]) | |
Y.Range <- c(min(Y.Values), max(Y.Values)) | |
X.Label <- paste(N.Cuts,"Cuts on Indicator",i) | |
Y.Label <- "Mean Difference" | |
# check to see whether new graph sheet needs to be created | |
if ((i > 1) & ((i - 1) %% (Gr.Rows * Gr.Cols) == 0)) | |
{ | |
mtext(paste("MAMBAC INDICATOR CURVES, page ",Page,sep=""),outer=T) | |
if (Gr.Orient == 1) dev.new(width=8.5, height=11) | |
else dev.new(width=11, height=8.5) | |
par(mfrow=c(Gr.Rows, Gr.Cols), omi=c(.25,.25,.25,.25)) | |
Page <- Page + 1 | |
} | |
# plot the averaged graph, with dotted lines for each individual curve | |
plot(X.Values, Ind.Graphs[i,], type = "l", pch = 16, ylim = Y.Range, xlim = X.Range, | |
ylab = Y.Label, xlab = X.Label, lwd = 2) | |
for (j in Ind.Curves) | |
lines(Curves[[j]][,1],Curves[[j]][,2], lty = 3, lwd = 2) | |
} | |
mtext(paste("MAMBAC INDICATOR CURVES, page",Page),outer=T) | |
} | |
if (Gr.Avg) | |
{ | |
if (Research.Data) | |
{ | |
dev.new(width=3.7, height=3.5) | |
par(omi=c(.25,.25,.25,.25), cex = .75) | |
} | |
X.Values <- Curves[[1]][,1] | |
X.Range <- c(min(X.Values), max(X.Values)) | |
Y.Values <- rep(0,length(X.Values)) | |
for (i in 1:N.Curves) | |
Y.Values <- Y.Values + Curves[[i]][,2] | |
Y.Values <- Y.Values / N.Curves | |
Y.Vals <- Y.Values | |
for (j in 1:N.Curves) | |
Y.Vals <- c(Y.Vals,Curves[[j]][,2]) | |
Y.Range <- c(min(Y.Vals), max(Y.Vals)) | |
X.Label <- paste(N.Cuts,"Cuts") | |
Y.Label <- "Mean Difference" | |
P.Avg <- 1/((Y.Values[length(Y.Values)]/Y.Values[1]) + 1) | |
if (P.Avg < 0) P.Avg <- 0 # correct for possibilities of negative values at extremes | |
if (P.Avg > 1) P.Avg <- 1 | |
if (Research.Data) | |
{ | |
plot(X.Values, Y.Values, type = "l", pch = 16, ylim = Y.Range, xlim = X.Range, | |
ylab = Y.Label, xlab = X.Label, main = "Averaged MAMBAC Curve", lwd = 2) | |
for (j in 1:N.Curves) | |
lines(Curves[[j]][,1],Curves[[j]][,2], lty = 3, lwd = 2) | |
} | |
} | |
if (!Research.Data) return(Y.Values) | |
else return(list(P.Avg,X.Values,Y.Values)) | |
} | |
########################################################################################################### | |
# | |
# Function to produce text summary of analytic specifications | |
# | |
########################################################################################################### | |
MAMBAC.Summarize <- function(Data, N, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, N.Cuts, | |
Replications, Gr.Smooth, Gr.Avg, Comp.Data, N.Pop, N.Samples, Supplied.P, | |
Supplied.Class) | |
{ | |
# report relevant analytic information and program specifications | |
cat("\n\nSUMMARY OF MAMBAC ANALYTIC SPECIFICATIONS\n") | |
cat("\nSample size: ",N,"\n") | |
cat("Number of indicator variables: ",I,"\n") | |
cat("Replications: ",Replications,"\n") | |
cat("Cuts: ",N.Cuts,"evenly-spaced cuts beginning",N.End,"cases from either extreme\n") | |
if (All.Pairs) Ind.Label <- "Variables serve in all possible input-output pairs" | |
else Ind.Label <- paste("One variable serves as output, input = sum of remaining",(I-1),"variables") | |
cat("Indicators: ",Ind.Label,"\n") | |
cat("Total number of curves: ",length(Curve.Info[,1]),"\n") | |
if (Gr.Smooth) cat("Y values smoothed for graphing and P estimation: Yes, LOWESS method\n") | |
else cat("Y values smoothed for graphing and P estimation: No\n") | |
if ((Supplied.P == 0) & (!Supplied.Class)) | |
cat("Classification of cases: Cut total score at estimated base rate\n") | |
if (Supplied.P > 0) cat("Classification of cases: Cut total score at supplied base rate\n") | |
if (Supplied.Class) cat("Classification of cases: Supplied with data\n") | |
if (Comp.Data) | |
{ | |
cat("Size of finite populations of comparison data: ",N.Pop,"\n") | |
cat("Number of samples of comparison data drawn from each population: ",N.Samples,"\n") | |
} | |
cat("\n") | |
} | |
########################################################################################################### | |
# | |
# Function to generate and report estimates of the taxon base rate and other latent parameters | |
# | |
########################################################################################################### | |
MAMBAC.P.Est <- function(Data, N, I, Curve.Info, Curves, All.Pairs, Ind.Comp, N.End, N.Cuts, Replications, | |
Gr.Smooth, Gr.Avg, P.Avg, Supplied.P, Supplied.Class) | |
{ | |
cat("\n\nSUMMARY OF MAMBAC PARAMETER ESTIMATES\n") | |
# estimate base rate for each curve | |
N.Curves <- length(Curve.Info[,1]) | |
P.Estimates <- matrix(nrow = N.Curves, ncol = 1) | |
if (Gr.Smooth) | |
for (i in 1:N.Curves) | |
{ | |
Num.Y <- length(Curves[[i]][,1]) | |
Lowess.Y <- lowess(Curves[[i]][,1],Curves[[i]][,2])$y | |
for (j in 1:(Num.Y-1)) | |
if (Curves[[i]][j,1] == Curves[[i]][(j + 1),1]) | |
Lowess.Y <- c(Lowess.Y[1:j],Lowess.Y[j],Lowess.Y[j+1:length(Lowess.Y)]) | |
Lowess.Y <- Lowess.Y[1:Num.Y] | |
P.Estimates[i,1] <- 1/((Lowess.Y[Num.Y]/Lowess.Y[1]) + 1) | |
if (P.Estimates[i,1] < 0) P.Estimates[i,1] <- 0 # correct for possibilities of | |
if (P.Estimates[i,1] > 1) P.Estimates[i,1] <- 1 # negative values at extremes | |
} | |
else | |
for (i in 1:N.Curves) | |
{ | |
P.Estimates[i,1] <- 1/((Curves[[i]][length(Curves[[i]][,2]),2]/Curves[[i]][1, 2]) + 1) | |
if (P.Estimates[i,1] < 0) P.Estimates[i,1] <- 0 # correct for possibilities of | |
if (P.Estimates[i,1] > 1) P.Estimates[i,1] <- 1 # negative values at extremes | |
} | |
Row.Labels <- paste("Curve",1:N.Curves) | |
Col.Labels <- c("P") | |
dimnames(P.Estimates) <- list(Row.Labels, Col.Labels) | |
cat("\nEstimated taxon base rates for each curve:\n\n") | |
print(round(P.Estimates,3)) | |
P.Est.M <- mean(P.Estimates[,1]) | |
P.Est.SD <- sd(P.Estimates[,1]) | |
cat("\nSummary of base rate estimates across curves:\n M =",round(P.Est.M,3), | |
"\n SD =",round(P.Est.SD,3),"\n\n") | |
# if more than one curve per indicator, gather P estimates for each indicator | |
if (All.Pairs & (I > 2)) | |
{ | |
Ind.Est <- matrix(rep(0, I), nrow = I, ncol = 1) | |
N.Each.Ind <- N.Curves / I | |
for (i in 1:N.Curves) | |
Ind.Est[Curve.Info[i,2]] <- Ind.Est[Curve.Info[i,2]] + P.Estimates[i] | |
Ind.Est[,1] <- Ind.Est[,1] / N.Each.Ind | |
Row.Labels <- paste("Indicator",1:I) | |
Col.Labels <- c("P") | |
dimnames(Ind.Est) <- list(Row.Labels, Col.Labels) | |
cat("Estimated taxon base rates for each indicator:\n\n") | |
print(round(Ind.Est,3)) | |
cat("\n") | |
cat("Summary of base rate estimates across indicators:\n M =",round(mean(Ind.Est[,1]),3), | |
"\n SD =",round(sd(Ind.Est[,1]),3),"\n\n") | |
} | |
# print base rate estimate for averaged curve if appropriate | |
if (Gr.Avg) | |
cat("Base rate estimate for averaged curve =",round(P.Avg, 3),"\n\n") | |
# assign cases to classes using base-rate classification method with estimated base rate | |
if ((Supplied.P == 0) & (!Supplied.Class)) | |
{ | |
Data.Matrix <- cbind(Data[,1:I],rep(0,N),rep(0,N)) | |
for (i in 1:I) | |
Data.Matrix[, (I + 1)] <- Data.Matrix[, (I + 1)] + Data.Matrix[,i] | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 1)]),] | |
P.Est.Series <- c(rep(2, trunc(N * P.Est.M + .5))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Matrix[,(I + 2)] <- c(Q.Est.Series, P.Est.Series) | |
} | |
# assign cases to classes using base-rate classification method with supplied base rate | |
if (Supplied.P > 0) | |
{ | |
Data.Matrix <- cbind(Data[,1:I],rep(0,N),rep(0,N)) | |
for (i in 1:I) | |
Data.Matrix[, (I + 1)] <- Data.Matrix[, (I + 1)] + Data.Matrix[,i] | |
Data.Matrix <- Data.Matrix[sort.list(Data.Matrix[,(I + 1)]),] | |
P.Est.Series <- c(rep(2, trunc(N * Supplied.P + .5))) | |
Q.Est.Series <- c(rep(1, N - length(P.Est.Series))) | |
Data.Matrix[,(I + 2)] <- c(Q.Est.Series, P.Est.Series) | |
} | |
# assign cases to classes using supplied classification | |
if (Supplied.Class) | |
Data.Matrix <- cbind(Data[,1:I], rep(0,N), Data[, (I + 1)]) | |
# Construct full-sample and within-groups data files | |
Data.f <- as.matrix(Data.Matrix[,(1:I)]) | |
Nt <- sum(Data.Matrix[,(I + 2)]) - N | |
Nc <- N - Nt | |
Data.t <- Data.Matrix[(Data.Matrix[,(I + 2)] == 2),1:I] | |
Data.c <- Data.Matrix[(Data.Matrix[,(I + 2)] == 1),1:I] | |
for (i in 1:I) | |
{ | |
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001) | |
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001) | |
} | |
# Calculate distributions for each indicator (full sample, taxon, complement) | |
Dist.Full <- matrix(nrow = I + 2, ncol = 4) | |
Dist.Tax <- matrix(nrow = I + 2, ncol = 4) | |
Dist.Comp <- matrix(nrow = I + 2, ncol = 4) | |
Rows <- c(paste("Ind",1:I), "M", "SD") | |
Cols <- c("M","SD","Skew","Kurtosis") | |
dimnames(Dist.Full) <- list(Rows, Cols) | |
dimnames(Dist.Tax) <- list(Rows, Cols) | |
dimnames(Dist.Comp) <- list(Rows, Cols) | |
for (i in 1:I) | |
{ | |
Dist.Full[i,1] <- mean(Data.f[,i]) | |
Dist.Tax[i,1] <- mean(Data.t[,i]) | |
Dist.Comp[i,1] <- mean(Data.c[,i]) | |
Dist.Full[i,2] <- sd(Data.f[,i]) | |
Dist.Tax[i,2] <- sd(Data.t[,i]) | |
Dist.Comp[i,2] <- sd(Data.c[,i]) | |
Dist.Full[i,3] <- Skew(Data.f[,i]) | |
Dist.Tax[i,3] <- Skew(Data.t[,i]) | |
Dist.Comp[i,3] <- Skew(Data.c[,i]) | |
Dist.Full[i,4] <- Kurtosis(Data.f[,i]) | |
Dist.Tax[i,4] <- Kurtosis(Data.t[,i]) | |
Dist.Comp[i,4] <- Kurtosis(Data.c[,i]) | |
} | |
for (i in 1:4) | |
{ | |
Dist.Full[I + 1,i] <- mean(Dist.Full[1:I,i]) | |
Dist.Tax[I + 1,i] <- mean(Dist.Tax[1:I,i]) | |
Dist.Comp[I + 1,i] <- mean(Dist.Comp[1:I,i]) | |
Dist.Full[I + 2,i] <- sd(Dist.Full[1:I,i]) | |
Dist.Tax[I + 2,i] <- sd(Dist.Tax[1:I,i]) | |
Dist.Comp[I + 2,i] <- sd(Dist.Comp[1:I,i]) | |
} | |
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n") | |
print(round(Dist.Full,3)) | |
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n") | |
print(round(Dist.Tax,3)) | |
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n") | |
print(round(Dist.Comp,3)) | |
# calculate validity for each indicator | |
Ind.Val <- matrix(nrow = I + 2, ncol = 2) | |
Rows <- c(paste("Ind",1:I), "M", "SD") | |
Cols <- c("Raw Units","Cohen's d") | |
dimnames(Ind.Val) <- list(Rows, Cols) | |
for (i in 1:I) | |
{ | |
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1] | |
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2) | |
/ (Nt + Nc - 2)) | |
} | |
Ind.Val[I + 1,1] <- mean(Ind.Val[1:I,1]) | |
Ind.Val[I + 1,2] <- mean(Ind.Val[1:I,2]) | |
Ind.Val[I + 2,1] <- sd(Ind.Val[1:I,1]) | |
Ind.Val[I + 2,2] <- sd(Ind.Val[1:I,2]) | |
cat("\nBetween-group validity:\n\n") | |
print(round(Ind.Val,3)) | |
# Calculate, print, and summarize full-sample and within-group correlations | |
Cor.Matrix <- cor(Data.f[,1:I]) | |
Rows <- paste("Ind",1:I) | |
Cols <- paste("Ind",1:I) | |
dimnames(Cor.Matrix) <- list(Rows, Cols) | |
cat("\nIndicator correlations:\n\nFull Sample ( N =",N,"):\n\n") | |
print(round(Cor.Matrix,3)) | |
Cor.Taxon <- cor(Data.t) | |
Cor.Comp <- cor(Data.c) | |
dimnames(Cor.Taxon) <- list(Rows, Cols) | |
dimnames(Cor.Comp) <- list(Rows, Cols) | |
cat("\nTaxon ( n =",Nt,"):\n\n") | |
print(round(Cor.Taxon,3)) | |
cat("\nComplement ( n =",Nc,"):\n\n") | |
print(round(Cor.Comp,3)) | |
Cor.Summary <- matrix(nrow = 3, ncol = 2) | |
dimnames(Cor.Summary) <- list(c("Full Sample","Taxon", "Complement"), c("M", "SD")) | |
Cor.Summary[1,1] <- mean(Cor.Matrix[lower.tri(Cor.Matrix)]) | |
Cor.Summary[2,1] <- mean(Cor.Taxon[lower.tri(Cor.Taxon)]) | |
Cor.Summary[3,1] <- mean(Cor.Comp[lower.tri(Cor.Comp)]) | |
Cor.Summary[1,2] <- sd(Cor.Matrix[lower.tri(Cor.Matrix)]) | |
Cor.Summary[2,2] <- sd(Cor.Taxon[lower.tri(Cor.Taxon)]) | |
Cor.Summary[3,2] <- sd(Cor.Comp[lower.tri(Cor.Comp)]) | |
cat("\nSummary of indicator correlations:\n\n") | |
print(round(Cor.Summary,3)) | |
cat("\n") | |
# return M and SD of base rate estimates | |
return(list(P.Est.M,P.Est.SD)) | |
} | |
################################################################################################################ | |
GenData <- function(Supplied.Data, N, N.Factors = 0, Max.Trials = 5, Initial.Multiplier = 1, seed = 0) | |
{ | |
# Initialize variables and (if applicable) set random number seed (step 1) ------------------------------------- | |
k <- dim(Supplied.Data)[2] # Number of variables | |
Data <- matrix(0, nrow = N, ncol = k) # Matrix to store the simulated data | |
Distributions <- matrix(0, nrow = N, ncol = k) # Matrix to store each variable's score distribution | |
Iteration <- 0 # Iteration counter | |
Best.RMSR <- 1 # Lowest RMSR correlation | |
Trials.Without.Improvement <- 0 # Trial counter | |
if (seed != 0) set.seed(seed) # If user specified a random seed, set it | |
# Generate distribution for each variable (step 2) ------------------------------------------------------------- | |
for (i in 1:k) | |
{ | |
if (min(Supplied.Data[,i]) == max(Supplied.Data[,i])) | |
Supplied.Data[,i] <- Supplied.Data[,i] + rnorm(dim(Supplied.Data)[1], 0, .0001) | |
V <- F | |
while (!V) | |
{ | |
Distributions[,i] <- sort(sample(Supplied.Data[,i], N, replace = T)) | |
V <- min(Distributions[,i]) != max(Distributions[,i]) | |
} | |
} | |
# Calculate and store a copy of the target correlation matrix (step 3) ----------------------------------------- | |
Target.Corr <- cor(Supplied.Data) | |
Intermediate.Corr <- Target.Corr | |
# If number of latent factors was not specified, determine it through parallel analysis (step 4) --------------- | |
if (N.Factors == 0) | |
{ | |
Eigenvalues.Observed <- eigen(Intermediate.Corr)$values | |
Eigenvalues.Random <- matrix(0, nrow = 100, ncol = k) | |
Random.Data <- matrix(0, nrow = N, ncol = k) | |
for (i in 1:100) | |
{ | |
for (j in 1:k) | |
{ | |
V <- F | |
while (!V) | |
{ | |
Random.Data[,j] <- sample(Distributions[,j], size = N, replace = TRUE) | |
V <- min(Random.Data[,j]) != max(Random.Data[,j]) | |
} | |
} | |
Eigenvalues.Random[i,] <- eigen(cor(Random.Data))$values | |
} | |
Eigenvalues.Random <- apply(Eigenvalues.Random, 2, mean) # calculate mean eigenvalue for each factor | |
N.Factors <- max(1, sum(Eigenvalues.Observed > Eigenvalues.Random)) | |
} | |
# Generate random normal data for shared and unique components, initialize factor loadings (steps 5, 6) -------- | |
Shared.Comp <- matrix(rnorm(N * N.Factors, 0, 1), nrow = N, ncol = N.Factors) | |
Unique.Comp <- matrix(rnorm(N * k, 0, 1), nrow = N, ncol = k) | |
Shared.Load <- matrix(0, nrow = k, ncol = N.Factors) | |
Unique.Load <- matrix(0, nrow = k, ncol = 1) | |
# Begin loop that ends when specified number of iterations pass without improvement in RMSR correlation -------- | |
while (Trials.Without.Improvement < Max.Trials) | |
{ | |
Iteration <- Iteration + 1 | |
# Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8) --------------------------- | |
Fact.Anal <- Factor.Analysis(Intermediate.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors) | |
if (N.Factors == 1) Shared.Load[,1] <- Fact.Anal$loadings | |
else | |
for (i in 1:N.Factors) | |
Shared.Load[,i] <- Fact.Anal$loadings[,i] | |
Shared.Load[Shared.Load > 1] <- 1 | |
Shared.Load[Shared.Load < -1] <- -1 | |
if (Shared.Load[1,1] < 0) Shared.Load <- Shared.Load * -1 | |
for (i in 1:k) | |
if (sum(Shared.Load[i,] * Shared.Load[i,]) < 1) Unique.Load[i,1] <- | |
(1 - sum(Shared.Load[i,] * Shared.Load[i,])) | |
else Unique.Load[i,1] <- 0 | |
Unique.Load <- sqrt(Unique.Load) | |
for (i in 1:k) | |
Data[,i] <- (Shared.Comp %*% t(Shared.Load))[,i] + Unique.Comp[,i] * Unique.Load[i,1] | |
# Replace normal with nonnormal distributions (step 9) --------------------------------------------------------- | |
for (i in 1:k) | |
{ | |
Data <- Data[sort.list(Data[,i]),] | |
Data[,i] <- Distributions[,i] | |
} | |
# Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12) -------------- | |
Reproduced.Corr <- cor(Data) | |
Residual.Corr <- Target.Corr - Reproduced.Corr | |
RMSR <- sqrt(sum(Residual.Corr[lower.tri(Residual.Corr)] * Residual.Corr[lower.tri(Residual.Corr)]) / | |
(.5 * (k * k - k))) | |
if (RMSR < Best.RMSR) | |
{ | |
Best.RMSR <- RMSR | |
Best.Corr <- Intermediate.Corr | |
Best.Res <- Residual.Corr | |
Intermediate.Corr <- Intermediate.Corr + Initial.Multiplier * Residual.Corr | |
Trials.Without.Improvement <- 0 | |
} | |
else | |
{ | |
Trials.Without.Improvement <- Trials.Without.Improvement + 1 | |
Current.Multiplier <- Initial.Multiplier * .5 ^ Trials.Without.Improvement | |
Intermediate.Corr <- Best.Corr + Current.Multiplier * Best.Res | |
} | |
} | |
# Construct the data set with the lowest RMSR correlation (step 13) -------------------------------------------- | |
Fact.Anal <- Factor.Analysis(Best.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors) | |
if (N.Factors == 1) Shared.Load[,1] <- Fact.Anal$loadings | |
else | |
for (i in 1:N.Factors) | |
Shared.Load[,i] <- Fact.Anal$loadings[,i] | |
Shared.Load[Shared.Load > 1] <- 1 | |
Shared.Load[Shared.Load < -1] <- -1 | |
if (Shared.Load[1,1] < 0) Shared.Load <- Shared.Load * -1 | |
for (i in 1:k) | |
if (sum(Shared.Load[i,] * Shared.Load[i,]) < 1) Unique.Load[i,1] <- | |
(1 - sum(Shared.Load[i,] * Shared.Load[i,])) | |
else Unique.Load[i,1] <- 0 | |
Unique.Load <- sqrt(Unique.Load) | |
for (i in 1:k) | |
Data[,i] <- (Shared.Comp %*% t(Shared.Load))[,i] + Unique.Comp[,i] * Unique.Load[i,1] | |
Data <- apply(Data, 2, scale) # standardizes each variable in the matrix | |
for (i in 1:k) | |
{ | |
Data <- Data[sort.list(Data[,i]),] | |
Data[,i] <- Distributions[,i] | |
} | |
# Report the results and return the simulated data set (step 14) ----------------------------------------------- | |
return(Data) | |
} | |
########################################################################################################### | |
# Factor analysis programs for R | |
# Written by Niels Waller, adapted for use with John Ruscio's taxometric programs | |
########################################################################################################### | |
Factor.Analysis <- function(Data, Corr.Matrix = FALSE, Max.Iter = 50, N.Factors = 0) | |
{ | |
Data <- as.matrix(Data) | |
k <- dim(Data)[2] | |
if (N.Factors == 0) N.Factors <- k | |
if (!Corr.Matrix) Cor.Matrix <- cor(Data) | |
else Cor.Matrix <- Data | |
Criterion <- .001 | |
Old.H2 <- rep(99, k) | |
H2 <- rep(0, k) | |
Change <- 1 | |
Iter <- 0 | |
Factor.Loadings <- matrix(nrow = k, ncol = N.Factors) | |
while ((Change >= Criterion) & (Iter < Max.Iter)) | |
{ | |
Iter <- Iter + 1 | |
Eig <- eigen(Cor.Matrix) | |
L <- sqrt(Eig$values[1:N.Factors]) | |
for (i in 1:N.Factors) | |
Factor.Loadings[,i] <- Eig$vectors[,i] * L[i] | |
for (i in 1:k) | |
H2[i] <- sum(Factor.Loadings[i,] * Factor.Loadings[i,]) | |
Change <- max(abs(Old.H2 - H2)) | |
Old.H2 <- H2 | |
diag(Cor.Matrix) <- H2 | |
} | |
if (N.Factors == k) N.Factors <- sum(Eig$values > 1) | |
return(list(loadings = Factor.Loadings[,1:N.Factors], factors = N.Factors)) | |
} | |
Factor.Analysis.LMode <- function(Data, Max.Iter = 50) | |
{ | |
Data <- as.matrix(Data) | |
Cor.Matrix <- cor(Data) | |
Criterion <- .001 | |
Old.H2 <- rep(99,dim(Cor.Matrix)[2]) | |
Change <- 1 | |
Iter <- 0 | |
while ((Change >= Criterion) & (Iter < Max.Iter)) | |
{ | |
Iter <- Iter + 1 | |
Eig <- eigen(Cor.Matrix) | |
L <- sqrt(Eig$values[1]) | |
Factor.Loadings <- Eig$vectors[,1] * L | |
H2 <- Factor.Loadings * Factor.Loadings | |
Change <- max(abs(Old.H2 - H2)) | |
Old.h2 <- H2 | |
diag(Cor.Matrix) <- H2 | |
} | |
Factor.Scores <- apply(Data %*% Factor.Loadings %*% solve(t(Factor.Loadings) %*% Factor.Loadings), 2, scale) | |
return(list(loadings = Factor.Loadings, scores = Factor.Scores)) | |
} | |
########################################################################################################### | |
# Program used by L-Mode to calculate best RMSR fit for two densities, allowing horizontal shifts (a) | |
########################################################################################################### | |
Fit.Densities <- function(a, data) | |
{ | |
x1 <- x2 <- data[[1]]$x | |
y1 <- data[[1]]$y | |
y2 <- data[[2]] | |
Nx <- length(x1) | |
w <- (max(y1)-min(y1)) / (max(x1)-min(x1)) | |
x1 <- x1 * w | |
x2 <- x2 * w | |
dist <- rep(0, Nx) | |
for (i in 1:Nx) | |
dist[i] <- min(sqrt((a + x1[i] - x2)^2 + (y1[i] - y2)^2)) | |
fit <- sqrt(sum(dist^2) / Nx) | |
return(fit) | |
} | |
########################################################################################################### | |
# | |
# CreateData routine for simulating data using on a handful of basic parameters | |
# | |
# Written by John Ruscio | |
# Last modified April 5, 2008 | |
# | |
########################################################################################################### | |
CreateData <- function (Str, N = 600, k = 4, P = .50, d = 2.00, r = .00, Tax.r = .00, Comp.r = .00, | |
Skew = 0, Cuts = 0) | |
{ | |
if (Skew < 0) P <- 1 - P | |
if (Str == "Dim") | |
{ | |
newdata <- matrix(rep(0, N * k), ncol = k) | |
if (r > 0) | |
R.expected <- r | |
else | |
{ | |
r.wg <- mean(c(Tax.r, Comp.r)) | |
R.expected <- (P * (1 - P) * d ^ 2 + r.wg) / (1 + (P * (1 - P) * d ^ 2)) | |
} | |
Loading.Common <- sqrt(R.expected) | |
Loading.Error <- sqrt(1 - R.expected) | |
Random.Common <- rnorm(N, mean = 0, sd = 1) | |
for (i in 1:k) | |
newdata[,i] <- Loading.Common * Random.Common + Loading.Error * rnorm(N, mean = 0, sd = 1) | |
} | |
else # create categorical data | |
{ | |
newdata <- matrix(rep(0, N * (k + 1)), ncol = (k + 1)) | |
N.Tax <- round(N * P) | |
N.Comp <- N - N.Tax | |
newdata[,(k + 1)] <- c(rep(2, N.Tax),rep(1, N.Comp)) | |
# create taxon (with desired within-group correlation) | |
Loading.Common <- sqrt(Tax.r) | |
Loading.Error <- sqrt(1 - Tax.r) | |
Random.Common <- rnorm(N.Tax, mean = 0, sd = 1) | |
for (i in 1:k) | |
newdata[1:N.Tax,i] <- Loading.Common * Random.Common + Loading.Error * rnorm(N.Tax, mean = 0, sd = 1) | |
# create complement (with desired within-group correlation) | |
Loading.Common <- sqrt(Comp.r) | |
Loading.Error <- sqrt(1 - Comp.r) | |
Random.Common <- rnorm(N.Comp, mean = 0, sd = 1) | |
for (i in 1:k) | |
newdata[(N.Tax + 1):N,i] <- Loading.Common * Random.Common + Loading.Error * rnorm(N.Comp, mean = 0, sd = 1) | |
# add in categorical separation | |
if (Skew >= 0) | |
for (i in 1:k) | |
newdata[,i] <- newdata[,i] + (d * (newdata[, (k + 1)] - 1.5)) | |
else | |
for (i in 1:k) | |
newdata[,i] <- newdata[,i] - (d * (newdata[, (k + 1)] - 1.5)) | |
} | |
if (Skew > 0) | |
for (i in 1:k) | |
newdata[,i] <- exp(newdata[,i]/Skew) | |
if (Skew < 0) | |
{ | |
for (i in 1:k) | |
newdata[,i] <- -exp(newdata[,i]/Skew) | |
if (Str == "Tax") newdata[,(k + 1)] <- 3 - newdata[,(k + 1)] | |
} | |
newdata[,1:k] <- apply(newdata[,1:k], 2, scale) | |
if (Cuts > 0) | |
{ | |
for (i in 1:k) | |
{ | |
Sorted.X <- newdata[sort.list(newdata[,i]),i] | |
Trim <- Sorted.X[round(.01 * N):round(.99 * N)] | |
Dens <- density(Trim, from = min(Trim), to = max(Trim)) | |
Dens.N <- length(Dens$y) | |
Cut.Pts <- round(seq(from = 0, to = Dens.N, length = (Cuts + 1))) | |
Cut.Vals <- vector("numeric",Cuts) | |
for (j in 1:Cuts) | |
Cut.Vals[j] <- sum(Dens$y[(Cut.Pts[j] + 1):Cut.Pts[j + 1]]) | |
Vals <- round(Cut.Vals * (N / sum(Cut.Vals))) | |
if (sum(Vals) != N) Vals[round(Cuts/2)] <- Vals[round(Cuts/2)] + (N - sum(Vals)) | |
for (j in 1:Cuts) | |
Cut.Vals[j] <- Sorted.X[sum(Vals[1:j])] | |
Cut.Vals <- c(-100,Cut.Vals,100) | |
newdata[,i] <- cut(newdata[,i], breaks = Cut.Vals)[1:N] | |
} | |
} | |
as.data.frame(newdata) | |
return(newdata) | |
} | |
########################################################################################################### | |
# | |
# Indicator.Dist | |
# | |
# This replaces the older "Validity.Est" program | |
# | |
# Written by John Ruscio | |
# Last modified November 10, 2008 | |
# | |
########################################################################################################### | |
# | |
# Program to calculate indicator distributions and correlations in the full sample as well as within groups | |
# as well as between-group separation. Uses the final column of data as a classification variable (1 = | |
# complement, 2 = taxon). Prints raw and standardized (using pooled variances, weighted by df) validity | |
# values for each indicator, followed by distributional summary statistics for each indicator and the | |
# indicator correlation matrix (in the full sample and within groups). | |
# | |
########################################################################################################### | |
Indicator.Dist <- function(Data) | |
{ | |
# Read N (# cases), I (# items), Nt (number taxon members), Nc (number complement members) | |
Data <- as.matrix(Data) | |
N <- dim(Data)[1] | |
I <- dim(Data)[2] - 1 | |
Nt <- sum(Data[,(I + 1)]) - N | |
Nc <- N - Nt | |
Data.f <- Data[,1:I] | |
Data.t <- Data[(Data[,(I + 1)] == 2),1:I] | |
Data.c <- Data[(Data[,(I + 1)] == 1),1:I] | |
for (i in 1:I) | |
{ | |
if (min(Data.t[,i]) == max(Data.t[,i])) Data.t[,i] <- Data.t[,i] + rnorm(Nt, 0, .0001) | |
if (min(Data.c[,i]) == max(Data.c[,i])) Data.c[,i] <- Data.c[,i] + rnorm(Nc, 0, .0001) | |
} | |
# Calculate distributions for each indicator (full sample, taxon, complement) | |
Dist.Full <- matrix(nrow = I + 2, ncol = 4) | |
Dist.Tax <- matrix(nrow = I + 2, ncol = 4) | |
Dist.Comp <- matrix(nrow = I + 2, ncol = 4) | |
Rows <- c(paste("Ind",1:I), "M", "SD") | |
Cols <- c("M","SD","Skew","Kurtosis") | |
dimnames(Dist.Full) <- list(Rows, Cols) | |
dimnames(Dist.Tax) <- list(Rows, Cols) | |
dimnames(Dist.Comp) <- list(Rows, Cols) | |
for (i in 1:I) | |
{ | |
Dist.Full[i,1] <- mean(Data.f[,i]) | |
Dist.Tax[i,1] <- mean(Data.t[,i]) | |
Dist.Comp[i,1] <- mean(Data.c[,i]) | |
Dist.Full[i,2] <- sd(Data.f[,i]) | |
Dist.Tax[i,2] <- sd(Data.t[,i]) | |
Dist.Comp[i,2] <- sd(Data.c[,i]) | |
Dist.Full[i,3] <- Skew(Data.f[,i]) | |
Dist.Tax[i,3] <- Skew(Data.t[,i]) | |
Dist.Comp[i,3] <- Skew(Data.c[,i]) | |
Dist.Full[i,4] <- Kurtosis(Data.f[,i]) | |
Dist.Tax[i,4] <- Kurtosis(Data.t[,i]) | |
Dist.Comp[i,4] <- Kurtosis(Data.c[,i]) | |
} | |
for (i in 1:4) | |
{ | |
Dist.Full[I + 1,i] <- mean(Dist.Full[1:I,i]) | |
Dist.Tax[I + 1,i] <- mean(Dist.Tax[1:I,i]) | |
Dist.Comp[I + 1,i] <- mean(Dist.Comp[1:I,i]) | |
Dist.Full[I + 2,i] <- sd(Dist.Full[1:I,i]) | |
Dist.Tax[I + 2,i] <- sd(Dist.Tax[1:I,i]) | |
Dist.Comp[I + 2,i] <- sd(Dist.Comp[1:I,i]) | |
} | |
cat("\nIndicator distributions in the full sample ( N =",N,"):\n\n") | |
print(round(Dist.Full,3)) | |
cat("\nIndicator distributions in the taxon ( n =",Nt,"):\n\n") | |
print(round(Dist.Tax,3)) | |
cat("\nIndicator distributions in the complement ( n =",Nc,"):\n\n") | |
print(round(Dist.Comp,3)) | |
# calculate validity for each indicator | |
Ind.Val <- matrix(nrow = I + 2, ncol = 2) | |
Rows <- c(paste("Ind",1:I), "M", "SD") | |
Cols <- c("Raw Units","Cohen's d") | |
dimnames(Ind.Val) <- list(Rows, Cols) | |
for (i in 1:I) | |
{ | |
Ind.Val[i,1] <- Dist.Tax[i,1] - Dist.Comp[i,1] | |
Ind.Val[i,2] <- Ind.Val[i,1] / sqrt(((Nt - 1) * Dist.Tax[i,2]^2 + (Nc - 1) * Dist.Comp[i,2]^2) | |
/ (Nt + Nc - 2)) | |
} | |
Ind.Val[I + 1,1] <- mean(Ind.Val[1:I,1]) | |
Ind.Val[I + 1,2] <- mean(Ind.Val[1:I,2]) | |
Ind.Val[I + 2,1] <- sd(Ind.Val[1:I,1]) | |
Ind.Val[I + 2,2] <- sd(Ind.Val[1:I,2]) | |
cat("\nBetween-group validity:\n\n") | |
print(round(Ind.Val,3)) | |
# Calculate, print, and summarize full-sample and within-group correlations | |
Cor.Matrix <- cor(Data.f[,1:I]) | |
Rows <- paste("Ind",1:I) | |
Cols <- paste("Ind",1:I) | |
dimnames(Cor.Matrix) <- list(Rows, Cols) | |
cat("\nIndicator correlations:\n\nFull Sample ( N =",N,"):\n\n") | |
print(round(Cor.Matrix,3)) | |
Cor.Taxon <- cor(Data.t) | |
Cor.Comp <- cor(Data.c) | |
dimnames(Cor.Taxon) <- list(Rows, Cols) | |
dimnames(Cor.Comp) <- list(Rows, Cols) | |
cat("\nTaxon ( n =",Nt,"):\n\n") | |
print(round(Cor.Taxon,3)) | |
cat("\nComplement ( n =",Nc,"):\n\n") | |
print(round(Cor.Comp,3)) | |
Cor.Summary <- matrix(nrow = 3, ncol = 2) | |
dimnames(Cor.Summary) <- list(c("Full Sample","Taxon", "Complement"), c("M", "SD")) | |
Cor.Summary[1,1] <- mean(Cor.Matrix[lower.tri(Cor.Matrix)]) | |
Cor.Summary[2,1] <- mean(Cor.Taxon[lower.tri(Cor.Taxon)]) | |
Cor.Summary[3,1] <- mean(Cor.Comp[lower.tri(Cor.Comp)]) | |
Cor.Summary[1,2] <- sd(Cor.Matrix[lower.tri(Cor.Matrix)]) | |
Cor.Summary[2,2] <- sd(Cor.Taxon[lower.tri(Cor.Taxon)]) | |
Cor.Summary[3,2] <- sd(Cor.Comp[lower.tri(Cor.Comp)]) | |
cat("\nSummary of indicator correlations:\n\n") | |
print(round(Cor.Summary,3)) | |
cat("\n\n") | |
} | |
########################################################################################################### | |
# Skewness and kurtosis programs for R | |
# Written by John Ruscio | |
########################################################################################################### | |
Skew <- function(x) | |
{ | |
N <- length(x) | |
m1 <- mean(x) | |
m2 <- sum((x - m1) ^ 2) / N | |
m3 <- sum((x - m1) ^ 3) / N | |
g1 <- m3 / (m2 ^ (3/2)) | |
G1 <- (sqrt(N * (N - 1)) / (N - 2)) * g1 | |
return(G1) | |
} | |
Kurtosis <- function(x) | |
{ | |
N <- length(x) | |
m1 <- mean(x) | |
m2 <- sum((x - m1) ^ 2) / N | |
m3 <- sum((x - m1) ^ 3) / N | |
m4 <- sum((x - m1) ^ 4) / N | |
g2 <- (m4 / (m2 ^ 2)) - 3 | |
G2 <- ((N - 1) / ((N - 2) * (N - 3))) * ((N + 1) * g2 + 6) | |
return(G2) | |
} | |
########################################################################################################### | |
# | |
# P.Classify program to implement the base-rate classification technique | |
# | |
# Written by John Ruscio | |
# Last modified November 13, 2011 | |
# | |
########################################################################################################### | |
P.Classify <- function(x, P.est, cols = 0) | |
{ | |
N <- dim(x)[1] | |
if (cols[1] == 0) cols <- 1:dim(x)[2] | |
s <- apply(x[,cols], 1, sum) | |
y <- cbind(x, rep(0,N), 1:N, s) | |
N.Tax <- round(P.est * N) | |
N.Comp <- N - N.Tax | |
Classes <- c(rep(1,N.Comp), rep(2,N.Tax)) | |
last.col <- dim(y)[2] | |
y <- y[sort.list(y[,last.col]),] | |
y[,last.col - 2] <- Classes | |
if (max(y[(y[,last.col - 2] == 1), last.col]) == min(y[(y[,last.col - 2] == 2), last.col])) | |
{ | |
tied.score <- max(y[(y[,last.col - 2] == 1), last.col]) | |
tied.cases <- (y[,last.col] == tied.score) | |
n.tied1 <- sum((y[,last.col - 2] == 1) & (tied.cases)) | |
n.tied2 <- sum((y[,last.col - 2] == 2) & (tied.cases)) | |
tied.min <- tied.score == min(y[,last.col]) | |
tied.max <- tied.score == max(y[,last.col]) | |
if (n.tied1 > n.tied2) | |
if (!tied.max) y[tied.cases, last.col - 2] <- 1 | |
else y[tied.cases, last.col - 2] <- 2 | |
else | |
if (!tied.min) y[tied.cases, last.col - 2] <- 2 | |
else y[tied.cases, last.col - 2] <- 1 | |
} | |
y <- y[sort.list(y[,last.col - 1]),] | |
return(cbind(x, y[,last.col - 2])) | |
} | |
########################################################################################################### | |
# | |
# Remove.Missing program to remove missing data (listwise deletion) | |
# | |
# Written by John Ruscio | |
# Last modified May 12, 2010 | |
# | |
########################################################################################################### | |
Remove.Missing <- function(x) | |
{ | |
N <- dim(x)[1] | |
k <- dim(x)[2] | |
complete <- rep(T, N) | |
for (i in 1:N) | |
for (j in 1:k) | |
if (is.na(x[i,j])) complete[i] <- F | |
x.k <- x[complete,] | |
n <- dim(x.k)[1] | |
if (n < N) | |
cat("\n*** ",n," out of ",N," cases (",round(100*n/N,2),"%) retained for analysis; ", | |
N-n," cases (",round(100*(N-n)/N,2),"%) contained missing data.\n", sep = "") | |
return(x.k) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment