Skip to content

Instantly share code, notes, and snippets.

@briatte
Created April 23, 2013 21:43
Show Gist options
  • Save briatte/5447652 to your computer and use it in GitHub Desktop.
Save briatte/5447652 to your computer and use it in GitHub Desktop.
Useful R functions for psychometrics and personality research, by William Revelle -- copied from http://personality-project.org/r/useful.r
#useful functions for psychometrics and personality research
#a growing compilation of simple functions that I have found useful for doing personality research
#last updated Feb 14, 2005
#by William Revelle
#includes
#alpha.scale #find coefficient alpha for a scale and a dataframe of items
#describe give means, sd, skew, n, and se
#summ.stats #basic summary statistics by a grouping variable
#error.crosses (error bars in two space)
#skew find skew
#panel.cor taken from the examples for pairs
#pairs.panels adapted from panel.cor -- gives a splom, histogram, and correlation matrix
#multi.hist #plot multiple histograms
#correct.cor #given a correlation matrix and a vector of reliabilities, correct for reliability
#fisherz #convert pearson r to fisher z
#paired.r #test for difference of dependent correlations
#count.pairwise #count the number of good cases when doing pairwise analysis
#eigen.loadings #convert eigen vector vectors to factor loadings by unnormalizing them
#principal #yet another way to do a principal components analysis -- brute force eignvalue decomp
#factor.congruence #find the factor congruence coeffiecints
#factor.model #given a factor model, find the correlation matrix
#factor.residuals #how well does it fit?
#factor.rotate # rotate two columns of a factor matrix by theta (in degrees)
#phi2poly #convert a matrix of phi coefficients to polychoric correlations
#define a function to calculate coefficient alpha
alpha.scale=function (x,y) #find coefficient alpha given a scale and a data.frame of the items in the scale
{
n=length(y) #number of variables
Vi=sum(diag(var(y,na.rm=TRUE))) #sum of item variance
Vt=var(x,na.rm=TRUE) #total test variance
((Vt-Vi)/Vt)*(n/(n-1))} #alpha
#general descriptive statistics
describe <- function (x, digits = 2,na.rm=TRUE) #basic stats after dropping non-numeric data
{ #first, define a local function
valid <- function(x) {
return(sum(!is.na(x)))
}
if (is.vector(x) ) #do it for vectors or
{
stats = matrix(rep(NA,6),ncol=6) #create a temporary array
stats[1, 1] <- mean(x, na.rm=na.rm )
stats[1, 2] <- median(x,na.rm=na.rm )
stats[1, 3] <- min(x, na.rm=na.rm )
stats[1, 4] <- max(x, na.rm=na.rm )
stats[1, 5] <- skew(x,na.rm=na.rm )
stats[1, 6] <- valid(x )
len <- 1;
}
else {len = dim(x)[2] #do it for matrices or data.frames
stats = matrix(rep(NA,len*6),ncol=6) #create a temporary array
for (i in 1:len) {
if (is.numeric(x[,i])) { #just do this for numeric data
stats[i, 1] <- mean(x[,i], na.rm=na.rm )
stats[i, 2] <- median(x[,i],na.rm=na.rm )
stats[i, 3] <- min(x[,i], na.rm=na.rm )
stats[i, 4] <- max(x[,i], na.rm=na.rm )
stats[i, 5] <- skew(x[,i],na.rm=na.rm )
stats[i, 6] <- valid(x[,i] )
}
}
}
temp <- data.frame(n = stats[,6],mean = stats[,1], sd = sd(x,na.rm=TRUE), median = stats[,
2],min= stats[,3],max=stats[,4], range=stats[,4]-stats[,3],skew = stats[, 5])
answer <- round(data.frame(V=seq(1:len),temp, se = temp$sd/sqrt(temp$n)), digits)
return(answer)
}
# find basic summary statistics by groups #adapted from "Kickstarting R"
summ.stats <- function (x,y) { #data are x, grouping variable is y
means <- tapply(x, y, mean, na.rm=TRUE)
sds <- tapply(x,y, sd, na.rm = TRUE)
valid <- function (x) {return(sum(!is.na(x)) )}
ns <- tapply(x,y, valid )
se=sds/sqrt(ns)
answer <-data.frame(mean=means,sd=sds,se=se,n=ns)
}
#error bars in a two dimensional plot
old.error.crosses <-function (x,y,z) # x and y are data frame with z points
{for (i in 1:z)
{xcen <- x$mean[i]
ycen <- y$mean[i]
xse <- x$se[i]
yse <- y$se[i]
arrows(xcen-xse,ycen,xcen+xse,ycen,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
arrows(xcen,ycen-yse,xcen,ycen+yse,length=.2, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
}
}
#error bars in a two dimensional plot with labels
error.crosses <-function (x,y,labels=NULL,pos=NULL,arrow.len=.2,...) # x and y are data frame with
{z <- dim(x)[1]
if (length(pos)==0) {locate<-rep(1,z)} else {locate <- pos}
if (length(labels)==0) lab<- rep("",z) else lab <-labels
for (i in 1:z)
{xcen <- x$mean[i]
ycen <- y$mean[i]
xse <- x$se[i]
yse <- y$se[i]
arrows(xcen-xse,ycen,xcen+xse,ycen,length=arrow.len, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
arrows(xcen,ycen-yse,xcen,ycen+yse,length=arrow.len, angle = 90, code=3,col = par("fg"), lty = NULL, lwd = par("lwd"), xpd = NULL)
text(xcen,ycen,labels=lab[i],pos=positions[i],cex=1,offset=arrow.len+1) #puts in labels for all points
}
}
#examples
# plot(mPA,mNA,pch=symb[condit],cex=4.5,col=colors[condit],bg=colors[condit],main="Postive vs. Negative Affect",xlim=c(-1,1.5),ylim=c(-1,1.5),xlab="Positive Affect",ylab="Negative Affect")
#error.crosses(paf.stats,naf.stats,4) #put in x and y error bars!
#find a vector of skews for a data matrix
skew= function (x, na.rm = FALSE)
{
if (na.rm) x <- x[!is.na(x)] #remove missing values
sum((x - mean(x))^3)/(length(x) * sd(x)^3) #calculate skew
}
#some fancy graphics -- adapted from help.cor
panel.cor.scale <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r = (cor(x, y,use="pairwise"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex * abs(r))
}
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r = (cor(x, y,use="pairwise"))
txt <- format(c(round(r,digits), 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex )
}
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...)
}
pairs.panels <- function (x,y,smooth=TRUE,scale=FALSE,digits=2) #combines a splom, histograms, and correlations
{if (smooth ){
if (scale) {
pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=panel.smooth)
}
else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
} #else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
}
else #smooth is not true
{ if (scale) {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor.scale)
} else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor) }
} #end of else (smooth)
} #end of function
multi.hist <- function(x,...) {nvar <- dim(x)[2] #number of variables
nsize=trunc(sqrt(nvar))+1 #size of graphic
old.par <- par(no.readonly = TRUE) # all par settings which can be changed
par(mfrow=c(nsize,nsize)) #set new graphic parameters
for (i in 1:nvar) {
name=names(x)[i] #get the names for the variables
hist(x[,i],main=name,xlab=name,...) } #draw the histograms for each variable
on.exit(par(old.par)) #set the graphic parameters back to the original
}
#function to do lowess fitting if we have missing data
lowess.na <- function(x, y = NULL, f = 2/3,...) { #do lowess with missing data
x1 <- subset(x,(!is.na(x)) &(!is.na(y)))
y1 <- subset(y, (!is.na(x)) &(!is.na(y)))
lowess.na <- lowess(x1,y1,f, ...)
}
#function to replace upper triangle of matrix with unattenuated correlations, and the diagonal with reliabilities
#input is a correlation matrix and a vector of reliabilities
correct.cor <- function(x,y) { n=dim(x)[1]
{ x[1,1] <- y[1,1]
for (i in 2:n) {
x[i,i] <- y[1,i] #put reliabilities on the diagonal
k=i-1
for (j in 1:k) {
x[j,i] <- x[j,i]/sqrt(y[1,i]*y[1,j]) } #fix the upper triangular part of the matrix
}
return(x) }}
#difference of dependent (paired) correlations (following Steiger,J., 1980, Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87, 245-251)
paired.r <- function(xy,xz,yz,n) {
diff <- xy-xz
determin=1-xy*xy - xz*xz - yz*yz + 2*xy*xz*yz
av=(xy+xz)/2
cube= (1-yz)*(1-yz)*(1-yz)
t2 = diff * sqrt((n-1)*(1+yz)/(((2*(n-1)/(n-3))*determin+av*av*cube)))
return(t2)
}
fisherz <- function(rho) {0.5*log((1+rho)/(1-rho)) } #converts r to z
#a function to call John Fox's polychor multiple times to form a matrix of polychoric correlations
#needs package polychor
polychor.matrix <-function(x,y=NULL) {
sizex <- dim(x)[2]
if (((is.data.frame(y))|(is.matrix(y)))) sizey<-dim(y)[2]
else sizey <- dim(x)[2]
result<-matrix(1,nrow=sizey,ncol=sizex) #create the output array
xnames<- names(x)
colnames(result)<- names(x)
if (((is.data.frame(y))|(is.matrix(y)))) rownames(result) <- names(y)
else rownames(result) <- names(x)
if (!((is.data.frame(y))|(is.matrix(y)))) { #default case returns a square matrix
for (i in 2: sizex ) {
for (j in 1:( i-1)) {
result[j,i]<-polychor(table(x[,j],x[,i]) )
result[i,j] <- result[j,i]
}
}
}
else { #if y is input, then return the rectangular array
for (i in 1: sizex ) {
for (j in 1:sizey) {
result[j,i]<-polychor(table(x[,i],y[,j]) )
}
} }
return (result) }
#count.pairwise -- count the number of good cases when doing pairwise analysis (e.g.,for correlations)
count.pairwise <- function(x,y) {
sizex <- dim(x)[2]
if (is.data.frame(y)) sizey<-dim(y)[2]
else sizey <- dim(x)[2]
result<-matrix(1,nrow=sizey,ncol=sizex) #create the output array
xnames<- names(x)
colnames(result)<- names(x)
if (((is.data.frame(y))|(is.matrix(y)))) rownames(result) <- names(y)
else rownames(result) <- names(x)
if (!is.data.frame(y)) { #default case returns a square matrix
for (i in 2: sizex ) {
for (j in 1:( i-1)) {
result[j,i]<- sum((!is.na(x[,j]))&(!is.na(x[,i])))
result[i,j] <- result[j,i]
}
}
}
else { #if y is input, then return the rectangular array
for (i in 1: sizex ) {
for (j in 1:sizey) {
result[j,i]<-sum((!is.na(x[,i]))&(!is.na(y[,j])))
}
} }
return (result) }
#minor routines to manipulate matrices for factor analysis
#output from eigen is two lists: $values and $vectors
#
eigen.loadings <- function (x) { #convert eigen vectors to loadings by unnormalizing them
n <- length(x$values)
x$values[ x$values<0]<- 0
fix<-sqrt(x$values)
result<- x$vectors * rep(fix, each = n)}
#do a principal components analysis by doing eigen vector decomposition and then showing the biggest nfactors
principal <-
function(r,nfactors=0) {
n <- dim(r)[1]
result<-list(values=c(rep(0,n)),loadings=matrix(rep(0,n*n),ncol=n),residual=matrix(rep(0,n*n),ncol=n),fit=0)
eigens <-eigen(r) #call the eigen value decomposition routine
result$values <-eigens$values
result$loadings <-eigen.loadings(eigens)
rownames(result$loadings) <- rownames(r)
if (nfactors>0) result$loadings<-result$loadings[,1:nfactors]
residual<- factor.residuals(r,result$loadings)
r2<- sum(r*r)
rstar2<- sum(residual*residual)
result$residual<-residual
result$fit<- 1-rstar2/r2
return(result)
}
#function to find factor congruence (
factor.congruence <- function (x,y) {
nx<- dim(x)[2]
ny<- dim(y)[2]
cross<- t(y) %*% x #inner product will have dim of ny * nx
sumsx<- sqrt(1/diag(t(x)%*%x))
sumsy<- sqrt(1/diag(t(y)%*%y))
result<- matrix(rep(0,nx*ny),ncol=nx)
result<- sumsy * (cross * rep(sumsx, each = ny))
return(result)
}
#estimate a correlation matrix given a particular factor/component model
factor.model <- function(x) {
result<- x%*%t(x)
return (result)}
#find residuals
factor.residuals <- function(r, f) {
rstar<- r- factor.model(f)
return(rstar)}
factor.fit <-function (r,f) {
r2 <-sum( r*r)
rstar <- factor.residuals(r,f)
rstar2 <- sum(rstar*rstar)
fit<- 1- rstar2/r2
return(fit) }
#do a principal components analysis by doing eigen vector decomposition and then showing the biggest nfactors
principal <- function(r,nfactors=0) {
n <- dim(r)[1]
result<-list(values=c(rep(0,n)),loadings=matrix(rep(0,n*n),ncol=n),residual=matrix(rep(0,n*n),ncol=n),fit=0)
eigens <-eigen(r) #call the eigen value decomposition routine
result$values <-eigens$values
result$loadings <-eigen.loadings(eigens)
if (nfactors>0) result$loadings<-result$loadings[,1:nfactors]
residual<- factor.residuals(r,result$loadings)
r2<- sum(r*r)
rstar2<- sum(residual*residual)
result$residual<-residual
result$fit<- 1-rstar2/r2
return(result)
}
factor.rotate <-function(f,angle,col1,col2) {
nvar<- dim(f)[2]
rot<- matrix(rep(0,nvar*nvar),ncol=nvar)
rot[cbind(1:nvar, 1:nvar)] <- 1
theta<- 2*pi*angle/360
rot[col1,col1]<-cos(theta)
rot[col2,col2]<-cos(theta)
rot[col1,col2]<- -sin(theta)
rot[col2,col1]<- sin(theta)
result <- f %*% rot
return(result) }
#phi coefficient from a 2 x 2 table (function probably already exists but I can't find it)
phi <- function(t) { #t is a 2 x 2 matrix
r.sum<-rowSums(t)
c.sum <-colSums(t)
total<-sum(r.sum)
r.sum<- r.sum/total
c.sum <-c.sum/total
v<- prod(r.sum,c.sum)
phi <- (t[1,1]/total- c.sum[1]*r.sum[1])/sqrt(v)
}
#an alternative function to do the same, need to combine into one
phi1 <- function(x) { # returns the phi coefficient x is a set of 4 numbers e.g. c(1,2,34)
xm <- matrix(x,ncol=2)
total <- sum(xm)
xp <- xm/total
HR <- xp[1,1] + xp[1,2]
SR <- xp[1,1]+ xp[2,1]
VP <- xp[1,1]
phi <- (VP - HR*SR) /sqrt(HR*(1-HR)*(SR)*(1-SR))
return(phi)
}
#convert a phi coefficient to the corresponding polychoric correlation
#needs John Fox's polychor program
phi2poly <- function(ph,cp,cc) {
#ph is the phi coefficient
#cp is the selection ratio of the predictor
#cc is the success rate of the criterion
r.marg<-rep(0,2)
c.marg<- rep(0,2)
p<-array(rep(0,4),dim=c(2,2))
r.marg[1]<- cp
r.marg[2]<- 1 -cp
c.marg[1]<- cc
c.marg[2]<- 1-cc
p[1,1]<- r.marg[1]*c.marg[1]+ ph*sqrt(prod(r.marg,c.marg))
p[2,2]<- r.marg[2]*c.marg[2]+ ph*sqrt(prod(r.marg,c.marg))
p[1,2]<- r.marg[1]*c.marg[2]- ph*sqrt(prod(r.marg,c.marg))
p[2,1]<- r.marg[2]*c.marg[1]- ph*sqrt(prod(r.marg,c.marg))
result<-polychor(p )
return(result)}
psycho.demo <- function() {
#simulate correlation matrix with variable cut points -- psychometric demo
#make up some correlations with different cut points
cuts <-c(-2,-1,0,1,2)
nsub<-1000 #how many subjects
r<-.6 #population correlation of observed with theta
theta <-rnorm(nsub) #make up some random normal theta scores
err<- rnorm(nsub) #random normal error scores
obser<- theta*(r) + err*sqrt(1-r*r) #observed = T + E
#convert to 0/1 with different values of cuts
trunc<- matrix(rep(obser,length(cuts)),ncol=length(cuts))
for (i in 1:length(cuts)) {
trunc[obser>cuts[i],i]<- 1
trunc[obser< cuts[i],i]<- 0}
d.mat<- data.frame(theta,obser,trunc) #combine into a data frame
trunc.cor<- cor(d.mat) #find the correlations
freq <- mean(d.mat) #find the frequencies of scores
#now, convert the upper diagonal to polychorics using John Fox's polychor and my phi2poly
for (i in 4:length(d.mat)) {
for (j in 3:i) {
trunc.cor[j,i]<- phi2poly(trunc.cor[i,j],freq[i],freq[j])
}}
}
#quick demonstration of a random walk process
#to show range of variability over trials and replications
#the "confidence lines" represent 2 sd of theoretical sum
random.walk <- function(length=200,n=20,effect=0) {
num <- seq(1:length)
colors <- rainbow(n) #select colors
x <- cumsum(rnorm(length,effect))/num
plot(x,ylim = c(-2.5,2.5),typ="b",col=colors[1],main="Sample means as function of sample size",ylab="sample mean",xlab="sample size")
for (i in 2:n) {
x <- cumsum(rnorm(length,effect))/num #find the next line
points(x,col=colors[i],typ="b") #draw it
}
curve(2/sqrt(x),1,length,101,add=TRUE) #upper confidence region
curve(-2/sqrt(x),1,length,101,add=TRUE) #lower confidence region
text(length/2,2,"effect size is " )
text(length/2, 1.8,eval(effect))
}
#read from clipboard for Mac (thanks to Ken Knoblauch for this hint
#for PCs, use read.table(file("clipboard"))
read.clipboard<-function(header=TRUE,...) {
MAC<-Sys.info()[1]=="Darwin" #are we on a Mac using the Darwin system?
if (!MAC ) {if (header) read.clipboard<-read.table(file("clipboard"),header=TRUE,...)
else read.clipboard<-read.table(file("clipboard"),...) }
else {
if (header) read.clipboard<- read.table(pipe("pbpaste"),header=TRUE,...)
else read.clipboard<- read.table(pipe("pbpaste") ,...)}
}
read.clipboard.csv<-function(header=TRUE,sep=',',...) { #same as read.clipboard(sep=',')
MAC<-Sys.info()[1]=="Darwin" #are we on a Mac using the Darwin system?
if (!MAC ) {if (header) read.clipboard<-read.table(file("clipboard"),header=TRUE,...)
else read.clipboard<-read.table(file("clipboard"),...) }
else {
if (header) read.clipboard<- read.table(pipe("pbpaste"),header=TRUE,...)
else read.clipboard<- read.table(pipe("pbpaste") ,...)}
}
#functions to help analyze data from correlation matrices
#used as part of the Synthetic Aperture Personality Assessment (SAPA) project
#
#extract cluster keys from a factor/principal components loadings matrix
factor2cluster <-
function(r,nfactors=0) {
n <- dim(r)[1]
result<-list(values=c(rep(0,n)),loadings=matrix(rep(0,n*n),ncol=n),residual=matrix(rep(0,n*n),ncol=n),fit=0)
eigens <-eigen(r) #call the eigen value decomposition routine
result$values <-eigens$values
result$loadings <-eigen.loadings(eigens)
rownames(result$loadings) <- rownames(r)
if (nfactors>0) result$loadings<-result$loadings[,1:nfactors]
residual<- factor.residuals(r,result$loadings)
r2<- sum(r*r)
rstar2<- sum(residual*residual)
result$residual<-residual
result$fit<- 1-rstar2/r2
return(result)
}
#find the correlation matrix of scales made up of items defined in a keys matrix (e.g., extracted by factor2cluster)
#takes as input the keys matrix as well as a correlation matrix of all the items
cluster.cor <-
function(keys,r.mat,correct=TRUE) { #function to extract clusters according to the key vector
#default is to correct for attenuation and show this above the diagonal
#find the correlation matrix of scales made up of items defined in a keys matrix (e.g., extracted by factor2cluster)
#takes as input the keys matrix as well as a correlation matrix of all the items
if(!is.matrix(keys)) keys <- as.matrix(keys) #keys are sometimes a data frame - must be a matrix
covar <- t(keys) %*% r.mat %*% keys #matrix algebra is our friend
var <- diag(covar)
sd.inv <- 1/sqrt(var)
ident.sd <- diag(sd.inv,ncol = length(sd.inv))
cluster.correl <- ident.sd %*% covar %*% ident.sd
key.var <- diag(t(keys) %*% keys)
key.alpha <- ((var-key.var)/var)*(key.var/(key.var-1))
key.alpha[is.nan(key.alpha)] <- 1 #if only 1 variable to the cluster, then alpha is undefined
key.alpha[!is.finite(key.alpha)] <- 1
colnames(cluster.correl) <- names(key.alpha)
rownames(cluster.correl) <- names(key.alpha)
if (correct) {cluster.corrected <- correct.cor(cluster.correl,t(key.alpha))} #correct for attenuation
return(list(cor=cluster.correl,sd=sqrt(var),corrected= cluster.corrected))
}
#a function to extract subsets of variables (a and b) from a correlation matrix m
#and find the multiple correlation beta weights + R2 of the a set predicting the b set
mat.regress <- function(m,a,b) {
#first reorder the matrix to select the right variables
nm <- dim(m)[1]
t.mat <- matrix(0,ncol=nm,nrow=nm)
ab <- c(a,b)
numa <- length(a)
numb <- length(b)
nab <- numa+numb
for (i in 1:nab) {
t.mat[i,ab[i]] <- 1 }
reorder <- t.mat %*% m %*% t(t.mat)
a.matrix <- reorder[1:numa,1:numa]
b.matrix <- reorder[1:numa,(numa+1):nab]
model.mat <- solve(a.matrix,b.matrix) #solve the equation bY~aX
if (length(b) >1 ) { rownames(model.mat) <- rownames(m)[a]
colnames(model.mat) <- colnames(m)[b]
r2 <- colSums(model.mat * b.matrix) }
else { r2 <- sum(model.mat * b.matrix)
names(model.mat) <- rownames(m)[a]
names(r2) <- colnames(m)[b]}
mat.regress <- list(beta=model.mat,r2=r2)
return(mat.regress)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment