dgrapov/Andrews plots and encoding.r

## Andrews plots and encoding.r
#needed packages
library(andrews)
library(ggplot2)

#get some data here I use "mtcars"
data<-mtcars
fct<-as.factor(mtcars$cyl) # factor for intial grouping


#get andrews encoding (x and y coords)
make.andrews<-function (df, type = 1, clr = NULL, step = 100, ymax = 10,...)
{
    #add objects to store x and y
	storelist<-list()

	if (step < 1)
        step <- 100
    n <- dim(df)[1]
    m <- dim(df)[2]
	#plot.new()
    if ((type == 1) | (type == 2) | (type == 4)) {
        xmin <- (-pi)
        xmax <- pi
    }
    else if (type == 3) {
        xmin <- 0
        xmax <- 4 * pi
    }
    # plot.window(c(xmin, xmax), c(-ymax, ymax))
    # title(main = main, sub = sub)
    # axis(1)
    # axis(2)
    # box()
    # lines(c(xmin, xmax), c(0, 0))
    for (i in 1:m) df[, i] <- normalize(df[, i])
    clx <- rep(1, n)
    if (!is.null(clr)) {
        if (!is.numeric(df[, clr])) {
            for (i in 1:n) {
                for (a in 1:nlevels(df[, clr])) if (levels(df[,
                  clr])[a] == df[i, clr])
                  clx[i] <- rainbow(nlevels(df[, clr]))[a]
            }
        }
        else {
            for (i in 1:n) clx[i] <- hsv(0, 1, df[i, clr])
        }
    }
    dfm <- numarray(df)
    m <- dim(dfm)[2]
    coorx <- 0:step
    for (p in 0:step) coorx[p + 1] <- (xmin + (xmax - xmin)/step *
        p)
    coory <- 0:step
    for (i in 1:n) {
        for (p in 0:step) {
            coory[p + 1] <- 0
            tt <- (xmin + (xmax - xmin)/step * p)
            for (a in 1:m) {
                if (type == 1) {
                  if (a == 1) {
                    coory[p + 1] <- dfm[i, a]/(2^0.5)
                  }
                  else {
                    cnst <- (a - 2)%/%2 + 1
                    if ((a - 2)%%2 == 0)
                      coory[p + 1] <- coory[p + 1] + dfm[i, a] *
                        sin(cnst * tt)
                    else coory[p + 1] <- coory[p + 1] + dfm[i,
                      a] * cos(cnst * tt)
                  }
                }
                else if (type == 2) {
                  cnst <- (a - 1)%/%2 + 1
                  if ((a - 1)%%2 == 0)
                    coory[p + 1] <- coory[p + 1] + dfm[i, a] *
                      sin(cnst * tt)
                  else coory[p + 1] <- coory[p + 1] + dfm[i,
                    a] * cos(cnst * tt)
                }
                else if (type == 3) {
                  coory[p + 1] <- coory[p + 1] + dfm[i, a] *
                    cos((a * tt)^0.5)
                }
                else if (type == 4) {
                  if (a == 1) {
                    coory[p + 1] <- dfm[i, a]
                  }
                  else {
                    cnst <- (a - 2)%/%2 + 1
                    if ((a - 2)%%2 == 0)
                      coory[p + 1] <- coory[p + 1] + dfm[i, a] *
                        (sin(cnst * tt) + cos(cnst * tt))
                    else coory[p + 1] <- coory[p + 1] + dfm[i,
                      a] * (sin(cnst * tt) - cos(cnst * tt))
                  }
                  coory[p + 1] <- coory[p + 1]/(2^0.5)
                }
            }
        }
        #lines(coorx, coory, col = clx[i])
		storelist[[i]]<-data.frame(x=coorx, y=coory)
    }
	return(storelist)
}

#plot
plot.data<-function(data,meta.data = NULL, kind="andrews",...){
	#change plotting and returned object using kind
	#kind = c("andrews-polygon","andrews-line","PCA","andrews-PCA")
	# meta data will be used to coor groups
	# can be multi column data frame, in this case it is collapsed to single factor

	#plotting theme for convenience (base'ish)
	.theme2<- theme(
					axis.line = element_line(colour = 'gray', size = .75),
					panel.background = element_blank(),
					plot.background = element_blank()
				 )


	#meta data needs to be separated from numeric
	num.id<-c(1:ncol(data))[sapply(data,is.numeric)] # numeric columns
	df<-data[,num.id]

	if(is.null(meta.data)){
			meta<-data[,-(num.id),drop=FALSE] # look for factors
				if(length(meta)==0) {meta<-data.frame(rep(1,nrow(data)))} # fix emptyness
		} else {
			meta<-as.data.frame(meta.data)
		}

	#need to combine meta to 1 column
	if(ncol(meta)>1) {
		meta<-data.frame(sapply(1:nrow(meta),function(i) {
			paste(as.character(unlist(meta[i,,drop=FALSE])),collapse = "|")
			}))
	}


	if(kind == "pairs"){
		meta<-as.factor(as.character(unlist(meta)))
		.rainbow<-function(x,alpha=NULL,...){
				if(is.null(alpha)){alpha<-1}
				rainbow(nlevels(x),alpha=alpha)[x]
			}
		color<-.rainbow(meta,...)
		plot(df,bg=color,col=color,...)
	}

	if(kind == "andrews-line"){
		a.data<-make.andrews(df,...) # makes list of x and y for each row

		#need to add meta data cols to x and y to keep track
		tmp<-lapply(1:length(a.data),function(i){
			cbind(a.data[[i]],meta=factor(meta[i,],levels=levels(as.factor(meta[,1]))),row.id=i)
		})

		obj<-do.call("rbind",tmp)

		#plot


		#intersting to change line width

		p<-ggplot(data=obj, aes(x=x, y=y,group=row.id,color=meta)) + #change group = meta for an interesting effct
			geom_line(...) + #geom_point(,...) +
			.theme2#

		print(p)
		return(obj)
	}

	if(kind == "andrews-polygon"){
		a.data<-make.andrews(df,...) # makes list of x and y for each row

		#need to add meta data cols to x and y to keep track
		tmp<-lapply(1:length(a.data),function(i){
			cbind(a.data[[i]],meta=factor(meta[i,],levels=levels(as.factor(meta[,1]))),row.id=i)
		})

		obj<-do.call("rbind",tmp)

		#plot


		#intersting to change line width

		p<-ggplot(data=obj, aes(x=x, y=y,group=meta,color=meta)) + #change group = meta for an interesting effct
			geom_line(size=4,aes(color=meta),...) + geom_point(,...) +
			.theme2#

		print(p)
		return(obj)
	}

	#PCA scores of andrews encoded data?? does the encoding help? define help?
	# need to consider how to combine x an y
	# could be a combination or relationship between
	# try separeted
	if(kind == "andrews-PCA"){
		a.data<-make.andrews(df,...) # makes list of x and y for each row

		#need to make vector combine x and y
		tmp<-lapply(1:length(a.data),function(i){
			c(a.data[[i]][,1],a.data[[i]][,2])
		})

		obj<-do.call("rbind",tmp)

		#make PCA scores
		#scaling changes effects so don't
		full<-prcomp(obj)
		scores<-data.frame(full$x, meta=as.factor(meta[,1]))

		#plot
		p<-ggplot(data=scores, aes(x=PC1, y=PC2 ,group=meta,color=meta)) +
			geom_point(...)	+
			.theme2
		print(p)
		return(full)
	}

	if(kind == "PCA"){
		#plot PCA scores
		full<-prcomp(df)
		scores<-data.frame(full$x, meta=as.factor(meta[,1]))

		#intersting to change line width
		p<-ggplot(data=scores, aes(x=PC1, y=PC2 ,group=meta,color=meta)) +
			geom_point(...)	+
			.theme2

		print(p)
		return(full)
	}

}

#pairs plot for the first 5 columns coloring on number of cylinders
x11() # turn these off to prevent making new plotting device
plot.data(data,kind="pairs",meta.data=fct, cex=1, alpha=.5, pch=21)

#make classical andrews plot
x11()
andrews(cbind(fct,data),type=1,clr=1,ymax=4.25) # add cyl as factor to allow coloring
 # factor for legend (colors)
legend("topleft",legend=levels(fct), fill=rainbow(length(levels(as.factor(mtcars$cyl)))) )

#alteranative visualization of andrews plot
library(ggplot2)
x11()
plot.data(data,kind="andrews-polygon",meta.data=fct,type=1, size=1, alpha=.5) #andrews

#make PCA
x11()
plot.data(data,kind="PCA",meta.data=mtcars$cyl, size=5, alpha=.75)

#make pca from andrews encoded data
x11()
plot.data(data,kind="andrews-PCA",meta.data=fct, size=5, alpha=.75)

#note intersting groups structure not explained by cyl
# after some cheking find that this is perfectly explained by the mtcars$am
#very intersting that this not obvious untill after encoding

#Make combined factor
fct<-cbind(mtcars$cyl,mtcars$am)
x11()
plot.data(mtcars,kind="PCA",meta.data=fct, size=5, alpha=.75)
x11()
plot.data(mtcars,kind="andrews-PCA",meta.data=fct, size=5, alpha=.75)
# judging by the clustering patterns,
# it looks like with this encoding cyl and am can be modeled (classified) relatively well
# there also seems to be a nother hidden covariate based on the pattern for 8 cylinder and 0 am (8|0)
	#needed packages
	library(andrews)
	library(ggplot2)

	#get some data here I use "mtcars"
	data<-mtcars
	fct<-as.factor(mtcars$cyl) # factor for intial grouping


	#get andrews encoding (x and y coords)
	make.andrews<-function (df, type = 1, clr = NULL, step = 100, ymax = 10,...)
	{
	#add objects to store x and y
	storelist<-list()

	if (step < 1)
	step <- 100
	n <- dim(df)[1]
	m <- dim(df)[2]
	#plot.new()
	if ((type == 1) \| (type == 2) \| (type == 4)) {
	xmin <- (-pi)
	xmax <- pi
	}
	else if (type == 3) {
	xmin <- 0
	xmax <- 4 * pi
	}
	# plot.window(c(xmin, xmax), c(-ymax, ymax))
	# title(main = main, sub = sub)
	# axis(1)
	# axis(2)
	# box()
	# lines(c(xmin, xmax), c(0, 0))
	for (i in 1:m) df[, i] <- normalize(df[, i])
	clx <- rep(1, n)
	if (!is.null(clr)) {
	if (!is.numeric(df[, clr])) {
	for (i in 1:n) {
	for (a in 1:nlevels(df[, clr])) if (levels(df[,
	clr])[a] == df[i, clr])
	clx[i] <- rainbow(nlevels(df[, clr]))[a]
	}
	}
	else {
	for (i in 1:n) clx[i] <- hsv(0, 1, df[i, clr])
	}
	}
	dfm <- numarray(df)
	m <- dim(dfm)[2]
	coorx <- 0:step
	for (p in 0:step) coorx[p + 1] <- (xmin + (xmax - xmin)/step *
	p)
	coory <- 0:step
	for (i in 1:n) {
	for (p in 0:step) {
	coory[p + 1] <- 0
	tt <- (xmin + (xmax - xmin)/step * p)
	for (a in 1:m) {
	if (type == 1) {
	if (a == 1) {
	coory[p + 1] <- dfm[i, a]/(2^0.5)
	}
	else {
	cnst <- (a - 2)%/%2 + 1
	if ((a - 2)%%2 == 0)
	coory[p + 1] <- coory[p + 1] + dfm[i, a] *
	sin(cnst * tt)
	else coory[p + 1] <- coory[p + 1] + dfm[i,
	a] * cos(cnst * tt)
	}
	}
	else if (type == 2) {
	cnst <- (a - 1)%/%2 + 1
	if ((a - 1)%%2 == 0)
	coory[p + 1] <- coory[p + 1] + dfm[i, a] *
	sin(cnst * tt)
	else coory[p + 1] <- coory[p + 1] + dfm[i,
	a] * cos(cnst * tt)
	}
	else if (type == 3) {
	coory[p + 1] <- coory[p + 1] + dfm[i, a] *
	cos((a * tt)^0.5)
	}
	else if (type == 4) {
	if (a == 1) {
	coory[p + 1] <- dfm[i, a]
	}
	else {
	cnst <- (a - 2)%/%2 + 1
	if ((a - 2)%%2 == 0)
	coory[p + 1] <- coory[p + 1] + dfm[i, a] *
	(sin(cnst * tt) + cos(cnst * tt))
	else coory[p + 1] <- coory[p + 1] + dfm[i,
	a] * (sin(cnst * tt) - cos(cnst * tt))
	}
	coory[p + 1] <- coory[p + 1]/(2^0.5)
	}
	}
	}
	#lines(coorx, coory, col = clx[i])
	storelist[[i]]<-data.frame(x=coorx, y=coory)
	}
	return(storelist)
	}

	#plot
	plot.data<-function(data,meta.data = NULL, kind="andrews",...){
	#change plotting and returned object using kind
	#kind = c("andrews-polygon","andrews-line","PCA","andrews-PCA")
	# meta data will be used to coor groups
	# can be multi column data frame, in this case it is collapsed to single factor

	#plotting theme for convenience (base'ish)
	.theme2<- theme(
	axis.line = element_line(colour = 'gray', size = .75),
	panel.background = element_blank(),
	plot.background = element_blank()
	)


	#meta data needs to be separated from numeric
	num.id<-c(1:ncol(data))[sapply(data,is.numeric)] # numeric columns
	df<-data[,num.id]

	if(is.null(meta.data)){
	meta<-data[,-(num.id),drop=FALSE] # look for factors
	if(length(meta)==0) {meta<-data.frame(rep(1,nrow(data)))} # fix emptyness
	} else {
	meta<-as.data.frame(meta.data)
	}

	#need to combine meta to 1 column
	if(ncol(meta)>1) {
	meta<-data.frame(sapply(1:nrow(meta),function(i) {
	paste(as.character(unlist(meta[i,,drop=FALSE])),collapse = "\|")
	}))
	}


	if(kind == "pairs"){
	meta<-as.factor(as.character(unlist(meta)))
	.rainbow<-function(x,alpha=NULL,...){
	if(is.null(alpha)){alpha<-1}
	rainbow(nlevels(x),alpha=alpha)[x]
	}
	color<-.rainbow(meta,...)
	plot(df,bg=color,col=color,...)
	}

	if(kind == "andrews-line"){
	a.data<-make.andrews(df,...) # makes list of x and y for each row

	#need to add meta data cols to x and y to keep track
	tmp<-lapply(1:length(a.data),function(i){
	cbind(a.data[[i]],meta=factor(meta[i,],levels=levels(as.factor(meta[,1]))),row.id=i)
	})

	obj<-do.call("rbind",tmp)

	#plot


	#intersting to change line width

	p<-ggplot(data=obj, aes(x=x, y=y,group=row.id,color=meta)) + #change group = meta for an interesting effct
	geom_line(...) + #geom_point(,...) +
	.theme2#

	print(p)
	return(obj)
	}

	if(kind == "andrews-polygon"){
	a.data<-make.andrews(df,...) # makes list of x and y for each row

	#need to add meta data cols to x and y to keep track
	tmp<-lapply(1:length(a.data),function(i){
	cbind(a.data[[i]],meta=factor(meta[i,],levels=levels(as.factor(meta[,1]))),row.id=i)
	})

	obj<-do.call("rbind",tmp)

	#plot


	#intersting to change line width

	p<-ggplot(data=obj, aes(x=x, y=y,group=meta,color=meta)) + #change group = meta for an interesting effct
	geom_line(size=4,aes(color=meta),...) + geom_point(,...) +
	.theme2#

	print(p)
	return(obj)
	}

	#PCA scores of andrews encoded data?? does the encoding help? define help?
	# need to consider how to combine x an y
	# could be a combination or relationship between
	# try separeted
	if(kind == "andrews-PCA"){
	a.data<-make.andrews(df,...) # makes list of x and y for each row

	#need to make vector combine x and y
	tmp<-lapply(1:length(a.data),function(i){
	c(a.data[[i]][,1],a.data[[i]][,2])
	})

	obj<-do.call("rbind",tmp)

	#make PCA scores
	#scaling changes effects so don't
	full<-prcomp(obj)
	scores<-data.frame(full$x, meta=as.factor(meta[,1]))

	#plot
	p<-ggplot(data=scores, aes(x=PC1, y=PC2 ,group=meta,color=meta)) +
	geom_point(...) +
	.theme2
	print(p)
	return(full)
	}

	if(kind == "PCA"){
	#plot PCA scores
	full<-prcomp(df)
	scores<-data.frame(full$x, meta=as.factor(meta[,1]))

	#intersting to change line width
	p<-ggplot(data=scores, aes(x=PC1, y=PC2 ,group=meta,color=meta)) +
	geom_point(...) +
	.theme2

	print(p)
	return(full)
	}

	}

	#pairs plot for the first 5 columns coloring on number of cylinders
	x11() # turn these off to prevent making new plotting device
	plot.data(data,kind="pairs",meta.data=fct, cex=1, alpha=.5, pch=21)

	#make classical andrews plot
	x11()
	andrews(cbind(fct,data),type=1,clr=1,ymax=4.25) # add cyl as factor to allow coloring
	# factor for legend (colors)
	legend("topleft",legend=levels(fct), fill=rainbow(length(levels(as.factor(mtcars$cyl)))) )

	#alteranative visualization of andrews plot
	library(ggplot2)
	x11()
	plot.data(data,kind="andrews-polygon",meta.data=fct,type=1, size=1, alpha=.5) #andrews

	#make PCA
	x11()
	plot.data(data,kind="PCA",meta.data=mtcars$cyl, size=5, alpha=.75)

	#make pca from andrews encoded data
	x11()
	plot.data(data,kind="andrews-PCA",meta.data=fct, size=5, alpha=.75)

	#note intersting groups structure not explained by cyl
	# after some cheking find that this is perfectly explained by the mtcars$am
	#very intersting that this not obvious untill after encoding

	#Make combined factor
	fct<-cbind(mtcars$cyl,mtcars$am)
	x11()
	plot.data(mtcars,kind="PCA",meta.data=fct, size=5, alpha=.75)
	x11()
	plot.data(mtcars,kind="andrews-PCA",meta.data=fct, size=5, alpha=.75)
	# judging by the clustering patterns,
	# it looks like with this encoding cyl and am can be modeled (classified) relatively well
	# there also seems to be a nother hidden covariate based on the pattern for 8 cylinder and 0 am (8\|0)