michalskop/a-weighted PCA

## readme.rd
| Principal factor analysis with "natural" weights - Model of parliamentary voting.
| (PCA equals to Multidimensional scaling)
| Cutting lines: perpendicular to normal vectors of divisions, minimizing the sum of "error distances" in given hyperplane

## a-weighted PCA
# Weighted PCA

## cutting_lines.r
# CUTTING LINES
# based on overall model
#minimizing function
f_cutting_lines=function(beta0) -1*sum(apply(cbind(y*(x%*%beta+beta0),zeros),1,min))

#how many dimensions?
n_first_dimensions = 2

#preparing variables
normals = Hy[,1:n_first_dimensions]
loss_f = data.frame(matrix(0,nrow=dim(H)[1],ncol=4))
colnames(loss_f)=c("Parameter1","Loss1","Parameter_1","Loss_1")
parameters = data.frame(matrix(0,nrow=dim(H)[1],ncol=3))
colnames(parameters)=c("Parameter","Loss","Direction")

xfull = t(t(He$vectors[,1:n_first_dimensions]) * sqrt(He$values[1:n_first_dimensions]))

#calculating all cutting lines
i=1
for (i in as.numeric(1:dim(H)[1])) {
    beta = Hy[i,1:n_first_dimensions]
    y = t(as.matrix(H[i,]))[,pI]
    x = xfull[which(!is.na(y)),]
    y = y[which(!is.na(y))]
    zeros = as.matrix(rep(0,length(y)))
    #** place for improvement: "1000"
    res1 = optim(c(1),f_cutting_lines,method="Brent",lower=-1000,upper=1000)	#1000? arbitrary
    #the sign is arbitrary, the real result may be -1*; we need to minimize the other way as well!!
    y=-y
    res2 = optim(c(1),f_cutting_lines,method="Brent",lower=-1000,upper=1000) #1000? arbitrary
    #the real parameter is the one with lower loss function
    #theoretically should be the same (either +1 or -1) for all divisions(rows), however, due to missing data, the calculation may lead to a few divisions with the other direction
    loss_f[i,] = c(res1$par,res1$value,res2$par,res2$value)
    if (res1$value<=res2$value) {
      parameters[i,] = c(res1$par,res1$value,1)
    } else {
      parameters[i,] = c(res2$par,res2$value,-1)
    }
}
CuttingLines = list(normals=normals,parameters=parameters,loss_function=loss_f,weights=cbind(w1,w2))


##
#cutting lines:
plot(Hproj[,1],Hproj[,2])
I = w1*w2 > .5
for (i in as.numeric(1:dim(H)[1])) {
  if (I[i] && ((i %% 10) == 0)) {
    beta = CuttingLines$normals[i,]
    beta0 = CuttingLines$parameters$Parameter[i]
	abline(-beta0/beta[2],-beta[1]/beta[2])
    break
  }
}

# only second half
plot(Hproj[,1],Hproj[,2])
for (j in as.numeric(1:round(dim(H)[1]/2))) {
     i = round(dim(H)[1]/2) + j
     if (I[i] && ((i %% 10) == 0)) {
         beta = CuttingLines$normals[i,]
         beta0 = CuttingLines$parameters$Parameter[i]
         abline(-beta0/beta[2],-beta[1]/beta[2])
         #break
     }
 }

# when y=0, x is ?
plot(CuttingLines$parameters$Parameter/CuttingLines$normals[,1],ylim=c(-100,100))
# slopes
plot(CuttingLines$parameters$Parameter/CuttingLines$normals[,2],ylim=c(-10,10))


## partial_projection.r
# NEW MP (or partial)
#New persons / partial (like a projection of divisions from a smaller time interval into all divisions)


analdb = dbConnect(SQLite(), dbname=paste(path,dataname,"/",analysisname,"/",analysisname,".sqlite3",sep=""))
datadb = dbConnect(SQLite(), dbname=paste(path,dataname,"/",dataname,".sqlite3",sep=""))

rotation = strsplit(dbGetQuery(analdb,"SELECT orientation FROM analysis_info")[1,1],",")[[1]]
rot_mp_rank = which(dimnames(Hcov)[[1]] == rotation[1])
for (j in 2:length(rotation)) {
  if (Hproj[rot_mp_rank,j-1] * as.double(rotation[j]) < 0) {
    U[,j-1] = -1*U[,j-1]
  }
}

aU = abs(U)

mp_names = dbGetQuery(datadb,"SELECT name FROM mp ORDER BY CAST(code as INTEGER)")
attr(G,'dimnames')[2][[1]] = mp_names$name
#attr(Hw0c,'dimnames')[2][[1]] = mp_names$name[pI]

interval_names = dbGetQuery(analdb,"SELECT distinct(interval_name) FROM analysis_division_in_interval ORDER BY interval_name")

#lower limit to eliminate from projections
lo_limitT = lo_limit


for (i in 1:dim(interval_names)[1]) {

    TIf = dbGetQuery(analdb,paste("SELECT division_code, CASE interval_name='",interval_names[i,],"' WHEN 1 THEN 'TRUE' ELSE 'FALSE' END as in_interval FROM analysis_division_in_interval ORDER BY division_code",sep=""))

    TIf$in_interval = as.logical(TIf$in_interval)

    max_division_code = TIf$division_code[max(which(as.logical(TIf$in_interval)))]
    min_division_code = TIf$division_code[min(which(as.logical(TIf$in_interval)))]

    TI = TIf$in_interval

    GTc = G[,pI]
    GTc[!TI,] = NA
    HTc = (GTc - attr(H,"scaled:center"))/attr(H,"scaled:scale")

    HTIc = HTc
	HTIc[!is.na(HTIc)] = 1
	HTIc[is.na(HTIc)] = 0

    HTw0c = HTc * w1 * w2
    HTw0c[is.na(HTw0c)] = 0


	#weights for non missing data division x persons
	HTIcw = HTIc*w1*w2
	#sum of weights of divisions for each persons
	tmp = apply(HTIcw,2,sum)
	pTw = tmp/max(tmp)
	#index of persons in calculation
	pTI = pTw > lo_limitT

	#weighted scaled with NA->0 and cutted persons with too few votes division x persons
	HTw0cc = HTw0c[,pTI]
	#index of missing data cutted persons with too few votes divisions x persons
	HTIcc = HTIc[,pTI]


	dweights = t(t(aU)%*%HTIcc / apply(aU,2,sum))  	#person x division
    dweights[is.na(dweights)] = 0

    HTw0ccproj = t(HTw0cc)%*%U / dweights


    parties = as.matrix(dbGetQuery(datadb,paste('SELECT COALESCE(NULLIF(tmax.code,""),tmin.code) as code, COALESCE(NULLIF(tmax.color,""),tmin.color) as color FROM (SELECT  mvf.mp_code, g.code, g.color FROM mp_vote_full as mvf  LEFT JOIN "group" as g  ON mvf.group_code=g.code  WHERE division_code="',max_division_code,'") as tmax LEFT JOIN  (SELECT  mvf.mp_code, g.code, g.color FROM mp_vote_full as mvf  LEFT JOIN "group" as g  ON mvf.group_code=g.code  WHERE division_code="',min_division_code,'") as tmin ON tmax.mp_code=tmin.mp_code  ORDER BY CAST(tmax.mp_code as INTEGER)',sep="")))
    partiescc = (parties[pI,])[pTI,]

    #mp rank id
    mp_rank_id_cc = seq(1,dim(HTw0c)[2],1)[,pTI]

    write.csv(format(cbind(HTw0ccproj[,1:2],partiescc,mp_rank_id_cc),digits=3),file=paste(path,dataname,"/",analysisname,"/",interval_names[i,],"_result.csv",sep=""))
     #*************************
}


## sources
#matrix (operations):
en.wikipedia.org/wiki/Matrix_(mathematics)

#rotation matrix:
http://en.wikipedia.org/wiki/Rotation_matrix

#SVD:
http://en.wikipedia.org/wiki/Singular_value_decomposition

#Eigendecomposition
http://en.wikipedia.org/wiki/Eigendecomposition

#SVD vs. eigendec.
http://stats.stackexchange.com/questions/14002/whats-the-difference-between-principal-components-analysis-and-multidimensional

#SVD example:
http://polisci2.ucsd.edu/aruizeuler/SVD.html

#normal and distance point,plane
http://en.wikipedia.org/wiki/Normal_%28geometry%29
http://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line
http://www.vitutor.com/geometry/distance/point_plane.html

# multiply rows of matrix by vector in R (speed)
http://stackoverflow.com/questions/3643555/multiply-rows-of-matrix-by-vector

#ideal points
http://voteview.com/ideal_point_history.htm

#wnominate in R
http://cran.r-project.org/web/packages/wnominate/vignettes/wnominate.pdf
http://cran.r-project.org/web/packages/wnominate/wnominate.pdf
http://rss.acs.unt.edu/Rdoc/library/pscl/html/rollcall.html
http://voteview.com/senate109.htm

#US roll-call downloads
http://voteview.com/downloads.asp#ROLLCALLDWNL

#linear discriminant analysis (not used finally)
http://en.wikipedia.org/wiki/Linear_discriminant_analysis

#logistic regression (not used finally)
http://en.wikipedia.org/wiki/Linear_discriminant_analysis

#SVM and kernels (not used finally)
http://en.wikipedia.org/wiki/Support_vector_machine
http://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/SVM
http://stats.stackexchange.com/questions/26293/how-to-obtain-decision-boundaries-from-linear-svm-in-r
http://stats.stackexchange.com/questions/5056/computing-the-decision-boundary-of-a-linear-svm-model/5080#5080
http://stats.stackexchange.com/questions/25387/problem-with-e1071-libsvm
http://stats.stackexchange.com/questions/17955/svm-with-unequal-group-sizes-in-training-data?rq=1
http://cran.r-project.org/web/packages/kernlab/kernlab.pdf
http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/svmbasic/svmbasic_notes.pdf
http://stackoverflow.com/questions/3282916/class-weight-syntax-in-kernlab
http://www.robots.ox.ac.uk/~az/lectures/ml/lect2.pdf
http://stat.ethz.ch/R-manual/R-patched/library/stats/html/density.html


## thumbnail.png

      
    Raw
  

              thumbnail.png
            
          
## votes.csv

          
            p1
            p2
            p3
            p4

            
              1
              1
              1
              -1

            
              1
              1
              -1
              -1

            
              1
              1
              -1
              -1

            
              1
              -1
              1
              -1

            
              1
              -1
              1
              -1

            
              1
              -1
              -1
              -1

            
              -1
              -1
              -1
              1

            
              -1
              -1
              -1
              1

            
              1
              1
              -1
              -1

            
              1
              -1
              1
              -1

## wpca.r
#install.packages("reshape2")
library("reshape2")
#install.packages("sqldf")
library("sqldf")

path = "~/project/mpv_motion/dev/"
#dataname = "cz_psp_2006-2010_a1_1"
#analysisname = 'cz_psp_2006-2010_a1_1q'
dataname = "cz_kromeriz_2012"
analysisname = 'cz_kromeriz_2012_1h'

#raw data in 3 columns (values in [-1,1])
Graw = read.table(paste(path,dataname,"/",dataname,".csv",sep=""), header=FALSE, sep=",")
#data divisions x persons
G = acast(Graw,V2~V1,value.var='V3')
#scaled divisions x persons (mean=0 and sd=1 for each division)
H=t(scale(t(G),scale=TRUE))

#weights
#weights 1, based on number of persons in division
w1 = apply(abs(G)==1,1,sum,na.rm=TRUE)/max(apply(abs(G)==1,1,sum,na.rm=TRUE))
w1[is.na(w1)] = 0
#weights 2, "100:100" vs. "195:5"
w2 = 1 - abs(apply(G==1,1,sum,na.rm=TRUE) - apply(G==-1,1,sum,na.rm=TRUE))/apply(!is.na(G),1,sum)
w2[is.na(w2)] = 0

	#analytics
	plot(w1)
	plot(w2)
	plot(w1*w2)

#weighted scaled divisions x persons
Hw = H * w1 * w2

#index of missing data
#index of missing data divisions x persons
HI = H
HI[!is.na(H)]=1
HI[is.na(H)] = 0

#weighted scaled with NA->0 division x persons
Hw0 = Hw
Hw0[is.na(Hw)]=0

#eliminate persons with too few votes (weighted)
#lower limit to eliminate from calculations
lo_limit = .1
#weights for non missing data division x persons
HIw = HI*w1*w2
#sum of weights of divisions for each persons
tmp = apply(HIw,2,sum)
pw = tmp/max(tmp)
#index of persons in calculation
pI = pw > lo_limit
#weighted scaled with NA->0 and cutted persons with too few votes division x persons
Hw0c = Hw0[,pI]
#index of missing data cutted persons with too few votes divisions x persons
HIc = HI[,pI]

#"covariance" matrix adjusted according to missing data
Hcov=t(Hw0c)%*%Hw0c * 1/(t(HIc)%*%HIc) * (dim(Hw0c)[1])
#Hcov=t(Hw0)%*%Hw0 * 1/(t(HI)%*%HI-1) * (dim(H)[1]-1)

#substitution of missing data in "covariance" matrix
Hcov0 = Hcov
Hcov0[is.na(Hcov)] = 0			#*********

#eigendecomposition
He=eigen(Hcov0)
# V (rotation values of persons)
V = He$vectors
#projected divisions into dimensions
Hy=Hw0c%*%V

	#analytics
	plot(Hy[,1],Hy[,2])
	plot(sqrt(He$values[1:10]))

#sigma matrix
sigma = diag(sqrt(He$values))
sigma[is.na(sigma)] = 0

#projection of persons into dimensions
Hproj = t(sigma%*%t(V))
	#analytics
	plot(Hproj[,1],Hproj[,2])

#sigma^-1 matrix
sigma_1 = diag(sqrt(1/He$values))
sigma_1[is.na(sigma_1)] = 0

# U (rotation values of divisions)
U = Hw0c%*%V%*%sigma_1

#U%*%sigma%*%t(V) != Hw0c ;because of adjusting of "covariance" matrix
#THE PROBLEM (if T - overall otherwise see Hproj2)
plot(He$vectors[,1]*sqrt(He$value[1]),He$vectors[,2]*sqrt(He$value[2]))
#second projection
    Hproj2 = t(t(U) %*% Hw0c)
    	#without missing values should be equal
    #analytics
    plot(Hproj[,1],Hproj2[,1])
    plot(Hproj[,2],Hproj2[,2])
#plot(HTw0ccproj[,1],HTw0ccproj[,2])
#plot(Hproj[,1],HTw0ccproj[,1])
#plot(Hproj[,2],HTw0ccproj[,2])
	\| Principal factor analysis with "natural" weights - Model of parliamentary voting.
	\| (PCA equals to Multidimensional scaling)
	\| Cutting lines: perpendicular to normal vectors of divisions, minimizing the sum of "error distances" in given hyperplane
	# CUTTING LINES
	# based on overall model
	#minimizing function
	f_cutting_lines=function(beta0) -1sum(apply(cbind(y(x%*%beta+beta0),zeros),1,min))

	#how many dimensions?
	n_first_dimensions = 2

	#preparing variables
	normals = Hy[,1:n_first_dimensions]
	loss_f = data.frame(matrix(0,nrow=dim(H)[1],ncol=4))
	colnames(loss_f)=c("Parameter1","Loss1","Parameter_1","Loss_1")
	parameters = data.frame(matrix(0,nrow=dim(H)[1],ncol=3))
	colnames(parameters)=c("Parameter","Loss","Direction")

	xfull = t(t(He$vectors[,1:n_first_dimensions]) * sqrt(He$values[1:n_first_dimensions]))

	#calculating all cutting lines
	i=1
	for (i in as.numeric(1:dim(H)[1])) {
	beta = Hy[i,1:n_first_dimensions]
	y = t(as.matrix(H[i,]))[,pI]
	x = xfull[which(!is.na(y)),]
	y = y[which(!is.na(y))]
	zeros = as.matrix(rep(0,length(y)))
	#** place for improvement: "1000"
	res1 = optim(c(1),f_cutting_lines,method="Brent",lower=-1000,upper=1000) #1000? arbitrary
	#the sign is arbitrary, the real result may be -1*; we need to minimize the other way as well!!
	y=-y
	res2 = optim(c(1),f_cutting_lines,method="Brent",lower=-1000,upper=1000) #1000? arbitrary
	#the real parameter is the one with lower loss function
	#theoretically should be the same (either +1 or -1) for all divisions(rows), however, due to missing data, the calculation may lead to a few divisions with the other direction
	loss_f[i,] = c(res1$par,res1$value,res2$par,res2$value)
	if (res1$value<=res2$value) {
	parameters[i,] = c(res1$par,res1$value,1)
	} else {
	parameters[i,] = c(res2$par,res2$value,-1)
	}
	}
	CuttingLines = list(normals=normals,parameters=parameters,loss_function=loss_f,weights=cbind(w1,w2))


	##
	#cutting lines:
	plot(Hproj[,1],Hproj[,2])
	I = w1*w2 > .5
	for (i in as.numeric(1:dim(H)[1])) {
	if (I[i] && ((i %% 10) == 0)) {
	beta = CuttingLines$normals[i,]
	beta0 = CuttingLines$parameters$Parameter[i]
	abline(-beta0/beta[2],-beta[1]/beta[2])
	break
	}
	}

	# only second half
	plot(Hproj[,1],Hproj[,2])
	for (j in as.numeric(1:round(dim(H)[1]/2))) {
	i = round(dim(H)[1]/2) + j
	if (I[i] && ((i %% 10) == 0)) {
	beta = CuttingLines$normals[i,]
	beta0 = CuttingLines$parameters$Parameter[i]
	abline(-beta0/beta[2],-beta[1]/beta[2])
	#break
	}
	}

	# when y=0, x is ?
	plot(CuttingLines$parameters$Parameter/CuttingLines$normals[,1],ylim=c(-100,100))
	# slopes
	plot(CuttingLines$parameters$Parameter/CuttingLines$normals[,2],ylim=c(-10,10))
	# NEW MP (or partial)
	#New persons / partial (like a projection of divisions from a smaller time interval into all divisions)


	analdb = dbConnect(SQLite(), dbname=paste(path,dataname,"/",analysisname,"/",analysisname,".sqlite3",sep=""))
	datadb = dbConnect(SQLite(), dbname=paste(path,dataname,"/",dataname,".sqlite3",sep=""))

	rotation = strsplit(dbGetQuery(analdb,"SELECT orientation FROM analysis_info")[1,1],",")[[1]]
	rot_mp_rank = which(dimnames(Hcov)[[1]] == rotation[1])
	for (j in 2:length(rotation)) {
	if (Hproj[rot_mp_rank,j-1] * as.double(rotation[j]) < 0) {
	U[,j-1] = -1*U[,j-1]
	}
	}

	aU = abs(U)

	mp_names = dbGetQuery(datadb,"SELECT name FROM mp ORDER BY CAST(code as INTEGER)")
	attr(G,'dimnames')[2][[1]] = mp_names$name
	#attr(Hw0c,'dimnames')[2][[1]] = mp_names$name[pI]

	interval_names = dbGetQuery(analdb,"SELECT distinct(interval_name) FROM analysis_division_in_interval ORDER BY interval_name")

	#lower limit to eliminate from projections
	lo_limitT = lo_limit




	for (i in 1:dim(interval_names)[1]) {

	TIf = dbGetQuery(analdb,paste("SELECT division_code, CASE interval_name='",interval_names[i,],"' WHEN 1 THEN 'TRUE' ELSE 'FALSE' END as in_interval FROM analysis_division_in_interval ORDER BY division_code",sep=""))

	TIf$in_interval = as.logical(TIf$in_interval)

	max_division_code = TIf$division_code[max(which(as.logical(TIf$in_interval)))]
	min_division_code = TIf$division_code[min(which(as.logical(TIf$in_interval)))]

	TI = TIf$in_interval

	GTc = G[,pI]
	GTc[!TI,] = NA
	HTc = (GTc - attr(H,"scaled:center"))/attr(H,"scaled:scale")

	HTIc = HTc
	HTIc[!is.na(HTIc)] = 1
	HTIc[is.na(HTIc)] = 0

	HTw0c = HTc * w1 * w2
	HTw0c[is.na(HTw0c)] = 0


	#weights for non missing data division x persons
	HTIcw = HTIcw1w2
	#sum of weights of divisions for each persons
	tmp = apply(HTIcw,2,sum)
	pTw = tmp/max(tmp)
	#index of persons in calculation
	pTI = pTw > lo_limitT

	#weighted scaled with NA->0 and cutted persons with too few votes division x persons
	HTw0cc = HTw0c[,pTI]
	#index of missing data cutted persons with too few votes divisions x persons
	HTIcc = HTIc[,pTI]


	dweights = t(t(aU)%*%HTIcc / apply(aU,2,sum)) #person x division
	dweights[is.na(dweights)] = 0

	HTw0ccproj = t(HTw0cc)%*%U / dweights


	parties = as.matrix(dbGetQuery(datadb,paste('SELECT COALESCE(NULLIF(tmax.code,""),tmin.code) as code, COALESCE(NULLIF(tmax.color,""),tmin.color) as color FROM (SELECT mvf.mp_code, g.code, g.color FROM mp_vote_full as mvf LEFT JOIN "group" as g ON mvf.group_code=g.code WHERE division_code="',max_division_code,'") as tmax LEFT JOIN (SELECT mvf.mp_code, g.code, g.color FROM mp_vote_full as mvf LEFT JOIN "group" as g ON mvf.group_code=g.code WHERE division_code="',min_division_code,'") as tmin ON tmax.mp_code=tmin.mp_code ORDER BY CAST(tmax.mp_code as INTEGER)',sep="")))
	partiescc = (parties[pI,])[pTI,]

	#mp rank id
	mp_rank_id_cc = seq(1,dim(HTw0c)[2],1)[,pTI]

	write.csv(format(cbind(HTw0ccproj[,1:2],partiescc,mp_rank_id_cc),digits=3),file=paste(path,dataname,"/",analysisname,"/",interval_names[i,],"_result.csv",sep=""))
	#*************************
	}
	#matrix (operations):
	en.wikipedia.org/wiki/Matrix_(mathematics)

	#rotation matrix:
	http://en.wikipedia.org/wiki/Rotation_matrix

	#SVD:
	http://en.wikipedia.org/wiki/Singular_value_decomposition

	#Eigendecomposition
	http://en.wikipedia.org/wiki/Eigendecomposition

	#SVD vs. eigendec.
	http://stats.stackexchange.com/questions/14002/whats-the-difference-between-principal-components-analysis-and-multidimensional

	#SVD example:
	http://polisci2.ucsd.edu/aruizeuler/SVD.html

	#normal and distance point,plane
	http://en.wikipedia.org/wiki/Normal_%28geometry%29
	http://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line
	http://www.vitutor.com/geometry/distance/point_plane.html

	# multiply rows of matrix by vector in R (speed)
	http://stackoverflow.com/questions/3643555/multiply-rows-of-matrix-by-vector

	#ideal points
	http://voteview.com/ideal_point_history.htm

	#wnominate in R
	http://cran.r-project.org/web/packages/wnominate/vignettes/wnominate.pdf
	http://cran.r-project.org/web/packages/wnominate/wnominate.pdf
	http://rss.acs.unt.edu/Rdoc/library/pscl/html/rollcall.html
	http://voteview.com/senate109.htm

	#US roll-call downloads
	http://voteview.com/downloads.asp#ROLLCALLDWNL

	#linear discriminant analysis (not used finally)
	http://en.wikipedia.org/wiki/Linear_discriminant_analysis

	#logistic regression (not used finally)
	http://en.wikipedia.org/wiki/Linear_discriminant_analysis

	#SVM and kernels (not used finally)
	http://en.wikipedia.org/wiki/Support_vector_machine
	http://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/SVM
	http://stats.stackexchange.com/questions/26293/how-to-obtain-decision-boundaries-from-linear-svm-in-r
	http://stats.stackexchange.com/questions/5056/computing-the-decision-boundary-of-a-linear-svm-model/5080#5080
	http://stats.stackexchange.com/questions/25387/problem-with-e1071-libsvm
	http://stats.stackexchange.com/questions/17955/svm-with-unequal-group-sizes-in-training-data?rq=1
	http://cran.r-project.org/web/packages/kernlab/kernlab.pdf
	http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/svmbasic/svmbasic_notes.pdf
	http://stackoverflow.com/questions/3282916/class-weight-syntax-in-kernlab
	http://www.robots.ox.ac.uk/~az/lectures/ml/lect2.pdf
	http://stat.ethz.ch/R-manual/R-patched/library/stats/html/density.html
p1	p2	p3	p4
1	1	1	-1
1	1	-1	-1
1	1	-1	-1
1	-1	1	-1
1	-1	1	-1
1	-1	-1	-1
-1	-1	-1	1
-1	-1	-1	1
1	1	-1	-1
1	-1	1	-1
	#install.packages("reshape2")
	library("reshape2")
	#install.packages("sqldf")
	library("sqldf")

	path = "~/project/mpv_motion/dev/"
	#dataname = "cz_psp_2006-2010_a1_1"
	#analysisname = 'cz_psp_2006-2010_a1_1q'
	dataname = "cz_kromeriz_2012"
	analysisname = 'cz_kromeriz_2012_1h'

	#raw data in 3 columns (values in [-1,1])
	Graw = read.table(paste(path,dataname,"/",dataname,".csv",sep=""), header=FALSE, sep=",")
	#data divisions x persons
	G = acast(Graw,V2~V1,value.var='V3')
	#scaled divisions x persons (mean=0 and sd=1 for each division)
	H=t(scale(t(G),scale=TRUE))

	#weights
	#weights 1, based on number of persons in division
	w1 = apply(abs(G)==1,1,sum,na.rm=TRUE)/max(apply(abs(G)==1,1,sum,na.rm=TRUE))
	w1[is.na(w1)] = 0
	#weights 2, "100:100" vs. "195:5"
	w2 = 1 - abs(apply(G==1,1,sum,na.rm=TRUE) - apply(G==-1,1,sum,na.rm=TRUE))/apply(!is.na(G),1,sum)
	w2[is.na(w2)] = 0

	#analytics
	plot(w1)
	plot(w2)
	plot(w1*w2)

	#weighted scaled divisions x persons
	Hw = H * w1 * w2

	#index of missing data
	#index of missing data divisions x persons
	HI = H
	HI[!is.na(H)]=1
	HI[is.na(H)] = 0

	#weighted scaled with NA->0 division x persons
	Hw0 = Hw
	Hw0[is.na(Hw)]=0

	#eliminate persons with too few votes (weighted)
	#lower limit to eliminate from calculations
	lo_limit = .1
	#weights for non missing data division x persons
	HIw = HIw1w2
	#sum of weights of divisions for each persons
	tmp = apply(HIw,2,sum)
	pw = tmp/max(tmp)
	#index of persons in calculation
	pI = pw > lo_limit
	#weighted scaled with NA->0 and cutted persons with too few votes division x persons
	Hw0c = Hw0[,pI]
	#index of missing data cutted persons with too few votes divisions x persons
	HIc = HI[,pI]

	#"covariance" matrix adjusted according to missing data
	Hcov=t(Hw0c)%%Hw0c 1/(t(HIc)%%HIc) (dim(Hw0c)[1])
	#Hcov=t(Hw0)%%Hw0 1/(t(HI)%%HI-1) (dim(H)[1]-1)

	#substitution of missing data in "covariance" matrix
	Hcov0 = Hcov
	Hcov0[is.na(Hcov)] = 0 #*********

	#eigendecomposition
	He=eigen(Hcov0)
	# V (rotation values of persons)
	V = He$vectors
	#projected divisions into dimensions
	Hy=Hw0c%*%V

	#analytics
	plot(Hy[,1],Hy[,2])
	plot(sqrt(He$values[1:10]))

	#sigma matrix
	sigma = diag(sqrt(He$values))
	sigma[is.na(sigma)] = 0

	#projection of persons into dimensions
	Hproj = t(sigma%*%t(V))
	#analytics
	plot(Hproj[,1],Hproj[,2])

	#sigma^-1 matrix
	sigma_1 = diag(sqrt(1/He$values))
	sigma_1[is.na(sigma_1)] = 0

	# U (rotation values of divisions)
	U = Hw0c%%V%%sigma_1

	#U%%sigma%%t(V) != Hw0c ;because of adjusting of "covariance" matrix
	#THE PROBLEM (if T - overall otherwise see Hproj2)
	plot(He$vectors[,1]sqrt(He$value[1]),He$vectors[,2]sqrt(He$value[2]))
	#second projection
	Hproj2 = t(t(U) %*% Hw0c)
	#without missing values should be equal
	#analytics
	plot(Hproj[,1],Hproj2[,1])
	plot(Hproj[,2],Hproj2[,2])
	#plot(HTw0ccproj[,1],HTw0ccproj[,2])
	#plot(Hproj[,1],HTw0ccproj[,1])
	#plot(Hproj[,2],HTw0ccproj[,2])