Skip to content

Instantly share code, notes, and snippets.

@michalskop
Last active December 16, 2015 21:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michalskop/5502013 to your computer and use it in GitHub Desktop.
Save michalskop/5502013 to your computer and use it in GitHub Desktop.
W PCA (in R)
| Principal factor analysis with "natural" weights - Model of parliamentary voting.
| (PCA equals to Multidimensional scaling)
| Cutting lines: perpendicular to normal vectors of divisions, minimizing the sum of "error distances" in given hyperplane
# Weighted PCA
# CUTTING LINES
# based on overall model
#minimizing function
f_cutting_lines=function(beta0) -1*sum(apply(cbind(y*(x%*%beta+beta0),zeros),1,min))
#how many dimensions?
n_first_dimensions = 2
#preparing variables
normals = Hy[,1:n_first_dimensions]
loss_f = data.frame(matrix(0,nrow=dim(H)[1],ncol=4))
colnames(loss_f)=c("Parameter1","Loss1","Parameter_1","Loss_1")
parameters = data.frame(matrix(0,nrow=dim(H)[1],ncol=3))
colnames(parameters)=c("Parameter","Loss","Direction")
xfull = t(t(He$vectors[,1:n_first_dimensions]) * sqrt(He$values[1:n_first_dimensions]))
#calculating all cutting lines
i=1
for (i in as.numeric(1:dim(H)[1])) {
beta = Hy[i,1:n_first_dimensions]
y = t(as.matrix(H[i,]))[,pI]
x = xfull[which(!is.na(y)),]
y = y[which(!is.na(y))]
zeros = as.matrix(rep(0,length(y)))
#** place for improvement: "1000"
res1 = optim(c(1),f_cutting_lines,method="Brent",lower=-1000,upper=1000) #1000? arbitrary
#the sign is arbitrary, the real result may be -1*; we need to minimize the other way as well!!
y=-y
res2 = optim(c(1),f_cutting_lines,method="Brent",lower=-1000,upper=1000) #1000? arbitrary
#the real parameter is the one with lower loss function
#theoretically should be the same (either +1 or -1) for all divisions(rows), however, due to missing data, the calculation may lead to a few divisions with the other direction
loss_f[i,] = c(res1$par,res1$value,res2$par,res2$value)
if (res1$value<=res2$value) {
parameters[i,] = c(res1$par,res1$value,1)
} else {
parameters[i,] = c(res2$par,res2$value,-1)
}
}
CuttingLines = list(normals=normals,parameters=parameters,loss_function=loss_f,weights=cbind(w1,w2))
##
#cutting lines:
plot(Hproj[,1],Hproj[,2])
I = w1*w2 > .5
for (i in as.numeric(1:dim(H)[1])) {
if (I[i] && ((i %% 10) == 0)) {
beta = CuttingLines$normals[i,]
beta0 = CuttingLines$parameters$Parameter[i]
abline(-beta0/beta[2],-beta[1]/beta[2])
break
}
}
# only second half
plot(Hproj[,1],Hproj[,2])
for (j in as.numeric(1:round(dim(H)[1]/2))) {
i = round(dim(H)[1]/2) + j
if (I[i] && ((i %% 10) == 0)) {
beta = CuttingLines$normals[i,]
beta0 = CuttingLines$parameters$Parameter[i]
abline(-beta0/beta[2],-beta[1]/beta[2])
#break
}
}
# when y=0, x is ?
plot(CuttingLines$parameters$Parameter/CuttingLines$normals[,1],ylim=c(-100,100))
# slopes
plot(CuttingLines$parameters$Parameter/CuttingLines$normals[,2],ylim=c(-10,10))
# NEW MP (or partial)
#New persons / partial (like a projection of divisions from a smaller time interval into all divisions)
analdb = dbConnect(SQLite(), dbname=paste(path,dataname,"/",analysisname,"/",analysisname,".sqlite3",sep=""))
datadb = dbConnect(SQLite(), dbname=paste(path,dataname,"/",dataname,".sqlite3",sep=""))
rotation = strsplit(dbGetQuery(analdb,"SELECT orientation FROM analysis_info")[1,1],",")[[1]]
rot_mp_rank = which(dimnames(Hcov)[[1]] == rotation[1])
for (j in 2:length(rotation)) {
if (Hproj[rot_mp_rank,j-1] * as.double(rotation[j]) < 0) {
U[,j-1] = -1*U[,j-1]
}
}
aU = abs(U)
mp_names = dbGetQuery(datadb,"SELECT name FROM mp ORDER BY CAST(code as INTEGER)")
attr(G,'dimnames')[2][[1]] = mp_names$name
#attr(Hw0c,'dimnames')[2][[1]] = mp_names$name[pI]
interval_names = dbGetQuery(analdb,"SELECT distinct(interval_name) FROM analysis_division_in_interval ORDER BY interval_name")
#lower limit to eliminate from projections
lo_limitT = lo_limit
for (i in 1:dim(interval_names)[1]) {
TIf = dbGetQuery(analdb,paste("SELECT division_code, CASE interval_name='",interval_names[i,],"' WHEN 1 THEN 'TRUE' ELSE 'FALSE' END as in_interval FROM analysis_division_in_interval ORDER BY division_code",sep=""))
TIf$in_interval = as.logical(TIf$in_interval)
max_division_code = TIf$division_code[max(which(as.logical(TIf$in_interval)))]
min_division_code = TIf$division_code[min(which(as.logical(TIf$in_interval)))]
TI = TIf$in_interval
GTc = G[,pI]
GTc[!TI,] = NA
HTc = (GTc - attr(H,"scaled:center"))/attr(H,"scaled:scale")
HTIc = HTc
HTIc[!is.na(HTIc)] = 1
HTIc[is.na(HTIc)] = 0
HTw0c = HTc * w1 * w2
HTw0c[is.na(HTw0c)] = 0
#weights for non missing data division x persons
HTIcw = HTIc*w1*w2
#sum of weights of divisions for each persons
tmp = apply(HTIcw,2,sum)
pTw = tmp/max(tmp)
#index of persons in calculation
pTI = pTw > lo_limitT
#weighted scaled with NA->0 and cutted persons with too few votes division x persons
HTw0cc = HTw0c[,pTI]
#index of missing data cutted persons with too few votes divisions x persons
HTIcc = HTIc[,pTI]
dweights = t(t(aU)%*%HTIcc / apply(aU,2,sum)) #person x division
dweights[is.na(dweights)] = 0
HTw0ccproj = t(HTw0cc)%*%U / dweights
parties = as.matrix(dbGetQuery(datadb,paste('SELECT COALESCE(NULLIF(tmax.code,""),tmin.code) as code, COALESCE(NULLIF(tmax.color,""),tmin.color) as color FROM (SELECT mvf.mp_code, g.code, g.color FROM mp_vote_full as mvf LEFT JOIN "group" as g ON mvf.group_code=g.code WHERE division_code="',max_division_code,'") as tmax LEFT JOIN (SELECT mvf.mp_code, g.code, g.color FROM mp_vote_full as mvf LEFT JOIN "group" as g ON mvf.group_code=g.code WHERE division_code="',min_division_code,'") as tmin ON tmax.mp_code=tmin.mp_code ORDER BY CAST(tmax.mp_code as INTEGER)',sep="")))
partiescc = (parties[pI,])[pTI,]
#mp rank id
mp_rank_id_cc = seq(1,dim(HTw0c)[2],1)[,pTI]
write.csv(format(cbind(HTw0ccproj[,1:2],partiescc,mp_rank_id_cc),digits=3),file=paste(path,dataname,"/",analysisname,"/",interval_names[i,],"_result.csv",sep=""))
#*************************
}
#matrix (operations):
en.wikipedia.org/wiki/Matrix_(mathematics)
#rotation matrix:
http://en.wikipedia.org/wiki/Rotation_matrix
#SVD:
http://en.wikipedia.org/wiki/Singular_value_decomposition
#Eigendecomposition
http://en.wikipedia.org/wiki/Eigendecomposition
#SVD vs. eigendec.
http://stats.stackexchange.com/questions/14002/whats-the-difference-between-principal-components-analysis-and-multidimensional
#SVD example:
http://polisci2.ucsd.edu/aruizeuler/SVD.html
#normal and distance point,plane
http://en.wikipedia.org/wiki/Normal_%28geometry%29
http://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line
http://www.vitutor.com/geometry/distance/point_plane.html
# multiply rows of matrix by vector in R (speed)
http://stackoverflow.com/questions/3643555/multiply-rows-of-matrix-by-vector
#ideal points
http://voteview.com/ideal_point_history.htm
#wnominate in R
http://cran.r-project.org/web/packages/wnominate/vignettes/wnominate.pdf
http://cran.r-project.org/web/packages/wnominate/wnominate.pdf
http://rss.acs.unt.edu/Rdoc/library/pscl/html/rollcall.html
http://voteview.com/senate109.htm
#US roll-call downloads
http://voteview.com/downloads.asp#ROLLCALLDWNL
#linear discriminant analysis (not used finally)
http://en.wikipedia.org/wiki/Linear_discriminant_analysis
#logistic regression (not used finally)
http://en.wikipedia.org/wiki/Linear_discriminant_analysis
#SVM and kernels (not used finally)
http://en.wikipedia.org/wiki/Support_vector_machine
http://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/SVM
http://stats.stackexchange.com/questions/26293/how-to-obtain-decision-boundaries-from-linear-svm-in-r
http://stats.stackexchange.com/questions/5056/computing-the-decision-boundary-of-a-linear-svm-model/5080#5080
http://stats.stackexchange.com/questions/25387/problem-with-e1071-libsvm
http://stats.stackexchange.com/questions/17955/svm-with-unequal-group-sizes-in-training-data?rq=1
http://cran.r-project.org/web/packages/kernlab/kernlab.pdf
http://cbio.ensmp.fr/~jvert/svn/tutorials/practical/svmbasic/svmbasic_notes.pdf
http://stackoverflow.com/questions/3282916/class-weight-syntax-in-kernlab
http://www.robots.ox.ac.uk/~az/lectures/ml/lect2.pdf
http://stat.ethz.ch/R-manual/R-patched/library/stats/html/density.html
p1 p2 p3 p4
1 1 1 -1
1 1 -1 -1
1 1 -1 -1
1 -1 1 -1
1 -1 1 -1
1 -1 -1 -1
-1 -1 -1 1
-1 -1 -1 1
1 1 -1 -1
1 -1 1 -1
#install.packages("reshape2")
library("reshape2")
#install.packages("sqldf")
library("sqldf")
path = "~/project/mpv_motion/dev/"
#dataname = "cz_psp_2006-2010_a1_1"
#analysisname = 'cz_psp_2006-2010_a1_1q'
dataname = "cz_kromeriz_2012"
analysisname = 'cz_kromeriz_2012_1h'
#raw data in 3 columns (values in [-1,1])
Graw = read.table(paste(path,dataname,"/",dataname,".csv",sep=""), header=FALSE, sep=",")
#data divisions x persons
G = acast(Graw,V2~V1,value.var='V3')
#scaled divisions x persons (mean=0 and sd=1 for each division)
H=t(scale(t(G),scale=TRUE))
#weights
#weights 1, based on number of persons in division
w1 = apply(abs(G)==1,1,sum,na.rm=TRUE)/max(apply(abs(G)==1,1,sum,na.rm=TRUE))
w1[is.na(w1)] = 0
#weights 2, "100:100" vs. "195:5"
w2 = 1 - abs(apply(G==1,1,sum,na.rm=TRUE) - apply(G==-1,1,sum,na.rm=TRUE))/apply(!is.na(G),1,sum)
w2[is.na(w2)] = 0
#analytics
plot(w1)
plot(w2)
plot(w1*w2)
#weighted scaled divisions x persons
Hw = H * w1 * w2
#index of missing data
#index of missing data divisions x persons
HI = H
HI[!is.na(H)]=1
HI[is.na(H)] = 0
#weighted scaled with NA->0 division x persons
Hw0 = Hw
Hw0[is.na(Hw)]=0
#eliminate persons with too few votes (weighted)
#lower limit to eliminate from calculations
lo_limit = .1
#weights for non missing data division x persons
HIw = HI*w1*w2
#sum of weights of divisions for each persons
tmp = apply(HIw,2,sum)
pw = tmp/max(tmp)
#index of persons in calculation
pI = pw > lo_limit
#weighted scaled with NA->0 and cutted persons with too few votes division x persons
Hw0c = Hw0[,pI]
#index of missing data cutted persons with too few votes divisions x persons
HIc = HI[,pI]
#"covariance" matrix adjusted according to missing data
Hcov=t(Hw0c)%*%Hw0c * 1/(t(HIc)%*%HIc) * (dim(Hw0c)[1])
#Hcov=t(Hw0)%*%Hw0 * 1/(t(HI)%*%HI-1) * (dim(H)[1]-1)
#substitution of missing data in "covariance" matrix
Hcov0 = Hcov
Hcov0[is.na(Hcov)] = 0 #*********
#eigendecomposition
He=eigen(Hcov0)
# V (rotation values of persons)
V = He$vectors
#projected divisions into dimensions
Hy=Hw0c%*%V
#analytics
plot(Hy[,1],Hy[,2])
plot(sqrt(He$values[1:10]))
#sigma matrix
sigma = diag(sqrt(He$values))
sigma[is.na(sigma)] = 0
#projection of persons into dimensions
Hproj = t(sigma%*%t(V))
#analytics
plot(Hproj[,1],Hproj[,2])
#sigma^-1 matrix
sigma_1 = diag(sqrt(1/He$values))
sigma_1[is.na(sigma_1)] = 0
# U (rotation values of divisions)
U = Hw0c%*%V%*%sigma_1
#U%*%sigma%*%t(V) != Hw0c ;because of adjusting of "covariance" matrix
#THE PROBLEM (if T - overall otherwise see Hproj2)
plot(He$vectors[,1]*sqrt(He$value[1]),He$vectors[,2]*sqrt(He$value[2]))
#second projection
Hproj2 = t(t(U) %*% Hw0c)
#without missing values should be equal
#analytics
plot(Hproj[,1],Hproj2[,1])
plot(Hproj[,2],Hproj2[,2])
#plot(HTw0ccproj[,1],HTw0ccproj[,2])
#plot(Hproj[,1],HTw0ccproj[,1])
#plot(Hproj[,2],HTw0ccproj[,2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment