Created
September 11, 2013 13:04
-
-
Save andilabs/6523287 to your computer and use it in GitHub Desktop.
R reconcile wojtekw
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#wczytanie danych dotyczacych ocen stron przez mturkow | |
DANE_IBR="/Users/andi/Desktop/RECONCILE/IBR.csv" | |
DANE_BIG="/Users/andi/Desktop/RECONCILE/DANE_evaluations.csv" | |
d_eval<-read.csv(DANE_BIG, header=TRUE, sep=";") | |
library(matrixStats) | |
library(R.methodsS3) | |
#przygotowanie danych dla Troi | |
#z NA | |
troia_input<-subset(d_eval,select=c(person_id,document_id,documentevaluation_credibility)) | |
write.table(troia_input, file="/Users/andi/Desktop/RECONCILE/troia_input.csv",row.names=FALSE, col.names=FALSE,sep=" ") | |
#bez NA | |
troia_input_without_na <- troia_input[!is.na(troia_input$documentevaluation_credibility),] | |
write.table(troia_input_without_na, file="/Users/andi/Desktop/RECONCILE/troia_input_without_na.csv",row.names=FALSE, col.names=FALSE, sep=" ") | |
#z 0 zamiast NA | |
troia_input_with_zeros <- troia_input | |
troia_input_with_zeros[is.na(troia_input_with_zeros)]<-0 | |
write.table(troia_input_with_zeros, file="/Users/andi/Desktop/RECONCILE/troia_input_with_zeros.csv",row.names=FALSE, col.names=FALSE, sep=" ") | |
#proste statystyki, wykresy | |
#biblioteki plyr uzyjemy do szybkiego wygenerowania podsumowan | |
library(plyr) | |
#zwroci tabele zawierajaca w pierwszej kolumnie liczbe ile-krotnie strona zostala oceniona, a w drugiej ile stron z calego zbioru dotyczyla tylokrotna ocena. | |
#wariant wliczajacy NA | |
count(count(d_eval,"document_id"),"freq") | |
#plot(count(count(d_eval,"document_id"),"freq")) | |
b<-barplot(count(count(d_eval,"document_id"),"freq")$freq.1,main="Distribution of number of ratings per page (including 0 - \"don\'t know\")",names.arg=count(count(d_eval,"document_id"),"freq")$freq) | |
mtext(side = 1, at = b, text = paste("#:",count(count(d_eval,"document_id"),"freq")$freq.1), line = 3) | |
#wariant pomijajacy NA | |
count(count(troia_input_without_na,"document_id"),"freq") | |
#plot(count(count(troia_input_without_na,"document_id"),"freq")) | |
b<-barplot(count(count(troia_input_without_na,"document_id"),"freq")$freq.1,main="Distribution of number of ratings per page(without 0 - \"don\'t know\")",names.arg=count(count(troia_input_without_na,"document_id"),"freq")$freq) | |
mtext(side = 1, at = b, text = paste("#:",count(count(troia_input_without_na,"document_id"),"freq")$freq.1), line = 3) | |
srednia<-function(lista) { | |
# print(lista) | |
sum1=0 | |
sum2=0 | |
for (i in 1:length(lista)){ | |
sum1=sum1+reated(lista[i])*notna(lista[i]) | |
sum2=sum2+reated(lista[i]) | |
} | |
if (sum2!=0){ | |
as.numeric(sum1/sum2) | |
} | |
else if(sum1==0){ | |
as.numeric(0) | |
} | |
} | |
wariancja<-function(lista) { | |
sum1=0 | |
sum2=0 | |
srednia=srednia(lista) | |
for (i in 1:length(lista)){ | |
sum1=sum1+reated(lista[i])*(notna(lista[i])-srednia)^2 | |
sum2=sum2+reated(lista[i]) | |
} | |
if (sum2!=0){ | |
as.numeric(sum1/sum2) | |
} | |
else if(sum1==0){ | |
as.numeric(0) | |
} | |
} | |
adamw_classifier<-function(lista){ | |
# co najmniej 2 oceny=5 -> strona trafia do klasy HC (Highly Credibly) | |
# co najmniej 2 oceny=4 -> strona trafia do klasy N (Neutral) | |
# co najmniej 2 oceny<4 -> strona trafia do klasy HNC (Highly Not Credible) | |
df<-data.frame(table(lista)) | |
print (df) | |
iclass="" | |
# list Freq | |
if (5 %in% df$lista){ | |
if(df[df$lista==5,]$Freq >= 2) { | |
iclass="HC" | |
print("HC was assigned") | |
#break | |
} | |
else{ | |
break | |
} | |
} | |
else if (4 %in% df$lista){ | |
if(df[df$lista==4,]$Freq >= 2){ | |
iclass="N" | |
print("N was assigned") | |
#break | |
} | |
else{ | |
break | |
} | |
}else if(nrow(df)>=3 & sum(df[1:3,]$Freq)>=2){ | |
iclass="HNC" | |
print("HNC was assigned") | |
} | |
else{ | |
iclass="UNDIFINED" | |
print("UNDIFINED was assigned") | |
} | |
# print(iclass) | |
return(as.character(iclass)) | |
} | |
reated <- function(arg){ | |
#if(is.na(arg)) 0 else 1 | |
if(is.na(arg)){ | |
as.numeric(0) | |
} | |
else{ | |
as.numeric(1) | |
} | |
} | |
notna <- function(arg){ | |
if(is.na(arg)) 0 else arg | |
} | |
tmp<-troia_input | |
length(unique(tmp$document_id)) | |
output<-ddply(tmp,.(document_id),summarise,SREDNIA=srednia(documentevaluation_credibility),WARIANCJA=wariancja(documentevaluation_credibility)) | |
#output<-ddply(tmp,.(document_id),summarise,SREDNIA=srednia(documentevaluation_credibility),WARIANCJA=wariancja(documentevaluation_credibility),ADAMW_CLASS=adamw_classifier(documentevaluation_credibility)) | |
head(output) | |
plot(sort(output$WARIANCJA),col="red") | |
lines(sort(output$SREDNIA),col="green") | |
plot(y=jitter(output$WARIANCJA,amount=0.2),x=jitter(output$SREDNIA,amount=0.2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment