Last active
December 22, 2015 23:29
-
-
Save andilabs/6547350 to your computer and use it in GitHub Desktop.
mit wojtekw @ RECONCILE --- !!! Z poważnymi błędami!!! ERROR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#wczytanie danych dotyczacych ocen stron przez mturkow | |
d_eval<-read.csv("/Users/andi/Desktop/RECONCILE/DANE_evaluations.csv", header=TRUE, sep=";") | |
library(matrixStats) | |
library(R.methodsS3) | |
#przygotowanie danych dla Troi | |
#z NA | |
troia_input<-subset(d_eval,select=c(person_id,document_id,documentevaluation_credibility)) | |
write.table(troia_input, file="/Users/andi/Desktop/RECONCILE/troia_input.csv",row.names=FALSE, col.names=FALSE,sep=" ") | |
#bez NA | |
troia_input_without_na <- troia_input[!is.na(troia_input$documentevaluation_credibility),] | |
write.table(troia_input_without_na, file="/Users/andi/Desktop/RECONCILE/troia_input_without_na.csv",row.names=FALSE, col.names=FALSE, sep=" ") | |
#z 0 zamiast NA | |
troia_input_with_zeros <- troia_input | |
troia_input_with_zeros[is.na(troia_input_with_zeros)]<-0 | |
write.table(troia_input_with_zeros, file="/Users/andi/Desktop/RECONCILE/troia_input_with_zeros.csv",row.names=FALSE, col.names=FALSE, sep=" ") | |
#proste statystyki, wykresy | |
#biblioteki plyr uzyjemy do szybkiego wygenerowania podsumowan | |
library(plyr) | |
#zwroci tabele zawierajaca w pierwszej kolumnie liczbe ile-krotnie strona zostala oceniona, a w drugiej ile stron z calego zbioru dotyczyla tylokrotna ocena. | |
#wariant wliczajacy NA | |
count(count(d_eval,"document_id"),"freq") | |
#plot(count(count(d_eval,"document_id"),"freq")) | |
b<-barplot(count(count(d_eval,"document_id"),"freq")$freq.1,main="Distribution of number of ratings per page (including 0 - \"don\'t know\")",names.arg=count(count(d_eval,"document_id"),"freq")$freq) | |
mtext(side = 1, at = b, text = paste("#:",count(count(d_eval,"document_id"),"freq")$freq.1), line = 3) | |
#wariant pomijajacy NA | |
count(count(troia_input_without_na,"document_id"),"freq") | |
#plot(count(count(troia_input_without_na,"document_id"),"freq")) | |
b<-barplot(count(count(troia_input_without_na,"document_id"),"freq")$freq.1,main="Distribution of number of ratings per page(without 0 - \"don\'t know\")",names.arg=count(count(troia_input_without_na,"document_id"),"freq")$freq) | |
mtext(side = 1, at = b, text = paste("#:",count(count(troia_input_without_na,"document_id"),"freq")$freq.1), line = 3) | |
temp<-aggregate(temp,by=list(temp$document_id), FUN=mean, na.rm=TRUE) | |
srednia<-function(lista) { | |
# print(lista) | |
sum1=0 | |
sum2=0 | |
for (i in 1:length(lista)){ | |
sum1=sum1+reated(lista[i])*notna(lista[i]) | |
sum2=sum2+reated(lista[i]) | |
} | |
if (sum2!=0){ | |
as.numeric(sum1/sum2) | |
} | |
else if(sum1==0){ | |
as.numeric(0) | |
} | |
} | |
wariancja<-function(lista) { | |
sum1=0 | |
sum2=0 | |
srednia=srednia(lista) | |
for (i in 1:length(lista)){ | |
sum1=sum1+reated(lista[i])*(notna(lista[i])-srednia)^2 | |
sum2=sum2+reated(lista[i]) | |
} | |
if (sum2!=0){ | |
as.numeric(sum1/sum2) | |
} | |
else if(sum1==0){ | |
as.numeric(0) | |
} | |
} | |
reated <- function(arg){ | |
#if(is.na(arg)) 0 else 1 | |
if(is.na(arg)){ | |
as.numeric(0) | |
} | |
else{ | |
as.numeric(1) | |
} | |
} | |
notna <- function(arg){ | |
if(is.na(arg)) 0 else arg | |
} | |
# co najmniej 2 oceny=5 -> strona trafia do klasy HC (Highly Credibly) | |
# co najmniej 2 oceny=4 -> strona trafia do klasy N (Neutral) | |
# co najmniej 2 oceny<4 -> strona trafia do klasy HNC (Highly Not Credible) | |
adamw_classifier<-function(lista){ | |
#zamiana NA na 0 | |
for (i in 1:length(lista)){ lista[i]=notna(lista[i]) } | |
#stworzenie data frameu z czestosciami wystapien poszczegolnych ocen dla strony | |
df<-data.frame(table(lista)) | |
if ((5 %in% df$lista) && (df[df$lista==5,]$Freq >= 2)) { | |
#print("HC was assigned") | |
return (3)#(as.character("HC")) | |
} | |
else if ((4 %in% df$lista) && (df[df$lista==4,]$Freq >= 2)){ | |
#print("N was assigned") | |
return (2)#(as.character("N")) | |
} | |
else if (sum(df[as.numeric(levels(df$lista)[df$lista]) < 4,]$Freq) >=2){ | |
#print("HNC was assigned") | |
return (1)#(as.character("HNC")) | |
} | |
else{ | |
#print("UNDIFINED was assigned") | |
return (0)#(as.character("UNDIFINED")) | |
} | |
} | |
tmp<-troia_input | |
length(unique(tmp$document_id)) | |
dane<-ddply(tmp,.(document_id),summarise,SREDNIA=srednia(documentevaluation_credibility),WARIANCJA=wariancja(documentevaluation_credibility),CLASS_ADAMW=adamw_classifier(documentevaluation_credibility)) | |
kmeans_data<-matrix(c(dane$SREDNIA, dane$WARIANCJA),ncol=2) | |
kmeans_result<-kmeans(kmeans_data,sum(unique(dane$CLASS_ADAMW))) | |
plot(y=jitter(dane$WARIANCJA,amount=0.2),x=jitter(dane$SREDNIA,amount=0.2)) | |
plot(y=jitter(dane$WARIANCJA,amount=0.2),x=jitter(dane$SREDNIA,amount=0.2),col=dane$CLASS_ADAMW) | |
plot(y=jitter(dane$WARIANCJA,amount=0.2),x=jitter(dane$SREDNIA,amount=0.2),col=kmeans_result$cluster) | |
points(kmeans_result$centers,col=6,pch=10,cex=3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment