Last active
July 25, 2016 14:46
-
-
Save shuozhang1985/3f01b9406a04555035f14d481ad22c4f to your computer and use it in GitHub Desktop.
Why did she got an A while I got a D?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Dataset description | |
library(plyr); library(dplyr) | |
library(ggplot2) | |
setwd("~/Desktop/student") | |
d1_mat=read.csv("student-mat.csv", sep=';', header=T, stringsAsFactors = F) | |
d2_por=read.csv("student-por.csv", sep=';', header = T, stringsAsFactors = F) | |
data=merge(d1_mat,d2_por,by=c("school", "sex", "age", "address", "famsize", | |
"Pstatus","Medu", "Fedu", "Mjob", "Fjob", "reason", | |
"nursery", "internet")) | |
summary(data) | |
#Which school has better student performance? | |
library(lattice) | |
library(plyr) | |
library(Rmisc) | |
p1<-ggplot(data=data, aes(x=school, y=G3.x))+ | |
geom_point(aes(color=school))+ | |
geom_boxplot(aes(color=school))+ | |
xlab('School')+ | |
ylab("Final Grade of Math")+ | |
geom_hline(yintercept=9,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=11,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=13,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=15,colour='grey20',size=0.5,linetype=2)+ | |
annotate('text', x=2.5, y=c(8.5,10.5,12.5,14.5, 17), | |
label=c('F', 'D', 'C', 'B', 'A'), colour='red')+ | |
ggtitle("G3 in Math vs School")+ | |
theme_bw()+ | |
theme(legend.position="none") | |
p2<-ggplot(data=data, aes(x=school, y=G3.y))+ | |
geom_point(aes(color=school))+ | |
geom_boxplot(aes(color=school))+ | |
scale_x_discrete(name='School')+ | |
scale_y_continuous(name="Final Grade of Portuguese")+ | |
geom_hline(yintercept=9,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=11,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=13,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=15,colour='grey20',size=0.5,linetype=2)+ | |
annotate('text', x=2.5, y=c(8.5,10.5,12.5,14.5, 17), | |
label=c('F', 'D', 'C', 'B', 'A'), colour='red')+ | |
ggtitle("G3 in Portuguese vs School")+ | |
theme_bw()+ | |
theme(legend.position="none") | |
multiplot(p1, p2, cols =1) | |
#Does the current student performance have a correlation with the past? | |
p3<-ggplot(data=data, aes(x=G1.x, y=G3.x, color=school))+ | |
geom_point(size=1, shape=1, position = "jitter")+ | |
scale_x_continuous(name="First Period Grade of Math")+ | |
scale_y_continuous(name="Final Grade of Math")+ | |
ggtitle("G3 vs G1 in Math")+ | |
scale_color_manual(values=c('red', 'blue'))+ | |
theme_bw() | |
p4<-ggplot(data=data, aes(x=G2.x, y=G3.x, color=school))+ | |
geom_point(size=1, shape=1, position = "jitter")+ | |
scale_x_continuous(name="Second Period Grade of Math")+ | |
scale_y_continuous(name="Final Grade of Math")+ | |
ggtitle("G3 vs G2 in Math")+ | |
scale_color_manual(values=c('red', 'blue'))+ | |
theme_bw() | |
multiplot(p3, p4, cols=2) | |
#Students who did not drop class: | |
data_G3xgreater0=data[data$G3.x>0,] | |
p5<-ggplot(data=data_G3xgreater0, aes(x=G2.x, y=G3.x, color=school))+ | |
geom_point(size=1, shape=1, position = "jitter")+ | |
scale_x_continuous(name="Second Period Grade of Math")+ | |
scale_y_continuous(name="Final Grade of Math")+ | |
ggtitle("G3 vs G2 in Math")+ | |
scale_color_manual(values=c('red', 'green') )+ | |
geom_smooth(se=F, method='lm', aes(group=1), alpha=0.8)+ | |
theme_bw() | |
p6<-ggplot(data=data_G3xgreater0, aes(x=G1.x, y=G3.x, color=school))+ | |
geom_point(size=1, shape=1, position = "jitter")+ | |
scale_x_continuous(name="First Period Grade of Math")+ | |
scale_y_continuous(name="Final Grade of Math")+ | |
ggtitle("G3 vs G1 in Math")+ | |
scale_color_manual(values=c('red', 'green') )+ | |
geom_smooth(se=F, method='lm', aes(group=1), alpha=0.8)+ | |
theme_bw() | |
multiplot(p5, p6, cols=2) | |
#Students who did not drop the course: | |
data_G3x0<-data[data$G3.x==0,] | |
p7<-ggplot(data=data_G3x0, aes(x=school, y=G1.x))+ | |
geom_point(aes(color=school))+ | |
geom_boxplot(aes(color=school))+ | |
scale_x_discrete(name='School')+ | |
scale_y_continuous(name='First Grade of Math')+ | |
geom_hline(yintercept=9,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=11,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=13,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=15,colour='grey20',size=0.5,linetype=2)+ | |
annotate('text', x=2.5, y=c(8.5,10.5,12.5,14.5, 17), | |
label=c('F', 'D', 'C', 'B', 'A'), colour='red')+ | |
ggtitle('students who drop Grade 3')+ | |
theme_bw()+ | |
theme(legend.position="none") | |
p8<-ggplot(data=data_G3x0, aes(x=school, y=G2.x))+ | |
geom_point(aes(color=school))+ | |
geom_boxplot(aes(color=school))+ | |
scale_x_discrete(name='School')+ | |
scale_y_continuous(name='Second Grade of Math')+ | |
geom_hline(yintercept=9,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=11,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=13,colour='grey20',size=0.5,linetype=2)+ | |
geom_hline(yintercept=15,colour='grey20',size=0.5,linetype=2)+ | |
annotate('text', x=2.5, y=c(8.5,10.5,12.5,14.5, 17), | |
label=c('F', 'D', 'C', 'B', 'A'), colour='red')+ | |
ggtitle('students who drop Grade 3')+ | |
theme_bw()+ | |
theme(legend.position="none") | |
data_G2x0<-data[data$G2.x==0,] | |
multiplot(p7, p8,cols=2) | |
nrow(data_G2x0) | |
# 13 dropped G2 | |
nrow(data_G3x0) | |
# 39 dropped G3 | |
data_Gx00<-data[(data$G2.x==0 & data$G3.x==0),] | |
nrow(data_Gx00) | |
# 13 dropped both G2, G3 | |
#Does student performance affect by past class failure? | |
p9<-ggplot(data=data, aes(x=failures.x, y=G3.x))+ | |
geom_point(aes(color=failures.x))+ | |
geom_boxplot(aes(group=failures.x, color=failures.x))+ | |
xlab('Past Class Failures')+ | |
scale_y_continuous(name='Final Grade of Math')+ | |
ggtitle('G3 vs Failures')+ | |
theme_bw()+ | |
facet_wrap(~school)+ | |
theme(legend.position="none") | |
p10<-ggplot(data=data, aes(x=failures.y, y=G3.y))+ | |
geom_point(aes(color=failures.y))+ | |
geom_boxplot(aes(group=failures.y, color=failures.y))+ | |
xlab('Past Class Failures')+ | |
scale_y_continuous(name='Final Grade of Portuguese')+ | |
theme_bw()+ | |
facet_wrap(~school)+ | |
theme(legend.position="none") | |
multiplot(p9, p10, cols=1) | |
#Does student performance change based on age? | |
p11<-ggplot(data=data, aes(x=age, y=G3.x))+ | |
geom_point(aes(color=age))+ | |
geom_boxplot(aes(group=age, color=age))+ | |
xlab('Age')+ | |
ylab('Final Grade of Math')+ | |
ggtitle('G3 vs Age')+ | |
theme_bw()+ | |
facet_wrap(~school)+ | |
theme(legend.position="none") | |
p12<-ggplot(data=data, aes(x=age, y=G3.y))+ | |
geom_point(aes(color=age))+ | |
geom_boxplot(aes(group=age, color=age))+ | |
xlab('Age')+ | |
ylab('Final Grade of Portuguese')+ | |
theme_bw()+ | |
facet_wrap(~school)+ | |
theme(legend.position="none") | |
multiplot(p11, p12, cols=1) | |
#Does the student who wants to take higher education do better at school? | |
p13<-ggplot(data=data, aes(x=higher.x, y=G3.x))+ | |
geom_point(aes(color=higher.x))+ | |
geom_boxplot(aes(color=higher.x))+ | |
scale_x_discrete(name='Wants to take higher education')+ | |
scale_y_continuous(name='Final Grade of Math')+ | |
ggtitle('G3 vs Higher')+ | |
theme_bw()+ | |
facet_grid(school~.)+ | |
theme(legend.position="none") | |
p14<-ggplot(data=data, aes(x=higher.y, y=G3.y))+ | |
geom_point(aes(color=higher.y))+ | |
geom_boxplot(aes(color=higher.y))+ | |
scale_x_discrete(name='Wants to take higher education')+ | |
scale_y_continuous(name='Final Grade of Portuguese')+ | |
theme_bw()+ | |
ggtitle('G3 vs Higher')+ | |
facet_grid(school~.)+ | |
theme(legend.position="none") | |
multiplot(p13, p14, cols=2) | |
#Is the fact true that the more time a student spend on studying, he has higher grade? | |
p15<-ggplot(data=data, aes(x=studytime.x, y=G3.x))+ | |
geom_point(aes(color=studytime.x))+ | |
geom_boxplot(aes(group=studytime.x, color=studytime.x))+ | |
xlab('Weekly Study Time')+ | |
scale_y_continuous(name='Final Grade of Math')+ | |
ggtitle('G3 vs Study Time')+ | |
theme_bw()+ | |
facet_wrap(~school)+ | |
theme(legend.position="none") | |
p16<-ggplot(data=data, aes(x=studytime.y, y=G3.y))+ | |
geom_point(aes(color=studytime.y))+ | |
geom_boxplot(aes(group=studytime.y, color=studytime.y))+ | |
xlab('Weekly Study Time')+ | |
scale_y_continuous(name='Final Grade of Portuguese')+ | |
theme_bw()+ | |
facet_wrap(~school)+ | |
theme(legend.position="none") | |
multiplot(p15,p16,cols=1) | |
#Does absence relate with student performance? | |
p17<-ggplot(data=data, aes(x=absences.x, y=G3.x))+ | |
geom_point(aes(color=absences.x))+ | |
geom_boxplot(aes(group=absences.x, color=absences.x))+ | |
xlab('Absences')+ | |
ylab('Final Grade of Math')+ | |
ggtitle('G3 vs Absences')+ | |
facet_grid(school~.)+ | |
theme_bw()+ | |
scale_color_gradient(low='red', high='yellow', name='absences')+ | |
coord_cartesian(xlim=c(0, 30))+ | |
theme(legend.position="none") | |
p18<-ggplot(data=data, aes(x=absences.y, y=G3.y))+ | |
geom_point(aes(color=absences.y))+ | |
geom_boxplot(aes(group=absences.y, color=absences.y))+ | |
xlab('Absences')+ | |
ylab('Final Grade of Portuguese')+ | |
facet_grid(school~.)+ | |
theme_bw()+ | |
scale_color_gradient(low='red', high='yellow', name='absences')+ | |
coord_cartesian(xlim=c(0, 30))+ | |
theme(legend.position="none") | |
multiplot(p17, p18, cols=1) | |
#To get a better understanding of the plot, I grouped number of school absences to 4 categories: 0-9, 10-19, 20-29, 30+. | |
data$absences.xgroup=numeric(nrow(data)) | |
for (i in 1:nrow(data)){ | |
if (data$absences.x[i]>=0&data$absences.x[i]<=9){ | |
data$absences.xgroup[i]="0-9" | |
} | |
else if (data$absences.x[i]>=10&data$absences.x[i]<=19){ | |
data$absences.xgroup[i]="10-19" | |
} | |
else if (data$absences.x[i]>=20&data$absences.x[i]<=29){ | |
data$absences.xgroup[i]="20-29" | |
} | |
else if (data$absences.x[i]>=15&data$absences.x[i]<=19){ | |
data$absences.xgroup[i]="15-19" | |
} | |
else | |
data$absences.xgroup[i]="30+" | |
} | |
data$absences.ygroup=numeric(nrow(data)) | |
for (i in 1:nrow(data)){ | |
if (data$absences.y[i]>=0&data$absences.y[i]<=9){ | |
data$absences.ygroup[i]="0-9" | |
} | |
else if (data$absences.y[i]>=10&data$absences.y[i]<=19){ | |
data$absences.ygroup[i]="10-19" | |
} | |
else if (data$absences.y[i]>=20&data$absences.y[i]<=29){ | |
data$absences.ygroup[i]="20-29" | |
} | |
else if (data$absences.y[i]>=15&data$absences.y[i]<=19){ | |
data$absences.ygroup[i]="15-19" | |
} | |
else | |
data$absences.ygroup[i]="30+" | |
} | |
p19<-ggplot(data=data, aes(x=absences.xgroup, y=G3.x))+ | |
geom_point(aes(color=absences.xgroup))+ | |
geom_boxplot(aes(group=absences.xgroup, color=absences.xgroup))+ | |
xlab('Absences Group')+ | |
ylab('Final Grade of Math')+ | |
ggtitle('G3 vs Absences Group')+ | |
facet_grid(school~.)+ | |
theme_bw()+ | |
theme(legend.position="none") | |
p20<-ggplot(data=data, aes(x=absences.ygroup, y=G3.y))+ | |
geom_point(aes(color=absences.ygroup))+ | |
geom_boxplot(aes(group=absences.ygroup, color=absences.ygroup))+ | |
xlab('Absences Group')+ | |
ylab('Final Grade of Math')+ | |
ggtitle('G3 vs Absences Group')+ | |
facet_grid(school~.)+ | |
theme_bw()+ | |
theme(legend.position="none") | |
multiplot(p19, p20, cols=2) | |
#Does the parents' education and job influence student performance? | |
p21<-ggplot(data=data, aes(x=Mjob, y=G3.x))+ | |
geom_point(aes(color=Mjob))+ | |
xlab('Mother Job')+ | |
ylab('Final Grade of Math')+ | |
ggtitle('G3 vs Mother Job')+ | |
geom_boxplot(aes(group=Mjob, color=Mjob))+ | |
facet_grid(school~.)+ | |
theme_bw()+ | |
theme(legend.position="none") | |
p22<-ggplot(data=data, aes(x=Mjob, y=G3.y))+ | |
geom_point(aes(color=Mjob))+ | |
xlab('Mother Job')+ | |
ylab('Final Grade of Portuguese')+ | |
geom_boxplot(aes(group=Mjob, color=Mjob))+ | |
facet_grid(school~.)+ | |
theme_bw()+ | |
theme(legend.position="none") | |
p23<-ggplot(data=data, aes(x=Medu, y=G3.x))+ | |
geom_point(aes(color=Medu))+ | |
xlab('Mother Education')+ | |
ylab('Final Grade of Math')+ | |
ggtitle('G3 vs Mother Education')+ | |
geom_boxplot(aes(group=Medu, color=Medu))+ | |
facet_grid(school~.)+ | |
theme_bw()+ | |
theme(legend.position="none") | |
p24<-ggplot(data=data, aes(x=Medu, y=G3.y))+ | |
geom_point(aes(color=Medu))+ | |
xlab('Mother Education')+ | |
ylab('Final Grade of Portuguese')+ | |
geom_boxplot(aes(group=Medu, color=Medu))+ | |
facet_grid(school~.)+ | |
theme_bw()+ | |
theme(legend.position="none") | |
data$Meducha=as.character(data$Medu) | |
p25<-ggplot(data=data, aes(x=Mjob))+ | |
geom_bar(aes(fill=Meducha), position='fill')+ | |
xlab('Mother Job')+ | |
ggtitle('Mjob vs Medu')+ | |
theme_bw() | |
multiplot(p21, p22,p25, p23, p24, cols=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment