Skip to content

Instantly share code, notes, and snippets.

@shuozhang1985
Last active July 25, 2016 14:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shuozhang1985/3f01b9406a04555035f14d481ad22c4f to your computer and use it in GitHub Desktop.
Save shuozhang1985/3f01b9406a04555035f14d481ad22c4f to your computer and use it in GitHub Desktop.
Why did she got an A while I got a D?
#Dataset description
library(plyr); library(dplyr)
library(ggplot2)
setwd("~/Desktop/student")
d1_mat=read.csv("student-mat.csv", sep=';', header=T, stringsAsFactors = F)
d2_por=read.csv("student-por.csv", sep=';', header = T, stringsAsFactors = F)
data=merge(d1_mat,d2_por,by=c("school", "sex", "age", "address", "famsize",
"Pstatus","Medu", "Fedu", "Mjob", "Fjob", "reason",
"nursery", "internet"))
summary(data)
#Which school has better student performance?
library(lattice)
library(plyr)
library(Rmisc)
p1<-ggplot(data=data, aes(x=school, y=G3.x))+
geom_point(aes(color=school))+
geom_boxplot(aes(color=school))+
xlab('School')+
ylab("Final Grade of Math")+
geom_hline(yintercept=9,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=11,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=13,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=15,colour='grey20',size=0.5,linetype=2)+
annotate('text', x=2.5, y=c(8.5,10.5,12.5,14.5, 17),
label=c('F', 'D', 'C', 'B', 'A'), colour='red')+
ggtitle("G3 in Math vs School")+
theme_bw()+
theme(legend.position="none")
p2<-ggplot(data=data, aes(x=school, y=G3.y))+
geom_point(aes(color=school))+
geom_boxplot(aes(color=school))+
scale_x_discrete(name='School')+
scale_y_continuous(name="Final Grade of Portuguese")+
geom_hline(yintercept=9,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=11,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=13,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=15,colour='grey20',size=0.5,linetype=2)+
annotate('text', x=2.5, y=c(8.5,10.5,12.5,14.5, 17),
label=c('F', 'D', 'C', 'B', 'A'), colour='red')+
ggtitle("G3 in Portuguese vs School")+
theme_bw()+
theme(legend.position="none")
multiplot(p1, p2, cols =1)
#Does the current student performance have a correlation with the past?
p3<-ggplot(data=data, aes(x=G1.x, y=G3.x, color=school))+
geom_point(size=1, shape=1, position = "jitter")+
scale_x_continuous(name="First Period Grade of Math")+
scale_y_continuous(name="Final Grade of Math")+
ggtitle("G3 vs G1 in Math")+
scale_color_manual(values=c('red', 'blue'))+
theme_bw()
p4<-ggplot(data=data, aes(x=G2.x, y=G3.x, color=school))+
geom_point(size=1, shape=1, position = "jitter")+
scale_x_continuous(name="Second Period Grade of Math")+
scale_y_continuous(name="Final Grade of Math")+
ggtitle("G3 vs G2 in Math")+
scale_color_manual(values=c('red', 'blue'))+
theme_bw()
multiplot(p3, p4, cols=2)
#Students who did not drop class:
data_G3xgreater0=data[data$G3.x>0,]
p5<-ggplot(data=data_G3xgreater0, aes(x=G2.x, y=G3.x, color=school))+
geom_point(size=1, shape=1, position = "jitter")+
scale_x_continuous(name="Second Period Grade of Math")+
scale_y_continuous(name="Final Grade of Math")+
ggtitle("G3 vs G2 in Math")+
scale_color_manual(values=c('red', 'green') )+
geom_smooth(se=F, method='lm', aes(group=1), alpha=0.8)+
theme_bw()
p6<-ggplot(data=data_G3xgreater0, aes(x=G1.x, y=G3.x, color=school))+
geom_point(size=1, shape=1, position = "jitter")+
scale_x_continuous(name="First Period Grade of Math")+
scale_y_continuous(name="Final Grade of Math")+
ggtitle("G3 vs G1 in Math")+
scale_color_manual(values=c('red', 'green') )+
geom_smooth(se=F, method='lm', aes(group=1), alpha=0.8)+
theme_bw()
multiplot(p5, p6, cols=2)
#Students who did not drop the course:
data_G3x0<-data[data$G3.x==0,]
p7<-ggplot(data=data_G3x0, aes(x=school, y=G1.x))+
geom_point(aes(color=school))+
geom_boxplot(aes(color=school))+
scale_x_discrete(name='School')+
scale_y_continuous(name='First Grade of Math')+
geom_hline(yintercept=9,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=11,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=13,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=15,colour='grey20',size=0.5,linetype=2)+
annotate('text', x=2.5, y=c(8.5,10.5,12.5,14.5, 17),
label=c('F', 'D', 'C', 'B', 'A'), colour='red')+
ggtitle('students who drop Grade 3')+
theme_bw()+
theme(legend.position="none")
p8<-ggplot(data=data_G3x0, aes(x=school, y=G2.x))+
geom_point(aes(color=school))+
geom_boxplot(aes(color=school))+
scale_x_discrete(name='School')+
scale_y_continuous(name='Second Grade of Math')+
geom_hline(yintercept=9,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=11,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=13,colour='grey20',size=0.5,linetype=2)+
geom_hline(yintercept=15,colour='grey20',size=0.5,linetype=2)+
annotate('text', x=2.5, y=c(8.5,10.5,12.5,14.5, 17),
label=c('F', 'D', 'C', 'B', 'A'), colour='red')+
ggtitle('students who drop Grade 3')+
theme_bw()+
theme(legend.position="none")
data_G2x0<-data[data$G2.x==0,]
multiplot(p7, p8,cols=2)
nrow(data_G2x0)
# 13 dropped G2
nrow(data_G3x0)
# 39 dropped G3
data_Gx00<-data[(data$G2.x==0 & data$G3.x==0),]
nrow(data_Gx00)
# 13 dropped both G2, G3
#Does student performance affect by past class failure?
p9<-ggplot(data=data, aes(x=failures.x, y=G3.x))+
geom_point(aes(color=failures.x))+
geom_boxplot(aes(group=failures.x, color=failures.x))+
xlab('Past Class Failures')+
scale_y_continuous(name='Final Grade of Math')+
ggtitle('G3 vs Failures')+
theme_bw()+
facet_wrap(~school)+
theme(legend.position="none")
p10<-ggplot(data=data, aes(x=failures.y, y=G3.y))+
geom_point(aes(color=failures.y))+
geom_boxplot(aes(group=failures.y, color=failures.y))+
xlab('Past Class Failures')+
scale_y_continuous(name='Final Grade of Portuguese')+
theme_bw()+
facet_wrap(~school)+
theme(legend.position="none")
multiplot(p9, p10, cols=1)
#Does student performance change based on age?
p11<-ggplot(data=data, aes(x=age, y=G3.x))+
geom_point(aes(color=age))+
geom_boxplot(aes(group=age, color=age))+
xlab('Age')+
ylab('Final Grade of Math')+
ggtitle('G3 vs Age')+
theme_bw()+
facet_wrap(~school)+
theme(legend.position="none")
p12<-ggplot(data=data, aes(x=age, y=G3.y))+
geom_point(aes(color=age))+
geom_boxplot(aes(group=age, color=age))+
xlab('Age')+
ylab('Final Grade of Portuguese')+
theme_bw()+
facet_wrap(~school)+
theme(legend.position="none")
multiplot(p11, p12, cols=1)
#Does the student who wants to take higher education do better at school?
p13<-ggplot(data=data, aes(x=higher.x, y=G3.x))+
geom_point(aes(color=higher.x))+
geom_boxplot(aes(color=higher.x))+
scale_x_discrete(name='Wants to take higher education')+
scale_y_continuous(name='Final Grade of Math')+
ggtitle('G3 vs Higher')+
theme_bw()+
facet_grid(school~.)+
theme(legend.position="none")
p14<-ggplot(data=data, aes(x=higher.y, y=G3.y))+
geom_point(aes(color=higher.y))+
geom_boxplot(aes(color=higher.y))+
scale_x_discrete(name='Wants to take higher education')+
scale_y_continuous(name='Final Grade of Portuguese')+
theme_bw()+
ggtitle('G3 vs Higher')+
facet_grid(school~.)+
theme(legend.position="none")
multiplot(p13, p14, cols=2)
#Is the fact true that the more time a student spend on studying, he has higher grade?
p15<-ggplot(data=data, aes(x=studytime.x, y=G3.x))+
geom_point(aes(color=studytime.x))+
geom_boxplot(aes(group=studytime.x, color=studytime.x))+
xlab('Weekly Study Time')+
scale_y_continuous(name='Final Grade of Math')+
ggtitle('G3 vs Study Time')+
theme_bw()+
facet_wrap(~school)+
theme(legend.position="none")
p16<-ggplot(data=data, aes(x=studytime.y, y=G3.y))+
geom_point(aes(color=studytime.y))+
geom_boxplot(aes(group=studytime.y, color=studytime.y))+
xlab('Weekly Study Time')+
scale_y_continuous(name='Final Grade of Portuguese')+
theme_bw()+
facet_wrap(~school)+
theme(legend.position="none")
multiplot(p15,p16,cols=1)
#Does absence relate with student performance?
p17<-ggplot(data=data, aes(x=absences.x, y=G3.x))+
geom_point(aes(color=absences.x))+
geom_boxplot(aes(group=absences.x, color=absences.x))+
xlab('Absences')+
ylab('Final Grade of Math')+
ggtitle('G3 vs Absences')+
facet_grid(school~.)+
theme_bw()+
scale_color_gradient(low='red', high='yellow', name='absences')+
coord_cartesian(xlim=c(0, 30))+
theme(legend.position="none")
p18<-ggplot(data=data, aes(x=absences.y, y=G3.y))+
geom_point(aes(color=absences.y))+
geom_boxplot(aes(group=absences.y, color=absences.y))+
xlab('Absences')+
ylab('Final Grade of Portuguese')+
facet_grid(school~.)+
theme_bw()+
scale_color_gradient(low='red', high='yellow', name='absences')+
coord_cartesian(xlim=c(0, 30))+
theme(legend.position="none")
multiplot(p17, p18, cols=1)
#To get a better understanding of the plot, I grouped number of school absences to 4 categories: 0-9, 10-19, 20-29, 30+.
data$absences.xgroup=numeric(nrow(data))
for (i in 1:nrow(data)){
if (data$absences.x[i]>=0&data$absences.x[i]<=9){
data$absences.xgroup[i]="0-9"
}
else if (data$absences.x[i]>=10&data$absences.x[i]<=19){
data$absences.xgroup[i]="10-19"
}
else if (data$absences.x[i]>=20&data$absences.x[i]<=29){
data$absences.xgroup[i]="20-29"
}
else if (data$absences.x[i]>=15&data$absences.x[i]<=19){
data$absences.xgroup[i]="15-19"
}
else
data$absences.xgroup[i]="30+"
}
data$absences.ygroup=numeric(nrow(data))
for (i in 1:nrow(data)){
if (data$absences.y[i]>=0&data$absences.y[i]<=9){
data$absences.ygroup[i]="0-9"
}
else if (data$absences.y[i]>=10&data$absences.y[i]<=19){
data$absences.ygroup[i]="10-19"
}
else if (data$absences.y[i]>=20&data$absences.y[i]<=29){
data$absences.ygroup[i]="20-29"
}
else if (data$absences.y[i]>=15&data$absences.y[i]<=19){
data$absences.ygroup[i]="15-19"
}
else
data$absences.ygroup[i]="30+"
}
p19<-ggplot(data=data, aes(x=absences.xgroup, y=G3.x))+
geom_point(aes(color=absences.xgroup))+
geom_boxplot(aes(group=absences.xgroup, color=absences.xgroup))+
xlab('Absences Group')+
ylab('Final Grade of Math')+
ggtitle('G3 vs Absences Group')+
facet_grid(school~.)+
theme_bw()+
theme(legend.position="none")
p20<-ggplot(data=data, aes(x=absences.ygroup, y=G3.y))+
geom_point(aes(color=absences.ygroup))+
geom_boxplot(aes(group=absences.ygroup, color=absences.ygroup))+
xlab('Absences Group')+
ylab('Final Grade of Math')+
ggtitle('G3 vs Absences Group')+
facet_grid(school~.)+
theme_bw()+
theme(legend.position="none")
multiplot(p19, p20, cols=2)
#Does the parents' education and job influence student performance?
p21<-ggplot(data=data, aes(x=Mjob, y=G3.x))+
geom_point(aes(color=Mjob))+
xlab('Mother Job')+
ylab('Final Grade of Math')+
ggtitle('G3 vs Mother Job')+
geom_boxplot(aes(group=Mjob, color=Mjob))+
facet_grid(school~.)+
theme_bw()+
theme(legend.position="none")
p22<-ggplot(data=data, aes(x=Mjob, y=G3.y))+
geom_point(aes(color=Mjob))+
xlab('Mother Job')+
ylab('Final Grade of Portuguese')+
geom_boxplot(aes(group=Mjob, color=Mjob))+
facet_grid(school~.)+
theme_bw()+
theme(legend.position="none")
p23<-ggplot(data=data, aes(x=Medu, y=G3.x))+
geom_point(aes(color=Medu))+
xlab('Mother Education')+
ylab('Final Grade of Math')+
ggtitle('G3 vs Mother Education')+
geom_boxplot(aes(group=Medu, color=Medu))+
facet_grid(school~.)+
theme_bw()+
theme(legend.position="none")
p24<-ggplot(data=data, aes(x=Medu, y=G3.y))+
geom_point(aes(color=Medu))+
xlab('Mother Education')+
ylab('Final Grade of Portuguese')+
geom_boxplot(aes(group=Medu, color=Medu))+
facet_grid(school~.)+
theme_bw()+
theme(legend.position="none")
data$Meducha=as.character(data$Medu)
p25<-ggplot(data=data, aes(x=Mjob))+
geom_bar(aes(fill=Meducha), position='fill')+
xlab('Mother Job')+
ggtitle('Mjob vs Medu')+
theme_bw()
multiplot(p21, p22,p25, p23, p24, cols=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment