Skip to content

Instantly share code, notes, and snippets.

@yunho0130
Last active June 12, 2017 08:11
Show Gist options
  • Save yunho0130/e2e034e6cf0698576ebc to your computer and use it in GitHub Desktop.
Save yunho0130/e2e034e6cf0698576ebc to your computer and use it in GitHub Desktop.
# 2016-03-28 Yunho Maeng
# Assignment 1 : Wine Quality
# if you didn't install package, you can use below code
# install.packages("ggplot2");
# install.packages("dplyr");
# install.packages("gridExtra")
# install.packages("GGally")
# install.packages("reshape2")
# install.packages("doBy")
# graph
library(ggplot2);
library(dplyr);
library (gridExtra);
library(reshape2);
library(doBy);
# 1. Load Red wine & White wine data to R
redWineData <- read.csv(
file="/Users/Yunho/VM Ware/R/winequality/winequality-red.csv",
header = TRUE,
sep = ";"
)
whiteWineData <- read.csv(
file="/Users/Yunho/VM Ware/R/winequality/winequality-white.csv",
header = TRUE,
sep = ";")
# 레드와인과 화이트 와인을 "Red"와 "White"로 구분하는 열을 추가하고 그 열의 이름을 "type"으로 지정.
# 그리고 하나의 데이터 프레임으로 합치고 그 이름을 tatalWineData로 지정.
redTemp2 <- mutate(redWineData, "Red")
colnames(redTemp2)[13] <- "type"
whiteTemp2 <- mutate(whiteWineData, "White")
colnames(whiteTemp2)[13] <- "type"
totalWineData <- NULL
totalWineData <- rbind(redTemp2,whiteTemp2)
# 2. quality를 제외한 다른 모든 변수들에 관해서 Red wine과 White wine이 어떤 차이가 나는 지를
# 잘 보여줄 수 있는 변수를 찾아보아라.
summaryOfRedWine <- summary(redWineData)
summaryOfWhiteWine <- summary(whiteWineData)
t.test(redWineData$fixed.acidity, whiteWineData$fixed.acidity)
t.test(redWineData$volatile.acidity, whiteWineData$volatile.acidity)
t.test(redWineData$citric.acid, whiteWineData$citric.acid)
t.test(redWineData$residual.sugar, whiteWineData$residual.sugar)
t.test(redWineData$chlorides, whiteWineData$chlorides)
t.test(redWineData$free.sulfur.dioxide, whiteWineData$free.sulfur.dioxide)
t.test(redWineData$total.sulfur.dioxide, whiteWineData$total.sulfur.dioxide)
t.test(redWineData$density, whiteWineData$density)
t.test(redWineData$pH, whiteWineData$pH)
t.test(redWineData$sulphates, whiteWineData$sulphates)
t.test(redWineData$alcohol, whiteWineData$alcohol)
# T-Test에서는 모든 변수가 차이가 난다고 나오기 때문에 "잘" 보여줄 수 있는 변수를 찾기 어려워서 다시 계산시작.
summaryOfRedWine
summaryOfWhiteWine
# 각 집단의 일반통계 수치를 확인하여 "눈에 띄는" 차이를 mean 값을 통해 파악함
# 답: volatile.acidity, residual.sugar, free.sulfur.dioxide, total.sulfur.dioxide, sulphates
# 3. Red wine의 quality 변수의 값은 3부터 8까지의 값을 갖는다. 3,4는 Low, 5,6은 Mid,
# 7,8은 High의 범주 값을 갖는 qualityGroup 변수를 추가하라.
# 그리고, qualityGroup에 따른 wine 성분의 차이를 잘 나타내는 변수를 찾아보아라.
# Red 와인의 등급설정
# 등급행 추가
redWineWithGrade <- redWineData
redWineWithGrade <- mutate(redWineData, "Grade")
colnames(redWineWithGrade)[13] <- "qualityGroup"
# quality에서 값을 복사해서 등급 구분
redWineWithGrade
redWineWithGrade$qualityGroup <- gsub(3, "Low", redWineWithGrade$quality)
redWineWithGrade$qualityGroup <- gsub(4, "Low", redWineWithGrade$quality)
redWineWithGrade$qualityGroup <- gsub(5, "Mid", redWineWithGrade$quality)
redWineWithGrade$qualityGroup <- gsub(6, "Mid", redWineWithGrade$quality)
redWineWithGrade$qualityGroup <- gsub(7, "High", redWineWithGrade$quality)
redWineWithGrade$qualityGroup <- gsub(8, "High", redWineWithGrade$quality)
redWineWithGrade$qualityGroup <- gsub(3, "Low", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(4, "Low", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(5, "Mid", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(6, "Mid", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(7, "High", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(8, "High", redWineWithGrade$qualityGroup)
# 그룹에 따른 성분차이
groupByRedWineSummary <- summaryBy(fixed.acidity + volatile.acidity + citric.acid
+ residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density
+ pH + sulphates + alcohol~ qualityGroup, redWineWithGrade)
# 데이터 크기가 작아졌으므로 엑셀로 export
write.csv(
groupByRedWineSummary,
"/Users/Yunho/VM Ware/R/winequality/groupByQualitySummary-red.csv",
row.names = TRUE
)
# 답: total.sulfur.dioxide, alcohol, fixed.acidity 값이 낮을 수록 와인의 품질이 높다.
# 4. White wine은 quality 변수가 3부터 9까지의 값을 갖는다. 3,4는 Low, 5,6은 Mid, 7,8,9는
# High의 범주 값을 갖는 qualityGroup 변수를 추가하라.
# 그리고 Red wine의 High 와인과 White wine의 High 와인의 성분의 차이를 잘 나타낼 수 있는 변수를 찾아 보아라.
# White 와인의 등급설정
# 등급행 추가
whiteWineWithGrade <- whiteWineData
whiteWineWithGrade <- mutate(whiteWineData, "Grade")
colnames(whiteWineWithGrade)[13] <- "qualityGroup"
# quality에서 값을 복사해서 등급 구분
whiteWineWithGrade
whiteWineWithGrade$qualityGroup <- gsub(3, "Low", whiteWineWithGrade$quality)
whiteWineWithGrade$qualityGroup <- gsub(4, "Low", whiteWineWithGrade$quality)
whiteWineWithGrade$qualityGroup <- gsub(5, "Mid", whiteWineWithGrade$quality)
whiteWineWithGrade$qualityGroup <- gsub(6, "Mid", whiteWineWithGrade$quality)
whiteWineWithGrade$qualityGroup <- gsub(7, "High", whiteWineWithGrade$quality)
whiteWineWithGrade$qualityGroup <- gsub(8, "High", whiteWineWithGrade$quality)
whiteWineWithGrade$qualityGroup <- gsub(9, "High", whiteWineWithGrade$quality)
whiteWineWithGrade$qualityGroup <- gsub(3, "Low", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(4, "Low", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(5, "Mid", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(6, "Mid", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(7, "High", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(8, "High", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(9, "High", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade
# 그룹에 따른 성분차이
groupByWhiteWineSummary <- summaryBy(fixed.acidity + volatile.acidity + citric.acid
+ residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density
+ pH + sulphates + alcohol~ qualityGroup, whiteWineWithGrade)
groupByWhiteWineSummary
# 데이터 크기가 작아졌으므로 엑셀로 export
write.csv(
groupByWhiteWineSummary,
"/Users/Yunho/VM Ware/R/winequality/groupByQualitySummary-white.csv",
row.names = TRUE
)
# 답: 고급레드와인과 고급화이트와인간 차이를 잘 나타내주는 성분은
# total.sulfur.dioxide.mean, free.sulfur.dioxide.mean 임
## 망한 참조 코드들
# ,pH,density
#
# ggplot(
# aes(x=alcohol, y=mean(alcohol)),
# data = totalWineData)+
# geom_bar(aes(color=type),stat='summary',fun.y=mean)+
# ggtitle('Redwine & Whitewine comparison')
# ggplot(
# aes(x=density, sulphates),
# data = totalWineData)
# +
# geom_bar(aes(color=type),stat='summary',fun.y=mean)+
# ggtitle('Redwine & Whitewine comparison2')
#
# q1<-ggplot(aes(x=pH),
# data = subset(totalWineData,type %in% c("White")))+
# geom_histogram(color =I('black'),fill = I('#999999'))+
# ggtitle('pH distribution for White wine')+
# data = subset(totalWineData,type %in% c("Red"))+
# geom_histogram(color =I('black'),fill = I('#999999'))+
# ggtitle('pH distribution for White wine')
#
# 각각을 빼서 차이를 보려고 했지만 데이터 사이즈가 달라서 실패
# betweenRedandWihteWine = redWineData - whiteWineData
# q2<-ggplot(aes(x=free.sulfur.dioxide),
# data = subset(totalWineData,type %in% c("White")))+
# geom_histogram(color =I('black'),fill = I('#099009'))+
# ggtitle('Free SO2 distribution for White wine')
# q3<-ggplot(aes(x=total.sulfur.dioxide),
# data = subset(totalWineData,type %in% c("White")))+
# geom_histogram(color =I('black'),fill = I('#099009'))+
# ggtitle('Total SO2 distribution for White wine')
# q4<-ggplot(aes(x=alcohol),
# data = subset(totalWineData,type %in% c("White")))+
# geom_histogram(color =I('black'),fill = I('#099009'))+
# ggtitle('Alcohol distribution for White wine')
#
# # # grid.arrange(q1,q2,q3,q4,ncol=2)
# export PDF
# install.packages("knitr");
# install.packages("framed")
# library(framed)
# knitr::stitch('/Users/Yunho/VM Ware/R/160328_HW1_wineQuality_yunhomaeng.R')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment