yunho0130/160418_HW2_wineQuality_yunhomaeng.R

## 160418_HW2_wineQuality_yunhomaeng.R
# 2016-04-18 Yunho Maeng
# Assignment 2 : Wine Quality

# if you didn't install package, you can use below code
# install.packages("ggplot2");
# install.packages("dplyr");
# install.packages("gridExtra")
# install.packages("GGally")
# install.packages("reshape2")
# install.packages("doBy")

# graph
library(ggplot2);
library(dplyr);
library (gridExtra);
library(reshape2);
library(doBy);
library(GGally);
# 1. Download data

# 2. Load Red wine & White wine data to R

redWineData <- read.csv(
  file="/Users/Yunho/VM Ware/R/[03-28] HW1. WineQuality/winequality/winequality-red.csv",
  header = TRUE,
  sep = ";"
                        )
whiteWineData <- read.csv(
  file="/Users/Yunho/VM Ware/R/[03-28] HW1. WineQuality/winequality/winequality-white.csv",
  header = TRUE,
  sep = ";")

# 레드와인과 화이트 와인을 "Red"와 "White"로 구분하는 열을 추가하고 그 열의 이름을 "type"으로 지정. 그리고 하나의 데이터 프레임으로 합치고 그 이름을 tatalWineData로 지정.

redTemp2 <- mutate(redWineData, "R")
colnames(redTemp2)[13] <- "type"
whiteTemp2 <- mutate(whiteWineData, "W")
colnames(whiteTemp2)[13] <- "type"
totalWineData <- NULL
totalWineData <- rbind(redTemp2,whiteTemp2)

# 3. quality를 제외한 다른 모든 변수들에 관해서 red wine과 white wine이 어떤 차이가 나는지를 보여줄 수 있는 수치를 보이고, 이를 잘 나타내 줄 수 있는 그래프로 표현하라. (수치와 그래프는 여러가지 이어야 하며, 왜 이런 수치와 그래프를 그렸는지를 설명하여야 함)

### HW 1-3 code start

summaryOfRedWine <- summary(redWineData)
summaryOfWhiteWine <- summary(whiteWineData)

t.test(redWineData$fixed.acidity, whiteWineData$fixed.acidity)
t.test(redWineData$volatile.acidity, whiteWineData$volatile.acidity)
t.test(redWineData$citric.acid, whiteWineData$citric.acid)
t.test(redWineData$residual.sugar, whiteWineData$residual.sugar)
t.test(redWineData$chlorides, whiteWineData$chlorides)
t.test(redWineData$free.sulfur.dioxide, whiteWineData$free.sulfur.dioxide)
t.test(redWineData$total.sulfur.dioxide, whiteWineData$total.sulfur.dioxide)
t.test(redWineData$density, whiteWineData$density)
t.test(redWineData$pH, whiteWineData$pH)
t.test(redWineData$sulphates, whiteWineData$sulphates)
t.test(redWineData$alcohol, whiteWineData$alcohol)

# T-Test에서는 모든 변수가 차이가 난다고 나오기 때문에 "잘" 보여줄 수 있는 변수를 찾기 어려워서 다시 계산시작.

summaryOfRedWine
summaryOfWhiteWine

# 각 집단의 일반통계 수치를 확인하여 "눈에 띄는" 차이를 mean 값을 통해 파악함
# 답: volatile.acidity, residual.sugar,  free.sulfur.dioxide, total.sulfur.dioxide, sulphates

### HW 1-3 code end

### HW 2-3 code start

head(totalWineData)

## 변수명이 짧은 ph와 alcohol을 활용하여 다양한 그래프를 그려보고 적합한 그래프 선별
# Redwine histogram
p1<-ggplot(aes(x=redWineData$pH), data = redWineData)+
  geom_histogram(color =I('black'),fill = I('#099009'))+
  ggtitle('pH distribution for Red wine')
p2<-ggplot(aes(x=redWineData$alcohol),data = redWineData)+
  geom_histogram(color =I('black'),fill = I('#099009'))+
  ggtitle('Alcohol distribution for Red wine')

grid.arrange(p1,p2)

# Whitewine histogram
p3<-ggplot(aes(x=whiteWineData$pH), data = whiteWineData)+
  geom_histogram(color =I('black'),fill = I('#099009'))+
  ggtitle('pH distribution for White wine')
p4<-ggplot(aes(x=whiteWineData$alcohol),data = whiteWineData)+
  geom_histogram(color =I('black'),fill = I('#099009'))+
  ggtitle('Alcohol distribution for White wine')

grid.arrange(p3,p4)
grid.arrange(p1,p2,p3,p4, ncol=2)

# ggplot 테스트

gg1 <-ggplot(data = redWineData, aes(pH))
gg1 + geom_area(stat="bin", fill="#E64C66")
gg1 + geom_density(kernel="gaussian")
gg1 + geom_dotplot(fill="#E64C66")
gg1 + geom_freqpoly()
gg1 + geom_histogram(binwidth = 0.05, fill="#E64C66", color="black")
gg1 + geom_bar(fill="#E64C66")

gg2<- gg1 + geom_histogram(binwidth = 0.05, fill="#E64C66", color="black") + geom_freqpoly(data = whiteWineData, aes(pH), color="#1BBC9B")
print(gg2)
gg2 <-ggplot(data = whiteWineData, aes(pH))
gg4 <- gg2 + geom_area(stat="bin", fill="#1BBC9B")

head(whiteWineData)
head(whiteWineData[1])

# Redwine과 Whitewine 성분 11개 비교 그래프
for (i in 1:11) {
  print(i)
  gg1 <-ggplot(data = redWineData, aes(redWineData[i]), xlab = redWineData[i])
  gg2 <-gg1 + geom_histogram(1, color="black") + geom_freqpoly(data = whiteWineData, aes(whiteWineData[i]), color="#1BBC9B")
  print(gg2)
}
for (i in 1:11) {
  print(i)
  print(head(redWineData[i]))
}


# 산점도
plot(whiteWineData$pH, )
plot(redWineData$pH, )
# 샘플 데이터 숫자가 서로 다르고 한 눈에 비교하기 어려움을 알 수 있음.

# 박스플롯
boxplot(pH ~ type, data = totalWineData)

### HW 2-3 code end

# 4. red wine의 quality 변수의 값은 3부터 8까지의 값을 갖는다. 3,4는 Low, 5,6은 Mid, 7,8은 High의 범주 값을 갖는 qualityGroup 변수를 추가하라. 그리고, qualityGroup에 따른 wine 성분의 차이를 잘 나타낼 수 있는 그래프를 그려라. (왜 이런 그래프를 그렸는지를 설명하여야 함)


# Red 와인의 등급설정
# 등급행 추가
redWineWithGrade <- redWineData
redWineWithGrade <- mutate(redWineData, "Grade")
colnames(redWineWithGrade)[13] <- "qualityGroup"

# quality에서 값을 복사해서 등급 구분
head(redWineWithGrade)
redWineWithGrade$qualityGroup <- gsub(3, "1.Low", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(4, "1.Low", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(5, "2.Mid", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(6, "2.Mid", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(7, "3.High", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub(8, "3.High", redWineWithGrade$qualityGroup)

redWineWithGrade$qualityGroup <- gsub("1.1.Low", "1.Low", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub("2.2.Mid", "2.Mid", redWineWithGrade$qualityGroup)
redWineWithGrade$qualityGroup <- gsub("3.3.High", "3.High", redWineWithGrade$qualityGroup)


### HW 2-4 code start

gg3 <- ggplot(data = redWineWithGrade, aes(x = qualityGroup, y= pH))
gg3 + geom_jitter(color = "#E64C66", alpha = "0.4")
gg3 + geom_boxplot()
gg3 + geom_violin()
gg3 + geom_dotplot(binaxis = "y", stackdir = "center")
gg3 + geom_smooth(method = lm)

## 수동 반복 코드 ㅠㅠ
gg3 <- ggplot(data = redWineWithGrade, aes(x = qualityGroup, y= alcohol))
k11 <- gg3 + geom_jitter(color = "#E64C66", alpha = "0.4")
print(k11)

head(redWineWithGrade)

grid.arrange(k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11, ncol=4)


### HW 2-4 code end

# 5. white wine은 quality 변수가 3부터 9까지의 값을 갖는다. 3,4는 Low, 5,6은 Mid, 7,8,9는 High의 범주 값을 갖는 qualityGroup 변수를 추가하라. 그리고 red wine의 High 와인과 white wine의 High 와인의 성분의 차이를 잘 나타낼 수 있는 그래프를 그려라. (왜 이런 그래프를 그렸는지를 설명하여야 함)


# White 와인의 등급설정
# 등급행 추가
whiteWineWithGrade <- whiteWineData
whiteWineWithGrade <- mutate(whiteWineData, "Grade")
colnames(whiteWineWithGrade)[13] <- "qualityGroup"

# quality에서 값을 복사해서 등급 구분
whiteWineWithGrade

whiteWineWithGrade$qualityGroup <- gsub(3, "Low", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(4, "Low", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(5, "Mid", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(6, "Mid", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(7, "High", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(8, "High", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub(9, "High", whiteWineWithGrade$qualityGroup)

head(whiteWineWithGrade)

whiteWineWithGrade$qualityGroup <- gsub("Low", "1.Low", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub("Mid", "2.Mid", whiteWineWithGrade$qualityGroup)
whiteWineWithGrade$qualityGroup <- gsub("High", "3.High", whiteWineWithGrade$qualityGroup)


rh.sub <- subset(redWineWithGrade, qualityGroup == "3.High" )
head(rh.sub)
wh.sub <- subset(whiteWineWithGrade, qualityGroup == "3.High")
head(rh.sub)

## 수동 반복 코드 ㅠㅠ

gg4 <- ggplot(data = rh.sub, aes(x = qualityGroup , y= alcohol))
r11 <- gg4 + geom_jitter(color = "#E64C66", alpha = "0.4") + geom_jitter(data = wh.sub, aes(x = qualityGroup, y = alcohol), color = "#1BBC9B", alpha = "0.4")
print(r11)

grid.arrange(r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11, ncol=4)


## 2016-04-18 망한 참조 코드들

# sqldf 테스트

install.packages("sqldf")
library(sqldf)

s1 <- sqldf::sqldf("SELECT * FROM totalWineData WHERE type='R'", stringsAsFactors=F)
head(s1)
s2 <- sqldf::sqldf("SELECT * FROM totalWineData WHERE type='W'", stringsAsFactors=F)
head(s2)
# 서브쿼리 작동 제한적으로만 하는 걸로... (내가 쓰는 방식으로 작동안함 ㅠㅠ )
# 그렇다고 위의 식에서 접근한 s1.pH 식의 접근도 안됨... ㅠㅠ
s3 <- sqldf::sqldf("SELECT R.pH = (SELECT pH FROM totalWineData WHERE type='R'), W.pH = (SELECT pH FROM totalWineData WHERE type='W'), FROM totalWineData", stringsAsFactor=F)

# sqldf 테스트 결과 : 서브쿼리 부분 지원 및 조인 명령어 제약으로 사용 보류
	# 2016-04-18 Yunho Maeng
	# Assignment 2 : Wine Quality

	# if you didn't install package, you can use below code
	# install.packages("ggplot2");
	# install.packages("dplyr");
	# install.packages("gridExtra")
	# install.packages("GGally")
	# install.packages("reshape2")
	# install.packages("doBy")

	# graph
	library(ggplot2);
	library(dplyr);
	library (gridExtra);
	library(reshape2);
	library(doBy);
	library(GGally);
	# 1. Download data

	# 2. Load Red wine & White wine data to R

	redWineData <- read.csv(
	file="/Users/Yunho/VM Ware/R/[03-28] HW1. WineQuality/winequality/winequality-red.csv",
	header = TRUE,
	sep = ";"
	)
	whiteWineData <- read.csv(
	file="/Users/Yunho/VM Ware/R/[03-28] HW1. WineQuality/winequality/winequality-white.csv",
	header = TRUE,
	sep = ";")

	# 레드와인과 화이트 와인을 "Red"와 "White"로 구분하는 열을 추가하고 그 열의 이름을 "type"으로 지정. 그리고 하나의 데이터 프레임으로 합치고 그 이름을 tatalWineData로 지정.

	redTemp2 <- mutate(redWineData, "R")
	colnames(redTemp2)[13] <- "type"
	whiteTemp2 <- mutate(whiteWineData, "W")
	colnames(whiteTemp2)[13] <- "type"
	totalWineData <- NULL
	totalWineData <- rbind(redTemp2,whiteTemp2)

	# 3. quality를 제외한 다른 모든 변수들에 관해서 red wine과 white wine이 어떤 차이가 나는지를 보여줄 수 있는 수치를 보이고, 이를 잘 나타내 줄 수 있는 그래프로 표현하라. (수치와 그래프는 여러가지 이어야 하며, 왜 이런 수치와 그래프를 그렸는지를 설명하여야 함)

	### HW 1-3 code start

	summaryOfRedWine <- summary(redWineData)
	summaryOfWhiteWine <- summary(whiteWineData)

	t.test(redWineData$fixed.acidity, whiteWineData$fixed.acidity)
	t.test(redWineData$volatile.acidity, whiteWineData$volatile.acidity)
	t.test(redWineData$citric.acid, whiteWineData$citric.acid)
	t.test(redWineData$residual.sugar, whiteWineData$residual.sugar)
	t.test(redWineData$chlorides, whiteWineData$chlorides)
	t.test(redWineData$free.sulfur.dioxide, whiteWineData$free.sulfur.dioxide)
	t.test(redWineData$total.sulfur.dioxide, whiteWineData$total.sulfur.dioxide)
	t.test(redWineData$density, whiteWineData$density)
	t.test(redWineData$pH, whiteWineData$pH)
	t.test(redWineData$sulphates, whiteWineData$sulphates)
	t.test(redWineData$alcohol, whiteWineData$alcohol)

	# T-Test에서는 모든 변수가 차이가 난다고 나오기 때문에 "잘" 보여줄 수 있는 변수를 찾기 어려워서 다시 계산시작.

	summaryOfRedWine
	summaryOfWhiteWine

	# 각 집단의 일반통계 수치를 확인하여 "눈에 띄는" 차이를 mean 값을 통해 파악함
	# 답: volatile.acidity, residual.sugar, free.sulfur.dioxide, total.sulfur.dioxide, sulphates

	### HW 1-3 code end

	### HW 2-3 code start

	head(totalWineData)

	## 변수명이 짧은 ph와 alcohol을 활용하여 다양한 그래프를 그려보고 적합한 그래프 선별
	# Redwine histogram
	p1<-ggplot(aes(x=redWineData$pH), data = redWineData)+
	geom_histogram(color =I('black'),fill = I('#099009'))+
	ggtitle('pH distribution for Red wine')
	p2<-ggplot(aes(x=redWineData$alcohol),data = redWineData)+
	geom_histogram(color =I('black'),fill = I('#099009'))+
	ggtitle('Alcohol distribution for Red wine')

	grid.arrange(p1,p2)

	# Whitewine histogram
	p3<-ggplot(aes(x=whiteWineData$pH), data = whiteWineData)+
	geom_histogram(color =I('black'),fill = I('#099009'))+
	ggtitle('pH distribution for White wine')
	p4<-ggplot(aes(x=whiteWineData$alcohol),data = whiteWineData)+
	geom_histogram(color =I('black'),fill = I('#099009'))+
	ggtitle('Alcohol distribution for White wine')

	grid.arrange(p3,p4)
	grid.arrange(p1,p2,p3,p4, ncol=2)

	# ggplot 테스트

	gg1 <-ggplot(data = redWineData, aes(pH))
	gg1 + geom_area(stat="bin", fill="#E64C66")
	gg1 + geom_density(kernel="gaussian")
	gg1 + geom_dotplot(fill="#E64C66")
	gg1 + geom_freqpoly()
	gg1 + geom_histogram(binwidth = 0.05, fill="#E64C66", color="black")
	gg1 + geom_bar(fill="#E64C66")

	gg2<- gg1 + geom_histogram(binwidth = 0.05, fill="#E64C66", color="black") + geom_freqpoly(data = whiteWineData, aes(pH), color="#1BBC9B")
	print(gg2)
	gg2 <-ggplot(data = whiteWineData, aes(pH))
	gg4 <- gg2 + geom_area(stat="bin", fill="#1BBC9B")

	head(whiteWineData)
	head(whiteWineData[1])

	# Redwine과 Whitewine 성분 11개 비교 그래프
	for (i in 1:11) {
	print(i)
	gg1 <-ggplot(data = redWineData, aes(redWineData[i]), xlab = redWineData[i])
	gg2 <-gg1 + geom_histogram(1, color="black") + geom_freqpoly(data = whiteWineData, aes(whiteWineData[i]), color="#1BBC9B")
	print(gg2)
	}
	for (i in 1:11) {
	print(i)
	print(head(redWineData[i]))
	}



	# 산점도
	plot(whiteWineData$pH, )
	plot(redWineData$pH, )
	# 샘플 데이터 숫자가 서로 다르고 한 눈에 비교하기 어려움을 알 수 있음.

	# 박스플롯
	boxplot(pH ~ type, data = totalWineData)

	### HW 2-3 code end

	# 4. red wine의 quality 변수의 값은 3부터 8까지의 값을 갖는다. 3,4는 Low, 5,6은 Mid, 7,8은 High의 범주 값을 갖는 qualityGroup 변수를 추가하라. 그리고, qualityGroup에 따른 wine 성분의 차이를 잘 나타낼 수 있는 그래프를 그려라. (왜 이런 그래프를 그렸는지를 설명하여야 함)



	# Red 와인의 등급설정
	# 등급행 추가
	redWineWithGrade <- redWineData
	redWineWithGrade <- mutate(redWineData, "Grade")
	colnames(redWineWithGrade)[13] <- "qualityGroup"

	# quality에서 값을 복사해서 등급 구분
	head(redWineWithGrade)
	redWineWithGrade$qualityGroup <- gsub(3, "1.Low", redWineWithGrade$qualityGroup)
	redWineWithGrade$qualityGroup <- gsub(4, "1.Low", redWineWithGrade$qualityGroup)
	redWineWithGrade$qualityGroup <- gsub(5, "2.Mid", redWineWithGrade$qualityGroup)
	redWineWithGrade$qualityGroup <- gsub(6, "2.Mid", redWineWithGrade$qualityGroup)
	redWineWithGrade$qualityGroup <- gsub(7, "3.High", redWineWithGrade$qualityGroup)
	redWineWithGrade$qualityGroup <- gsub(8, "3.High", redWineWithGrade$qualityGroup)

	redWineWithGrade$qualityGroup <- gsub("1.1.Low", "1.Low", redWineWithGrade$qualityGroup)
	redWineWithGrade$qualityGroup <- gsub("2.2.Mid", "2.Mid", redWineWithGrade$qualityGroup)
	redWineWithGrade$qualityGroup <- gsub("3.3.High", "3.High", redWineWithGrade$qualityGroup)


	### HW 2-4 code start

	gg3 <- ggplot(data = redWineWithGrade, aes(x = qualityGroup, y= pH))
	gg3 + geom_jitter(color = "#E64C66", alpha = "0.4")
	gg3 + geom_boxplot()
	gg3 + geom_violin()
	gg3 + geom_dotplot(binaxis = "y", stackdir = "center")
	gg3 + geom_smooth(method = lm)

	## 수동 반복 코드 ㅠㅠ
	gg3 <- ggplot(data = redWineWithGrade, aes(x = qualityGroup, y= alcohol))
	k11 <- gg3 + geom_jitter(color = "#E64C66", alpha = "0.4")
	print(k11)

	head(redWineWithGrade)

	grid.arrange(k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11, ncol=4)


	### HW 2-4 code end

	# 5. white wine은 quality 변수가 3부터 9까지의 값을 갖는다. 3,4는 Low, 5,6은 Mid, 7,8,9는 High의 범주 값을 갖는 qualityGroup 변수를 추가하라. 그리고 red wine의 High 와인과 white wine의 High 와인의 성분의 차이를 잘 나타낼 수 있는 그래프를 그려라. (왜 이런 그래프를 그렸는지를 설명하여야 함)


	# White 와인의 등급설정
	# 등급행 추가
	whiteWineWithGrade <- whiteWineData
	whiteWineWithGrade <- mutate(whiteWineData, "Grade")
	colnames(whiteWineWithGrade)[13] <- "qualityGroup"

	# quality에서 값을 복사해서 등급 구분
	whiteWineWithGrade

	whiteWineWithGrade$qualityGroup <- gsub(3, "Low", whiteWineWithGrade$qualityGroup)
	whiteWineWithGrade$qualityGroup <- gsub(4, "Low", whiteWineWithGrade$qualityGroup)
	whiteWineWithGrade$qualityGroup <- gsub(5, "Mid", whiteWineWithGrade$qualityGroup)
	whiteWineWithGrade$qualityGroup <- gsub(6, "Mid", whiteWineWithGrade$qualityGroup)
	whiteWineWithGrade$qualityGroup <- gsub(7, "High", whiteWineWithGrade$qualityGroup)
	whiteWineWithGrade$qualityGroup <- gsub(8, "High", whiteWineWithGrade$qualityGroup)
	whiteWineWithGrade$qualityGroup <- gsub(9, "High", whiteWineWithGrade$qualityGroup)

	head(whiteWineWithGrade)

	whiteWineWithGrade$qualityGroup <- gsub("Low", "1.Low", whiteWineWithGrade$qualityGroup)
	whiteWineWithGrade$qualityGroup <- gsub("Mid", "2.Mid", whiteWineWithGrade$qualityGroup)
	whiteWineWithGrade$qualityGroup <- gsub("High", "3.High", whiteWineWithGrade$qualityGroup)


	rh.sub <- subset(redWineWithGrade, qualityGroup == "3.High" )
	head(rh.sub)
	wh.sub <- subset(whiteWineWithGrade, qualityGroup == "3.High")
	head(rh.sub)

	## 수동 반복 코드 ㅠㅠ

	gg4 <- ggplot(data = rh.sub, aes(x = qualityGroup , y= alcohol))
	r11 <- gg4 + geom_jitter(color = "#E64C66", alpha = "0.4") + geom_jitter(data = wh.sub, aes(x = qualityGroup, y = alcohol), color = "#1BBC9B", alpha = "0.4")
	print(r11)

	grid.arrange(r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11, ncol=4)



	## 2016-04-18 망한 참조 코드들

	# sqldf 테스트

	install.packages("sqldf")
	library(sqldf)

	s1 <- sqldf::sqldf("SELECT * FROM totalWineData WHERE type='R'", stringsAsFactors=F)
	head(s1)
	s2 <- sqldf::sqldf("SELECT * FROM totalWineData WHERE type='W'", stringsAsFactors=F)
	head(s2)
	# 서브쿼리 작동 제한적으로만 하는 걸로... (내가 쓰는 방식으로 작동안함 ㅠㅠ )
	# 그렇다고 위의 식에서 접근한 s1.pH 식의 접근도 안됨... ㅠㅠ
	s3 <- sqldf::sqldf("SELECT R.pH = (SELECT pH FROM totalWineData WHERE type='R'), W.pH = (SELECT pH FROM totalWineData WHERE type='W'), FROM totalWineData", stringsAsFactor=F)

	# sqldf 테스트 결과 : 서브쿼리 부분 지원 및 조인 명령어 제약으로 사용 보류