jurgen-dejager/Rcode.Rmd

## Rcode.Rmd
---
title: "Project3"
author: "Jurgen de Jager"
date: "August 12, 2016"
output: html_document
---

```{r, echo = FALSE}
#loading packages
library(randomForest)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(rattle)
library(party)
library(RColorBrewer)
library(MASS)
library(GGally)
library(dplyr)
library(d3heatmap)
library(doBy)
library(stargazer)
```

```{r, echo = FALSE}
#data prep
nba = read.csv("nba.csv")
nba$Position = as.character(nba$Position)
which(nchar(nba$Position)>3)
nba = nba[1:342,]
nba = nba[-c(159,260,324,327),]
nba$random = runif(338, 0, 1)
nba.train = nba[nba$random < .4,]
nba.test = nba[nba$random >= .4,]
```

# VIZ
```{r}
nba.temp = nba[,c('Position', 'TRB', 'AST', 'STL', 'BLK', 'PER', 'TOV', 'USG', 'TS')]
ggpairs(nba.temp, aes(col = Position))

#points
ggplot(nba, aes(x = PER)) + geom_density(aes(fill = Position, alpha = 0.2))
#true shootings score
ggplot(nba, aes(x = TS)) + geom_density(aes(fill = Position, alpha = 0.2))
#Offensive Rebounds
ggplot(nba, aes(x = TRB)) + geom_density(aes(fill = Position, alpha = 0.2))
#Defensive Rebounds
ggplot(nba, aes(x = DRB)) + geom_density(aes(fill = Position, alpha = 0.2))
#Steals
ggplot(nba, aes(x = STL)) + geom_density(aes(fill = Position, alpha = 0.2))


#heatmap by position
viz.data = nba[,c(1:2,6:21)]
vd = viz.data %>%
  group_by(Position) %>%
  summarise_each(funs(mean))
name = vd$Position
vd = vd[,2:17]
row.names(vd) = name
d3heatmap(vd, scale="column", dendrogram = "none")


## heatmap of every player
nba3 = nba[,c('Player', 'TRB', 'AST', 'STL', 'BLK', 'PER', 'TOV', 'USG', 'TS')]
name2 = nba3$Player
row.names(nba3) = name2
d3heatmap(nba3[,c(2:9)], scale="column", yaxis_font_size = "4pt", k_row = 5,dendrogram = "row")

```

#Decision Tree - Good Fit
```{r}
#model2
fit = rpart(Position ~., data = nba.train[,c(2,6:21)], method="class")
#fancyRpartPlot(fit)
#prediction
nba.test$Prediction <- predict(fit, nba.test, type = "class")
#tabling results
table(nba.test$Position, nba.test$Prediction)
prop.table(table(nba.test$Position,nba.test$Prediction),1)
```

#Decision Tree - Over Fit
```{r}
#model2
fit = rpart(Position ~., data = nba.train[,c(2,6:21)], method="class", control=rpart.control(minsplit=3, cp=0.001))
plot(fit$variable.importance)
#fancyRpartPlot(fit)
#prediction
nba.test$Prediction <- predict(fit, nba.test, type = "class")
#tabling results
table(nba.test$Position, nba.test$Prediction)
prop.table(table(nba.test$Position,nba.test$Prediction),1)
```

#RF
```{r}
#model 1
rf.model = randomForest(as.factor(Position) ~ . , data= nba.train[,c(2,6:21)], ntree= 500, mtry = round(sqrt(ncol(nba))))
mean(rf.model$err.rate)
plot(rf.model)
#predicting using test set
nba.test$pred.pos.rf = predict(rf.model, nba.test, type="response")
#tabling resutls
table(nba.test$Position,nba.test$pred.pos.rf)
#proportion table

prop.table(table(nba.test$Position,nba.test$pred.pos.rf),1)
layout(matrix(c(1,2),nrow=1),
       width=c(4,1))
par(mar=c(5,4,4,0)) #No margin on the right side
plot(rf.model)
par(mar=c(5,0,4,2)) #No margin on the left side
plot(c(0,1),type="n", axes=F, xlab="", ylab="")
legend("top", colnames(rf.model$err.rate),col=1:6,cex=0.8,fill=1:6)
```

#table
```{r}
ave.table = summaryBy(. ~ factor(Position) , data = nba[,c(2,6:21)], FUN = mean)
stargazer(ave.table[,-c(2,13,14,15)], summary = F, font.size = "small", column.sep.width = "1pt", covariate.labels = c("ID", "Position", "PER", "TS", "ORB", "DRB", "TRB", "AST", "BLK", "TOV", "USG", "ORtg", "DRtg","STL"))
stargazer(nba)
```

#finding best parameters(ntree and mtry)
```{r}
bestmtry <- tuneRF(nba[,c(6:21)], factor(nba$Position), stepFactor=1.5, improve=1e-5, ntree=500)
print(bestmtry)
plot(rf.model)
```


## statcrunchscraped.py
#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
import sys # used to dump the whole dataframe

r = requests.get('https://www.statcrunch.com/grabdata.php?dataid=1096769&_=1470977876486') # making GET request

data = r.text.splitlines()

headers = data.pop(0).split() # extracting headers


with open('statcrunch_dump.csv', 'wb') as write_file: # file to which data will be written
	writer = csv.writer(write_file)
	for line in data:
		# extracting values
		name = re.split("\" ", line)[0].strip('"')
		data = re.split("\" ", line)[1].split()

		# combining data
		meta = data[:0] + [name] + data[0:]

		meta = [d.encode('utf-8') for d in meta] # reason for encoding: remove u

		writer.writerow(meta) # writing to file


df = pd.read_csv('statcrunch_dump.csv') # inserting in pandas
print df.to_csv(sys.stdout) # printing from pandas
	---
	title: "Project3"
	author: "Jurgen de Jager"
	date: "August 12, 2016"
	output: html_document
	---

	```{r, echo = FALSE}
	#loading packages
	library(randomForest)
	library(ggplot2)
	library(rpart)
	library(rpart.plot)
	library(rattle)
	library(party)
	library(RColorBrewer)
	library(MASS)
	library(GGally)
	library(dplyr)
	library(d3heatmap)
	library(doBy)
	library(stargazer)
	```

	```{r, echo = FALSE}
	#data prep
	nba = read.csv("nba.csv")
	nba$Position = as.character(nba$Position)
	which(nchar(nba$Position)>3)
	nba = nba[1:342,]
	nba = nba[-c(159,260,324,327),]
	nba$random = runif(338, 0, 1)
	nba.train = nba[nba$random < .4,]
	nba.test = nba[nba$random >= .4,]
	```

	# VIZ
	```{r}
	nba.temp = nba[,c('Position', 'TRB', 'AST', 'STL', 'BLK', 'PER', 'TOV', 'USG', 'TS')]
	ggpairs(nba.temp, aes(col = Position))

	#points
	ggplot(nba, aes(x = PER)) + geom_density(aes(fill = Position, alpha = 0.2))
	#true shootings score
	ggplot(nba, aes(x = TS)) + geom_density(aes(fill = Position, alpha = 0.2))
	#Offensive Rebounds
	ggplot(nba, aes(x = TRB)) + geom_density(aes(fill = Position, alpha = 0.2))
	#Defensive Rebounds
	ggplot(nba, aes(x = DRB)) + geom_density(aes(fill = Position, alpha = 0.2))
	#Steals
	ggplot(nba, aes(x = STL)) + geom_density(aes(fill = Position, alpha = 0.2))


	#heatmap by position
	viz.data = nba[,c(1:2,6:21)]
	vd = viz.data %>%
	group_by(Position) %>%
	summarise_each(funs(mean))
	name = vd$Position
	vd = vd[,2:17]
	row.names(vd) = name
	d3heatmap(vd, scale="column", dendrogram = "none")


	## heatmap of every player
	nba3 = nba[,c('Player', 'TRB', 'AST', 'STL', 'BLK', 'PER', 'TOV', 'USG', 'TS')]
	name2 = nba3$Player
	row.names(nba3) = name2
	d3heatmap(nba3[,c(2:9)], scale="column", yaxis_font_size = "4pt", k_row = 5,dendrogram = "row")

	```

	#Decision Tree - Good Fit
	```{r}
	#model2
	fit = rpart(Position ~., data = nba.train[,c(2,6:21)], method="class")
	#fancyRpartPlot(fit)
	#prediction
	nba.test$Prediction <- predict(fit, nba.test, type = "class")
	#tabling results
	table(nba.test$Position, nba.test$Prediction)
	prop.table(table(nba.test$Position,nba.test$Prediction),1)
	```

	#Decision Tree - Over Fit
	```{r}
	#model2
	fit = rpart(Position ~., data = nba.train[,c(2,6:21)], method="class", control=rpart.control(minsplit=3, cp=0.001))
	plot(fit$variable.importance)
	#fancyRpartPlot(fit)
	#prediction
	nba.test$Prediction <- predict(fit, nba.test, type = "class")
	#tabling results
	table(nba.test$Position, nba.test$Prediction)
	prop.table(table(nba.test$Position,nba.test$Prediction),1)
	```

	#RF
	```{r}
	#model 1
	rf.model = randomForest(as.factor(Position) ~ . , data= nba.train[,c(2,6:21)], ntree= 500, mtry = round(sqrt(ncol(nba))))
	mean(rf.model$err.rate)
	plot(rf.model)
	#predicting using test set
	nba.test$pred.pos.rf = predict(rf.model, nba.test, type="response")
	#tabling resutls
	table(nba.test$Position,nba.test$pred.pos.rf)
	#proportion table

	prop.table(table(nba.test$Position,nba.test$pred.pos.rf),1)
	layout(matrix(c(1,2),nrow=1),
	width=c(4,1))
	par(mar=c(5,4,4,0)) #No margin on the right side
	plot(rf.model)
	par(mar=c(5,0,4,2)) #No margin on the left side
	plot(c(0,1),type="n", axes=F, xlab="", ylab="")
	legend("top", colnames(rf.model$err.rate),col=1:6,cex=0.8,fill=1:6)
	```

	#table
	```{r}
	ave.table = summaryBy(. ~ factor(Position) , data = nba[,c(2,6:21)], FUN = mean)
	stargazer(ave.table[,-c(2,13,14,15)], summary = F, font.size = "small", column.sep.width = "1pt", covariate.labels = c("ID", "Position", "PER", "TS", "ORB", "DRB", "TRB", "AST", "BLK", "TOV", "USG", "ORtg", "DRtg","STL"))
	stargazer(nba)
	```

	#finding best parameters(ntree and mtry)
	```{r}
	bestmtry <- tuneRF(nba[,c(6:21)], factor(nba$Position), stepFactor=1.5, improve=1e-5, ntree=500)
	print(bestmtry)
	plot(rf.model)
	```
	#!/usr/bin/env python3

	import requests
	from bs4 import BeautifulSoup
	import csv
	import re
	import pandas as pd
	import sys # used to dump the whole dataframe

	r = requests.get('https://www.statcrunch.com/grabdata.php?dataid=1096769&_=1470977876486') # making GET request

	data = r.text.splitlines()

	headers = data.pop(0).split() # extracting headers


	with open('statcrunch_dump.csv', 'wb') as write_file: # file to which data will be written
	writer = csv.writer(write_file)
	for line in data:
	# extracting values
	name = re.split("\" ", line)[0].strip('"')
	data = re.split("\" ", line)[1].split()

	# combining data
	meta = data[:0] + [name] + data[0:]

	meta = [d.encode('utf-8') for d in meta] # reason for encoding: remove u

	writer.writerow(meta) # writing to file


	df = pd.read_csv('statcrunch_dump.csv') # inserting in pandas
	print df.to_csv(sys.stdout) # printing from pandas