Skip to content

Instantly share code, notes, and snippets.

@jurgen-dejager jurgen-dejager/Rcode.Rmd

Last active Aug 16, 2016
Embed
What would you like to do?
---
title: "Project3"
author: "Jurgen de Jager"
date: "August 12, 2016"
output: html_document
---
```{r, echo = FALSE}
#loading packages
library(randomForest)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(rattle)
library(party)
library(RColorBrewer)
library(MASS)
library(GGally)
library(dplyr)
library(d3heatmap)
library(doBy)
library(stargazer)
```
```{r, echo = FALSE}
#data prep
nba = read.csv("nba.csv")
nba$Position = as.character(nba$Position)
which(nchar(nba$Position)>3)
nba = nba[1:342,]
nba = nba[-c(159,260,324,327),]
nba$random = runif(338, 0, 1)
nba.train = nba[nba$random < .4,]
nba.test = nba[nba$random >= .4,]
```
# VIZ
```{r}
nba.temp = nba[,c('Position', 'TRB', 'AST', 'STL', 'BLK', 'PER', 'TOV', 'USG', 'TS')]
ggpairs(nba.temp, aes(col = Position))
#points
ggplot(nba, aes(x = PER)) + geom_density(aes(fill = Position, alpha = 0.2))
#true shootings score
ggplot(nba, aes(x = TS)) + geom_density(aes(fill = Position, alpha = 0.2))
#Offensive Rebounds
ggplot(nba, aes(x = TRB)) + geom_density(aes(fill = Position, alpha = 0.2))
#Defensive Rebounds
ggplot(nba, aes(x = DRB)) + geom_density(aes(fill = Position, alpha = 0.2))
#Steals
ggplot(nba, aes(x = STL)) + geom_density(aes(fill = Position, alpha = 0.2))
#heatmap by position
viz.data = nba[,c(1:2,6:21)]
vd = viz.data %>%
group_by(Position) %>%
summarise_each(funs(mean))
name = vd$Position
vd = vd[,2:17]
row.names(vd) = name
d3heatmap(vd, scale="column", dendrogram = "none")
## heatmap of every player
nba3 = nba[,c('Player', 'TRB', 'AST', 'STL', 'BLK', 'PER', 'TOV', 'USG', 'TS')]
name2 = nba3$Player
row.names(nba3) = name2
d3heatmap(nba3[,c(2:9)], scale="column", yaxis_font_size = "4pt", k_row = 5,dendrogram = "row")
```
#Decision Tree - Good Fit
```{r}
#model2
fit = rpart(Position ~., data = nba.train[,c(2,6:21)], method="class")
#fancyRpartPlot(fit)
#prediction
nba.test$Prediction <- predict(fit, nba.test, type = "class")
#tabling results
table(nba.test$Position, nba.test$Prediction)
prop.table(table(nba.test$Position,nba.test$Prediction),1)
```
#Decision Tree - Over Fit
```{r}
#model2
fit = rpart(Position ~., data = nba.train[,c(2,6:21)], method="class", control=rpart.control(minsplit=3, cp=0.001))
plot(fit$variable.importance)
#fancyRpartPlot(fit)
#prediction
nba.test$Prediction <- predict(fit, nba.test, type = "class")
#tabling results
table(nba.test$Position, nba.test$Prediction)
prop.table(table(nba.test$Position,nba.test$Prediction),1)
```
#RF
```{r}
#model 1
rf.model = randomForest(as.factor(Position) ~ . , data= nba.train[,c(2,6:21)], ntree= 500, mtry = round(sqrt(ncol(nba))))
mean(rf.model$err.rate)
plot(rf.model)
#predicting using test set
nba.test$pred.pos.rf = predict(rf.model, nba.test, type="response")
#tabling resutls
table(nba.test$Position,nba.test$pred.pos.rf)
#proportion table
prop.table(table(nba.test$Position,nba.test$pred.pos.rf),1)
layout(matrix(c(1,2),nrow=1),
width=c(4,1))
par(mar=c(5,4,4,0)) #No margin on the right side
plot(rf.model)
par(mar=c(5,0,4,2)) #No margin on the left side
plot(c(0,1),type="n", axes=F, xlab="", ylab="")
legend("top", colnames(rf.model$err.rate),col=1:6,cex=0.8,fill=1:6)
```
#table
```{r}
ave.table = summaryBy(. ~ factor(Position) , data = nba[,c(2,6:21)], FUN = mean)
stargazer(ave.table[,-c(2,13,14,15)], summary = F, font.size = "small", column.sep.width = "1pt", covariate.labels = c("ID", "Position", "PER", "TS", "ORB", "DRB", "TRB", "AST", "BLK", "TOV", "USG", "ORtg", "DRtg","STL"))
stargazer(nba)
```
#finding best parameters(ntree and mtry)
```{r}
bestmtry <- tuneRF(nba[,c(6:21)], factor(nba$Position), stepFactor=1.5, improve=1e-5, ntree=500)
print(bestmtry)
plot(rf.model)
```
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
import sys # used to dump the whole dataframe
r = requests.get('https://www.statcrunch.com/grabdata.php?dataid=1096769&_=1470977876486') # making GET request
data = r.text.splitlines()
headers = data.pop(0).split() # extracting headers
with open('statcrunch_dump.csv', 'wb') as write_file: # file to which data will be written
writer = csv.writer(write_file)
for line in data:
# extracting values
name = re.split("\" ", line)[0].strip('"')
data = re.split("\" ", line)[1].split()
# combining data
meta = data[:0] + [name] + data[0:]
meta = [d.encode('utf-8') for d in meta] # reason for encoding: remove u
writer.writerow(meta) # writing to file
df = pd.read_csv('statcrunch_dump.csv') # inserting in pandas
print df.to_csv(sys.stdout) # printing from pandas
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.