Skip to content

Instantly share code, notes, and snippets.

@yabyzq
yabyzq / qplot.R
Last active September 16, 2016 13:11
R Tutorial - qplot
library(ggplot2)
#Get a small proportion to plot
dsample <- diamonds[sample(nrow(diamonds),500),]
#qplot(x, y, data)
qplot(carat,price,data=dsample)
#Add more measures
qplot(carat,price,data=dsample, color = color, shape =cut)
@yabyzq
yabyzq / abs.txt
Created September 16, 2016 14:13
Tableau - using ABS data
1. Search "ABS Housing finance"
2. Download data. TABLE 1. HOUSING FINANCE COMMITMENTS (Owner Occupation), By Purpose: Australia (Number, $000)
3. Clean up unused rows.
4. Pivoting
5. Switch it to month, Adjust Axis range, Add Color, Adjust line size
@yabyzq
yabyzq / R - algorithms.R
Created September 18, 2016 12:43
simple R algorithms
#Prepare training and test data
test_index <- which (1:length(iris[,1])%% 5 == 0)
iris_train <- iris[-test_index, ]
iris_test <- iris[test_index, ]
library(car)
test_index <- which (1:nrow(Prestige)%% 4 == 0)
prestige_train <- Prestige[-test_index, ]
prestige_test <- Prestige[test_index, ]
#Linear Regression --log(income) + education
library(mice)
library(missForest)
library(VIM)
library(Amelia)
library(Hmisc)
library(mi)
#Generate missing value using prodNA from missForest
data <- iris
summary(iris)
@yabyzq
yabyzq / R - Unbalanced dataset.R
Created October 11, 2016 12:46
R - Unbalanced dataset
#Sampling method
#1. Undersampling
#2. Oversampling
#3. Synthetic - Smote
#Ensembling method
#1. BalanceCascade, Keep removing majority class examples un4l none is miss-classified
#2. EasyEnsemble, ensemble different balanced model
#Cost-based method - Cost FN >> cost FP
@yabyzq
yabyzq / R - Basic Description.R
Created October 24, 2016 13:14
numeric and factor basic script
#1 Numeric Variable
numeric_stats <- function (x, na.omit =FALSE){
if(na.omit)
x <-x[!is.na(x)]
m <- mean(x)
a <- median(x)
s <- sd(x)
min <- min(x)
max <- max(x)
return(c(min = min, mean = m, avg = a, max = max, sd = s))
@yabyzq
yabyzq / R - dplyr.R
Created October 24, 2016 13:15
Basic dplyr
library(dplyr)
library(nycflights13)
#look at the data
tbl_df(flights)
#filter
filter(flights, month == 1, day ==1,!is.na(month))
#arrange
arrange(flights, desc(year))
@yabyzq
yabyzq / R - ggplot.R
Created October 24, 2016 13:15
basic ggplot
# Basic scatter plot of vocabulary (y) against education (x). Use geom_point()
ggplot(Vocab, aes(education, vocabulary))+geom_point()
# Use geom_jitter() instead of geom_point()
ggplot(Vocab, aes(education, vocabulary))+geom_jitter()
# Using the above plotting command, set alpha to a very low 0.2
ggplot(Vocab, aes(education, vocabulary))+geom_jitter(alpha = 0.2)
library(rpart)
#Generate binary Target
iris$isSetosa <- "N"
iris[iris$Species == "setosa",]$isSetosa <- "Y"
iris$isSetosa <- as.factor((iris$isSetosa))
levels(iris$isSetosa) <- c("N","Y")
head(iris)
#Create function
@yabyzq
yabyzq / caret.R
Created November 14, 2016 13:59
R - caret data handling
library(caret)
#looking at missing value
options(digits=2)
stats <- data.frame(missing = sapply(iris, function(x) sum(is.na(x))),
mean = sapply(iris, function(x) if(is.numeric(x)) {mean(x, na.rm = T)} else names(table(x)[order(table(x), decreasing = T)])[1])
)