Skip to content

Instantly share code, notes, and snippets.

@fredrick fredrick/cheatsheet.R
Last active Aug 29, 2015

Embed
What would you like to do?
R Statistics Cheetsheet
library(ggplot2)
library(plyr)
## Graphing
# Histogram of columnName in someDataFrame, ordered in descending frequency
# from left to right.
ggplot(someDataFrame, aes(x=reorder(columnName,columnName,function(x)-length(x)))) +
geom_bar() +
xlab("X Label") +
ylab("Y Label")
# Frequency punchcard of columnX versus columnY in someDataFrame.
dfc <- ddply(someDataFrame, c("columnX", "columnY"), "nrow", .drop=FALSE)
ggplot(data=dfc, aes(x=columnX, y=columnY, size=factor(nrow), color=factor(nrow))) +
geom_point() +
scale_size_discrete(range=c(1, 10)) +
labs(size="Frequency", color="Frequency")
## Subsets
# Omit NA values and boxplot outliers from data frame column
na.omit(someDataFrame$columnName[!someDataFram$columnName %in% boxplot.stats(someDataFrame$columnName)$out])
# Group dates into year, month factors
factor(as.Date(as.yearmon(as.Date(date))))
# Combine two data frames together
common.names <- intersect(colnames(database.one), colnames(database.two))
combined.database <- rbind(database.one[, common.names], database.two[, common.names])
## Factor analysis
# PCA Variable Factor Map
library(FactoMineR)
result <- PCA(someDataFrame)
## Data mining
# Association rule learning
library(arules)
library(arulesViz)
rules <- apriori(factorDataFrame,
parameter = list(minlen=2, supp=0.005, conf=0.8),
appearance = list(rhs=c("dependent_variable=1"), default="lhs"),
control = list(verbose=F))
rules.sorted <- sort(rules, by="lift")
subset.matrix <- is.subset(rules.sorted, rules.sorted)
subset.matrix[lower.tri(subset.matrix, diag=T)] <- NA
redundant <- colSums(subset.matrix, na.rm=T) >= 1
rules.pruned <- rules.sorted[!redundant]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.