benfb/Rnotes.r

## Rnotes.r
# basic data setup
library(ggplot2)
d <- read.csv('/Users/ben/Documents/hampshire/S2015/collapse_phenomena/resistance.csv', header = TRUE, sep = ",")
load("/Users/ben/Downloads/drugs/DS0001/34933-0001-Data.rda")
d <- read.table('/Users/ben/Downloads/adolescenthealth/DS0001/21600-0001-Data.tsv', sep='\t', header=TRUE)
data <- da34933.0001 # store the dataframe in an easily accessible variable

# Convert to actual numerical variables
library(prettyR)
lbls <- sort(levels(data$PSILCY))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
data$PSILCY <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", data$PSILCY))
data$PSILCY <- add.value.labels(data$PSILCY, lbls)

# plot amount of responses in each category for ketamine
p <- qplot(KETAMINE, data = data, xlab = "Ketamine", ylab = "Responses", geom="bar")
# scatterplot of two drugs
p <- qplot(KETAMINE, PSILCY, data = data, xlab = "Ketamine", ylab = "Psilocybin")

# hexbin
ggplot(Cigarette,aes(x=income, y=packpc)) + stat_binhex()

# correlations
cor(data$KETAMINE, data$PSILCY, use="complete.obs")
cor(m$grc, m$mean)

# save image
ggsave(filename = "file.png", plot=p)

# count variables
library(plyr)
count(data, vars="PSILCY")

# p value test
summary(lm(RowMean ~ ID, data = check5))
summary(lm(mean ~ grc, data = m))

# get average of every 5 rows
colMeans(matrix(d$steps, nrow=5))

# standard deviation graph
library(matrixStats)
m <- data.frame(colMeans(matrix(d$steps, nrow=5)), colSds(matrix(d$steps, nrow=5)), colMeans(matrix(d$grc, nrow=5)))
names(m)[1] <- "mean"
names(m)[2] <- "sd"
names(m)[3] <- "grc"
ggplot(m, aes(x=m$grc, y=m$mean, col=m$grc)) +
geom_errorbar(aes(ymin=m$mean-m$sd, ymax=m$mean+m$sd), width=.2) +
geom_line() +
geom_point()

# easier SD and mean calculations and plotting
library(plyr)
ds <- ddply(d, .(grc), summarise, mean = mean(steps), sd = sd(steps))
ggplot() + geom_point(data = d, aes(x = grc, y = steps)) + geom_point(data = ds, aes(x = grc, y = mean),  colour = 'red', size = 3) + geom_errorbar(data = ds, aes (x = grc, y = mean, ymin = mean - sd, ymax = mean + sd), colour = 'red', width = 0.4)
ggplot(ds, aes(x=grc, y=mean)) + geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2) + geom_line() + geom_point()
p <- ggplot(ds, aes(x=grc, y=mean, ymin=mean-sd, ymax=mean+sd, colour=mean)) + geom_errorbar(width=1.5) + geom_line() + geom_point() + labs(x = "gain-reduction-chance", y="mean steps") + annotate("text", label = "p < .01", x = 60, y = 250, size = 8, colour = "red")

# line of best fit
c <- coef(lm(grc ~ mean, data = ds))
geom_abline(intercept=c[1], slope=c[2], linetype=2)

# graph with limits
p <- qplot(H1DA9, H1DA10, data = d, xlab = "Videos", ylab = "Games") + xlim(c(0, 100)) + ylim(c(0, 100))

# graph with sizes
p <- qplot(H1DA9, H1DA10, data = d, xlab = "Videos", ylab = "Games") + stat_sum(aes(size = ..n..))

# ecdata
library(ggplot2)
library(Ecdat)
data(Cigarette)
p <- qplot(income, packpc, data = Cigarette, xlab = "State Personal Income", ylab = "Packs Per Capita")
p <- ggplot(Cigarette, aes(x=income, y=packpc, colour=state)) + geom_point() + labs(x = "income", y="mean packs per capita", title ="Cigarette Use Related To Income")
p + guides(col = guide_legend(nrow = 12)) # divide legend into columns

# make pretty interactive graphs with plot.ly
library(plotly)
py <- plotly(user="bfb", key="8eju5js6ef")
response<-py$ggplotly()

# Austin crime mapping
library(ggmap)
AustinMap <- qmap("austin", zoom = 12, color = "bw", legend = "topleft")
AustinMap + geom_point(aes(x = LONGITUDE, y = LATITUDE, colour = Crime.Type), data = d)

# subset based on if name contains string
subset(d, ((grepl("ASSAULT", Crime.Type))))

# add themes to graphs
library("ggthemes")
p + theme_economist()

# calculate confidence interval
error <- qt(0.975,df=SAMPLESIZE-1)*STDEV/sqrt(SAMPLESIZE)
	# basic data setup
	library(ggplot2)
	d <- read.csv('/Users/ben/Documents/hampshire/S2015/collapse_phenomena/resistance.csv', header = TRUE, sep = ",")
	load("/Users/ben/Downloads/drugs/DS0001/34933-0001-Data.rda")
	d <- read.table('/Users/ben/Downloads/adolescenthealth/DS0001/21600-0001-Data.tsv', sep='\t', header=TRUE)
	data <- da34933.0001 # store the dataframe in an easily accessible variable

	# Convert to actual numerical variables
	library(prettyR)
	lbls <- sort(levels(data$PSILCY))
	lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
	data$PSILCY <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", data$PSILCY))
	data$PSILCY <- add.value.labels(data$PSILCY, lbls)

	# plot amount of responses in each category for ketamine
	p <- qplot(KETAMINE, data = data, xlab = "Ketamine", ylab = "Responses", geom="bar")
	# scatterplot of two drugs
	p <- qplot(KETAMINE, PSILCY, data = data, xlab = "Ketamine", ylab = "Psilocybin")

	# hexbin
	ggplot(Cigarette,aes(x=income, y=packpc)) + stat_binhex()

	# correlations
	cor(data$KETAMINE, data$PSILCY, use="complete.obs")
	cor(m$grc, m$mean)

	# save image
	ggsave(filename = "file.png", plot=p)

	# count variables
	library(plyr)
	count(data, vars="PSILCY")

	# p value test
	summary(lm(RowMean ~ ID, data = check5))
	summary(lm(mean ~ grc, data = m))

	# get average of every 5 rows
	colMeans(matrix(d$steps, nrow=5))

	# standard deviation graph
	library(matrixStats)
	m <- data.frame(colMeans(matrix(d$steps, nrow=5)), colSds(matrix(d$steps, nrow=5)), colMeans(matrix(d$grc, nrow=5)))
	names(m)[1] <- "mean"
	names(m)[2] <- "sd"
	names(m)[3] <- "grc"
	ggplot(m, aes(x=m$grc, y=m$mean, col=m$grc)) +
	geom_errorbar(aes(ymin=m$mean-m$sd, ymax=m$mean+m$sd), width=.2) +
	geom_line() +
	geom_point()

	# easier SD and mean calculations and plotting
	library(plyr)
	ds <- ddply(d, .(grc), summarise, mean = mean(steps), sd = sd(steps))
	ggplot() + geom_point(data = d, aes(x = grc, y = steps)) + geom_point(data = ds, aes(x = grc, y = mean), colour = 'red', size = 3) + geom_errorbar(data = ds, aes (x = grc, y = mean, ymin = mean - sd, ymax = mean + sd), colour = 'red', width = 0.4)
	ggplot(ds, aes(x=grc, y=mean)) + geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2) + geom_line() + geom_point()
	p <- ggplot(ds, aes(x=grc, y=mean, ymin=mean-sd, ymax=mean+sd, colour=mean)) + geom_errorbar(width=1.5) + geom_line() + geom_point() + labs(x = "gain-reduction-chance", y="mean steps") + annotate("text", label = "p < .01", x = 60, y = 250, size = 8, colour = "red")

	# line of best fit
	c <- coef(lm(grc ~ mean, data = ds))
	geom_abline(intercept=c[1], slope=c[2], linetype=2)

	# graph with limits
	p <- qplot(H1DA9, H1DA10, data = d, xlab = "Videos", ylab = "Games") + xlim(c(0, 100)) + ylim(c(0, 100))

	# graph with sizes
	p <- qplot(H1DA9, H1DA10, data = d, xlab = "Videos", ylab = "Games") + stat_sum(aes(size = ..n..))

	# ecdata
	library(ggplot2)
	library(Ecdat)
	data(Cigarette)
	p <- qplot(income, packpc, data = Cigarette, xlab = "State Personal Income", ylab = "Packs Per Capita")
	p <- ggplot(Cigarette, aes(x=income, y=packpc, colour=state)) + geom_point() + labs(x = "income", y="mean packs per capita", title ="Cigarette Use Related To Income")
	p + guides(col = guide_legend(nrow = 12)) # divide legend into columns

	# make pretty interactive graphs with plot.ly
	library(plotly)
	py <- plotly(user="bfb", key="8eju5js6ef")
	response<-py$ggplotly()

	# Austin crime mapping
	library(ggmap)
	AustinMap <- qmap("austin", zoom = 12, color = "bw", legend = "topleft")
	AustinMap + geom_point(aes(x = LONGITUDE, y = LATITUDE, colour = Crime.Type), data = d)

	# subset based on if name contains string
	subset(d, ((grepl("ASSAULT", Crime.Type))))

	# add themes to graphs
	library("ggthemes")
	p + theme_economist()

	# calculate confidence interval
	error <- qt(0.975,df=SAMPLESIZE-1)*STDEV/sqrt(SAMPLESIZE)