fwhigh/bootstrap.R

## bootstrap.R
library(data.table)
library(ggplot2)

thm <- theme_bw()
thm <- thm + theme(axis.line = element_line(colour = "black"),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.border = element_blank(),
                   panel.background = element_blank())

# make fake uniformly distributed data
df <- data.table(index=as.factor(1:30),count=1,value=runif(30))
print(df)

# plot the number of time each data point appears
p <- ggplot(df, aes(index, count)) + geom_bar(stat="identity")
p <- p + scale_y_continuous(limits = c(0, 8))
p <- p + thm
print(p)
ggsave(p, file="example_raw_count_hist.png", bg = "transparent")

# bootstrap the mean of the data
qoi <- function(dat) {
  return(mean(dat))
}
se.estimator <- function(dat) {
  return(sd(dat)/sqrt(length(dat)))
}
m <- 1e4 # number of bootstraps
# Do the bootstrap
result <- sapply(1:m, function (i) {df[sample(1:nrow(df), replace = TRUE),qoi(value)]})
cat(paste0("compare bootstrap mean ",mean(result)," to sample quantity of interest ",qoi(df$value),"\n"))
cat(paste0("compare bootstrap standard deviation ",sd(result)," to the sample standard error of the quantity of interest ",se.estimator(df$value),"\n"))

# perform a single bootstrap resampling using multinomial random numbers
df$resample=c(rmultinom(1,nrow(df),rep(1/nrow(df),nrow(df))))

# plot the number of time each data point appears
p <- ggplot(df, aes(index, resample)) + geom_bar(stat="identity")
p <- p + scale_y_continuous(limits = c(0, 8))
p <- p + thm
print(p)
ggsave(p, file="example_resample_hist.png", bg = "transparent")

# simulate 10000 bootstrap iterations and look at the frequency of the first data point per iteration
cnts <- data.table(frequency=rmultinom(10000,nrow(df),rep(1/nrow(df),nrow(df)))[1,])
# count the number of times each frequency comes up
prob <- data.table(unique(cnts[,count:=.N,by=frequency]))
# compute the binomial
prob[,binomial:=dbinom(frequency,10000,1/10000)*10000]
print(prob[order(frequency),])

# compute the poisson and plot
prob[,poisson:=dpois(frequency,1)*10000]
prob.melt <- melt(prob,id.vars=c('frequency'))
p <- ggplot(prob.melt,aes(frequency,value,color=variable,shape=variable)) + geom_point(size=5)
p <- p + scale_shape_discrete(solid=F)
p <- p + thm
print(p)
ggsave(p, file="first_index_freq_pois.png", bg = "transparent")

write.table(subset(df,select=c('value','count')),file='small_data.txt',quote=F,sep=" ",row.names=F,col.names=F)

## clean_up.sh
#!/usr/bin/env bash

PIDS=$(ps ax | grep "t stream search" | grep -v grep | sed 's/^[ \t]*//g' | cut -d' ' -f1 | paste -s -d' ' -)
if [ "$PIDS" != "" ]; then
    kill $PIDS
fi

\ls | grep -ve "png$" | xargs -I {} rm -v {}

## streaming_bootstrap.sh
#!/usr/bin/env bash

cat small_data.txt | \
 sdbootstrap_inner.py \
  --online_update WeightedMeanUpdater \
  --n_boot 10000 > pyboot_out.txt

head pyboot_out.txt

/usr/bin/env Rscript -<<EOF
library(data.table)
df<-fread('pyboot_out.txt')
df[,mean(V3)]
df[,sd(V3)]
EOF

awk -v n_data=10000 '
BEGIN {
for (i=1;i<=n_data;i++) {
  print rand(),1
 }
}' > data.txt

head data.txt

/usr/bin/env Rscript -<<EOF
library(data.table)
df <- fread('data.txt', col.names=c("value","count"))
qoi <- function(dat) {
  return(mean(dat))
}
se.estimator <- function(dat) {
  return(sd(dat)/sqrt(length(dat)))
}
m <- 1e4
result <- sapply(1:m, function (i) {df[sample(1:nrow(df), replace = TRUE),qoi(value)]})
cat(paste0("compare bootstrap mean ",mean(result)," to sample quantity of interest ",qoi(df$value),"\n"))
cat(paste0("compare bootstrap standard deviation ",sd(result)," to the sample standard error of the quantity of interest ",se.estimator(df$value),"\n"))
EOF

parallel --pipepart --jobs 6 --block 18k -a data.txt \
 sdbootstrap_inner.py --online_update WeightedMeanUpdater --n_boot 10000 | \
 sdbootstrap_outer.py > pyboot_out.txt

head pyboot_out.txt

/usr/bin/env Rscript -<<EOF
library(data.table)
df <- fread('pyboot_out.txt', col.names=c("master_index","index","value","count"))
df[,mean(value)]
df[,sd(value)]
EOF

## twitter_example.R
library(data.table)
library(ggplot2)

args <- commandArgs(trailingOnly = TRUE)

thm <- theme_bw()
thm <- thm + theme(axis.line = element_line(colour = "black"),
                   panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.border = element_blank(),
                   panel.background = element_blank())

i=0
for (term in args[1:3]) {
 df.raw=fread(paste0('pyboot_out_',term,'.txt'))
 df.raw$term=term
 if (i == 0) {
  df=df.raw
 } else {
  df=rbind(df,df.raw)
 }
 i = i + 1
}
df[,mean:=mean(V3),by=c('V1','term')]
df[,sd:=sd(V3),by=c('V1','term')]
df.u=unique(subset(df,select=c('V1','term','mean','sd')))
df.u[,lower:=mean-sd]
df.u[,upper:=mean+sd]
p <- ggplot(df.u,aes(as.POSIXct(V1, origin="1970-01-01"),mean,color=term)) + geom_line()
p <- p + geom_ribbon(data=df.u,aes(ymin=lower,ymax=upper),alpha=0.3)
p <- p + xlab("time") + ylab("mean seconds between tweets, cumulative")
p <- p + thm
ggsave(p, file=args[4])

## twitter_example.sh
#!/usr/bin/env bash

SEARCH1=$(t trends | awk 'NR==1' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]')
SEARCH2=$(t trends | awk 'NR==2' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]')
SEARCH3=$(t trends | awk 'NR==3' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]')

function t_stream() {
  echo $1;
  t stream search -c -l $1 | \
   gstdbuf -i0 -o0 -e0 awk -F, '$2 ~ /^2017/ {b=$2;gsub(/[:-]/," ",b);if (a) {print mktime(b)-mktime(a),1}; a=b}' | \
   python -u $(which sdbootstrap_inner.py) \
    --online_update WeightedMeanUpdater \
    --n_boot 200 \
    --update_every 10 \
    --no_flush_on_update > "pyboot_out_${1}.txt";
}

t_stream $SEARCH1 &
t_stream $SEARCH2 &
t_stream $SEARCH3 &
# find process ID's to kill using
#ps ax | grep "t stream search"

Rscript twitter_example.R $SEARCH1 $SEARCH2 $SEARCH3 tweet_rate.png
open tweet_rate.png
	library(data.table)
	library(ggplot2)

	thm <- theme_bw()
	thm <- thm + theme(axis.line = element_line(colour = "black"),
	panel.grid.major = element_blank(),
	panel.grid.minor = element_blank(),
	panel.border = element_blank(),
	panel.background = element_blank())

	# make fake uniformly distributed data
	df <- data.table(index=as.factor(1:30),count=1,value=runif(30))
	print(df)

	# plot the number of time each data point appears
	p <- ggplot(df, aes(index, count)) + geom_bar(stat="identity")
	p <- p + scale_y_continuous(limits = c(0, 8))
	p <- p + thm
	print(p)
	ggsave(p, file="example_raw_count_hist.png", bg = "transparent")

	# bootstrap the mean of the data
	qoi <- function(dat) {
	return(mean(dat))
	}
	se.estimator <- function(dat) {
	return(sd(dat)/sqrt(length(dat)))
	}
	m <- 1e4 # number of bootstraps
	# Do the bootstrap
	result <- sapply(1:m, function (i) {df[sample(1:nrow(df), replace = TRUE),qoi(value)]})
	cat(paste0("compare bootstrap mean ",mean(result)," to sample quantity of interest ",qoi(df$value),"\n"))
	cat(paste0("compare bootstrap standard deviation ",sd(result)," to the sample standard error of the quantity of interest ",se.estimator(df$value),"\n"))

	# perform a single bootstrap resampling using multinomial random numbers
	df$resample=c(rmultinom(1,nrow(df),rep(1/nrow(df),nrow(df))))

	# plot the number of time each data point appears
	p <- ggplot(df, aes(index, resample)) + geom_bar(stat="identity")
	p <- p + scale_y_continuous(limits = c(0, 8))
	p <- p + thm
	print(p)
	ggsave(p, file="example_resample_hist.png", bg = "transparent")

	# simulate 10000 bootstrap iterations and look at the frequency of the first data point per iteration
	cnts <- data.table(frequency=rmultinom(10000,nrow(df),rep(1/nrow(df),nrow(df)))[1,])
	# count the number of times each frequency comes up
	prob <- data.table(unique(cnts[,count:=.N,by=frequency]))
	# compute the binomial
	prob[,binomial:=dbinom(frequency,10000,1/10000)*10000]
	print(prob[order(frequency),])

	# compute the poisson and plot
	prob[,poisson:=dpois(frequency,1)*10000]
	prob.melt <- melt(prob,id.vars=c('frequency'))
	p <- ggplot(prob.melt,aes(frequency,value,color=variable,shape=variable)) + geom_point(size=5)
	p <- p + scale_shape_discrete(solid=F)
	p <- p + thm
	print(p)
	ggsave(p, file="first_index_freq_pois.png", bg = "transparent")

	write.table(subset(df,select=c('value','count')),file='small_data.txt',quote=F,sep=" ",row.names=F,col.names=F)
	#!/usr/bin/env bash

	PIDS=$(ps ax \| grep "t stream search" \| grep -v grep \| sed 's/^[ \t]*//g' \| cut -d' ' -f1 \| paste -s -d' ' -)
	if [ "$PIDS" != "" ]; then
	kill $PIDS
	fi

	\ls \| grep -ve "png$" \| xargs -I {} rm -v {}
	#!/usr/bin/env bash

	cat small_data.txt \| \
	sdbootstrap_inner.py \
	--online_update WeightedMeanUpdater \
	--n_boot 10000 > pyboot_out.txt

	head pyboot_out.txt

	/usr/bin/env Rscript -<<EOF
	library(data.table)
	df<-fread('pyboot_out.txt')
	df[,mean(V3)]
	df[,sd(V3)]
	EOF

	awk -v n_data=10000 '
	BEGIN {
	for (i=1;i<=n_data;i++) {
	print rand(),1
	}
	}' > data.txt

	head data.txt

	/usr/bin/env Rscript -<<EOF
	library(data.table)
	df <- fread('data.txt', col.names=c("value","count"))
	qoi <- function(dat) {
	return(mean(dat))
	}
	se.estimator <- function(dat) {
	return(sd(dat)/sqrt(length(dat)))
	}
	m <- 1e4
	result <- sapply(1:m, function (i) {df[sample(1:nrow(df), replace = TRUE),qoi(value)]})
	cat(paste0("compare bootstrap mean ",mean(result)," to sample quantity of interest ",qoi(df$value),"\n"))
	cat(paste0("compare bootstrap standard deviation ",sd(result)," to the sample standard error of the quantity of interest ",se.estimator(df$value),"\n"))
	EOF

	parallel --pipepart --jobs 6 --block 18k -a data.txt \
	sdbootstrap_inner.py --online_update WeightedMeanUpdater --n_boot 10000 \| \
	sdbootstrap_outer.py > pyboot_out.txt

	head pyboot_out.txt

	/usr/bin/env Rscript -<<EOF
	library(data.table)
	df <- fread('pyboot_out.txt', col.names=c("master_index","index","value","count"))
	df[,mean(value)]
	df[,sd(value)]
	EOF
	library(data.table)
	library(ggplot2)

	args <- commandArgs(trailingOnly = TRUE)

	thm <- theme_bw()
	thm <- thm + theme(axis.line = element_line(colour = "black"),
	panel.grid.major = element_blank(),
	panel.grid.minor = element_blank(),
	panel.border = element_blank(),
	panel.background = element_blank())

	i=0
	for (term in args[1:3]) {
	df.raw=fread(paste0('pyboot_out_',term,'.txt'))
	df.raw$term=term
	if (i == 0) {
	df=df.raw
	} else {
	df=rbind(df,df.raw)
	}
	i = i + 1
	}
	df[,mean:=mean(V3),by=c('V1','term')]
	df[,sd:=sd(V3),by=c('V1','term')]
	df.u=unique(subset(df,select=c('V1','term','mean','sd')))
	df.u[,lower:=mean-sd]
	df.u[,upper:=mean+sd]
	p <- ggplot(df.u,aes(as.POSIXct(V1, origin="1970-01-01"),mean,color=term)) + geom_line()
	p <- p + geom_ribbon(data=df.u,aes(ymin=lower,ymax=upper),alpha=0.3)
	p <- p + xlab("time") + ylab("mean seconds between tweets, cumulative")
	p <- p + thm
	ggsave(p, file=args[4])
	#!/usr/bin/env bash

	SEARCH1=$(t trends \| awk 'NR==1' \| tr -d '#' \| tr '[:upper:]' '[:lower:]' \| tr -d '[:blank:]')
	SEARCH2=$(t trends \| awk 'NR==2' \| tr -d '#' \| tr '[:upper:]' '[:lower:]' \| tr -d '[:blank:]')
	SEARCH3=$(t trends \| awk 'NR==3' \| tr -d '#' \| tr '[:upper:]' '[:lower:]' \| tr -d '[:blank:]')

	function t_stream() {
	echo $1;
	t stream search -c -l $1 \| \
	gstdbuf -i0 -o0 -e0 awk -F, '$2 ~ /^2017/ {b=$2;gsub(/[:-]/," ",b);if (a) {print mktime(b)-mktime(a),1}; a=b}' \| \
	python -u $(which sdbootstrap_inner.py) \
	--online_update WeightedMeanUpdater \
	--n_boot 200 \
	--update_every 10 \
	--no_flush_on_update > "pyboot_out_${1}.txt";
	}

	t_stream $SEARCH1 &
	t_stream $SEARCH2 &
	t_stream $SEARCH3 &
	# find process ID's to kill using
	#ps ax \| grep "t stream search"

	Rscript twitter_example.R $SEARCH1 $SEARCH2 $SEARCH3 tweet_rate.png
	open tweet_rate.png