Skip to content

Instantly share code, notes, and snippets.

@fwhigh
Last active June 16, 2017 21:01
Show Gist options
  • Save fwhigh/a68eb626018abb2985af1c2d8b7b93c1 to your computer and use it in GitHub Desktop.
Save fwhigh/a68eb626018abb2985af1c2d8b7b93c1 to your computer and use it in GitHub Desktop.
The Streaming Distributed Bootstrap
library(data.table)
library(ggplot2)
thm <- theme_bw()
thm <- thm + theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
# make fake uniformly distributed data
df <- data.table(index=as.factor(1:30),count=1,value=runif(30))
print(df)
# plot the number of time each data point appears
p <- ggplot(df, aes(index, count)) + geom_bar(stat="identity")
p <- p + scale_y_continuous(limits = c(0, 8))
p <- p + thm
print(p)
ggsave(p, file="example_raw_count_hist.png", bg = "transparent")
# bootstrap the mean of the data
qoi <- function(dat) {
return(mean(dat))
}
se.estimator <- function(dat) {
return(sd(dat)/sqrt(length(dat)))
}
m <- 1e4 # number of bootstraps
# Do the bootstrap
result <- sapply(1:m, function (i) {df[sample(1:nrow(df), replace = TRUE),qoi(value)]})
cat(paste0("compare bootstrap mean ",mean(result)," to sample quantity of interest ",qoi(df$value),"\n"))
cat(paste0("compare bootstrap standard deviation ",sd(result)," to the sample standard error of the quantity of interest ",se.estimator(df$value),"\n"))
# perform a single bootstrap resampling using multinomial random numbers
df$resample=c(rmultinom(1,nrow(df),rep(1/nrow(df),nrow(df))))
# plot the number of time each data point appears
p <- ggplot(df, aes(index, resample)) + geom_bar(stat="identity")
p <- p + scale_y_continuous(limits = c(0, 8))
p <- p + thm
print(p)
ggsave(p, file="example_resample_hist.png", bg = "transparent")
# simulate 10000 bootstrap iterations and look at the frequency of the first data point per iteration
cnts <- data.table(frequency=rmultinom(10000,nrow(df),rep(1/nrow(df),nrow(df)))[1,])
# count the number of times each frequency comes up
prob <- data.table(unique(cnts[,count:=.N,by=frequency]))
# compute the binomial
prob[,binomial:=dbinom(frequency,10000,1/10000)*10000]
print(prob[order(frequency),])
# compute the poisson and plot
prob[,poisson:=dpois(frequency,1)*10000]
prob.melt <- melt(prob,id.vars=c('frequency'))
p <- ggplot(prob.melt,aes(frequency,value,color=variable,shape=variable)) + geom_point(size=5)
p <- p + scale_shape_discrete(solid=F)
p <- p + thm
print(p)
ggsave(p, file="first_index_freq_pois.png", bg = "transparent")
write.table(subset(df,select=c('value','count')),file='small_data.txt',quote=F,sep=" ",row.names=F,col.names=F)
#!/usr/bin/env bash
PIDS=$(ps ax | grep "t stream search" | grep -v grep | sed 's/^[ \t]*//g' | cut -d' ' -f1 | paste -s -d' ' -)
if [ "$PIDS" != "" ]; then
kill $PIDS
fi
\ls | grep -ve "png$" | xargs -I {} rm -v {}
#!/usr/bin/env bash
cat small_data.txt | \
sdbootstrap_inner.py \
--online_update WeightedMeanUpdater \
--n_boot 10000 > pyboot_out.txt
head pyboot_out.txt
/usr/bin/env Rscript -<<EOF
library(data.table)
df<-fread('pyboot_out.txt')
df[,mean(V3)]
df[,sd(V3)]
EOF
awk -v n_data=10000 '
BEGIN {
for (i=1;i<=n_data;i++) {
print rand(),1
}
}' > data.txt
head data.txt
/usr/bin/env Rscript -<<EOF
library(data.table)
df <- fread('data.txt', col.names=c("value","count"))
qoi <- function(dat) {
return(mean(dat))
}
se.estimator <- function(dat) {
return(sd(dat)/sqrt(length(dat)))
}
m <- 1e4
result <- sapply(1:m, function (i) {df[sample(1:nrow(df), replace = TRUE),qoi(value)]})
cat(paste0("compare bootstrap mean ",mean(result)," to sample quantity of interest ",qoi(df$value),"\n"))
cat(paste0("compare bootstrap standard deviation ",sd(result)," to the sample standard error of the quantity of interest ",se.estimator(df$value),"\n"))
EOF
parallel --pipepart --jobs 6 --block 18k -a data.txt \
sdbootstrap_inner.py --online_update WeightedMeanUpdater --n_boot 10000 | \
sdbootstrap_outer.py > pyboot_out.txt
head pyboot_out.txt
/usr/bin/env Rscript -<<EOF
library(data.table)
df <- fread('pyboot_out.txt', col.names=c("master_index","index","value","count"))
df[,mean(value)]
df[,sd(value)]
EOF
library(data.table)
library(ggplot2)
args <- commandArgs(trailingOnly = TRUE)
thm <- theme_bw()
thm <- thm + theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
i=0
for (term in args[1:3]) {
df.raw=fread(paste0('pyboot_out_',term,'.txt'))
df.raw$term=term
if (i == 0) {
df=df.raw
} else {
df=rbind(df,df.raw)
}
i = i + 1
}
df[,mean:=mean(V3),by=c('V1','term')]
df[,sd:=sd(V3),by=c('V1','term')]
df.u=unique(subset(df,select=c('V1','term','mean','sd')))
df.u[,lower:=mean-sd]
df.u[,upper:=mean+sd]
p <- ggplot(df.u,aes(as.POSIXct(V1, origin="1970-01-01"),mean,color=term)) + geom_line()
p <- p + geom_ribbon(data=df.u,aes(ymin=lower,ymax=upper),alpha=0.3)
p <- p + xlab("time") + ylab("mean seconds between tweets, cumulative")
p <- p + thm
ggsave(p, file=args[4])
#!/usr/bin/env bash
SEARCH1=$(t trends | awk 'NR==1' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]')
SEARCH2=$(t trends | awk 'NR==2' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]')
SEARCH3=$(t trends | awk 'NR==3' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]')
function t_stream() {
echo $1;
t stream search -c -l $1 | \
gstdbuf -i0 -o0 -e0 awk -F, '$2 ~ /^2017/ {b=$2;gsub(/[:-]/," ",b);if (a) {print mktime(b)-mktime(a),1}; a=b}' | \
python -u $(which sdbootstrap_inner.py) \
--online_update WeightedMeanUpdater \
--n_boot 200 \
--update_every 10 \
--no_flush_on_update > "pyboot_out_${1}.txt";
}
t_stream $SEARCH1 &
t_stream $SEARCH2 &
t_stream $SEARCH3 &
# find process ID's to kill using
#ps ax | grep "t stream search"
Rscript twitter_example.R $SEARCH1 $SEARCH2 $SEARCH3 tweet_rate.png
open tweet_rate.png
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment