grosscol/stratified.r

## stratified.r
require('dplyr')
require('ggplot2')

# Simulate two types of queries: fast and slow.  More fast queries.
num_samples = 10000
days <- sample(seq(1,30), num_samples, replace=TRUE)
qtimes <- rpois(num_samples, c(25,35,100))
qlog <- data.frame(day=days, qtime=qtimes)

# take a quick look.
ggplot(qlog, aes(day,qtime)) + geom_count() + scale_size_area()

summary_of_qtime_day <- qlog %>%
  dplyr::arrange(day) %>%
  dplyr::group_by(day) %>%
  summarise(
    first_Q = quantile(qtime,.25,na.rm=TRUE),
    med = median(qtime,na.rm=TRUE),
    mean = mean(qtime,na.rm=TRUE),
    third_Q=quantile(qtime,.75,na.rm=TRUE),
    ninty_Q=quantile(qtime,.9,na.rm=TRUE),
    nintynine_Q=quantile(qtime,.99,na.rm=TRUE),
    max = max(qtime,na.rm=TRUE),
    q_per_day = n()
  )

# Take a look at how close or divergent the median and means are
print(summary_of_qtime_day$mean)
print(summary_of_qtime_day$med)

# Plot
p <- ggplot(summary_of_qtime_day, aes(size=3)) +
  guides(size=FALSE) +
  labs(x='day', y='value') +
  scale_color_manual(values=c('red','blue')) +
  geom_point(aes(x=day, y=mean, color='mean')) +
  geom_point(aes(x=day,y=med, color='med'))
print(p)

# Diverging from the new R and dplyr methods here just to get things to work.
# There is likely a more elegant dplyr approach to the following, but I don't have time to track it down.

##  Select the day which has statistics that are the closests to the average statistics.
column_mask <- colnames(summary_of_qtime_day) != "day"

# Get average of descriptive stats of all the days
ave_stats <- colMeans(summary_of_qtime_day[column_mask])

# Sum the residuals for each day and add the column to summary of qtimes
residuals <- apply(summary_of_qtime_day[column_mask], MARGIN=1, FUN= function(x){ sum(abs(x-ave_stats)) })
summary_of_qtime_day <- cbind(summary_of_qtime_day, res=residuals)

# Choose the day with the lowest sum of residuals
min_res_idx = which(summary_of_qtime_day$res == min(summary_of_qtime_day$res), arr.ind=TRUE)

matching_ave_day = summary_of_qtime_day[min_res_idx,]
print("Closest day log to average")
print(matching_ave_day)


#summary(by_day)
#day         first_Q           med              mean            third_Q
#Min.   :  1   Min.   : 0.00   Min.   :  0.00   Min.   :111.1   Min.   : 34.0
#1st Qu.: 55   1st Qu.:30.00   1st Qu.: 90.00   1st Qu.:221.6   1st Qu.:269.0
#Median :109   Median :31.00   Median : 99.00   Median :241.8   Median :286.0
#Mean   :109   Mean   :30.44   Mean   : 97.96   Mean   :253.0   Mean   :289.6
#3rd Qu.:163   3rd Qu.:31.00   3rd Qu.:105.00   3rd Qu.:263.1   3rd Qu.:303.0
#Max.   :217   Max.   :34.00   Max.   :140.00   Max.   :920.4   Max.   :910.2
#NA's   :1     NA's   :1       NA's   :1        NA's   :1       NA's   :1
#ninty_Q        nintynine_Q         max          q_per_day
#Min.   : 194.1   Min.   : 1166   Min.   : 6100   Min.   :    8
#1st Qu.: 506.0   1st Qu.: 1518   1st Qu.:14887   1st Qu.:12908
#Median : 552.7   Median : 1712   Median :21474   Median :17272
#Mean   : 584.9   Mean   : 1934   Mean   :23514   Mean   :18161
#3rd Qu.: 616.0   3rd Qu.: 1959   3rd Qu.:31507   3rd Qu.:20602
#Max.   :2438.0   Max.   :13521   Max.   :44937   Max.   :45972
#NA's   :1        NA's   :1       NA's   :1
	require('dplyr')
	require('ggplot2')

	# Simulate two types of queries: fast and slow. More fast queries.
	num_samples = 10000
	days <- sample(seq(1,30), num_samples, replace=TRUE)
	qtimes <- rpois(num_samples, c(25,35,100))
	qlog <- data.frame(day=days, qtime=qtimes)

	# take a quick look.
	ggplot(qlog, aes(day,qtime)) + geom_count() + scale_size_area()

	summary_of_qtime_day <- qlog %>%
	dplyr::arrange(day) %>%
	dplyr::group_by(day) %>%
	summarise(
	first_Q = quantile(qtime,.25,na.rm=TRUE),
	med = median(qtime,na.rm=TRUE),
	mean = mean(qtime,na.rm=TRUE),
	third_Q=quantile(qtime,.75,na.rm=TRUE),
	ninty_Q=quantile(qtime,.9,na.rm=TRUE),
	nintynine_Q=quantile(qtime,.99,na.rm=TRUE),
	max = max(qtime,na.rm=TRUE),
	q_per_day = n()
	)

	# Take a look at how close or divergent the median and means are
	print(summary_of_qtime_day$mean)
	print(summary_of_qtime_day$med)

	# Plot
	p <- ggplot(summary_of_qtime_day, aes(size=3)) +
	guides(size=FALSE) +
	labs(x='day', y='value') +
	scale_color_manual(values=c('red','blue')) +
	geom_point(aes(x=day, y=mean, color='mean')) +
	geom_point(aes(x=day,y=med, color='med'))
	print(p)

	# Diverging from the new R and dplyr methods here just to get things to work.
	# There is likely a more elegant dplyr approach to the following, but I don't have time to track it down.

	## Select the day which has statistics that are the closests to the average statistics.
	column_mask <- colnames(summary_of_qtime_day) != "day"

	# Get average of descriptive stats of all the days
	ave_stats <- colMeans(summary_of_qtime_day[column_mask])

	# Sum the residuals for each day and add the column to summary of qtimes
	residuals <- apply(summary_of_qtime_day[column_mask], MARGIN=1, FUN= function(x){ sum(abs(x-ave_stats)) })
	summary_of_qtime_day <- cbind(summary_of_qtime_day, res=residuals)

	# Choose the day with the lowest sum of residuals
	min_res_idx = which(summary_of_qtime_day$res == min(summary_of_qtime_day$res), arr.ind=TRUE)

	matching_ave_day = summary_of_qtime_day[min_res_idx,]
	print("Closest day log to average")
	print(matching_ave_day)


	#summary(by_day)
	#day first_Q med mean third_Q
	#Min. : 1 Min. : 0.00 Min. : 0.00 Min. :111.1 Min. : 34.0
	#1st Qu.: 55 1st Qu.:30.00 1st Qu.: 90.00 1st Qu.:221.6 1st Qu.:269.0
	#Median :109 Median :31.00 Median : 99.00 Median :241.8 Median :286.0
	#Mean :109 Mean :30.44 Mean : 97.96 Mean :253.0 Mean :289.6
	#3rd Qu.:163 3rd Qu.:31.00 3rd Qu.:105.00 3rd Qu.:263.1 3rd Qu.:303.0
	#Max. :217 Max. :34.00 Max. :140.00 Max. :920.4 Max. :910.2
	#NA's :1 NA's :1 NA's :1 NA's :1 NA's :1
	#ninty_Q nintynine_Q max q_per_day
	#Min. : 194.1 Min. : 1166 Min. : 6100 Min. : 8
	#1st Qu.: 506.0 1st Qu.: 1518 1st Qu.:14887 1st Qu.:12908
	#Median : 552.7 Median : 1712 Median :21474 Median :17272
	#Mean : 584.9 Mean : 1934 Mean :23514 Mean :18161
	#3rd Qu.: 616.0 3rd Qu.: 1959 3rd Qu.:31507 3rd Qu.:20602
	#Max. :2438.0 Max. :13521 Max. :44937 Max. :45972
	#NA's :1 NA's :1 NA's :1