Skip to content

Instantly share code, notes, and snippets.

@grosscol
Created November 10, 2016 21:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save grosscol/387a3962891952a5f22df9807e2e2221 to your computer and use it in GitHub Desktop.
Save grosscol/387a3962891952a5f22df9807e2e2221 to your computer and use it in GitHub Desktop.
Selecting a similar distribution
require('dplyr')
require('ggplot2')
# Simulate two types of queries: fast and slow. More fast queries.
num_samples = 10000
days <- sample(seq(1,30), num_samples, replace=TRUE)
qtimes <- rpois(num_samples, c(25,35,100))
qlog <- data.frame(day=days, qtime=qtimes)
# take a quick look.
ggplot(qlog, aes(day,qtime)) + geom_count() + scale_size_area()
summary_of_qtime_day <- qlog %>%
dplyr::arrange(day) %>%
dplyr::group_by(day) %>%
summarise(
first_Q = quantile(qtime,.25,na.rm=TRUE),
med = median(qtime,na.rm=TRUE),
mean = mean(qtime,na.rm=TRUE),
third_Q=quantile(qtime,.75,na.rm=TRUE),
ninty_Q=quantile(qtime,.9,na.rm=TRUE),
nintynine_Q=quantile(qtime,.99,na.rm=TRUE),
max = max(qtime,na.rm=TRUE),
q_per_day = n()
)
# Take a look at how close or divergent the median and means are
print(summary_of_qtime_day$mean)
print(summary_of_qtime_day$med)
# Plot
p <- ggplot(summary_of_qtime_day, aes(size=3)) +
guides(size=FALSE) +
labs(x='day', y='value') +
scale_color_manual(values=c('red','blue')) +
geom_point(aes(x=day, y=mean, color='mean')) +
geom_point(aes(x=day,y=med, color='med'))
print(p)
# Diverging from the new R and dplyr methods here just to get things to work.
# There is likely a more elegant dplyr approach to the following, but I don't have time to track it down.
## Select the day which has statistics that are the closests to the average statistics.
column_mask <- colnames(summary_of_qtime_day) != "day"
# Get average of descriptive stats of all the days
ave_stats <- colMeans(summary_of_qtime_day[column_mask])
# Sum the residuals for each day and add the column to summary of qtimes
residuals <- apply(summary_of_qtime_day[column_mask], MARGIN=1, FUN= function(x){ sum(abs(x-ave_stats)) })
summary_of_qtime_day <- cbind(summary_of_qtime_day, res=residuals)
# Choose the day with the lowest sum of residuals
min_res_idx = which(summary_of_qtime_day$res == min(summary_of_qtime_day$res), arr.ind=TRUE)
matching_ave_day = summary_of_qtime_day[min_res_idx,]
print("Closest day log to average")
print(matching_ave_day)
#summary(by_day)
#day first_Q med mean third_Q
#Min. : 1 Min. : 0.00 Min. : 0.00 Min. :111.1 Min. : 34.0
#1st Qu.: 55 1st Qu.:30.00 1st Qu.: 90.00 1st Qu.:221.6 1st Qu.:269.0
#Median :109 Median :31.00 Median : 99.00 Median :241.8 Median :286.0
#Mean :109 Mean :30.44 Mean : 97.96 Mean :253.0 Mean :289.6
#3rd Qu.:163 3rd Qu.:31.00 3rd Qu.:105.00 3rd Qu.:263.1 3rd Qu.:303.0
#Max. :217 Max. :34.00 Max. :140.00 Max. :920.4 Max. :910.2
#NA's :1 NA's :1 NA's :1 NA's :1 NA's :1
#ninty_Q nintynine_Q max q_per_day
#Min. : 194.1 Min. : 1166 Min. : 6100 Min. : 8
#1st Qu.: 506.0 1st Qu.: 1518 1st Qu.:14887 1st Qu.:12908
#Median : 552.7 Median : 1712 Median :21474 Median :17272
#Mean : 584.9 Mean : 1934 Mean :23514 Mean :18161
#3rd Qu.: 616.0 3rd Qu.: 1959 3rd Qu.:31507 3rd Qu.:20602
#Max. :2438.0 Max. :13521 Max. :44937 Max. :45972
#NA's :1 NA's :1 NA's :1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment