Skip to content

Instantly share code, notes, and snippets.

@hibernado
Last active June 4, 2018 22:54
Show Gist options
  • Save hibernado/8ed985a8a9f0361cbac1ab82f5e008ef to your computer and use it in GitHub Desktop.
Save hibernado/8ed985a8a9f0361cbac1ab82f5e008ef to your computer and use it in GitHub Desktop.
Analysis Task
library(tidyverse)
library(lubridate)
library(data.table)
getwd()
dir = '~/Documents/open_signal/'
setwd(dir)
file = 'speed_data.csv'
df <- read.csv(file,stringsAsFactors = F)
glimpse(df)
df = df %>%
mutate(hour_of_day = hour(hour_read_at),
day_of_week = wday(hour_read_at))
df[df$test_type == 'speeds','test_type'] <- 'automatic'
df$date = as.Date(df$hour_read_at)
glimpse(df)
df %>%
group_by(network_name_mapped, country,date) %>%
summarise(mn = mean(download_speed)) %>%
group_by(network_name_mapped, country) %>%
summarise(mn_mn = mean(mn),
low_mn = quantile(mn,0.05),
upp_mn = quantile(mn,0.95)) %T>%
print %>%
qplot(data=.
,x = network_name_mapped
,y = mn_mn
,ymin = low_mn
,ymax = upp_mn
,geom = 'errorbar') +
geom_point(size=4,colour='blue') +
expand_limits(y=0)
df %>%
group_by(network_name_mapped, country,test_type,date) %>%
summarise(mn = mean(download_speed)) %>%
group_by(network_name_mapped, country,test_type) %>%
summarise(mn_mn = mean(mn),
low_mn = quantile(mn,0.05),
upp_mn = quantile(mn,0.95)) %T>%
print %>%
qplot(data=.
,x = network_name_mapped
,y = mn_mn
,ymin = low_mn
,ymax = upp_mn
,geom = 'errorbar') + geom_point(size=4,colour='blue') +
facet_wrap(~test_type) +
expand_limits(y=0)
df %>%
group_by(network_name_mapped, country, test_type) %>%
summarise(cnt = n()) %>%
spread(test_type,cnt)
df %>%
group_by(network_name_mapped, country,date) %>%
summarise(mn = mean(download_speed),
max = max(download_speed),
min = min(download_speed),
stddv = sqrt(var(download_speed)),
med = median(download_speed),
cnt = length(download_speed)) %T>% print %>%
gather(variable,value,-network_name_mapped,-country,-date) %>%
qplot(data=.,
x = interaction(network_name_mapped,country),
y = value,
geom = 'boxplot'
) + facet_wrap(~ variable,scales = 'free_y') +
expand_limits(y=0) +
theme(axis.text.x = element_text(hjust=0, angle=-20))
df %>%
group_by(network_name_mapped, country,date,test_type) %>%
summarise(mn = mean(download_speed),
max = max(download_speed),
min = min(download_speed),
stddv = sqrt(var(download_speed)),
med = median(download_speed),
cnt = length(download_speed)) %T>% print %>%
gather(variable,value,-network_name_mapped,-country,-test_type,-date) %>%
qplot(data=.,
x = interaction(network_name_mapped,country),
y = value,
group = network_name_mapped,
colour = test_type,
geom = 'boxplot'
) +
facet_wrap(test_type ~ variable,scales = 'free_y',nrow=2) +
expand_limits(y=0) +
theme(axis.text.x = element_text(hjust=0, angle=-20))
# Reasons:
- #Geography
- #Frequency
- # Time of day / day of week (seasonality)
#
{}
#############################
# Initial Approach
# Can we derive a benchmark for the time of day?
summary.1 =
df %>%
group_by(network_name_mapped, country, test_type, hour_of_day) %>%
summarise(stddev = sqrt(var(download_speed)))
filter(summary.1, country == 'USA') %>%
qplot(data=.,
x = factor(hour_of_day),
y = stddev,
colour = test_type) +
facet_wrap(~ test_type, scales = 'free_y') +
facet_wrap(~ network_name_mapped)
glimpse(df)
model.1 = lm(download_speed ~ hour_of_day + day_of_week, data = df)
summary(model.1)
model.2 = lm(download_speed ~ hour_of_day +
day_of_week +
test_type +
network_name_mapped, data = df)
summary(model.2)
# Are some data sources obviously generating
# 'wild' data
df %>%
group_by(network_name_mapped, country,test_type, device_id_time) %>%
summarise(stddev = sqrt(var(log(download_speed)))) %>%
qplot(data =.,
x = stddev) +
facet_wrap(test_type ~ network_name_mapped, scales = 'free_y')
# Could we just compare the mean for a 90% quantile
# 'band' against the overall mean?
# is the data sufficiently well behaved?
df =
df %>%
group_by(date, network_name_mapped) %>%
mutate( upp = quantile(download_speed, 0.95),
low = quantile(download_speed, 0.05)) %>%
mutate( in_band = ifelse(download_speed > low & download_speed < upp,'yes','no'))
df %>%
group_by(in_band) %>%
summarise(n())
left_join({
df %>%
group_by(date, network_name_mapped) %>%
summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n())
},{
df %>%
filter(in_band == 'yes') %>%
group_by(date, network_name_mapped) %>%
summarise(in_band_mn = mean(download_speed), in_band_vr = var(download_speed))
}
) %>%
filter(!is.na(in_band_mn)) %>%
mutate( is_anomalous = (abs(in_band_mn-mn)/abs(mn)) > 0.2) %>%
qplot(data=.,
x=date,
y=mn,
group = factor(1),
colour = is_anomalous,
geom = 'point') +
facet_grid(is_anomalous ~ network_name_mapped)
# Answer: NO !
# attempt two
df %>% glimpse
df %>%
group_by(device_id_time, test_type, network_name_mapped) %>%
summarise(cnt=n()) %T>%
summary %>%
qplot(data=.,
x = log(cnt),
fill = network_name_mapped) +
facet_wrap(test_type~network_name_mapped,
scales = 'free_y',
ncol = 4)
df %>%
group_by(date, network_name_mapped) %>%
summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n()) %>%
gather(variable,value,-network_name_mapped,-date) %>%
qplot(data=.,
x = date,
y = value,
group = variable,
colour = variable,
geom='point') +
facet_wrap(variable ~ network_name_mapped,
scales = 'free_y',
ncol = 4) +
expand_limits(y=0)
# Network 1
# Stable
# Network 2 & 4
# cnt, mean & variance go nuts at a certain point in time
# Network 3
# cnt shoots up at a certain point in time (promotional activity?)
# this clearly has an impact on mean & var but it's not clear that
# it is unexplained.
# I'm guessing that perhaps new geographies are being monitored
# or perhaps certain existing ones more intensively.
# Since the change is not immediate it's not obvious
# how to treat this gradual change.
# What we really want is a measure of 'speed'
# which is clearly understood in terms of devices & geography.
#
# This would then highlight if we have new 'fast' areas.
# or existing 'fast' areas which are being reported more frequently.
df %>%
group_by(date, network_name_mapped,device_id_time) %>%
summarise(cnt = n()) %>%
group_by(date, network_name_mapped) %>%
summarise(mn_cnt = mean(cnt), max_cnt = max(cnt), min_cnt = min(cnt)) %>%
gather(variable,value,-network_name_mapped,-date) %>%
qplot(data=.,
x = date,
y = value,
group = variable,
colour = variable,
geom='point') +
facet_wrap(variable ~ network_name_mapped,
scales = 'free_y',
ncol = 4) +
expand_limits(y=0)
library(broom)
df.1 =
df %>%
group_by(date, network_name_mapped) %>%
summarise(vr = var(download_speed), mn = mean(download_speed), cnt = n(),man_cnt = sum(ifelse(test_type=='automatic',1,0))) %>%
gather(variable,value,-date,-network_name_mapped) %>%
ungroup
df.1 %>%
mutate(date_num = as.numeric(date)) %>%
# filter(date == max(date))
group_by(network_name_mapped,variable) %>%
mutate(thresh = ifelse(date_num>(max(date_num)-50),1,0)) %>%
do( fit = lm(value~date_num+thresh,span=.5,.), data= (.) ) %>%
augment(fit,data) %>%
select(date,network_name_mapped,variable,value,fitted = .fitted,thresh) %>%
qplot(data=.,
x = date,
y = value,
group=network_name_mapped,
geom='blank') +
geom_line(aes(x=date,y=fitted,group=network_name_mapped,colour=thresh)) +
facet_wrap(variable ~ network_name_mapped, scales = 'free_y') +
expand_limits(y=0)
dt =
df %>%
arrange(date) %>%
group_by(date, network_name_mapped) %>%
summarise(vr = var(download_speed),
mn = mean(download_speed),
cnt = n(),
man_vr = var(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
man_mn = mean(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
man_cnt = sum(ifelse(test_type=='automatic',1,0))
) %>%
data.table
dt.2 = dt[network_name_mapped == 'Netwwork2']
# Chose today = 195.
# It should be easy to spot problems now
(days = dim(dt.2)[1])
day_ = 195
# Split the data:
(prev = dt.2[1:day_-1])
(curr = dt.2[day_])
all = rbind(prev,curr)
samples = dim(all)[1]
all_mn = mean(all$mn)
prev_mn = mean(prev$mn)
curr_mn = mean(curr$mn)
# total_sum_of_squares
(sst = sum((all$mn-all_mn)^2))
# residual_sum_of_squares
(ssr = sum((prev$mn - prev_mn)^2) + sum((curr$mn - curr_mn)^2))
# model sum of squares
(ssm = sst-ssr)
(r_squared = ssm/ssr)
dgf = 1 # <- deg_free
mn_ssm = ssm/dgf
mn_ssr = ssr/(samples-dgf-1)
(F_ratio = mn_ssm/mn_ssr)
# Normal distribution (patently true)
# F_test =>
p = 1-pf(F_ratio, df1=1, df2=samples-dgf,lower.tail = T)
p
mod.2 =
all %>%
mutate(is_new_unexplained_mean = ifelse(date==max(all$date),1,0)) %>%
lm(mn ~ is_new_unexplained_mean, data=.)
mod.2.summary = summary(mod.2)
mod.2.summary$fstatistic
mod.2.summary
# https://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression
get_f_ratio_p <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
# Set 'today'
day_indx = 195
prev_days_to_check = 120
r_squared_threshold = .25
f_ratio_threshold = 0.001
dt.filtered = dt.2[1:day_indx]
out_list = list()
for (i in 1:day_indx) {
dt.filtered[, is_new_unexplained_mean:= 0]
dt.filtered[i:day_indx, is_new_unexplained_mean:= 1]
print(dt.filtered)
mod = lm(mn ~ is_new_unexplained_mean, data=dt.filtered)
frp = 0; try({frp = get_f_ratio_p(mod)});
rsqr = summary(mod)$r.squared
out_list[[i]] = data.frame(date = dt.filtered[i]$date,
f.ratio.p = frp,
r.squared = rsqr
)
}
wanted_measures = c('f.ratio.p',
'mn',
# 'r.squared',
'r.squared.loess',
'r.squared.slope',
'r.squared.accel'
)
res =
bind_rows(out_list) %>%
inner_join(dt.filtered) %>%
mutate(log.f.ratio.p = log(f.ratio.p))
res$r.squared.loess = predict(loess(r.squared~as.numeric(date),span=.15,data = res))
res$r.squared.slope = c(0,diff(res$r.squared.loess,differences = 1))
res$r.squared.accel = c(0,0,diff(res$r.squared.loess,differences = 2))
anomaly_date =
res %>%
slice(max(0,day_indx-prev_days_to_check):day_indx) %>%
filter(f.ratio.p < f_ratio_threshold) %>%
filter(r.squared.accel == max(r.squared.accel)) %>%
select(date)
res %>%
slice(2:n()) %>%
gather(variable,value, -date, -network_name_mapped) %>%
filter(variable %in% wanted_measures) %>%
mutate(post_anomaly = date >= unlist(anomaly_date)) %>%
qplot(data=.,
x = date,
y = value,
colour = post_anomaly,
group = variable) +
facet_wrap(~ variable,
scales='free_y') +
scale_color_brewer(palette = 'Set2') +
theme_linedraw()
print(anomaly_date)
# todo.
# choosing where the rate of change in r.squared is a maximum is not the only choice we
# might make.
# we could choose the first inflection point in r.squared where the slope is positive.
@hibernado
Copy link
Author

brainstorm / work in progress.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment