hibernado/01_analysis.R

## 01_analysis.R
library(tidyverse)
library(lubridate)
library(data.table)

getwd()
dir = '~/Documents/open_signal/'
setwd(dir)

file = 'speed_data.csv'
df <- read.csv(file,stringsAsFactors = F)


glimpse(df)
df = df %>%
  mutate(hour_of_day = hour(hour_read_at),
         day_of_week = wday(hour_read_at))
df[df$test_type == 'speeds','test_type'] <- 'automatic'
df$date = as.Date(df$hour_read_at)

glimpse(df)

df %>%
  group_by(network_name_mapped, country,date) %>%
  summarise(mn = mean(download_speed)) %>%
  group_by(network_name_mapped, country) %>%
  summarise(mn_mn = mean(mn),
            low_mn = quantile(mn,0.05),
            upp_mn = quantile(mn,0.95)) %T>%
  print %>%
  qplot(data=.
        ,x = network_name_mapped
        ,y = mn_mn
        ,ymin = low_mn
        ,ymax = upp_mn
        ,geom = 'errorbar') +
  geom_point(size=4,colour='blue') +
  expand_limits(y=0)


df %>%
  group_by(network_name_mapped, country,test_type,date) %>%
  summarise(mn = mean(download_speed)) %>%
  group_by(network_name_mapped, country,test_type) %>%
  summarise(mn_mn = mean(mn),
            low_mn = quantile(mn,0.05),
            upp_mn = quantile(mn,0.95)) %T>%
  print %>%
  qplot(data=.
        ,x = network_name_mapped
        ,y = mn_mn
        ,ymin = low_mn
        ,ymax = upp_mn
        ,geom = 'errorbar') + geom_point(size=4,colour='blue') +
  facet_wrap(~test_type) +
  expand_limits(y=0)


df %>%
  group_by(network_name_mapped, country, test_type) %>%
  summarise(cnt = n()) %>%
  spread(test_type,cnt)


df %>%
  group_by(network_name_mapped, country,date) %>%
  summarise(mn = mean(download_speed),
            max = max(download_speed),
            min = min(download_speed),
            stddv = sqrt(var(download_speed)),
            med = median(download_speed),
            cnt = length(download_speed)) %T>% print %>%
  gather(variable,value,-network_name_mapped,-country,-date) %>%
  qplot(data=.,
        x = interaction(network_name_mapped,country),
        y = value,
        geom = 'boxplot'
        ) + facet_wrap(~ variable,scales = 'free_y') +
  expand_limits(y=0) +
  theme(axis.text.x = element_text(hjust=0, angle=-20))


df %>%
  group_by(network_name_mapped, country,date,test_type) %>%
  summarise(mn = mean(download_speed),
            max = max(download_speed),
            min = min(download_speed),
            stddv = sqrt(var(download_speed)),
            med = median(download_speed),
            cnt = length(download_speed)) %T>% print %>%
  gather(variable,value,-network_name_mapped,-country,-test_type,-date) %>%
  qplot(data=.,
        x = interaction(network_name_mapped,country),
        y = value,
        group = network_name_mapped,
        colour = test_type,
        geom = 'boxplot'
  ) +
  facet_wrap(test_type ~ variable,scales = 'free_y',nrow=2) +
  expand_limits(y=0) +
  theme(axis.text.x = element_text(hjust=0, angle=-20))


# Reasons:
 - #Geography
 - #Frequency
 - # Time of day / day of week (seasonality)
#
{}

#############################
# Initial Approach
# Can we derive a benchmark for the time of day?
summary.1 =
  df %>%
  group_by(network_name_mapped, country, test_type, hour_of_day) %>%
  summarise(stddev = sqrt(var(download_speed)))

filter(summary.1, country == 'USA') %>%
  qplot(data=.,
        x = factor(hour_of_day),
        y = stddev,
        colour = test_type) +
  facet_wrap(~ test_type, scales = 'free_y') +
  facet_wrap(~ network_name_mapped)

glimpse(df)
model.1 = lm(download_speed ~ hour_of_day + day_of_week, data = df)
summary(model.1)

model.2 = lm(download_speed ~ hour_of_day +
               day_of_week +
               test_type +
               network_name_mapped, data = df)
summary(model.2)


# Are some data sources obviously generating
# 'wild' data
df %>%
  group_by(network_name_mapped, country,test_type, device_id_time) %>%
  summarise(stddev = sqrt(var(log(download_speed)))) %>%
  qplot(data =.,
        x = stddev) +
  facet_wrap(test_type ~ network_name_mapped, scales = 'free_y')

# Could we just compare the mean for a 90% quantile
# 'band' against the overall mean?
# is the data sufficiently well behaved?
df =
  df %>%
  group_by(date, network_name_mapped) %>%
    mutate( upp = quantile(download_speed, 0.95),
            low = quantile(download_speed, 0.05)) %>%
    mutate( in_band = ifelse(download_speed > low & download_speed < upp,'yes','no'))

df %>%
  group_by(in_band) %>%
  summarise(n())


left_join({
  df %>%
  group_by(date, network_name_mapped) %>%
  summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n())
},{
  df %>%
  filter(in_band == 'yes') %>%
  group_by(date, network_name_mapped) %>%
  summarise(in_band_mn = mean(download_speed), in_band_vr = var(download_speed))
}
) %>%
  filter(!is.na(in_band_mn)) %>%
  mutate( is_anomalous = (abs(in_band_mn-mn)/abs(mn)) > 0.2) %>%
  qplot(data=.,
        x=date,
        y=mn,
        group = factor(1),
        colour = is_anomalous,
        geom = 'point') +
  facet_grid(is_anomalous ~ network_name_mapped)

# Answer: NO !


# attempt two

df %>% glimpse

df %>%
  group_by(device_id_time, test_type, network_name_mapped) %>%
  summarise(cnt=n()) %T>%
  summary %>%
  qplot(data=.,
        x = log(cnt),
        fill = network_name_mapped) +
  facet_wrap(test_type~network_name_mapped,
             scales = 'free_y',
             ncol = 4)

df %>%
  group_by(date, network_name_mapped) %>%
  summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n()) %>%
  gather(variable,value,-network_name_mapped,-date) %>%
  qplot(data=.,
        x = date,
        y = value,
        group = variable,
        colour = variable,
        geom='point') +
  facet_wrap(variable ~ network_name_mapped,
             scales = 'free_y',
             ncol = 4) +
    expand_limits(y=0)

# Network 1
# Stable

# Network 2 & 4
# cnt, mean & variance go nuts at a certain point in time

# Network 3
# cnt shoots up at a certain point in time (promotional activity?)
# this clearly has an impact on mean & var but it's not clear that
# it is unexplained.
# I'm guessing that perhaps new geographies are being monitored
# or perhaps certain existing ones more intensively.
# Since the change is not immediate it's not obvious
# how to treat this gradual change.
# What we really want is a measure of 'speed'
# which is clearly understood in terms of devices & geography.
#
# This would then highlight if we have new 'fast' areas.
# or existing 'fast' areas which are being reported more frequently.


df %>%
  group_by(date, network_name_mapped,device_id_time) %>%
  summarise(cnt = n()) %>%
  group_by(date, network_name_mapped) %>%
  summarise(mn_cnt = mean(cnt), max_cnt = max(cnt), min_cnt = min(cnt)) %>%
  gather(variable,value,-network_name_mapped,-date) %>%
  qplot(data=.,
        x = date,
        y = value,
        group = variable,
        colour = variable,
        geom='point') +
  facet_wrap(variable ~ network_name_mapped,
             scales = 'free_y',
             ncol = 4) +
  expand_limits(y=0)


library(broom)

df.1 =
  df %>%
  group_by(date, network_name_mapped) %>%
  summarise(vr = var(download_speed), mn = mean(download_speed), cnt = n(),man_cnt = sum(ifelse(test_type=='automatic',1,0))) %>%
  gather(variable,value,-date,-network_name_mapped) %>%
  ungroup


df.1 %>%
  mutate(date_num = as.numeric(date)) %>%
  # filter(date == max(date))
  group_by(network_name_mapped,variable) %>%
  mutate(thresh = ifelse(date_num>(max(date_num)-50),1,0)) %>%
  do( fit = lm(value~date_num+thresh,span=.5,.), data= (.) ) %>%
  augment(fit,data) %>%
  select(date,network_name_mapped,variable,value,fitted = .fitted,thresh) %>%
  qplot(data=.,
        x = date,
        y = value,
        group=network_name_mapped,
        geom='blank') +
  geom_line(aes(x=date,y=fitted,group=network_name_mapped,colour=thresh)) +
  facet_wrap(variable ~ network_name_mapped, scales = 'free_y') +
  expand_limits(y=0)


dt =
  df %>%
  arrange(date) %>%
  group_by(date, network_name_mapped) %>%
  summarise(vr = var(download_speed),
            mn = mean(download_speed),
            cnt = n(),
            man_vr = var(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
            man_mn = mean(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
            man_cnt = sum(ifelse(test_type=='automatic',1,0))
  ) %>%
  data.table


dt.2 = dt[network_name_mapped == 'Netwwork2']

# Chose today = 195.
# It should be easy to spot problems now
(days = dim(dt.2)[1])
day_ = 195

# Split the data:
(prev = dt.2[1:day_-1])
(curr = dt.2[day_])
all = rbind(prev,curr)

samples = dim(all)[1]

all_mn  = mean(all$mn)
prev_mn = mean(prev$mn)
curr_mn = mean(curr$mn)


# total_sum_of_squares
(sst = sum((all$mn-all_mn)^2))

# residual_sum_of_squares
(ssr = sum((prev$mn - prev_mn)^2) + sum((curr$mn - curr_mn)^2))

# model sum of squares
(ssm = sst-ssr)

(r_squared = ssm/ssr)

dgf = 1 # <- deg_free
mn_ssm = ssm/dgf
mn_ssr = ssr/(samples-dgf-1)
(F_ratio = mn_ssm/mn_ssr)
# Normal distribution (patently true)
# F_test =>
p = 1-pf(F_ratio, df1=1, df2=samples-dgf,lower.tail = T)
p

mod.2 =
  all %>%
  mutate(is_new_unexplained_mean = ifelse(date==max(all$date),1,0)) %>%
  lm(mn ~ is_new_unexplained_mean, data=.)
mod.2.summary = summary(mod.2)
mod.2.summary$fstatistic
mod.2.summary


# https://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression
get_f_ratio_p <- function (modelobject) {
  if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
  f <- summary(modelobject)$fstatistic
  p <- pf(f[1],f[2],f[3],lower.tail=F)
  attributes(p) <- NULL
  return(p)
}

# Set 'today'
day_indx = 195

prev_days_to_check = 120
r_squared_threshold = .25
f_ratio_threshold = 0.001

dt.filtered = dt.2[1:day_indx]

out_list = list()
for (i in 1:day_indx) {
  dt.filtered[, is_new_unexplained_mean:= 0]
  dt.filtered[i:day_indx, is_new_unexplained_mean:= 1]
  print(dt.filtered)
  mod = lm(mn ~ is_new_unexplained_mean, data=dt.filtered)

  frp = 0; try({frp = get_f_ratio_p(mod)});
  rsqr = summary(mod)$r.squared

  out_list[[i]] = data.frame(date = dt.filtered[i]$date,
                             f.ratio.p = frp,
                             r.squared = rsqr
                             )
}


wanted_measures = c('f.ratio.p',
                    'mn',
                    # 'r.squared',
                    'r.squared.loess',
                    'r.squared.slope',
                    'r.squared.accel'
                    )

res =
  bind_rows(out_list) %>%
  inner_join(dt.filtered) %>%
  mutate(log.f.ratio.p = log(f.ratio.p))


res$r.squared.loess = predict(loess(r.squared~as.numeric(date),span=.15,data = res))
res$r.squared.slope = c(0,diff(res$r.squared.loess,differences = 1))
res$r.squared.accel = c(0,0,diff(res$r.squared.loess,differences = 2))

anomaly_date =
  res %>%
  slice(max(0,day_indx-prev_days_to_check):day_indx) %>%
  filter(f.ratio.p < f_ratio_threshold) %>%
  filter(r.squared.accel == max(r.squared.accel)) %>%
  select(date)


res %>%
  slice(2:n()) %>%
  gather(variable,value, -date, -network_name_mapped) %>%
  filter(variable %in% wanted_measures) %>%
  mutate(post_anomaly = date >= unlist(anomaly_date)) %>%
  qplot(data=.,
        x = date,
        y = value,
        colour = post_anomaly,
        group = variable) +
  facet_wrap(~ variable,
             scales='free_y') +
  scale_color_brewer(palette = 'Set2') +
  theme_linedraw()

print(anomaly_date)
# todo.
# choosing where the rate of change in r.squared is a maximum is not the only choice we
# might make.
# we could choose the first inflection point in r.squared where the slope is positive.
	library(tidyverse)
	library(lubridate)
	library(data.table)

	getwd()
	dir = '~/Documents/open_signal/'
	setwd(dir)

	file = 'speed_data.csv'
	df <- read.csv(file,stringsAsFactors = F)


	glimpse(df)
	df = df %>%
	mutate(hour_of_day = hour(hour_read_at),
	day_of_week = wday(hour_read_at))
	df[df$test_type == 'speeds','test_type'] <- 'automatic'
	df$date = as.Date(df$hour_read_at)

	glimpse(df)

	df %>%
	group_by(network_name_mapped, country,date) %>%
	summarise(mn = mean(download_speed)) %>%
	group_by(network_name_mapped, country) %>%
	summarise(mn_mn = mean(mn),
	low_mn = quantile(mn,0.05),
	upp_mn = quantile(mn,0.95)) %T>%
	print %>%
	qplot(data=.
	,x = network_name_mapped
	,y = mn_mn
	,ymin = low_mn
	,ymax = upp_mn
	,geom = 'errorbar') +
	geom_point(size=4,colour='blue') +
	expand_limits(y=0)


	df %>%
	group_by(network_name_mapped, country,test_type,date) %>%
	summarise(mn = mean(download_speed)) %>%
	group_by(network_name_mapped, country,test_type) %>%
	summarise(mn_mn = mean(mn),
	low_mn = quantile(mn,0.05),
	upp_mn = quantile(mn,0.95)) %T>%
	print %>%
	qplot(data=.
	,x = network_name_mapped
	,y = mn_mn
	,ymin = low_mn
	,ymax = upp_mn
	,geom = 'errorbar') + geom_point(size=4,colour='blue') +
	facet_wrap(~test_type) +
	expand_limits(y=0)


	df %>%
	group_by(network_name_mapped, country, test_type) %>%
	summarise(cnt = n()) %>%
	spread(test_type,cnt)


	df %>%
	group_by(network_name_mapped, country,date) %>%
	summarise(mn = mean(download_speed),
	max = max(download_speed),
	min = min(download_speed),
	stddv = sqrt(var(download_speed)),
	med = median(download_speed),
	cnt = length(download_speed)) %T>% print %>%
	gather(variable,value,-network_name_mapped,-country,-date) %>%
	qplot(data=.,
	x = interaction(network_name_mapped,country),
	y = value,
	geom = 'boxplot'
	) + facet_wrap(~ variable,scales = 'free_y') +
	expand_limits(y=0) +
	theme(axis.text.x = element_text(hjust=0, angle=-20))


	df %>%
	group_by(network_name_mapped, country,date,test_type) %>%
	summarise(mn = mean(download_speed),
	max = max(download_speed),
	min = min(download_speed),
	stddv = sqrt(var(download_speed)),
	med = median(download_speed),
	cnt = length(download_speed)) %T>% print %>%
	gather(variable,value,-network_name_mapped,-country,-test_type,-date) %>%
	qplot(data=.,
	x = interaction(network_name_mapped,country),
	y = value,
	group = network_name_mapped,
	colour = test_type,
	geom = 'boxplot'
	) +
	facet_wrap(test_type ~ variable,scales = 'free_y',nrow=2) +
	expand_limits(y=0) +
	theme(axis.text.x = element_text(hjust=0, angle=-20))





	# Reasons:
	- #Geography
	- #Frequency
	- # Time of day / day of week (seasonality)
	#
	{}

	#############################
	# Initial Approach
	# Can we derive a benchmark for the time of day?
	summary.1 =
	df %>%
	group_by(network_name_mapped, country, test_type, hour_of_day) %>%
	summarise(stddev = sqrt(var(download_speed)))

	filter(summary.1, country == 'USA') %>%
	qplot(data=.,
	x = factor(hour_of_day),
	y = stddev,
	colour = test_type) +
	facet_wrap(~ test_type, scales = 'free_y') +
	facet_wrap(~ network_name_mapped)

	glimpse(df)
	model.1 = lm(download_speed ~ hour_of_day + day_of_week, data = df)
	summary(model.1)

	model.2 = lm(download_speed ~ hour_of_day +
	day_of_week +
	test_type +
	network_name_mapped, data = df)
	summary(model.2)


	# Are some data sources obviously generating
	# 'wild' data
	df %>%
	group_by(network_name_mapped, country,test_type, device_id_time) %>%
	summarise(stddev = sqrt(var(log(download_speed)))) %>%
	qplot(data =.,
	x = stddev) +
	facet_wrap(test_type ~ network_name_mapped, scales = 'free_y')

	# Could we just compare the mean for a 90% quantile
	# 'band' against the overall mean?
	# is the data sufficiently well behaved?
	df =
	df %>%
	group_by(date, network_name_mapped) %>%
	mutate( upp = quantile(download_speed, 0.95),
	low = quantile(download_speed, 0.05)) %>%
	mutate( in_band = ifelse(download_speed > low & download_speed < upp,'yes','no'))

	df %>%
	group_by(in_band) %>%
	summarise(n())


	left_join({
	df %>%
	group_by(date, network_name_mapped) %>%
	summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n())
	},{
	df %>%
	filter(in_band == 'yes') %>%
	group_by(date, network_name_mapped) %>%
	summarise(in_band_mn = mean(download_speed), in_band_vr = var(download_speed))
	}
	) %>%
	filter(!is.na(in_band_mn)) %>%
	mutate( is_anomalous = (abs(in_band_mn-mn)/abs(mn)) > 0.2) %>%
	qplot(data=.,
	x=date,
	y=mn,
	group = factor(1),
	colour = is_anomalous,
	geom = 'point') +
	facet_grid(is_anomalous ~ network_name_mapped)

	# Answer: NO !


	# attempt two

	df %>% glimpse

	df %>%
	group_by(device_id_time, test_type, network_name_mapped) %>%
	summarise(cnt=n()) %T>%
	summary %>%
	qplot(data=.,
	x = log(cnt),
	fill = network_name_mapped) +
	facet_wrap(test_type~network_name_mapped,
	scales = 'free_y',
	ncol = 4)

	df %>%
	group_by(date, network_name_mapped) %>%
	summarise(mn = mean(download_speed), vr = var(download_speed), cnt = n()) %>%
	gather(variable,value,-network_name_mapped,-date) %>%
	qplot(data=.,
	x = date,
	y = value,
	group = variable,
	colour = variable,
	geom='point') +
	facet_wrap(variable ~ network_name_mapped,
	scales = 'free_y',
	ncol = 4) +
	expand_limits(y=0)

	# Network 1
	# Stable

	# Network 2 & 4
	# cnt, mean & variance go nuts at a certain point in time

	# Network 3
	# cnt shoots up at a certain point in time (promotional activity?)
	# this clearly has an impact on mean & var but it's not clear that
	# it is unexplained.
	# I'm guessing that perhaps new geographies are being monitored
	# or perhaps certain existing ones more intensively.
	# Since the change is not immediate it's not obvious
	# how to treat this gradual change.
	# What we really want is a measure of 'speed'
	# which is clearly understood in terms of devices & geography.
	#
	# This would then highlight if we have new 'fast' areas.
	# or existing 'fast' areas which are being reported more frequently.


	df %>%
	group_by(date, network_name_mapped,device_id_time) %>%
	summarise(cnt = n()) %>%
	group_by(date, network_name_mapped) %>%
	summarise(mn_cnt = mean(cnt), max_cnt = max(cnt), min_cnt = min(cnt)) %>%
	gather(variable,value,-network_name_mapped,-date) %>%
	qplot(data=.,
	x = date,
	y = value,
	group = variable,
	colour = variable,
	geom='point') +
	facet_wrap(variable ~ network_name_mapped,
	scales = 'free_y',
	ncol = 4) +
	expand_limits(y=0)



	library(broom)

	df.1 =
	df %>%
	group_by(date, network_name_mapped) %>%
	summarise(vr = var(download_speed), mn = mean(download_speed), cnt = n(),man_cnt = sum(ifelse(test_type=='automatic',1,0))) %>%
	gather(variable,value,-date,-network_name_mapped) %>%
	ungroup


	df.1 %>%
	mutate(date_num = as.numeric(date)) %>%
	# filter(date == max(date))
	group_by(network_name_mapped,variable) %>%
	mutate(thresh = ifelse(date_num>(max(date_num)-50),1,0)) %>%
	do( fit = lm(value~date_num+thresh,span=.5,.), data= (.) ) %>%
	augment(fit,data) %>%
	select(date,network_name_mapped,variable,value,fitted = .fitted,thresh) %>%
	qplot(data=.,
	x = date,
	y = value,
	group=network_name_mapped,
	geom='blank') +
	geom_line(aes(x=date,y=fitted,group=network_name_mapped,colour=thresh)) +
	facet_wrap(variable ~ network_name_mapped, scales = 'free_y') +
	expand_limits(y=0)



	dt =
	df %>%
	arrange(date) %>%
	group_by(date, network_name_mapped) %>%
	summarise(vr = var(download_speed),
	mn = mean(download_speed),
	cnt = n(),
	man_vr = var(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
	man_mn = mean(ifelse(test_type=='automatic',download_speed,NA),na.rm =T),
	man_cnt = sum(ifelse(test_type=='automatic',1,0))
	) %>%
	data.table


	dt.2 = dt[network_name_mapped == 'Netwwork2']

	# Chose today = 195.
	# It should be easy to spot problems now
	(days = dim(dt.2)[1])
	day_ = 195

	# Split the data:
	(prev = dt.2[1:day_-1])
	(curr = dt.2[day_])
	all = rbind(prev,curr)

	samples = dim(all)[1]

	all_mn = mean(all$mn)
	prev_mn = mean(prev$mn)
	curr_mn = mean(curr$mn)


	# total_sum_of_squares
	(sst = sum((all$mn-all_mn)^2))

	# residual_sum_of_squares
	(ssr = sum((prev$mn - prev_mn)^2) + sum((curr$mn - curr_mn)^2))

	# model sum of squares
	(ssm = sst-ssr)

	(r_squared = ssm/ssr)

	dgf = 1 # <- deg_free
	mn_ssm = ssm/dgf
	mn_ssr = ssr/(samples-dgf-1)
	(F_ratio = mn_ssm/mn_ssr)
	# Normal distribution (patently true)
	# F_test =>
	p = 1-pf(F_ratio, df1=1, df2=samples-dgf,lower.tail = T)
	p

	mod.2 =
	all %>%
	mutate(is_new_unexplained_mean = ifelse(date==max(all$date),1,0)) %>%
	lm(mn ~ is_new_unexplained_mean, data=.)
	mod.2.summary = summary(mod.2)
	mod.2.summary$fstatistic
	mod.2.summary


	# https://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression
	get_f_ratio_p <- function (modelobject) {
	if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
	f <- summary(modelobject)$fstatistic
	p <- pf(f[1],f[2],f[3],lower.tail=F)
	attributes(p) <- NULL
	return(p)
	}

	# Set 'today'
	day_indx = 195

	prev_days_to_check = 120
	r_squared_threshold = .25
	f_ratio_threshold = 0.001

	dt.filtered = dt.2[1:day_indx]

	out_list = list()
	for (i in 1:day_indx) {
	dt.filtered[, is_new_unexplained_mean:= 0]
	dt.filtered[i:day_indx, is_new_unexplained_mean:= 1]
	print(dt.filtered)
	mod = lm(mn ~ is_new_unexplained_mean, data=dt.filtered)

	frp = 0; try({frp = get_f_ratio_p(mod)});
	rsqr = summary(mod)$r.squared

	out_list[[i]] = data.frame(date = dt.filtered[i]$date,
	f.ratio.p = frp,
	r.squared = rsqr
	)
	}


	wanted_measures = c('f.ratio.p',
	'mn',
	# 'r.squared',
	'r.squared.loess',
	'r.squared.slope',
	'r.squared.accel'
	)

	res =
	bind_rows(out_list) %>%
	inner_join(dt.filtered) %>%
	mutate(log.f.ratio.p = log(f.ratio.p))


	res$r.squared.loess = predict(loess(r.squared~as.numeric(date),span=.15,data = res))
	res$r.squared.slope = c(0,diff(res$r.squared.loess,differences = 1))
	res$r.squared.accel = c(0,0,diff(res$r.squared.loess,differences = 2))

	anomaly_date =
	res %>%
	slice(max(0,day_indx-prev_days_to_check):day_indx) %>%
	filter(f.ratio.p < f_ratio_threshold) %>%
	filter(r.squared.accel == max(r.squared.accel)) %>%
	select(date)


	res %>%
	slice(2:n()) %>%
	gather(variable,value, -date, -network_name_mapped) %>%
	filter(variable %in% wanted_measures) %>%
	mutate(post_anomaly = date >= unlist(anomaly_date)) %>%
	qplot(data=.,
	x = date,
	y = value,
	colour = post_anomaly,
	group = variable) +
	facet_wrap(~ variable,
	scales='free_y') +
	scale_color_brewer(palette = 'Set2') +
	theme_linedraw()

	print(anomaly_date)
	# todo.
	# choosing where the rate of change in r.squared is a maximum is not the only choice we
	# might make.
	# we could choose the first inflection point in r.squared where the slope is positive.