hibernado/.gitignore

## .gitignore
TES Data Science technical screen.pdf
TESDataScienceData.csv
*.RData
*.Rhistory

## analysis_2.txt
# Analysis Task:

# Are things popular because they are popular ?

We could look at the conditional popularity as a plot.
So we could look at what was popular last week / month
and compare it to what is popular to this week / month.
We could bin the 'prior' popularity and compare it to
this period's binned popularity.

To really understand this we might want to test.
If we say that x resource has 10000 downloads in one group
versus 4 in another group. What is the eventual download rate?
-> Probably ok to do this for things that are free (ethics!).


# Are thing popular because the author is popular ?
Here I suggest taking 'prolific' authors.
Does the 'histogram' of downloads by author follow a similar shape?
Does it differ only by scale (mean & var) ?
We would probably need to account for different subjects and other factors.


# Are things popular because of the price ?
I suspect that the interaction between price and popularity isn't simple / linear.
I'd imagine that free things generally have a lower download rate than things which cost a little.
Then I'd expect popularity to rapidly decrease for things which cost more.
-> a scatter plot of price to popularity

At the same time I'd expect that things which cost more would have greater 'longevity' ? In
other words I'd expect that more expensive things are less popular but provide a more reliable stream
of downloads/income.


# Are things popular because of the rating ?

I suspect that a resource's rating has an impact on the number of downloads.
-> a scatter plot of rating to number of downloads.
I suspect that the first 3/4/5 ratings have a big impact on the eventual number of downloads.
-> a scatter plot of the average ratings after 3/4/5 ratings v total downloads.

## sql_1_1.sql
-- 1) Write a SQL query that will return the top downloaded 5 resources by subject for each month
-- since 1st September 2015. Only include Mathematics, English and History.


with filter_raw_events as (

	select
		date_trunc('month', e.eventTime) eventMonth
		,r.id
		,r.title
		,r.subject
		,count(1) event_count

	from events.eventStream e
		join content.resourceDetails r on e.assetId = r.id
		where 1=1
			and e.assetType = 'resource'
			and e.event = 'download'
			and e.eventTime >= '2015-09-01'::timestamp
			and r.subject in ('Mathematics','English','History')

	group by 1,2,3,4
), summarise_events as (

	select
		 eventMonth
		,r.title
		,r.subject
		,row_number() over (partition by eventMonth,subject order by event_count desc) rank_per_subject_per_month

	from filter_raw_events
)

select *
from summarise_events
where rank_per_subject_per_month <= 5


## sql_1_2.sql
-- 2) We consider active users to be users that have been active in the last 28 days.
-- Please write a SQL query to show the percentage of users in the dataset considered active, split by week of their first visit.

with munge_raw_events as (

	select
		date_trunc('week', min(e.eventTime)) firstVisitWeek
		,max(case when trunc(e.eventTime) >= current_date - 28 then 1 end) isActive
		,userId
	from events.eventStream e
	group by userId

)

select
	firstVisitWeek
	,100.0 * count(isActive) / sum(count(userId)) over () as percentActive

from munge_raw_events
group by firstVisitWeek


## tes_interview.R
library(tidyverse)
library(data.table)

dir = '~/Documents/data_science/TES_interview/48ed0f570043a81687fa0317f8776df1/'

setwd(dir)
getwd()

TesData = 'TESDataScienceData.csv'
df <- read.csv(TesData)

summary(df)
nrow(df)


#########################
# DATA EXPLORATION
#########################

# Approach: look at every column for evidence of a link to the total number of views
# the goal is to gain an understanding of how different the total views are when
# split by a particular field (mean and var).

# info_gender

qplot(data = df, x = info_gender, fill = info_gender)
qplot(data = df, x = info_gender, y = total_views, geom = 'blank') + geom_boxplot()

df %>%
  filter(total_views < 4000) %>%
  qplot(data = ., x = total_views, fill = info_gender, geom = 'blank') + geom_histogram(alpha = 0.6)

df %>%
  filter(total_views < 4000) %>%
  qplot(data = ., x = total_views, fill = info_gender, geom = 'blank') + geom_histogram(alpha = 0.6) +
  facet_wrap(~info_gender, scales = 'free')

# -> info_gender does not appear to have much influence on the total number of views


# advert_start_date / advert_end_date / advert_duration
df$advert_start_date <- as.Date(df$advert_start_date, '%d/%m/%Y')
df$advert_end_date <- as.Date(df$advert_end_date, '%d/%m/%Y')
df$advert_duration <- as.double(df$advert_end_date - df$advert_start_date)

qplot(data = df, x = advert_start_date, geom = 'blank') + geom_bar()
qplot(data = df, x = advert_end_date, geom = 'blank') + geom_bar()
qplot(data = df, x = advert_duration, geom = 'blank') + geom_bar()

df %>%
  group_by(advert_duration) %>%
  summarise( mn_views = mean(total_views), stdDev_views = sqrt(var(total_views))) %>%
  select(advert_duration, mn_views,stdDev_views) %>%
  # glimpse
  # filter( advert_duration < 40) %>%
  gather(key = variable, value = value, mn_views,stdDev_views) %>%
  qplot( data = ., x = advert_duration, y = value, colour = variable, geom = 'line')
# clear link between advert duration and total_views

df %>%
  # filter(advert_duration < 40) %>%
  qplot(data = ., x = advert_duration, y = total_views) +
  geom_boxplot(aes(group = advert_duration)) +
  geom_smooth(method = 'lm', formula = y~ poly(x,2)) +
  geom_smooth(method = 'lm', formula = y~ exp(x)) +
  geom_smooth(method = 'lm', formula = y~ x)

# advert duration has some impact on total views but less than I expected


# job_category

qplot(data = df, x = job_category, y = total_views, geom = 'blank') +
  geom_boxplot() +
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
# head of department & teacher have a big 'tail' in total views (not surprising).
# the rest have reasonably tight boxplots -> job_category might give reasonable prediction


# workplace

qplot(data = df, x = workplace, y = total_views, geom = 'blank') +
  geom_boxplot() +
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
# apparently useless


# subject

qplot(data = df, x = subject, y = total_views, geom = 'blank') +
  geom_boxplot() +
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
# some variability -> subject might give reasonable prediction
# a lot less difference observed than expected.

# country_group
qplot(data = df, x = country_group, y = total_views, geom = 'blank') +
  geom_boxplot() +
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
# big difference between internation / uk. Expected but probably not that useful ?
# still can provide good prediction


# admin_level_2
data.table(df) %>%
  .[, .N, .(country_group, admin_level_2)]

df %>%
  filter(country_group != 'International') %>%
  qplot(data = ., x = admin_level_2, y = total_views, geom = 'blank') +
  geom_boxplot() +
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
# area does influence the number of views.
# london etc have long 'tails'. difficult

# package
qplot(data = df, x = package, y = total_views, geom = 'blank') +
  geom_boxplot() +
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
# small difference between packages


# salary_displayed
qplot(data = df, x = salary_displayed, y = total_views, group = salary_displayed, geom = 'blank') +
  geom_boxplot() +
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
data.table(df) %>%
  .[, mean(total_views), .(salary_displayed)]
# definite difference.
# salary displayed --> lower views !

# we don't have the school names ... but we do have their ids
DT = data.table(df)
DT[, cnt := length(unique(job_id)), school_id]
length(unique(DT$job_id))
length(unique(DT$school_id))

DT %>%
  group_by(school_id) %>%
  mutate(cnt = n()) %>%
  filter( cnt > 20 ) %>%
  qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

DT %>%
  group_by(school_id) %>%
  mutate(cnt = n()) %>%
  filter(cnt < 20 & cnt > 15) %>%
  qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

DT %>%
  group_by(school_id) %>%
  mutate(cnt = n()) %>%
  filter(cnt < 15 & cnt > 9) %>%
  qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

DT %>%
  group_by(school_id) %>%
  mutate(cnt = n()) %>%
  filter(cnt < 9) %>%
  # qplot( data = ., x = factor(1), y = total_views, geom = 'blank') + geom_boxplot()
  qplot( data = ., x = total_views, geom = 'blank') + geom_histogram()

# predictable_schools <- factor(DT[, .N, school_id][N >= 10]$school_id)


set.seed(100)
rowsForTraining = 0.5
index <- sample(x = seq(nrow(df)), size = floor(nrow(df) * rowsForTraining))

train <- df[index,]
test  <- df[-index,]

train <- df[index,] %>%
    filter(school_id %in% predictable_schools)
test  <- df[-index,] %>%
    filter(school_id %in% predictable_schools)

linReg <- lm(total_views ~ school_id,data = train)
summary(linReg) # big impact. big proportion of the variation is explained

# To make the model somewhat sane / understandable let's not use the school ids directly
# let's classify the schools by mean view and bucket these into x bands

# an alternative would be to do a regression per school.


#########################
# MODEL
#########################


set.seed(100)

qplot(data= df,x = advert_start_date)
train <- df %>% filter(advert_start_date <  as.Date('2015-03-01'))
test  <- df %>% filter(advert_start_date >= as.Date('2015-03-01'))


DT <- data.table(train)
school_jobs = DT[, .(job_cnt = .N), school_id] %>% filter(job_cnt > 3)

train <- merge(train,school_jobs, by = 'school_id', all.x = T) %>%
  data.table
train[is.na(job_cnt), school_for_reg := 'few_jobs_logged']
train[!is.na(job_cnt), school_for_reg := school_id]
train[,.N, school_for_reg]

linReg <- lm( total_views ~ school_for_reg + job_category + advert_duration + subject + country_group + admin_level_2 + salary_displayed
             ,data = train)
summary(linReg)


#########################
#
#########################


test <- merge(test,school_jobs, by = 'school_id', all.x = T)
test <- data.table(test)
test[is.na(job_cnt), school_for_reg := 'few_jobs_logged']
test[!is.na(job_cnt), school_for_reg := school_id]
test <- data.frame(test)

pred <- predict.lm(linReg,test)
err <- abs(test['total_views'] - pred)$total_views
mean(err)
	TES Data Science technical screen.pdf
	TESDataScienceData.csv
	*.RData
	*.Rhistory
	# Analysis Task:

	# Are things popular because they are popular ?

	We could look at the conditional popularity as a plot.
	So we could look at what was popular last week / month
	and compare it to what is popular to this week / month.
	We could bin the 'prior' popularity and compare it to
	this period's binned popularity.

	To really understand this we might want to test.
	If we say that x resource has 10000 downloads in one group
	versus 4 in another group. What is the eventual download rate?
	-> Probably ok to do this for things that are free (ethics!).


	# Are thing popular because the author is popular ?
	Here I suggest taking 'prolific' authors.
	Does the 'histogram' of downloads by author follow a similar shape?
	Does it differ only by scale (mean & var) ?
	We would probably need to account for different subjects and other factors.



	# Are things popular because of the price ?
	I suspect that the interaction between price and popularity isn't simple / linear.
	I'd imagine that free things generally have a lower download rate than things which cost a little.
	Then I'd expect popularity to rapidly decrease for things which cost more.
	-> a scatter plot of price to popularity

	At the same time I'd expect that things which cost more would have greater 'longevity' ? In
	other words I'd expect that more expensive things are less popular but provide a more reliable stream
	of downloads/income.


	# Are things popular because of the rating ?

	I suspect that a resource's rating has an impact on the number of downloads.
	-> a scatter plot of rating to number of downloads.
	I suspect that the first 3/4/5 ratings have a big impact on the eventual number of downloads.
	-> a scatter plot of the average ratings after 3/4/5 ratings v total downloads.
	-- 1) Write a SQL query that will return the top downloaded 5 resources by subject for each month
	-- since 1st September 2015. Only include Mathematics, English and History.


	with filter_raw_events as (

	select
	date_trunc('month', e.eventTime) eventMonth
	,r.id
	,r.title
	,r.subject
	,count(1) event_count

	from events.eventStream e
	join content.resourceDetails r on e.assetId = r.id
	where 1=1
	and e.assetType = 'resource'
	and e.event = 'download'
	and e.eventTime >= '2015-09-01'::timestamp
	and r.subject in ('Mathematics','English','History')

	group by 1,2,3,4
	), summarise_events as (

	select
	eventMonth
	,r.title
	,r.subject
	,row_number() over (partition by eventMonth,subject order by event_count desc) rank_per_subject_per_month

	from filter_raw_events
	)

	select *
	from summarise_events
	where rank_per_subject_per_month <= 5
	-- 2) We consider active users to be users that have been active in the last 28 days.
	-- Please write a SQL query to show the percentage of users in the dataset considered active, split by week of their first visit.

	with munge_raw_events as (

	select
	date_trunc('week', min(e.eventTime)) firstVisitWeek
	,max(case when trunc(e.eventTime) >= current_date - 28 then 1 end) isActive
	,userId
	from events.eventStream e
	group by userId

	)

	select
	firstVisitWeek
	,100.0 * count(isActive) / sum(count(userId)) over () as percentActive

	from munge_raw_events
	group by firstVisitWeek
	library(tidyverse)
	library(data.table)

	dir = '~/Documents/data_science/TES_interview/48ed0f570043a81687fa0317f8776df1/'

	setwd(dir)
	getwd()

	TesData = 'TESDataScienceData.csv'
	df <- read.csv(TesData)

	summary(df)
	nrow(df)


	#########################
	# DATA EXPLORATION
	#########################

	# Approach: look at every column for evidence of a link to the total number of views
	# the goal is to gain an understanding of how different the total views are when
	# split by a particular field (mean and var).

	# info_gender

	qplot(data = df, x = info_gender, fill = info_gender)
	qplot(data = df, x = info_gender, y = total_views, geom = 'blank') + geom_boxplot()

	df %>%
	filter(total_views < 4000) %>%
	qplot(data = ., x = total_views, fill = info_gender, geom = 'blank') + geom_histogram(alpha = 0.6)

	df %>%
	filter(total_views < 4000) %>%
	qplot(data = ., x = total_views, fill = info_gender, geom = 'blank') + geom_histogram(alpha = 0.6) +
	facet_wrap(~info_gender, scales = 'free')

	# -> info_gender does not appear to have much influence on the total number of views




	# advert_start_date / advert_end_date / advert_duration
	df$advert_start_date <- as.Date(df$advert_start_date, '%d/%m/%Y')
	df$advert_end_date <- as.Date(df$advert_end_date, '%d/%m/%Y')
	df$advert_duration <- as.double(df$advert_end_date - df$advert_start_date)

	qplot(data = df, x = advert_start_date, geom = 'blank') + geom_bar()
	qplot(data = df, x = advert_end_date, geom = 'blank') + geom_bar()
	qplot(data = df, x = advert_duration, geom = 'blank') + geom_bar()

	df %>%
	group_by(advert_duration) %>%
	summarise( mn_views = mean(total_views), stdDev_views = sqrt(var(total_views))) %>%
	select(advert_duration, mn_views,stdDev_views) %>%
	# glimpse
	# filter( advert_duration < 40) %>%
	gather(key = variable, value = value, mn_views,stdDev_views) %>%
	qplot( data = ., x = advert_duration, y = value, colour = variable, geom = 'line')
	# clear link between advert duration and total_views

	df %>%
	# filter(advert_duration < 40) %>%
	qplot(data = ., x = advert_duration, y = total_views) +
	geom_boxplot(aes(group = advert_duration)) +
	geom_smooth(method = 'lm', formula = y~ poly(x,2)) +
	geom_smooth(method = 'lm', formula = y~ exp(x)) +
	geom_smooth(method = 'lm', formula = y~ x)

	# advert duration has some impact on total views but less than I expected


	# job_category

	qplot(data = df, x = job_category, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# head of department & teacher have a big 'tail' in total views (not surprising).
	# the rest have reasonably tight boxplots -> job_category might give reasonable prediction


	# workplace

	qplot(data = df, x = workplace, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# apparently useless


	# subject

	qplot(data = df, x = subject, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# some variability -> subject might give reasonable prediction
	# a lot less difference observed than expected.

	# country_group
	qplot(data = df, x = country_group, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# big difference between internation / uk. Expected but probably not that useful ?
	# still can provide good prediction



	# admin_level_2
	data.table(df) %>%
	.[, .N, .(country_group, admin_level_2)]

	df %>%
	filter(country_group != 'International') %>%
	qplot(data = ., x = admin_level_2, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# area does influence the number of views.
	# london etc have long 'tails'. difficult

	# package
	qplot(data = df, x = package, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# small difference between packages


	# salary_displayed
	qplot(data = df, x = salary_displayed, y = total_views, group = salary_displayed, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	data.table(df) %>%
	.[, mean(total_views), .(salary_displayed)]
	# definite difference.
	# salary displayed --> lower views !

	# we don't have the school names ... but we do have their ids
	DT = data.table(df)
	DT[, cnt := length(unique(job_id)), school_id]
	length(unique(DT$job_id))
	length(unique(DT$school_id))

	DT %>%
	group_by(school_id) %>%
	mutate(cnt = n()) %>%
	filter( cnt > 20 ) %>%
	qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

	DT %>%
	group_by(school_id) %>%
	mutate(cnt = n()) %>%
	filter(cnt < 20 & cnt > 15) %>%
	qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

	DT %>%
	group_by(school_id) %>%
	mutate(cnt = n()) %>%
	filter(cnt < 15 & cnt > 9) %>%
	qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

	DT %>%
	group_by(school_id) %>%
	mutate(cnt = n()) %>%
	filter(cnt < 9) %>%
	# qplot( data = ., x = factor(1), y = total_views, geom = 'blank') + geom_boxplot()
	qplot( data = ., x = total_views, geom = 'blank') + geom_histogram()

	# predictable_schools <- factor(DT[, .N, school_id][N >= 10]$school_id)





	set.seed(100)
	rowsForTraining = 0.5
	index <- sample(x = seq(nrow(df)), size = floor(nrow(df) * rowsForTraining))

	train <- df[index,]
	test <- df[-index,]

	train <- df[index,] %>%
	filter(school_id %in% predictable_schools)
	test <- df[-index,] %>%
	filter(school_id %in% predictable_schools)

	linReg <- lm(total_views ~ school_id,data = train)
	summary(linReg) # big impact. big proportion of the variation is explained

	# To make the model somewhat sane / understandable let's not use the school ids directly
	# let's classify the schools by mean view and bucket these into x bands

	# an alternative would be to do a regression per school.




	#########################
	# MODEL
	#########################


	set.seed(100)

	qplot(data= df,x = advert_start_date)
	train <- df %>% filter(advert_start_date < as.Date('2015-03-01'))
	test <- df %>% filter(advert_start_date >= as.Date('2015-03-01'))


	DT <- data.table(train)
	school_jobs = DT[, .(job_cnt = .N), school_id] %>% filter(job_cnt > 3)

	train <- merge(train,school_jobs, by = 'school_id', all.x = T) %>%
	data.table
	train[is.na(job_cnt), school_for_reg := 'few_jobs_logged']
	train[!is.na(job_cnt), school_for_reg := school_id]
	train[,.N, school_for_reg]

	linReg <- lm( total_views ~ school_for_reg + job_category + advert_duration + subject + country_group + admin_level_2 + salary_displayed
	,data = train)
	summary(linReg)


	#########################
	#
	#########################


	test <- merge(test,school_jobs, by = 'school_id', all.x = T)
	test <- data.table(test)
	test[is.na(job_cnt), school_for_reg := 'few_jobs_logged']
	test[!is.na(job_cnt), school_for_reg := school_id]
	test <- data.frame(test)

	pred <- predict.lm(linReg,test)
	err <- abs(test['total_views'] - pred)$total_views
	mean(err)