Created
March 1, 2018 06:02
-
-
Save theianchan/54eb882c6f5ce359552c3ebdcd6b2507 to your computer and use it in GitHub Desktop.
Exploration of the Firefox product data hosted here: https://web.archive.org/web/20160304073326/https://testpilot.mozillalabs.com/testcases/a-week-life-2/aggregated-data.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- How many rows do we have in each table? | |
select | |
count(u.id) as users, | |
count(distinct u.id) as distinct_users, | |
count(s.user_id) as survey_responses, | |
count(distinct s.user_id) as distinct_survey_responses, | |
count(e.*) as events | |
from users u, survey s, events e; | |
-- 27,267 unique users | |
-- 2,400,491 events | |
-- 4,081 unique survey responses | |
-- Why does the combined query take over 30 minutes to return when we can get the results of `select count(*)` on individual tables in like 2 seconds? | |
-- Even without the events table included | |
-------------------------------------------------- | |
-------------------------------------------------- | |
-- Can we use the survey data to make reliable assumptions about our population? How representative are the users in the survey? | |
-------------------- | |
-- fx_version -- | |
-------------------- | |
with | |
population as ( | |
with total as | |
(select count(*) as "total" from users) | |
select | |
fx_version, | |
count(*) as "count_population", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_population" | |
from users, total | |
group by 1 | |
order by 1 | |
), | |
sample as ( | |
with total as | |
(select count(*) as "total" from survey) | |
select | |
fx_version, | |
count(*) as "count_sample", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_sample" | |
from users, total | |
where users.id in | |
(select user_id from survey) | |
group by 1 | |
order by 1 | |
) | |
select | |
* | |
from population | |
left join sample | |
using (fx_version) | |
order by 1; | |
-- 27 FX versions starting with 3.5.11 and ending with 4.0b8pre | |
-- Sample/population distribution is pretty comparable w/89% of users on 4.0b6 in both cases | |
-------------------- | |
-- os -- | |
-------------------- | |
with | |
population as ( | |
with total as | |
(select count(*) as "total" from users) | |
select | |
os, | |
count(*) as "count_population", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_population" | |
from users, total | |
group by 1 | |
order by 1 | |
), | |
sample as ( | |
with total as | |
(select count(*) as "total" from survey) | |
select | |
os, | |
count(*) as "count_sample", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_sample" | |
from users, total | |
where users.id in | |
(select user_id from survey) | |
group by 1 | |
order by 1 | |
) | |
select | |
* | |
from population | |
left join sample | |
using (os) | |
order by 1; | |
-- 18 OS types (3 Mac, 4 Linux, 10 Windows, 1 Sun) | |
with | |
population as ( | |
with total as | |
(select count(*) as "total" from users) | |
select | |
os, | |
count(*) as "count_population", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_population" | |
from users, total | |
group by 1 | |
order by 1 | |
), | |
sample as ( | |
with total as | |
(select count(*) as "total" from survey) | |
select | |
os, | |
count(*) as "count_sample", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_sample" | |
from users, total | |
where users.id in | |
(select user_id from survey) | |
group by 1 | |
order by 1 | |
) | |
select | |
case | |
when os ilike '%mac%' | |
then 'Mac' | |
when os ilike '%linux%' | |
then 'Linux' | |
when os ilike '%windows%' | |
then 'Windows' | |
when os ilike '%sunos%' | |
then 'Sun' | |
else 'Other' | |
end as "os_group", | |
sum(count_population) as count_population, | |
sum(percent_population) as percent_population, | |
sum(count_sample) as count_sample, | |
sum(percent_sample) as percent_sample | |
from ( | |
select * from population left join sample using (os) | |
) as discrete | |
group by 1; | |
-- Grouping them we can see that Mac OS users are slightly overrepresented in the sample (10% vs 5% in the population) | |
-- Not a big problem but we can just keep that in mind | |
-------------------- | |
-- version -- | |
-------------------- | |
with | |
population as ( | |
with total as | |
(select count(*) as "total" from users) | |
select | |
version, | |
count(*) as "count_population", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_population" | |
from users, total | |
group by 1 | |
order by 1 | |
), | |
sample as ( | |
with total as | |
(select count(*) as "total" from survey) | |
select | |
version, | |
count(*) as "count_sample", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_sample" | |
from users, total | |
where users.id in | |
(select user_id from survey) | |
group by 1 | |
order by 1 | |
) | |
select | |
* | |
from population | |
left join sample | |
using (version) | |
order by 1; | |
-- 4 different test pilot extension versions | |
-- 93-94% of users (sample and pop) are on the most recent version, 1.0.3 | |
-------------------- | |
-- extensions -- | |
-------------------- | |
with | |
population as ( | |
with total as | |
(select count(*) as "total" from users) | |
select | |
number_extensions, | |
count(*) as "count_population", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_population" | |
from users, total | |
group by 1 | |
order by 1 | |
), | |
sample as ( | |
with total as | |
(select count(*) as "total" from survey) | |
select | |
number_extensions, | |
count(*) as "count_sample", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_sample" | |
from users, total | |
where users.id in | |
(select user_id from survey) | |
group by 1 | |
order by 1 | |
) | |
select | |
* | |
from population | |
left join sample | |
using (number_extensions) | |
order by 1; | |
-- The user with the most extensions has 185 extensions | |
with | |
population as ( | |
with total as | |
(select count(*) as "total" from users) | |
select | |
number_extensions, | |
count(*) as "count_population", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_population" | |
from users, total | |
group by 1 | |
order by 1 | |
), | |
sample as ( | |
with total as | |
(select count(*) as "total" from survey) | |
select | |
number_extensions, | |
count(*) as "count_sample", | |
round((count(*) / avg(total.total) * 100), 2) as "percent_sample" | |
from users, total | |
where users.id in | |
(select user_id from survey) | |
group by 1 | |
order by 1 | |
) | |
select | |
case | |
when number_extensions = 1 | |
then '1. 1' | |
when number_extensions <= 5 | |
then '2. 2-5' | |
when number_extensions <= 15 | |
then '3. 6-15' | |
when number_extensions <= 25 | |
then '4. 16-25' | |
else '5. 25+' | |
end as "number_extensions", | |
sum(count_population) as count_population, | |
sum(percent_population) as percent_population, | |
sum(count_sample) as count_sample, | |
sum(percent_sample) as percent_sample | |
from ( | |
select * from population left join sample using (number_extensions) | |
) as discrete | |
group by 1 | |
order by 1; | |
-- This is where we have the biggest delta between sample/pop - users with more extensions were more likely to be represented in the survey | |
-- Average number of extensions installed per user in sample vs population? | |
select | |
avg(p.number_extensions) as population_avg_ext, | |
avg(s.number_extensions) as sample_avg_ext | |
from users p, | |
( | |
select number_extensions from users | |
where users.id in (select user_id from survey) | |
) as s; | |
-- Population: 6 | |
-- Sample: 9 | |
-------------------------------------------------- | |
-------------------------------------------------- | |
-- Summary statistics based on survey data | |
-- "How long have you used Firefox?" | |
select q1, count(*), round(count(*)::numeric/(select count(*) from survey) * 100, 2) from survey group by 1 order by 1; | |
-- 9% < 1 year, 91% > 1 year, 69% > 3 years (!) | |
-- "Do you use more than one browser in your daily life?" | |
-- 65% use more than one browser | |
-- "If you use other browsers besides Firefox, what are they?" | |
select | |
round((select count(*) from survey where q3 ilike '%0%')::numeric / sample.total * 100, 2) as "chrome", | |
round((select count(*) from survey where q3 ilike '%1%')::numeric / sample.total * 100, 2) as "safari", | |
round((select count(*) from survey where q3 ilike '%2%')::numeric / sample.total * 100, 2) as "opera", | |
round((select count(*) from survey where q3 ilike '%3%')::numeric / sample.total * 100, 2) as "ie", | |
round((select count(*) from survey where q3 ilike '%<%')::numeric / sample.total * 100, 2) as "other" | |
from (select count(*) as "total" from survey) as sample; | |
-- 46% IE, 42% Chrome, 17% Safari, 16% Opera, 5% Other (non-exclusive) | |
-- "If you use multiple browsers, what do you consider to be your primary browser?" | |
-- 17% only use Firefox, 65% Firefox, 6% Chrome, 1% Safari, 1% Opera, 3% IE | |
-- "What is your gender?" | |
-- 91% male lol | |
-- "How old are you?" | |
-- 63% between 18-35 | |
-- "How much time do you spend on the Web each day?" | |
-- 89% > 2 hours, 20% > 10 hours | |
-------------------------------------------------- | |
-------------------------------------------------- | |
-- Let's create our user segments | |
-- New/casual users | |
select count(*) from survey | |
where ( | |
(q1 = '0' or q1 = '1') | |
and | |
(q7 = '0' or q7 = '1' or q7 = '2') | |
) | |
-- Loyal/casual users | |
select count(*) from survey | |
where ( | |
(q1 <> '0' and q1 <> '1') | |
and | |
(q7 = '0' or q7 = '1' or q7 = '2') | |
) | |
-- New/super users | |
select count(*) from survey | |
where ( | |
(q1 = '0' or q1 = '1') | |
and | |
(q7 <> '0' and q7 <> '1' and q7 <> '2') | |
) | |
-- Loyal/super users | |
select count(*) from survey | |
where ( | |
(q1 <> '0' and q1 <> '1') | |
and | |
(q7 <> '0' and q7 <> '1' and q7 <> '2') | |
) | |
-- Let's save the IDs of our loyal/super users | |
drop table if exists super_users; | |
select user_id as id | |
into temp table super_users | |
from survey | |
where ( | |
(q1 <> '0' and q1 <> '1') | |
and | |
(q7 <> '0' and q7 <> '1' and q7 <> '2') | |
) | |
-- 2,429 | |
-- And let's create a list of IDs for everyone else | |
drop table if exists other_users; | |
select id as id | |
into temp table other_users | |
from users | |
where id not in (select id from super_users); | |
-- 24,838 | |
-------------------------------------------------- | |
-------------------------------------------------- | |
-- Now let's examine our events data each in segment | |
select * from events limit 10; | |
-------------------- | |
-- event code ref -- | |
-------------------- | |
-- bookmark_status: 8 | |
-- bookmark_create: 9 | |
-- bookmark_choose: 10 | |
-- bookmark_modify: 11 | |
-- num_tabs: 26 | |
-- Put them in their own table | |
drop table if exists event_codes; | |
select distinct event_code as code | |
into temp table event_codes | |
from events | |
where (event_code = 8 or event_code = 9 or event_code = 10 or event_code = 11 or event_code = 26); | |
select * from event_codes; | |
-- number of bookmarks created: bookmark_status.data1 | |
-- number of bookmarks chosen: bookmark_choose | |
-- number of tabs: num_tabs.data2 | |
-- Compare event/user counts across the two groups | |
with super as ( | |
with su as (select count(*) as "total" from super_users) | |
select | |
event_code, | |
count(*) as "su_total", | |
round(count(*) / avg(su.total), 2) as "su_capita" | |
from events, su | |
where | |
event_code in (select code from event_codes) | |
and | |
user_id in (select id from super_users) | |
group by 1 | |
), other as ( | |
with ot as (select count(*) as "total" from other_users) | |
select | |
event_code, | |
count(*) as "ot_total", | |
round(count(*) / avg(ot.total), 2) as "ot_capita" | |
from events, ot | |
where | |
event_code in (select code from event_codes) | |
and | |
user_id in (select id from other_users) | |
group by 1 | |
) | |
select | |
* | |
from super left join other using (event_code) | |
order by 1; | |
-------------------- | |
-- sanity checks -- | |
-------------------- | |
-- Super users: 2,429 | |
-- Other users: 24,838 | |
-- How many unique users in the events table? | |
select count(distinct user_id) from events; | |
-- 14,718 | |
-- How many users have at least 1 event recorded? | |
select count(distinct user_id) from events where user_id in | |
(select id from super_users); | |
-- Super: 1,430 (59%) | |
-- Other: 13,288 (53%) | |
-- At least 1 bookmark_status event? | |
select count(distinct user_id) | |
from events | |
where | |
user_id in (select id from super_users) | |
and event_code = 8; | |
-- Super: 874 (36%) | |
-- Other: 9,721 (39%) | |
-- At least 1 num_tabs event? | |
select count(distinct user_id) | |
from events | |
where | |
user_id in (select id from super_users) | |
and event_code = 26; | |
-- 1,412 (58%) | |
-- 13,186 (53%) | |
-------------------- | |
-- bookmarks -- | |
-------------------- | |
-- Number of bookmark_status events? | |
select count(*) from events where event_code = 8; | |
-- 23,758 | |
-- Show distribution of bookmarks across both segments | |
select | |
user_id, | |
round(avg(split_part(data1, ' ', 1)::int)) | |
from events | |
where | |
event_code = 8 | |
and user_id in (select id from super_users) | |
group by 1 | |
order by 2 desc; | |
-- Summary stats on num_bookmarks for each segment? | |
with super as ( | |
select | |
user_id, | |
round(avg(split_part(data1, ' ', 1)::int)) as "bookmarks" | |
from events | |
where | |
event_code = 8 | |
and user_id in (select id from super_users) | |
group by 1 | |
) | |
select | |
count(bookmarks) as "users", | |
round(avg(bookmarks)) as "mean_bk", | |
round(median(bookmarks)) as "median_bk", | |
max(bookmarks) as "max_bk", | |
min(bookmarks) as "min_bk", | |
round(stddev(bookmarks)) as "stdev_bk", | |
round(variance(bookmarks)) as "var_bk" | |
from super; | |
-------------------- | |
-- tabs -- | |
-------------------- | |
select count(*) from events where event_code = 26; | |
-- 452,715 events | |
-- Show distribution | |
select | |
user_id, | |
round(avg(split_part(data2, ' ', 1)::int)) | |
from events | |
where | |
event_code = 26 | |
and user_id in (select id from super_users) | |
group by 1 | |
order by 2 desc; | |
-- Summary stats (using max) | |
with super as ( | |
select | |
user_id, | |
max(split_part(data2, ' ', 1)::int) as "tabs" | |
from events | |
where | |
event_code = 26 | |
and user_id in (select id from other_users) | |
group by 1 | |
) | |
select | |
count(user_id) as "users", | |
(select count(user_id) from super where tabs >= 10) as "max_tabs_10+", | |
round((select count(user_id) from super where tabs >= 10)::numeric / count(user_id) * 100, 2) as "%max_tabs_10+", | |
(select count(user_id) from super where tabs >= 8) as "max_tabs_8+", | |
round((select count(user_id) from super where tabs >= 8)::numeric / count(user_id) * 100, 2) as "%max_tabs_8+", | |
(select count(user_id) from super where tabs >= 6) as "max_tabs_6+", | |
round((select count(user_id) from super where tabs >= 6)::numeric / count(user_id) * 100, 2) as "%max_tabs_6+", | |
(select count(user_id) from super where tabs >= 4) as "max_tabs_4+", | |
round((select count(user_id) from super where tabs >= 4)::numeric / count(user_id) * 100, 2) as "%max_tabs_4+", | |
round(avg(tabs)) as "mean_tab", | |
round(median(tabs)) as "median_tab", | |
max(tabs) as "max_tab", | |
min(tabs) as "min_tab", | |
round(stddev(tabs)) as "stdev_tab", | |
round(variance(tabs)) as "var_tab" | |
from super; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment