Skip to content

Instantly share code, notes, and snippets.

@theianchan
Created March 1, 2018 06:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save theianchan/54eb882c6f5ce359552c3ebdcd6b2507 to your computer and use it in GitHub Desktop.
Save theianchan/54eb882c6f5ce359552c3ebdcd6b2507 to your computer and use it in GitHub Desktop.
-- How many rows do we have in each table?
select
count(u.id) as users,
count(distinct u.id) as distinct_users,
count(s.user_id) as survey_responses,
count(distinct s.user_id) as distinct_survey_responses,
count(e.*) as events
from users u, survey s, events e;
-- 27,267 unique users
-- 2,400,491 events
-- 4,081 unique survey responses
-- Why does the combined query take over 30 minutes to return when we can get the results of `select count(*)` on individual tables in like 2 seconds?
-- Even without the events table included
--------------------------------------------------
--------------------------------------------------
-- Can we use the survey data to make reliable assumptions about our population? How representative are the users in the survey?
--------------------
-- fx_version --
--------------------
with
population as (
with total as
(select count(*) as "total" from users)
select
fx_version,
count(*) as "count_population",
round((count(*) / avg(total.total) * 100), 2) as "percent_population"
from users, total
group by 1
order by 1
),
sample as (
with total as
(select count(*) as "total" from survey)
select
fx_version,
count(*) as "count_sample",
round((count(*) / avg(total.total) * 100), 2) as "percent_sample"
from users, total
where users.id in
(select user_id from survey)
group by 1
order by 1
)
select
*
from population
left join sample
using (fx_version)
order by 1;
-- 27 FX versions starting with 3.5.11 and ending with 4.0b8pre
-- Sample/population distribution is pretty comparable w/89% of users on 4.0b6 in both cases
--------------------
-- os --
--------------------
with
population as (
with total as
(select count(*) as "total" from users)
select
os,
count(*) as "count_population",
round((count(*) / avg(total.total) * 100), 2) as "percent_population"
from users, total
group by 1
order by 1
),
sample as (
with total as
(select count(*) as "total" from survey)
select
os,
count(*) as "count_sample",
round((count(*) / avg(total.total) * 100), 2) as "percent_sample"
from users, total
where users.id in
(select user_id from survey)
group by 1
order by 1
)
select
*
from population
left join sample
using (os)
order by 1;
-- 18 OS types (3 Mac, 4 Linux, 10 Windows, 1 Sun)
with
population as (
with total as
(select count(*) as "total" from users)
select
os,
count(*) as "count_population",
round((count(*) / avg(total.total) * 100), 2) as "percent_population"
from users, total
group by 1
order by 1
),
sample as (
with total as
(select count(*) as "total" from survey)
select
os,
count(*) as "count_sample",
round((count(*) / avg(total.total) * 100), 2) as "percent_sample"
from users, total
where users.id in
(select user_id from survey)
group by 1
order by 1
)
select
case
when os ilike '%mac%'
then 'Mac'
when os ilike '%linux%'
then 'Linux'
when os ilike '%windows%'
then 'Windows'
when os ilike '%sunos%'
then 'Sun'
else 'Other'
end as "os_group",
sum(count_population) as count_population,
sum(percent_population) as percent_population,
sum(count_sample) as count_sample,
sum(percent_sample) as percent_sample
from (
select * from population left join sample using (os)
) as discrete
group by 1;
-- Grouping them we can see that Mac OS users are slightly overrepresented in the sample (10% vs 5% in the population)
-- Not a big problem but we can just keep that in mind
--------------------
-- version --
--------------------
with
population as (
with total as
(select count(*) as "total" from users)
select
version,
count(*) as "count_population",
round((count(*) / avg(total.total) * 100), 2) as "percent_population"
from users, total
group by 1
order by 1
),
sample as (
with total as
(select count(*) as "total" from survey)
select
version,
count(*) as "count_sample",
round((count(*) / avg(total.total) * 100), 2) as "percent_sample"
from users, total
where users.id in
(select user_id from survey)
group by 1
order by 1
)
select
*
from population
left join sample
using (version)
order by 1;
-- 4 different test pilot extension versions
-- 93-94% of users (sample and pop) are on the most recent version, 1.0.3
--------------------
-- extensions --
--------------------
with
population as (
with total as
(select count(*) as "total" from users)
select
number_extensions,
count(*) as "count_population",
round((count(*) / avg(total.total) * 100), 2) as "percent_population"
from users, total
group by 1
order by 1
),
sample as (
with total as
(select count(*) as "total" from survey)
select
number_extensions,
count(*) as "count_sample",
round((count(*) / avg(total.total) * 100), 2) as "percent_sample"
from users, total
where users.id in
(select user_id from survey)
group by 1
order by 1
)
select
*
from population
left join sample
using (number_extensions)
order by 1;
-- The user with the most extensions has 185 extensions
with
population as (
with total as
(select count(*) as "total" from users)
select
number_extensions,
count(*) as "count_population",
round((count(*) / avg(total.total) * 100), 2) as "percent_population"
from users, total
group by 1
order by 1
),
sample as (
with total as
(select count(*) as "total" from survey)
select
number_extensions,
count(*) as "count_sample",
round((count(*) / avg(total.total) * 100), 2) as "percent_sample"
from users, total
where users.id in
(select user_id from survey)
group by 1
order by 1
)
select
case
when number_extensions = 1
then '1. 1'
when number_extensions <= 5
then '2. 2-5'
when number_extensions <= 15
then '3. 6-15'
when number_extensions <= 25
then '4. 16-25'
else '5. 25+'
end as "number_extensions",
sum(count_population) as count_population,
sum(percent_population) as percent_population,
sum(count_sample) as count_sample,
sum(percent_sample) as percent_sample
from (
select * from population left join sample using (number_extensions)
) as discrete
group by 1
order by 1;
-- This is where we have the biggest delta between sample/pop - users with more extensions were more likely to be represented in the survey
-- Average number of extensions installed per user in sample vs population?
select
avg(p.number_extensions) as population_avg_ext,
avg(s.number_extensions) as sample_avg_ext
from users p,
(
select number_extensions from users
where users.id in (select user_id from survey)
) as s;
-- Population: 6
-- Sample: 9
--------------------------------------------------
--------------------------------------------------
-- Summary statistics based on survey data
-- "How long have you used Firefox?"
select q1, count(*), round(count(*)::numeric/(select count(*) from survey) * 100, 2) from survey group by 1 order by 1;
-- 9% < 1 year, 91% > 1 year, 69% > 3 years (!)
-- "Do you use more than one browser in your daily life?"
-- 65% use more than one browser
-- "If you use other browsers besides Firefox, what are they?"
select
round((select count(*) from survey where q3 ilike '%0%')::numeric / sample.total * 100, 2) as "chrome",
round((select count(*) from survey where q3 ilike '%1%')::numeric / sample.total * 100, 2) as "safari",
round((select count(*) from survey where q3 ilike '%2%')::numeric / sample.total * 100, 2) as "opera",
round((select count(*) from survey where q3 ilike '%3%')::numeric / sample.total * 100, 2) as "ie",
round((select count(*) from survey where q3 ilike '%<%')::numeric / sample.total * 100, 2) as "other"
from (select count(*) as "total" from survey) as sample;
-- 46% IE, 42% Chrome, 17% Safari, 16% Opera, 5% Other (non-exclusive)
-- "If you use multiple browsers, what do you consider to be your primary browser?"
-- 17% only use Firefox, 65% Firefox, 6% Chrome, 1% Safari, 1% Opera, 3% IE
-- "What is your gender?"
-- 91% male lol
-- "How old are you?"
-- 63% between 18-35
-- "How much time do you spend on the Web each day?"
-- 89% > 2 hours, 20% > 10 hours
--------------------------------------------------
--------------------------------------------------
-- Let's create our user segments
-- New/casual users
select count(*) from survey
where (
(q1 = '0' or q1 = '1')
and
(q7 = '0' or q7 = '1' or q7 = '2')
)
-- Loyal/casual users
select count(*) from survey
where (
(q1 <> '0' and q1 <> '1')
and
(q7 = '0' or q7 = '1' or q7 = '2')
)
-- New/super users
select count(*) from survey
where (
(q1 = '0' or q1 = '1')
and
(q7 <> '0' and q7 <> '1' and q7 <> '2')
)
-- Loyal/super users
select count(*) from survey
where (
(q1 <> '0' and q1 <> '1')
and
(q7 <> '0' and q7 <> '1' and q7 <> '2')
)
-- Let's save the IDs of our loyal/super users
drop table if exists super_users;
select user_id as id
into temp table super_users
from survey
where (
(q1 <> '0' and q1 <> '1')
and
(q7 <> '0' and q7 <> '1' and q7 <> '2')
)
-- 2,429
-- And let's create a list of IDs for everyone else
drop table if exists other_users;
select id as id
into temp table other_users
from users
where id not in (select id from super_users);
-- 24,838
--------------------------------------------------
--------------------------------------------------
-- Now let's examine our events data each in segment
select * from events limit 10;
--------------------
-- event code ref --
--------------------
-- bookmark_status: 8
-- bookmark_create: 9
-- bookmark_choose: 10
-- bookmark_modify: 11
-- num_tabs: 26
-- Put them in their own table
drop table if exists event_codes;
select distinct event_code as code
into temp table event_codes
from events
where (event_code = 8 or event_code = 9 or event_code = 10 or event_code = 11 or event_code = 26);
select * from event_codes;
-- number of bookmarks created: bookmark_status.data1
-- number of bookmarks chosen: bookmark_choose
-- number of tabs: num_tabs.data2
-- Compare event/user counts across the two groups
with super as (
with su as (select count(*) as "total" from super_users)
select
event_code,
count(*) as "su_total",
round(count(*) / avg(su.total), 2) as "su_capita"
from events, su
where
event_code in (select code from event_codes)
and
user_id in (select id from super_users)
group by 1
), other as (
with ot as (select count(*) as "total" from other_users)
select
event_code,
count(*) as "ot_total",
round(count(*) / avg(ot.total), 2) as "ot_capita"
from events, ot
where
event_code in (select code from event_codes)
and
user_id in (select id from other_users)
group by 1
)
select
*
from super left join other using (event_code)
order by 1;
--------------------
-- sanity checks --
--------------------
-- Super users: 2,429
-- Other users: 24,838
-- How many unique users in the events table?
select count(distinct user_id) from events;
-- 14,718
-- How many users have at least 1 event recorded?
select count(distinct user_id) from events where user_id in
(select id from super_users);
-- Super: 1,430 (59%)
-- Other: 13,288 (53%)
-- At least 1 bookmark_status event?
select count(distinct user_id)
from events
where
user_id in (select id from super_users)
and event_code = 8;
-- Super: 874 (36%)
-- Other: 9,721 (39%)
-- At least 1 num_tabs event?
select count(distinct user_id)
from events
where
user_id in (select id from super_users)
and event_code = 26;
-- 1,412 (58%)
-- 13,186 (53%)
--------------------
-- bookmarks --
--------------------
-- Number of bookmark_status events?
select count(*) from events where event_code = 8;
-- 23,758
-- Show distribution of bookmarks across both segments
select
user_id,
round(avg(split_part(data1, ' ', 1)::int))
from events
where
event_code = 8
and user_id in (select id from super_users)
group by 1
order by 2 desc;
-- Summary stats on num_bookmarks for each segment?
with super as (
select
user_id,
round(avg(split_part(data1, ' ', 1)::int)) as "bookmarks"
from events
where
event_code = 8
and user_id in (select id from super_users)
group by 1
)
select
count(bookmarks) as "users",
round(avg(bookmarks)) as "mean_bk",
round(median(bookmarks)) as "median_bk",
max(bookmarks) as "max_bk",
min(bookmarks) as "min_bk",
round(stddev(bookmarks)) as "stdev_bk",
round(variance(bookmarks)) as "var_bk"
from super;
--------------------
-- tabs --
--------------------
select count(*) from events where event_code = 26;
-- 452,715 events
-- Show distribution
select
user_id,
round(avg(split_part(data2, ' ', 1)::int))
from events
where
event_code = 26
and user_id in (select id from super_users)
group by 1
order by 2 desc;
-- Summary stats (using max)
with super as (
select
user_id,
max(split_part(data2, ' ', 1)::int) as "tabs"
from events
where
event_code = 26
and user_id in (select id from other_users)
group by 1
)
select
count(user_id) as "users",
(select count(user_id) from super where tabs >= 10) as "max_tabs_10+",
round((select count(user_id) from super where tabs >= 10)::numeric / count(user_id) * 100, 2) as "%max_tabs_10+",
(select count(user_id) from super where tabs >= 8) as "max_tabs_8+",
round((select count(user_id) from super where tabs >= 8)::numeric / count(user_id) * 100, 2) as "%max_tabs_8+",
(select count(user_id) from super where tabs >= 6) as "max_tabs_6+",
round((select count(user_id) from super where tabs >= 6)::numeric / count(user_id) * 100, 2) as "%max_tabs_6+",
(select count(user_id) from super where tabs >= 4) as "max_tabs_4+",
round((select count(user_id) from super where tabs >= 4)::numeric / count(user_id) * 100, 2) as "%max_tabs_4+",
round(avg(tabs)) as "mean_tab",
round(median(tabs)) as "median_tab",
max(tabs) as "max_tab",
min(tabs) as "min_tab",
round(stddev(tabs)) as "stdev_tab",
round(variance(tabs)) as "var_tab"
from super;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment