Created
April 27, 2019 08:44
-
-
Save lhk/87cb98c0ec7b1657a3d6c1a4afa2d9dc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- a bit of subsampling, to make the size more manageable | |
-- I'm using the first N events, ordered by timestamp | |
-- this way, the learning traces should not be interrupted. | |
drop table if exists smaller; | |
create table smaller | |
as | |
select * | |
from log | |
order by timestamp | |
limit 20000000; | |
-- card reviews | |
drop table if exists repetitions; | |
create table repetitions | |
as | |
select * | |
from smaller | |
where event == 9 | |
order by timestamp; | |
select count(*) | |
from repetitions; | |
-- card additions | |
drop table if exists additions; | |
create table additions | |
as | |
select * | |
from smaller | |
where event == 6 | |
order by timestamp; | |
select count(*) | |
from additions; | |
-- the goal is to observe "learning traces", starting from an initial object review | |
-- for many objects, this initial review seems to not have been observed | |
-- this table contains user/object pairs where the initial review has been recorded | |
drop table if exists initial_repetition_observed; | |
create table initial_repetition_observed as | |
select object_id, user_id | |
from repetitions | |
where event == 9 | |
group by object_id, user_id | |
having min(acq_reps) == 0 | |
and min(ret_reps) == 0; | |
select count(*) | |
from initial_repetition_observed; | |
select count(distinct object_id) | |
from initial_repetition_observed; | |
-- build the table of traces | |
-- for every user/object where the initial repetition is recorded, | |
-- select the first N repetitions, here N is 50 | |
drop table if exists traces; | |
create table traces as | |
select * | |
from ( | |
select *, | |
ROW_NUMBER() over (partition by user_id,object_id order by timestamp) as RowCount | |
from repetitions | |
where (object_id, user_id) in initial_repetition_observed | |
) as data | |
where data.RowCount <= 50 | |
order by timestamp; | |
select count(*) | |
from traces; | |
select count(distinct object_id) | |
from traces; | |
select count(distinct user_id) | |
from traces; | |
-- look at individual traces | |
-- basically all users show weird inconsistencies | |
-- timestamps seem wrong, | |
-- acq_reps or ret_reps jump up and down | |
-- entries seem to be missing | |
-- for example, in offset 9, many repetitions seem to have not been observed | |
-- for example, in offset 17, the dynamics of grade / acq_reps / ret_reps seem broken | |
-- after a cursory inspection, the following users are broken: | |
-- 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 | |
-- (the numbers are the corresponding offset value) | |
-- that's 23 out of 26 | |
select t.* | |
from traces as t | |
where t.user_id in (select distinct user_id from traces limit 1 OFFSET 9) | |
and t.object_id in (select distinct object_id from traces where user_id == t.user_id limit 1) | |
order by timestamp; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment