Skip to content

Instantly share code, notes, and snippets.

@lhk
Created April 27, 2019 08:44
Show Gist options
  • Save lhk/87cb98c0ec7b1657a3d6c1a4afa2d9dc to your computer and use it in GitHub Desktop.
Save lhk/87cb98c0ec7b1657a3d6c1a4afa2d9dc to your computer and use it in GitHub Desktop.
-- a bit of subsampling, to make the size more manageable
-- I'm using the first N events, ordered by timestamp
-- this way, the learning traces should not be interrupted.
drop table if exists smaller;
create table smaller
as
select *
from log
order by timestamp
limit 20000000;
-- card reviews
drop table if exists repetitions;
create table repetitions
as
select *
from smaller
where event == 9
order by timestamp;
select count(*)
from repetitions;
-- card additions
drop table if exists additions;
create table additions
as
select *
from smaller
where event == 6
order by timestamp;
select count(*)
from additions;
-- the goal is to observe "learning traces", starting from an initial object review
-- for many objects, this initial review seems to not have been observed
-- this table contains user/object pairs where the initial review has been recorded
drop table if exists initial_repetition_observed;
create table initial_repetition_observed as
select object_id, user_id
from repetitions
where event == 9
group by object_id, user_id
having min(acq_reps) == 0
and min(ret_reps) == 0;
select count(*)
from initial_repetition_observed;
select count(distinct object_id)
from initial_repetition_observed;
-- build the table of traces
-- for every user/object where the initial repetition is recorded,
-- select the first N repetitions, here N is 50
drop table if exists traces;
create table traces as
select *
from (
select *,
ROW_NUMBER() over (partition by user_id,object_id order by timestamp) as RowCount
from repetitions
where (object_id, user_id) in initial_repetition_observed
) as data
where data.RowCount <= 50
order by timestamp;
select count(*)
from traces;
select count(distinct object_id)
from traces;
select count(distinct user_id)
from traces;
-- look at individual traces
-- basically all users show weird inconsistencies
-- timestamps seem wrong,
-- acq_reps or ret_reps jump up and down
-- entries seem to be missing
-- for example, in offset 9, many repetitions seem to have not been observed
-- for example, in offset 17, the dynamics of grade / acq_reps / ret_reps seem broken
-- after a cursory inspection, the following users are broken:
-- 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
-- (the numbers are the corresponding offset value)
-- that's 23 out of 26
select t.*
from traces as t
where t.user_id in (select distinct user_id from traces limit 1 OFFSET 9)
and t.object_id in (select distinct object_id from traces where user_id == t.user_id limit 1)
order by timestamp;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment