Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save mshakhomirov/ea9458252e50d5281c09237bd02613b4 to your computer and use it in GitHub Desktop.
Save mshakhomirov/ea9458252e50d5281c09237bd02613b4 to your computer and use it in GitHub Desktop.

Suggested tweaks for churn model / dataset churn_predictions:

  • Change last_seen_d_u (based on device) to last_seen (subscriber_id) > churned_u
  • Use d1, d2, d0 for one model and week_1 activity for another for model training and prediction.
  • Change MIN_ACTIVITY_DATE = '20220501'
  • Remove user_pseudo_id and use user_id
  • added features from daily_session table
  • added credit spend
  • excluded bots
  • excluded outliers

Recommendations:

change IsPremium to completed Premium group / account purchase

Exclude staff tag and change to categrical feature hasTag.

DECLARE MIN_ACTIVITY_DATE STRING;
SET MIN_ACTIVITY_DATE = '20220101'; -- Cut off date for subscriber activity.

create or replace table `palringo-data-staging.analytics.churn_prediction_w1` (

   user_id	        STRING
  ,first_seen 	    DATE
  ,last_seen	    DATE
  ,days_inactive	INT64
  ,churned_u	    INT64
  ,w1_gr_join   	INT64
  ,day123_gr_join	INT64
  ,w1_chat  	    INT64
  ,day123_chat	    INT64
  ,w1_avatar	    INT64
  ,day123_avatar    INT64
  ,tag	            STRING
  ,IsPremium	    STRING
  ,platform	        STRING
  ,day123_messages  INT64
  ,w1_messages      INT64
  ,day123_events_watched   INT64
  ,w1_events_watched   INT64
  ,day123_credit_purchase_cnt   INT64
  ,w1_credit_purchase_cnt   INT64

)
PARTITION BY DATE_TRUNC(first_seen, month) 
CLUSTER BY user_id
;

truncate table `palringo-data-staging.analytics.churn_prediction_w1`
;

insert `palringo-data-staging.analytics.churn_prediction_w1`
with fs as (
  SELECT 
      c.first_seen
    , c.user_id
    , c.last_seen
   FROM
    `palringo-data.analytics.churn` c 
   WHERE 
    first_seen >= PARSE_DATE('%Y%m%d', MIN_ACTIVITY_DATE) -- pick only users for this year and remove users with multiple accounts and devices
   and 
   multiple_accounts = false -- this can be another model feature.
--    and
--    multiple_devices = false -- multiple devices is fine as often it would be a re-install.
   and user_id is not null
)
-- user daily session data
, s as (
select
      connection_dt                     as dt
    , cast(subscriberId as string)      as user_id
    , sum(messagesTotal)                as messages
    , sum(array_length(eventsWatched))  as events_watched
from `palringo-data.analytics.user_daily_session`
where

  connection_dt >= PARSE_DATE('%Y%m%d', MIN_ACTIVITY_DATE)
group by 1,2

)
-- credit spend 
, c as (
select 
    date(ts)                        as dt 
    ,cast(subscriberid as string)   as user_id
    ,count(creditPurchaseItemId)    as items
from `palringo-data.production.credit_purchase_ro`
where date(_partitiontime) >= PARSE_DATE('%Y%m%d', MIN_ACTIVITY_DATE)
group by 1,2
)
-- add user activity data
, ua as (
select
      ua.dt
    , ua.user_id  
    , ua.platform
    , ua.tag
    , ua.IsPremium
    , ua.event_name
    , ua.eventCount
    , s.messages
    , s.events_watched
    , c.items as credit_purchase_cnt
from
  `palringo-data.analytics.fb_event_aggregates` ua
left join s on 
    s.dt = ua.dt
    and ua.user_id = s.user_id
left join c on 
    c.dt = ua.dt
    and ua.user_id = c.user_id
where
  ua.dt >= PARSE_DATE('%Y%m%d', MIN_ACTIVITY_DATE)
  and ua.user_id is not null
)

, d as (
select
    fs.user_id
  , fs.first_seen
  , fs.last_seen
  , IF( fs.last_seen < DATE_SUB(current_date(),INTERVAL 30 DAY) , 1, 0)     AS churned_u        
  , DATE_DIFF(current_date(),  fs.last_seen, day)                           AS days_inactive    
  , ua.dt                                                                                       
  , DATE_DIFF(ua.dt, fs.first_seen, DAY)                                    AS day_number       
  , ua.event_name
  , ua.eventCount
  , ua.tag
  , ua.IsPremium
  , ua.platform
  , ua.messages
  , ua.events_watched
  , ua.credit_purchase_cnt
from fs
left join ua on ua.user_id = fs.user_id

order by
1,2,4
)

select 
  user_id
, first_seen
, last_seen
, days_inactive -- wasn't active N days -> Churn?
, churned_u

-- event counts:
    -- join group. 
,	sum(IF(day_number in (0,1,2,3,4,5,6) and upper(event_name) in ('JOIN_GROUP','GROUP_JOIN'), eventCount,0))	 as w1_gr_join
,	sum(IF(day_number in (0,1,2) and upper(event_name) in ('JOIN_GROUP','GROUP_JOIN'), eventCount,0))	 as day123_gr_join
    --Open_Chat:
,	sum(IF(day_number in (0,1,2,3,4,5,6) and upper(event_name) in ('OPEN_CHAT'), eventCount,0))	                 as w1_chat
,	sum(IF(day_number in (0,1,2) and upper(event_name) in ('OPEN_CHAT'), eventCount,0))	                 as d123_chat
,	sum(IF(day_number in (0,1,2,3,4,5,6) and upper(event_name) in ('SET_AVATAR'), eventCount,0))	                     as w1_set_avatar
,	sum(IF(day_number in (0,1,2) and upper(event_name) in ('SET_AVATAR'), eventCount,0))	                     as d123_set_avatar

, max(IFNULL(tag, 'None'))                  tag
, max(IFNULL(IsPremium, 'False'))           IsPremium
, max(IFNULL(platform, 'unknown'))          platform

-- messages ssent
,	sum(IF(day_number in (0,1,2) , messages,0))	                             as d123_messages
,	sum(IF(day_number in (0,1,2,3,4,5,6) , messages,0))	                     as w1_messages
-- events watched
,	sum(IF(day_number in (0,1,2) , events_watched,0))	                     as d123_events_watched
,	sum(IF(day_number in (0,1,2,3,4,5,6) , events_watched,0))	             as w1_events_watched
-- credit purchase count
,	sum(IF(day_number in (0,1,2) , credit_purchase_cnt,0))	                     as d123_credit_purchase_cnt
,	sum(IF(day_number in (0,1,2,3,4,5,6) , credit_purchase_cnt,0))	             as w1_credit_purchase_cnt


from d
where 
    first_seen >= PARSE_DATE('%Y%m%d', MIN_ACTIVITY_DATE)
    and user_id not in (select  cast(subscriber_id as string) FROM `palringo-data.production.bot_ro_bot_v`)
group by
  user_id
, first_seen
, last_seen
, days_inactive -- wasn't active N days -> Churn?
, churned_u
order by
  user_id
, days_inactive desc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment