Create a gist now

Instantly share code, notes, and snippets.

* Estimate appropriate weights for each (user, item, signal) triple
* based on three factors:
* 1) the relative frequencies of the signals across all users
* 2) the relative frequencies of the signals for each specific user
* 3) the proportion of users who have at least one instance of a given signal
* (1) is the basic principle: rare events are usually more important than frequent ones
* (2) accounts for the fact that the same signal can be more or less significant
* depending on the particular user's habits
* (3) accounts for the fact that some signals may be rare not because they require
* real user engagement, but rather because they are unappealing to users
* for some other reason, such as having a bad user interface, or being
* a generally unsuitable feature for the product space.
* ui_signals: { user, item, signal }
* prior_quantile: String in the for 'qNN', where NN is a multiple of 5,
* i.e. q50 for median, q25 for 25th percentile.
* This is a Bayesian prior which allows better estimation of signal weights
* for users with small sample sizes of events (not much engagement).
* A higher percentile means a more "aggressive"/"opinionated" prior.
* -->
* final_sig_weights: { user, signal, weight }
* This macro requires the DataFu library of UDFs to have been registered.
DEFINE Recsys__WeightSignals(ui_signals, prior_quantile)
RETURNS final_sig_weights {
-- find the "sample size" for each user
user_totals = FOREACH (GROUP $ui_signals BY user) GENERATE
group AS user, COUNT($1) AS total;
-- Count the number of signals for each (user, signal type) pair
user_sig_counts = FOREACH (GROUP $ui_signals BY (user, signal)) GENERATE
FLATTEN(group) AS (user, signal),
COUNT($1) AS count;
-- Join the signal counts with the sample sizes and disambiguate aliases
user_sig_counts = FOREACH (JOIN user_totals BY user, user_sig_counts BY user) GENERATE
$2 AS user, $3 AS signal,
$4 AS count, $1 AS total;
-- The number of signals that will be observed for a given (user, signal type) pair
-- after an arbitary number of events is modelled as a binomial random variable.
-- To estimate the probability parameter of this binomial, we use a beta-distributed prior,
-- where the hyperparameters alpha and beta are empirically derived.
-- * alpha is the median (or some other quantile) number of signals across all (user, signal type) pairs
-- * beta is the median (or some other quantile) total number of signals across all users
DEFINE Recsys__WeightSignals_Quantiles datafu.pig.stats.StreamingQuantile('21');
prior_alpha = FOREACH (GROUP user_sig_counts ALL) GENERATE
FLATTEN(Recsys__WeightSignals_Quantiles($1.count)) AS (
q0, q5, q10, q15, q20, q25, q30, q35, q40, q45,
q50, q55, q60, q65, q70, q75, q80, q85, q90, q95, q100
prior_alpha = FOREACH prior_alpha GENERATE $prior_quantile AS alpha;
prior_beta = FOREACH (GROUP user_totals ALL) GENERATE
FLATTEN(Recsys__WeightSignals_Quantiles($ AS (
q00, q05, q10, q15, q20, q25, q30, q35, q40, q45,
q50, q55, q60, q65, q70, q75, q80, q85, q90, q95, q100
prior_beta = FOREACH prior_beta GENERATE $prior_quantile - prior_alpha.alpha AS beta;
-- weight(user, signal) = 1 / P(arbitrary_event == signal | user)
bayes_sig_weights = FOREACH user_sig_counts GENERATE
user, signal,
(float) (total + prior_alpha.alpha) /
(float) (count + prior_alpha.alpha + prior_beta.beta) AS weight;
-- Multiply each signal weight by a "coverage score":
-- the fraction of users who have at least one instance of this signal.
num_users = FOREACH (GROUP user_sig_counts ALL) GENERATE COUNT($1) AS count;
sig_coverage = FOREACH (GROUP user_sig_counts BY signal) GENERATE
group AS signal,
(float) COUNT($1) / (float) num_users.count AS coverage;
$final_sig_weights = FOREACH (JOIN sig_coverage BY signal, bayes_sig_weights BY signal) GENERATE
bayes_sig_weights::user AS user,
bayes_sig_weights::signal AS signal,
weight * coverage AS weight;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment