Last active
December 20, 2015 02:49
-
-
Save jspacker/6058763 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Estimate appropriate weights for each (user, item, signal) triple | |
* based on three factors: | |
* 1) the relative frequencies of the signals across all users | |
* 2) the relative frequencies of the signals for each specific user | |
* 3) the proportion of users who have at least one instance of a given signal | |
* | |
* (1) is the basic principle: rare events are usually more important than frequent ones | |
* (2) accounts for the fact that the same signal can be more or less significant | |
* depending on the particular user's habits | |
* (3) accounts for the fact that some signals may be rare not because they require | |
* real user engagement, but rather because they are unappealing to users | |
* for some other reason, such as having a bad user interface, or being | |
* a generally unsuitable feature for the product space. | |
* | |
* ui_signals: { user, item, signal } | |
* prior_quantile: String in the for 'qNN', where NN is a multiple of 5, | |
* i.e. q50 for median, q25 for 25th percentile. | |
* This is a Bayesian prior which allows better estimation of signal weights | |
* for users with small sample sizes of events (not much engagement). | |
* A higher percentile means a more "aggressive"/"opinionated" prior. | |
* --> | |
* final_sig_weights: { user, signal, weight } | |
* | |
* This macro requires the DataFu library of UDFs to have been registered. | |
*/ | |
DEFINE Recsys__WeightSignals(ui_signals, prior_quantile) | |
RETURNS final_sig_weights { | |
-- find the "sample size" for each user | |
user_totals = FOREACH (GROUP $ui_signals BY user) GENERATE | |
group AS user, COUNT($1) AS total; | |
-- Count the number of signals for each (user, signal type) pair | |
user_sig_counts = FOREACH (GROUP $ui_signals BY (user, signal)) GENERATE | |
FLATTEN(group) AS (user, signal), | |
COUNT($1) AS count; | |
-- Join the signal counts with the sample sizes and disambiguate aliases | |
user_sig_counts = FOREACH (JOIN user_totals BY user, user_sig_counts BY user) GENERATE | |
$2 AS user, $3 AS signal, | |
$4 AS count, $1 AS total; | |
-- The number of signals that will be observed for a given (user, signal type) pair | |
-- after an arbitary number of events is modelled as a binomial random variable. | |
-- To estimate the probability parameter of this binomial, we use a beta-distributed prior, | |
-- where the hyperparameters alpha and beta are empirically derived. | |
-- | |
-- * alpha is the median (or some other quantile) number of signals across all (user, signal type) pairs | |
-- * beta is the median (or some other quantile) total number of signals across all users | |
DEFINE Recsys__WeightSignals_Quantiles datafu.pig.stats.StreamingQuantile('21'); | |
prior_alpha = FOREACH (GROUP user_sig_counts ALL) GENERATE | |
FLATTEN(Recsys__WeightSignals_Quantiles($1.count)) AS ( | |
q0, q5, q10, q15, q20, q25, q30, q35, q40, q45, | |
q50, q55, q60, q65, q70, q75, q80, q85, q90, q95, q100 | |
); | |
prior_alpha = FOREACH prior_alpha GENERATE $prior_quantile AS alpha; | |
prior_beta = FOREACH (GROUP user_totals ALL) GENERATE | |
FLATTEN(Recsys__WeightSignals_Quantiles($1.total)) AS ( | |
q00, q05, q10, q15, q20, q25, q30, q35, q40, q45, | |
q50, q55, q60, q65, q70, q75, q80, q85, q90, q95, q100 | |
); | |
prior_beta = FOREACH prior_beta GENERATE $prior_quantile - prior_alpha.alpha AS beta; | |
-- weight(user, signal) = 1 / P(arbitrary_event == signal | user) | |
bayes_sig_weights = FOREACH user_sig_counts GENERATE | |
user, signal, | |
(float) (total + prior_alpha.alpha) / | |
(float) (count + prior_alpha.alpha + prior_beta.beta) AS weight; | |
-- Multiply each signal weight by a "coverage score": | |
-- the fraction of users who have at least one instance of this signal. | |
num_users = FOREACH (GROUP user_sig_counts ALL) GENERATE COUNT($1) AS count; | |
sig_coverage = FOREACH (GROUP user_sig_counts BY signal) GENERATE | |
group AS signal, | |
(float) COUNT($1) / (float) num_users.count AS coverage; | |
$final_sig_weights = FOREACH (JOIN sig_coverage BY signal, bayes_sig_weights BY signal) GENERATE | |
bayes_sig_weights::user AS user, | |
bayes_sig_weights::signal AS signal, | |
weight * coverage AS weight; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment