jspacker/signal_weighting.pig

## signal_weighting.pig
/*
 * Estimate appropriate weights for each (user, item, signal) triple
 * based on three factors:
 *     1) the relative frequencies of the signals across all users
 *     2) the relative frequencies of the signals for each specific user
 *     3) the proportion of users who have at least one instance of a given signal
 *
 * (1) is the basic principle: rare events are usually more important than frequent ones
 * (2) accounts for the fact that the same signal can be more or less significant
 *     depending on the particular user's habits
 * (3) accounts for the fact that some signals may be rare not because they require
 *     real user engagement, but rather because they are unappealing to users
 *     for some other reason, such as having a bad user interface, or being
 *     a generally unsuitable feature for the product space.
 *
 * ui_signals: { user, item, signal }
 * prior_quantile: String in the for 'qNN', where NN is a multiple of 5,
 *                 i.e. q50 for median, q25 for 25th percentile.
 *                 This is a Bayesian prior which allows better estimation of signal weights
 *                 for users with small sample sizes of events (not much engagement).
 *                 A higher percentile means a more "aggressive"/"opinionated" prior.
 * -->
 * final_sig_weights: { user, signal, weight }
 *
 * This macro requires the DataFu library of UDFs to have been registered.
 */
DEFINE Recsys__WeightSignals(ui_signals, prior_quantile)
RETURNS final_sig_weights {
    -- find the "sample size" for each user
    user_totals         =   FOREACH (GROUP $ui_signals BY user) GENERATE
                                group AS user, COUNT($1) AS total;

    -- Count the number of signals for each (user, signal type) pair
    user_sig_counts     =   FOREACH (GROUP $ui_signals BY (user, signal)) GENERATE
                                FLATTEN(group) AS (user, signal),
                                COUNT($1) AS count;

    -- Join the signal counts with the sample sizes and disambiguate aliases
    user_sig_counts     =   FOREACH (JOIN user_totals BY user, user_sig_counts BY user) GENERATE
                                $2 AS user,  $3 AS signal,
                                $4 AS count, $1 AS total;

    -- The number of signals that will be observed for a given (user, signal type) pair
    -- after an arbitary number of events is modelled as a binomial random variable.
    -- To estimate the probability parameter of this binomial, we use a beta-distributed prior,
    -- where the hyperparameters alpha and beta are empirically derived.
    --
    -- * alpha is the median (or some other quantile) number of signals across all (user, signal type) pairs
    -- * beta is the median (or some other quantile) total number of signals across all users
    DEFINE Recsys__WeightSignals_Quantiles datafu.pig.stats.StreamingQuantile('21');

    prior_alpha         =   FOREACH (GROUP user_sig_counts ALL) GENERATE
                                FLATTEN(Recsys__WeightSignals_Quantiles($1.count)) AS (
                                     q0,  q5, q10, q15, q20, q25, q30, q35, q40, q45,
                                    q50, q55, q60, q65, q70, q75, q80, q85, q90, q95, q100
                                );
    prior_alpha         =   FOREACH prior_alpha GENERATE $prior_quantile AS alpha;

    prior_beta          =   FOREACH (GROUP user_totals ALL) GENERATE
                                FLATTEN(Recsys__WeightSignals_Quantiles($1.total)) AS (
                                    q00, q05, q10, q15, q20, q25, q30, q35, q40, q45,
                                    q50, q55, q60, q65, q70, q75, q80, q85, q90, q95, q100
                                );
    prior_beta          =   FOREACH prior_beta GENERATE $prior_quantile - prior_alpha.alpha AS beta;

    -- weight(user, signal) = 1 / P(arbitrary_event == signal | user)
    bayes_sig_weights   =   FOREACH user_sig_counts GENERATE
                                user, signal,
                                (float) (total + prior_alpha.alpha) /
                                (float) (count + prior_alpha.alpha + prior_beta.beta) AS weight;

    -- Multiply each signal weight by a "coverage score":
    -- the fraction of users who have at least one instance of this signal.
    num_users           =   FOREACH (GROUP user_sig_counts ALL) GENERATE COUNT($1) AS count;
    sig_coverage        =   FOREACH (GROUP user_sig_counts BY signal) GENERATE
                                group AS signal,
                                (float) COUNT($1) / (float) num_users.count AS coverage;
    $final_sig_weights  =   FOREACH (JOIN sig_coverage BY signal, bayes_sig_weights BY signal) GENERATE
                                bayes_sig_weights::user AS user,
                                bayes_sig_weights::signal AS signal,
                                weight * coverage AS weight;
};
	/*
	* Estimate appropriate weights for each (user, item, signal) triple
	* based on three factors:
	* 1) the relative frequencies of the signals across all users
	* 2) the relative frequencies of the signals for each specific user
	* 3) the proportion of users who have at least one instance of a given signal
	*
	* (1) is the basic principle: rare events are usually more important than frequent ones
	* (2) accounts for the fact that the same signal can be more or less significant
	* depending on the particular user's habits
	* (3) accounts for the fact that some signals may be rare not because they require
	* real user engagement, but rather because they are unappealing to users
	* for some other reason, such as having a bad user interface, or being
	* a generally unsuitable feature for the product space.
	*
	* ui_signals: { user, item, signal }
	* prior_quantile: String in the for 'qNN', where NN is a multiple of 5,
	* i.e. q50 for median, q25 for 25th percentile.
	* This is a Bayesian prior which allows better estimation of signal weights
	* for users with small sample sizes of events (not much engagement).
	* A higher percentile means a more "aggressive"/"opinionated" prior.
	* -->
	* final_sig_weights: { user, signal, weight }
	*
	* This macro requires the DataFu library of UDFs to have been registered.
	*/
	DEFINE Recsys__WeightSignals(ui_signals, prior_quantile)
	RETURNS final_sig_weights {
	-- find the "sample size" for each user
	user_totals = FOREACH (GROUP $ui_signals BY user) GENERATE
	group AS user, COUNT($1) AS total;

	-- Count the number of signals for each (user, signal type) pair
	user_sig_counts = FOREACH (GROUP $ui_signals BY (user, signal)) GENERATE
	FLATTEN(group) AS (user, signal),
	COUNT($1) AS count;

	-- Join the signal counts with the sample sizes and disambiguate aliases
	user_sig_counts = FOREACH (JOIN user_totals BY user, user_sig_counts BY user) GENERATE
	$2 AS user, $3 AS signal,
	$4 AS count, $1 AS total;

	-- The number of signals that will be observed for a given (user, signal type) pair
	-- after an arbitary number of events is modelled as a binomial random variable.
	-- To estimate the probability parameter of this binomial, we use a beta-distributed prior,
	-- where the hyperparameters alpha and beta are empirically derived.
	--
	-- * alpha is the median (or some other quantile) number of signals across all (user, signal type) pairs
	-- * beta is the median (or some other quantile) total number of signals across all users
	DEFINE Recsys__WeightSignals_Quantiles datafu.pig.stats.StreamingQuantile('21');

	prior_alpha = FOREACH (GROUP user_sig_counts ALL) GENERATE
	FLATTEN(Recsys__WeightSignals_Quantiles($1.count)) AS (
	q0, q5, q10, q15, q20, q25, q30, q35, q40, q45,
	q50, q55, q60, q65, q70, q75, q80, q85, q90, q95, q100
	);
	prior_alpha = FOREACH prior_alpha GENERATE $prior_quantile AS alpha;

	prior_beta = FOREACH (GROUP user_totals ALL) GENERATE
	FLATTEN(Recsys__WeightSignals_Quantiles($1.total)) AS (
	q00, q05, q10, q15, q20, q25, q30, q35, q40, q45,
	q50, q55, q60, q65, q70, q75, q80, q85, q90, q95, q100
	);
	prior_beta = FOREACH prior_beta GENERATE $prior_quantile - prior_alpha.alpha AS beta;

	-- weight(user, signal) = 1 / P(arbitrary_event == signal \| user)
	bayes_sig_weights = FOREACH user_sig_counts GENERATE
	user, signal,
	(float) (total + prior_alpha.alpha) /
	(float) (count + prior_alpha.alpha + prior_beta.beta) AS weight;

	-- Multiply each signal weight by a "coverage score":
	-- the fraction of users who have at least one instance of this signal.
	num_users = FOREACH (GROUP user_sig_counts ALL) GENERATE COUNT($1) AS count;
	sig_coverage = FOREACH (GROUP user_sig_counts BY signal) GENERATE
	group AS signal,
	(float) COUNT($1) / (float) num_users.count AS coverage;
	$final_sig_weights = FOREACH (JOIN sig_coverage BY signal, bayes_sig_weights BY signal) GENERATE
	bayes_sig_weights::user AS user,
	bayes_sig_weights::signal AS signal,
	weight * coverage AS weight;
	};