rjurney/happiness.pig

## happiness.pig
grouped = GROUP links BY (sourceNameOrIp, destinationNameOrIp, $filter);
sorted_grouped = FOREACH grouped {
    sorted_datetimes = ORDER links BY date_time;
    GENERATE FLATTEN(group) AS (sourceNameOrIp, destinationNameOrIp, $filter),
             sorted_datetimes.(date_time) AS sorted_datetimes,
             sorted_datetimes.(log_hash) as sorted_loghashes;
};
sorted_grouped = FOREACH sorted_grouped GENERATE sourceNameOrIp, destinationNameOrIp, $filter, sorted_datetimes, sorted_loghashes;
has_beacons = FILTER sorted_grouped by SIZE(sorted_datetimes) > 1;
has_beacons = FOREACH has_beacons GENERATE SIZE(sorted_datetimes) AS total;
has_beacons = FILTER has_beacons by total > 1;

DEFINE beacon_features_command `beacon_features.py`
    SHIP('src/beacons/beacon_features.py');

/* How can I make this next operation more parallel? */
happiness =
    STREAM sorted_grouped
    THROUGH beacon_features_command
    AS (sourceNameOrIp:chararray,
        destinationNameOrIp:chararray,
        $filter:chararray,
        SLD:chararray,
        interval:double,
        riqr:double,
        loghash_list:tuple(sample1:chararray,
                           sample2:chararray,
                           sample3:chararray),
        sparse_histogram:map[]);
	grouped = GROUP links BY (sourceNameOrIp, destinationNameOrIp, $filter);
	sorted_grouped = FOREACH grouped {
	sorted_datetimes = ORDER links BY date_time;
	GENERATE FLATTEN(group) AS (sourceNameOrIp, destinationNameOrIp, $filter),
	sorted_datetimes.(date_time) AS sorted_datetimes,
	sorted_datetimes.(log_hash) as sorted_loghashes;
	};
	sorted_grouped = FOREACH sorted_grouped GENERATE sourceNameOrIp, destinationNameOrIp, $filter, sorted_datetimes, sorted_loghashes;
	has_beacons = FILTER sorted_grouped by SIZE(sorted_datetimes) > 1;
	has_beacons = FOREACH has_beacons GENERATE SIZE(sorted_datetimes) AS total;
	has_beacons = FILTER has_beacons by total > 1;

	DEFINE beacon_features_command `beacon_features.py`
	SHIP('src/beacons/beacon_features.py');

	/* How can I make this next operation more parallel? */
	happiness =
	STREAM sorted_grouped
	THROUGH beacon_features_command
	AS (sourceNameOrIp:chararray,
	destinationNameOrIp:chararray,
	$filter:chararray,
	SLD:chararray,
	interval:double,
	riqr:double,
	loghash_list:tuple(sample1:chararray,
	sample2:chararray,
	sample3:chararray),
	sparse_histogram:map[]);