This gist contains code samples for the blog post "Connected data: Using BigQuery to analyse user behaviour in response to webhooks".
Connected data: Using BigQuery to analyse user behaviour in response to webhooks
/* | |
extractIDs takes the webhook request payload, which is structured like so: | |
{ | |
"events": [ | |
{ "links": { "<resource-name>": "ID", ... } }, | |
..., | |
] | |
} | |
And extracts the unique set of resource IDs referenced by the webhook. | |
*/ | |
CREATE TEMP FUNCTION | |
extractIDs(json STRING) | |
RETURNS ARRAY<string> | |
LANGUAGE js AS """ | |
return [ | |
... new Set( | |
JSON.parse(json)["events"] | |
.map(e=>Object.values(e["links"])) | |
.flat() | |
) | |
]; | |
"""; | |
WITH | |
-- Expand each webhook into a row per resource ID from the request_body | |
-- payload. Use the Javascript extractIDs function to parse all the events, | |
-- then UNNEST the parsed array into separate rows. | |
webhook_resources AS ( | |
SELECT | |
id, | |
created_at, | |
resource_id | |
FROM ( | |
SELECT | |
id, | |
created_at, | |
extractIDs(request_body) AS resource_ids | |
FROM | |
webhooks ) AS webhook_resources | |
-- UNNEST transforms the array of resources IDs into a row per resource ID, | |
-- while the CROSS JOIN finds every pairing from the original webhook (1) to | |
-- every resource ID (n). | |
CROSS JOIN | |
UNNEST(resource_ids) AS resource_id | |
), | |
-- Join each HTTP request log onto webhook_resources. Set the | |
-- webhook_work_duration field to be the duration of the request, if we can | |
-- find a webhook that was sent within 1m either side of the request. | |
requests AS ( | |
SELECT | |
merchant_activity._log_timestamp AS timestamp, | |
merchant_activity.handler, | |
merchant_activity.resource_id, | |
merchant_activity.duration, | |
( | |
CASE webhook_resources.resource_id IS NULL | |
WHEN TRUE THEN 0.0 | |
ELSE merchant_activity.duration | |
END | |
) AS webhook_work_duration | |
FROM | |
merchant_activity | |
-- OUTER JOIN to ensure we produce a row for every merchant_activity entry, as | |
-- we'll want to understand the ratio of webhook work to other API activity, | |
-- which is best measured in duration spent serving each request. | |
LEFT OUTER JOIN | |
webhook_resources | |
-- Allow for matching against a request either side by 1m of our API request. | |
-- This permits for slippage, as the log has a timestamp set at the completion | |
-- of a request and some integrators may respond to webhooks inline, rather | |
-- than asynchronously. | |
ON | |
webhook_resources.resource_id=merchant_activity.resource_id | |
AND ABS(UNIX_SECONDS(_log_timestamp) - UNIX_SECONDS(created_at)) < 60 | |
AND merchant_activity.resource_id IS NOT NULL | |
AND merchant_activity._log_timestamp > '2020-10-29' | |
) | |
-- Present work ratio rolled-up to each minute of our results. Provide total | |
-- durations so we can perform additional roll-ups later. | |
SELECT | |
TIMESTAMP_TRUNC(timestamp, MINUTE) AS timestamp, | |
COUNT(*) AS count, | |
SUM(duration) AS duration, | |
SUM(webhook_work_duration) AS webhook_work_duration, | |
SUM(webhook_work_duration) / SUM(duration) AS work_ratio | |
FROM | |
requests | |
GROUP BY | |
1 | |
ORDER BY | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment