Skip to content

Instantly share code, notes, and snippets.

@lawrencejones
Last active December 31, 2020 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lawrencejones/5850c75ecdcbb77492c9e37d11076643 to your computer and use it in GitHub Desktop.
Save lawrencejones/5850c75ecdcbb77492c9e37d11076643 to your computer and use it in GitHub Desktop.
Connected data: Using BigQuery to analyse user behaviour in response to webhooks
/*
extractIDs takes the webhook request payload, which is structured like so:
{
"events": [
{ "links": { "<resource-name>": "ID", ... } },
...,
]
}
And extracts the unique set of resource IDs referenced by the webhook.
*/
CREATE TEMP FUNCTION
extractIDs(json STRING)
RETURNS ARRAY<string>
LANGUAGE js AS """
return [
... new Set(
JSON.parse(json)["events"]
.map(e=>Object.values(e["links"]))
.flat()
)
];
""";
WITH
-- Expand each webhook into a row per resource ID from the request_body
-- payload. Use the Javascript extractIDs function to parse all the events,
-- then UNNEST the parsed array into separate rows.
webhook_resources AS (
SELECT
id,
created_at,
resource_id
FROM (
SELECT
id,
created_at,
extractIDs(request_body) AS resource_ids
FROM
webhooks ) AS webhook_resources
-- UNNEST transforms the array of resources IDs into a row per resource ID,
-- while the CROSS JOIN finds every pairing from the original webhook (1) to
-- every resource ID (n).
CROSS JOIN
UNNEST(resource_ids) AS resource_id
),
-- Join each HTTP request log onto webhook_resources. Set the
-- webhook_work_duration field to be the duration of the request, if we can
-- find a webhook that was sent within 1m either side of the request.
requests AS (
SELECT
merchant_activity._log_timestamp AS timestamp,
merchant_activity.handler,
merchant_activity.resource_id,
merchant_activity.duration,
(
CASE webhook_resources.resource_id IS NULL
WHEN TRUE THEN 0.0
ELSE merchant_activity.duration
END
) AS webhook_work_duration
FROM
merchant_activity
-- OUTER JOIN to ensure we produce a row for every merchant_activity entry, as
-- we'll want to understand the ratio of webhook work to other API activity,
-- which is best measured in duration spent serving each request.
LEFT OUTER JOIN
webhook_resources
-- Allow for matching against a request either side by 1m of our API request.
-- This permits for slippage, as the log has a timestamp set at the completion
-- of a request and some integrators may respond to webhooks inline, rather
-- than asynchronously.
ON
webhook_resources.resource_id=merchant_activity.resource_id
AND ABS(UNIX_SECONDS(_log_timestamp) - UNIX_SECONDS(created_at)) < 60
AND merchant_activity.resource_id IS NOT NULL
AND merchant_activity._log_timestamp > '2020-10-29'
)
-- Present work ratio rolled-up to each minute of our results. Provide total
-- durations so we can perform additional roll-ups later.
SELECT
TIMESTAMP_TRUNC(timestamp, MINUTE) AS timestamp,
COUNT(*) AS count,
SUM(duration) AS duration,
SUM(webhook_work_duration) AS webhook_work_duration,
SUM(webhook_work_duration) / SUM(duration) AS work_ratio
FROM
requests
GROUP BY
1
ORDER BY
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment