Skip to content

Instantly share code, notes, and snippets.

@EcZachly
Created November 6, 2023 21:11
Show Gist options
  • Save EcZachly/3a9ce861fe553db3e89cf9c6248be088 to your computer and use it in GitHub Desktop.
Save EcZachly/3a9ce861fe553db3e89cf9c6248be088 to your computer and use it in GitHub Desktop.
How to write an algorithm to group people in optimized groups based on timezone and track
-- first query all the users
WITH offsets AS (SELECT a.*,
EXTRACT(hour FROM ptn.utc_offset) AS utc_offset
FROM bootcamp.attendees a
JOIN pg_timezone_names ptn ON a.timezone = ptn.name
WHERE a.bootcamp_version = 3
AND a.timezone IS NOT NULL
AND a.content_delivery = 'Live'::text
),
-- then aggregate the users by track and offset, we want matching timezones to fill up first
-- Label incomplete groups as "needs merging"
offsets_aggregated AS (SELECT offsets.utc_offset,
offsets.track,
array_agg(offsets.user_id) AS users,
count(1) AS cnt,
CASE
WHEN count(1) < 5 THEN 'needs merging'::text
ELSE 'good'::text
END AS status
FROM offsets
GROUP BY offsets.utc_offset, offsets.track),
-- For the groups that need merging, round their offset to the nearest 5 so widen up their limits
windowed_v2 AS (SELECT CASE
WHEN offsets_aggregated.status = 'needs merging'::text
THEN round(offsets_aggregated.utc_offset / 5::numeric) * 5::numeric
ELSE offsets_aggregated.utc_offset
END AS utc_offset,
offsets_aggregated.track,
array_agg(offsets_aggregated.users::text) AS users,
CASE
WHEN cardinality(string_to_array(array_agg(offsets_aggregated.users::text)::text,
','::text)) < 5 THEN 'needs_merging'::text
ELSE 'good'::text
END AS status
FROM offsets_aggregated
GROUP BY (
CASE
WHEN offsets_aggregated.status = 'needs merging'::text
THEN round(offsets_aggregated.utc_offset / 5::numeric) * 5::numeric
ELSE offsets_aggregated.utc_offset
END), offsets_aggregated.track),
-- even after widening timezones, you'll have a few "stragglers" in remote places that are far away from everybody else
-- For these users, split the world at London and assign them into a "good" group that is either east or west of London
-- Depending on their utc offset
windowed_v3 AS (SELECT CASE
WHEN windowed_v2.status = 'needs_merging'::text THEN
CASE
WHEN windowed_v2.utc_offset < 0::numeric THEN min(
CASE
WHEN windowed_v2.status = 'good'::text
THEN windowed_v2.utc_offset
ELSE NULL::numeric
END)
OVER (PARTITION BY windowed_v2.track ORDER BY windowed_v2.utc_offset, (windowed_v2.status = 'good'::text) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
ELSE max(
CASE
WHEN windowed_v2.status = 'good'::text THEN windowed_v2.utc_offset
ELSE NULL::numeric
END)
OVER (PARTITION BY windowed_v2.track ORDER BY windowed_v2.utc_offset, (windowed_v2.status = 'good'::text) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
END
ELSE windowed_v2.utc_offset
END AS utc_offset,
windowed_v2.utc_offset AS prev_offset,
windowed_v2.status,
windowed_v2.track,
windowed_v2.users
FROM windowed_v2),
-- This query gets up back to utc offsets and the array of users in each bucket
final_merge AS (SELECT windowed_v3.utc_offset,
windowed_v3.track,
cardinality(string_to_array(
regexp_replace(array_agg(windowed_v3.users::text)::text, '[^0-9,]'::text, ''::text,
'g'::text), ','::text)) AS num_users,
string_to_array(
regexp_replace(array_agg(windowed_v3.users::text)::text, '[^0-9,]'::text, ''::text,
'g'::text), ','::text) AS users
FROM windowed_v3
GROUP BY windowed_v3.utc_offset, windowed_v3.track),
-- let's unnest now so we can see how big each group is
unnested AS (SELECT unnest(final_merge.users) AS u,
cardinality(final_merge.users) AS group_size,
final_merge.track,
final_merge.utc_offset
FROM final_merge),
-- let's assign group numbers by dividing by 5
numbered AS (SELECT unnested.u,
unnested.group_size,
unnested.track,
unnested.utc_offset,
row_number() OVER (PARTITION BY unnested.track, unnested.utc_offset) / 5 AS group_num
FROM unnested),
-- now we have the groups ready to go
matched_groups AS (SELECT numbered.track,
numbered.group_num + 1 AS group_number,
numbered.utc_offset,
array_agg(numbered.u) AS users
FROM numbered
GROUP BY numbered.track, (numbered.group_num + 1), numbered.utc_offset),
-- But the issue is there are some utc offsets with like.. 7 people
-- We should combine the small straggler groups of 2 or 3
combo_groups AS (SELECT mg.track,
mg.group_number AS new_group_number,
mg2.group_number AS old_group_number,
mg.utc_offset,
mg2.users || mg.users AS users
FROM matched_groups mg
JOIN matched_groups mg2 ON mg.track = mg2.track AND mg.utc_offset = mg2.utc_offset AND
mg.group_number <> mg2.group_number AND
mg.group_number = 1 AND cardinality(mg2.users) < 4
WHERE cardinality(mg2.users || mg.users) < 7),
-- bring back the nice name of the utc offset
offset_names AS (SELECT offsets.utc_offset,
min(offsets.timezone) AS timezone
FROM offsets
GROUP BY offsets.utc_offset),
-- build the group names
-- do a FULL OUTER JOIN with combo_groups
-- If there's a hit with combo groups, pick that group, otherwise filter it out
final_query AS (SELECT mg.track,
mg.utc_offset,
o.timezone AS group_timezone,
((((('V3'::text || ' '::text) || mg.track) || ' '::text) || o.timezone) || ' '::text) ||
mg.group_number AS group_name,
COALESCE(cg.new_group_number, mg.group_number) AS group_number,
COALESCE(cg.users, mg.users) AS users,
cardinality(COALESCE(cg.users, mg.users)) AS num_users
FROM matched_groups mg
FULL JOIN combo_groups cg ON mg.track = cg.track AND
(mg.group_number = cg.new_group_number OR
mg.group_number = cg.old_group_number) AND
mg.utc_offset = cg.utc_offset
LEFT JOIN offset_names o ON mg.utc_offset = o.utc_offset
WHERE cg.new_group_number = mg.group_number
OR cg.new_group_number IS NULL
ORDER BY mg.track, o.timezone, (COALESCE(cg.new_group_number, mg.group_number)))
SELECT final_query.track,
final_query.utc_offset,
final_query.group_timezone,
final_query.group_name,
final_query.group_number,
final_query.users,
final_query.num_users
FROM final_query;
@SpikeyCoder
Copy link

Really cool stuff. Still working through the different parts, but how did you decide on 5 as the count to increment to and merge? Was it based on the number of time zones in the group?

@EcZachly
Copy link
Author

EcZachly commented Nov 7, 2023

@SpikeyCoder I tried 3-hour blocks too but didn't get the distribution I was looking for. That's a very easy thing to edit and try differently though!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment