EcZachly/groups.sql

## groups.sql
-- first query all the users
WITH offsets AS (SELECT a.*,
                        EXTRACT(hour FROM ptn.utc_offset) AS utc_offset
                 FROM bootcamp.attendees a
                          JOIN pg_timezone_names ptn ON a.timezone = ptn.name
                 WHERE a.bootcamp_version = 3
                   AND a.timezone IS NOT NULL
                   AND a.content_delivery = 'Live'::text
                  ),
-- then aggregate the users by track and offset, we want matching timezones to fill up first
-- Label incomplete groups as "needs merging"
     offsets_aggregated AS (SELECT offsets.utc_offset,
                                   offsets.track,
                                   array_agg(offsets.user_id) AS users,
                                   count(1)                   AS cnt,
                                   CASE
                                       WHEN count(1) < 5 THEN 'needs merging'::text
                                       ELSE 'good'::text
                                       END                    AS status
                            FROM offsets
                            GROUP BY offsets.utc_offset, offsets.track),
-- For the groups that need merging, round their offset to the nearest 5 so widen up their limits
     windowed_v2 AS (SELECT CASE
                                WHEN offsets_aggregated.status = 'needs merging'::text
                                    THEN round(offsets_aggregated.utc_offset / 5::numeric) * 5::numeric
                                ELSE offsets_aggregated.utc_offset
                                END                                   AS utc_offset,
                            offsets_aggregated.track,
                            array_agg(offsets_aggregated.users::text) AS users,
                            CASE
                                WHEN cardinality(string_to_array(array_agg(offsets_aggregated.users::text)::text,
                                                                 ','::text)) < 5 THEN 'needs_merging'::text
                                ELSE 'good'::text
                                END                                   AS status
                     FROM offsets_aggregated
                     GROUP BY (
                                  CASE
                                      WHEN offsets_aggregated.status = 'needs merging'::text
                                          THEN round(offsets_aggregated.utc_offset / 5::numeric) * 5::numeric
                                      ELSE offsets_aggregated.utc_offset
                                      END), offsets_aggregated.track),
		-- even after widening timezones, you'll have a few "stragglers" in remote places that are far away from everybody else
		-- For these users, split the world at London and assign them into a "good" group that is either east or west of London
		-- Depending on their utc offset
     windowed_v3 AS (SELECT CASE
                                WHEN windowed_v2.status = 'needs_merging'::text THEN
                                    CASE
                                        WHEN windowed_v2.utc_offset < 0::numeric THEN min(
                                                                                      CASE
                                                                                          WHEN windowed_v2.status = 'good'::text
                                                                                              THEN windowed_v2.utc_offset
                                                                                          ELSE NULL::numeric
                                                                                          END)
                                                                                      OVER (PARTITION BY windowed_v2.track ORDER BY windowed_v2.utc_offset, (windowed_v2.status = 'good'::text) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
                                        ELSE max(
                                             CASE
                                                 WHEN windowed_v2.status = 'good'::text THEN windowed_v2.utc_offset
                                                 ELSE NULL::numeric
                                                 END)
                                             OVER (PARTITION BY windowed_v2.track ORDER BY windowed_v2.utc_offset, (windowed_v2.status = 'good'::text) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
                                        END
                                ELSE windowed_v2.utc_offset
                                END                AS utc_offset,
                            windowed_v2.utc_offset AS prev_offset,
                            windowed_v2.status,
                            windowed_v2.track,
                            windowed_v2.users
                     FROM windowed_v2),
			-- This query gets up back to utc offsets and the array of users in each bucket
     final_merge AS (SELECT windowed_v3.utc_offset,
                            windowed_v3.track,
                            cardinality(string_to_array(
                                    regexp_replace(array_agg(windowed_v3.users::text)::text, '[^0-9,]'::text, ''::text,
                                                   'g'::text), ','::text)) AS num_users,
                            string_to_array(
                                    regexp_replace(array_agg(windowed_v3.users::text)::text, '[^0-9,]'::text, ''::text,
                                                   'g'::text), ','::text)  AS users
                     FROM windowed_v3
                     GROUP BY windowed_v3.utc_offset, windowed_v3.track),
     -- let's unnest now so we can see how big each group is
		 unnested AS (SELECT unnest(final_merge.users)      AS u,
                         cardinality(final_merge.users) AS group_size,
                         final_merge.track,
                         final_merge.utc_offset
                  FROM final_merge),

		 -- let's assign group numbers by dividing by 5
     numbered AS (SELECT unnested.u,
                         unnested.group_size,
                         unnested.track,
                         unnested.utc_offset,
                         row_number() OVER (PARTITION BY unnested.track, unnested.utc_offset) / 5 AS group_num
                  FROM unnested),
     -- now we have the groups ready to go
		 matched_groups AS (SELECT numbered.track,
                               numbered.group_num + 1 AS group_number,
                               numbered.utc_offset,
                               array_agg(numbered.u)  AS users
                        FROM numbered
                        GROUP BY numbered.track, (numbered.group_num + 1), numbered.utc_offset),
     -- But the issue is there are some utc offsets with like.. 7 people
		 -- We should combine the small straggler groups of 2 or 3
		 combo_groups AS (SELECT mg.track,
                             mg.group_number       AS new_group_number,
                             mg2.group_number      AS old_group_number,
                             mg.utc_offset,
                             mg2.users || mg.users AS users
                      FROM matched_groups mg
                               JOIN matched_groups mg2 ON mg.track = mg2.track AND mg.utc_offset = mg2.utc_offset AND
                                                          mg.group_number <> mg2.group_number AND
                                                          mg.group_number = 1 AND cardinality(mg2.users) < 4
                      WHERE cardinality(mg2.users || mg.users) < 7),
     -- bring back the nice name of the utc offset
		 offset_names AS (SELECT offsets.utc_offset,
                             min(offsets.timezone) AS timezone
                      FROM offsets
                      GROUP BY offsets.utc_offset),
     -- build the group names
		 -- do a FULL OUTER JOIN with combo_groups
		 -- If there's a hit with combo groups, pick that group, otherwise filter it out
		 final_query AS (SELECT mg.track,
                            mg.utc_offset,
                            o.timezone                                     AS group_timezone,
                            ((((('V3'::text || ' '::text) || mg.track) || ' '::text) || o.timezone) || ' '::text) ||
                            mg.group_number                                AS group_name,
                            COALESCE(cg.new_group_number, mg.group_number) AS group_number,
                            COALESCE(cg.users, mg.users)                   AS users,
                            cardinality(COALESCE(cg.users, mg.users))      AS num_users
                     FROM matched_groups mg
                              FULL JOIN combo_groups cg ON mg.track = cg.track AND
                                                           (mg.group_number = cg.new_group_number OR
                                                            mg.group_number = cg.old_group_number) AND
                                                           mg.utc_offset = cg.utc_offset
                              LEFT JOIN offset_names o ON mg.utc_offset = o.utc_offset
                     WHERE cg.new_group_number = mg.group_number
                        OR cg.new_group_number IS NULL
                     ORDER BY mg.track, o.timezone, (COALESCE(cg.new_group_number, mg.group_number)))
SELECT final_query.track,
       final_query.utc_offset,
       final_query.group_timezone,
       final_query.group_name,
       final_query.group_number,
       final_query.users,
       final_query.num_users
FROM final_query;
	-- first query all the users
	WITH offsets AS (SELECT a.*,
	EXTRACT(hour FROM ptn.utc_offset) AS utc_offset
	FROM bootcamp.attendees a
	JOIN pg_timezone_names ptn ON a.timezone = ptn.name
	WHERE a.bootcamp_version = 3
	AND a.timezone IS NOT NULL
	AND a.content_delivery = 'Live'::text
	),
	-- then aggregate the users by track and offset, we want matching timezones to fill up first
	-- Label incomplete groups as "needs merging"
	offsets_aggregated AS (SELECT offsets.utc_offset,
	offsets.track,
	array_agg(offsets.user_id) AS users,
	count(1) AS cnt,
	CASE
	WHEN count(1) < 5 THEN 'needs merging'::text
	ELSE 'good'::text
	END AS status
	FROM offsets
	GROUP BY offsets.utc_offset, offsets.track),
	-- For the groups that need merging, round their offset to the nearest 5 so widen up their limits
	windowed_v2 AS (SELECT CASE
	WHEN offsets_aggregated.status = 'needs merging'::text
	THEN round(offsets_aggregated.utc_offset / 5::numeric) * 5::numeric
	ELSE offsets_aggregated.utc_offset
	END AS utc_offset,
	offsets_aggregated.track,
	array_agg(offsets_aggregated.users::text) AS users,
	CASE
	WHEN cardinality(string_to_array(array_agg(offsets_aggregated.users::text)::text,
	','::text)) < 5 THEN 'needs_merging'::text
	ELSE 'good'::text
	END AS status
	FROM offsets_aggregated
	GROUP BY (
	CASE
	WHEN offsets_aggregated.status = 'needs merging'::text
	THEN round(offsets_aggregated.utc_offset / 5::numeric) * 5::numeric
	ELSE offsets_aggregated.utc_offset
	END), offsets_aggregated.track),
	-- even after widening timezones, you'll have a few "stragglers" in remote places that are far away from everybody else
	-- For these users, split the world at London and assign them into a "good" group that is either east or west of London
	-- Depending on their utc offset
	windowed_v3 AS (SELECT CASE
	WHEN windowed_v2.status = 'needs_merging'::text THEN
	CASE
	WHEN windowed_v2.utc_offset < 0::numeric THEN min(
	CASE
	WHEN windowed_v2.status = 'good'::text
	THEN windowed_v2.utc_offset
	ELSE NULL::numeric
	END)
	OVER (PARTITION BY windowed_v2.track ORDER BY windowed_v2.utc_offset, (windowed_v2.status = 'good'::text) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
	ELSE max(
	CASE
	WHEN windowed_v2.status = 'good'::text THEN windowed_v2.utc_offset
	ELSE NULL::numeric
	END)
	OVER (PARTITION BY windowed_v2.track ORDER BY windowed_v2.utc_offset, (windowed_v2.status = 'good'::text) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
	END
	ELSE windowed_v2.utc_offset
	END AS utc_offset,
	windowed_v2.utc_offset AS prev_offset,
	windowed_v2.status,
	windowed_v2.track,
	windowed_v2.users
	FROM windowed_v2),
	-- This query gets up back to utc offsets and the array of users in each bucket
	final_merge AS (SELECT windowed_v3.utc_offset,
	windowed_v3.track,
	cardinality(string_to_array(
	regexp_replace(array_agg(windowed_v3.users::text)::text, '[^0-9,]'::text, ''::text,
	'g'::text), ','::text)) AS num_users,
	string_to_array(
	regexp_replace(array_agg(windowed_v3.users::text)::text, '[^0-9,]'::text, ''::text,
	'g'::text), ','::text) AS users
	FROM windowed_v3
	GROUP BY windowed_v3.utc_offset, windowed_v3.track),
	-- let's unnest now so we can see how big each group is
	unnested AS (SELECT unnest(final_merge.users) AS u,
	cardinality(final_merge.users) AS group_size,
	final_merge.track,
	final_merge.utc_offset
	FROM final_merge),

	-- let's assign group numbers by dividing by 5
	numbered AS (SELECT unnested.u,
	unnested.group_size,
	unnested.track,
	unnested.utc_offset,
	row_number() OVER (PARTITION BY unnested.track, unnested.utc_offset) / 5 AS group_num
	FROM unnested),
	-- now we have the groups ready to go
	matched_groups AS (SELECT numbered.track,
	numbered.group_num + 1 AS group_number,
	numbered.utc_offset,
	array_agg(numbered.u) AS users
	FROM numbered
	GROUP BY numbered.track, (numbered.group_num + 1), numbered.utc_offset),
	-- But the issue is there are some utc offsets with like.. 7 people
	-- We should combine the small straggler groups of 2 or 3
	combo_groups AS (SELECT mg.track,
	mg.group_number AS new_group_number,
	mg2.group_number AS old_group_number,
	mg.utc_offset,
	mg2.users \|\| mg.users AS users
	FROM matched_groups mg
	JOIN matched_groups mg2 ON mg.track = mg2.track AND mg.utc_offset = mg2.utc_offset AND
	mg.group_number <> mg2.group_number AND
	mg.group_number = 1 AND cardinality(mg2.users) < 4
	WHERE cardinality(mg2.users \|\| mg.users) < 7),
	-- bring back the nice name of the utc offset
	offset_names AS (SELECT offsets.utc_offset,
	min(offsets.timezone) AS timezone
	FROM offsets
	GROUP BY offsets.utc_offset),
	-- build the group names
	-- do a FULL OUTER JOIN with combo_groups
	-- If there's a hit with combo groups, pick that group, otherwise filter it out
	final_query AS (SELECT mg.track,
	mg.utc_offset,
	o.timezone AS group_timezone,
	((((('V3'::text \|\| ' '::text) \|\| mg.track) \|\| ' '::text) \|\| o.timezone) \|\| ' '::text) \|\|
	mg.group_number AS group_name,
	COALESCE(cg.new_group_number, mg.group_number) AS group_number,
	COALESCE(cg.users, mg.users) AS users,
	cardinality(COALESCE(cg.users, mg.users)) AS num_users
	FROM matched_groups mg
	FULL JOIN combo_groups cg ON mg.track = cg.track AND
	(mg.group_number = cg.new_group_number OR
	mg.group_number = cg.old_group_number) AND
	mg.utc_offset = cg.utc_offset
	LEFT JOIN offset_names o ON mg.utc_offset = o.utc_offset
	WHERE cg.new_group_number = mg.group_number
	OR cg.new_group_number IS NULL
	ORDER BY mg.track, o.timezone, (COALESCE(cg.new_group_number, mg.group_number)))
	SELECT final_query.track,
	final_query.utc_offset,
	final_query.group_timezone,
	final_query.group_name,
	final_query.group_number,
	final_query.users,
	final_query.num_users
	FROM final_query;