Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save barrettclark/a9cf7e2f6b69efbdc8de to your computer and use it in GitHub Desktop.
Save barrettclark/a9cf7e2f6b69efbdc8de to your computer and use it in GitHub Desktop.
Box Plot Quartile Exploration
WITH raw_data AS (
SELECT jurisdictions AS county, median_value AS value
FROM maryland_residential_sales_figures
WHERE median_value > 999
), ntiles AS (
SELECT county,
value,
ROW_NUMBER() OVER (PARTITION BY county ORDER BY value) AS row_number,
COUNT(*) OVER (PARTITION BY county ) AS total,
NTILE(2) OVER (PARTITION BY county ORDER BY value) AS bitile
FROM raw_data
), medians AS (
SELECT county, total,
AVG(CASE WHEN total % 2 > 0
THEN MAX(value) FILTER(WHERE bitile=1)
ELSE (MAX(value) FILTER(WHERE bitile=1) + MIN(value) FILTER(WHERE bitile=2)) / 2.0
END
) OVER (PARTITION BY county) as median
FROM ntiles
GROUP BY 1, 2
), quartiles AS (
SELECT ntiles.county,
MEDIAN(value) FILTER(WHERE value < median) as q1,
AVG(median) AS median,
MEDIAN(value) FILTER(WHERE value > median) as q3
FROM ntiles
JOIN medians on medians.county = ntiles.county
GROUP BY 1
ORDER BY 1
)
SELECT quartiles.county,
ARRAY_TO_STRING(
ARRAY_AGG(
CASE WHEN value < q1 - ((q3-q1) * 1.5)
THEN value::VARCHAR ELSE NULL END
), ',') AS lower_outliers,
MIN(CASE WHEN value >= q1 - ((q3-q1) * 1.5)
THEN value ELSE NULL END
) AS minimum,
quartiles.q1,
quartiles.median,
quartiles.q3,
MAX(CASE WHEN value <= q3 + ((q3-q1) * 1.5)
THEN value ELSE NULL END
) AS maximum,
ARRAY_TO_STRING(
ARRAY_AGG(
CASE WHEN value > q3 + ((q3-q1) * 1.5)
THEN value::VARCHAR ELSE NULL END
), ',') AS upper_outliers
FROM quartiles
JOIN raw_data on quartiles.county = raw_data.county
GROUP BY quartiles.county, quartiles.q1, quartiles.median, quartiles.q3
ORDER BY quartiles.county
CREATE OR REPLACE FUNCTION _final_median(anyarray) RETURNS float8 AS $$
WITH q AS
(
SELECT val
FROM unnest($1) val
WHERE VAL IS NOT NULL
ORDER BY 1
),
cnt AS
(
SELECT COUNT(*) AS c FROM q
)
SELECT AVG(val)::float8
FROM
(
SELECT val FROM q
LIMIT 2 - MOD((SELECT c FROM cnt), 2)
OFFSET GREATEST(CEIL((SELECT c FROM cnt) / 2.0) - 1,0)
) q2;
$$ LANGUAGE SQL IMMUTABLE;
CREATE AGGREGATE median(anyelement) (
SFUNC=array_append,
STYPE=anyarray,
FINALFUNC=_final_median,
INITCOND='{}'
);
-- https://blog.modeanalytics.com/how-to-make-box-and-whisker-plot-sql/
WITH raw_data AS (
SELECT jurisdictions AS county, median_value AS value
FROM maryland_residential_sales_figures
WHERE median_value > 999
), details AS (
SELECT county,
value,
ROW_NUMBER() OVER (PARTITION BY county ORDER BY value) AS row_number,
COUNT(*) OVER (PARTITION BY county) AS total
FROM raw_data
), quartiles AS (
-- NOTE: q1 and q3 could potentially include the median if it's repeated in the data
SELECT county,
value,
AVG(CASE WHEN row_number BETWEEN FLOOR(total/2.0)/2.0 AND FLOOR(total/2.0)/2.0 + 1
THEN value ELSE NULL END
) OVER (PARTITION BY county) as q1,
AVG(CASE WHEN row_number BETWEEN total/2.0 AND total/2.0 + 1
THEN value ELSE NULL END
) OVER (PARTITION BY county) AS median,
AVG(CASE WHEN row_number BETWEEN CEIL(total/2.0) + FLOOR(total/2.0)/2.0 AND CEIL(total/2.0) + FLOOR(total/2.0)/2.0 + 1
THEN value/1.0 ELSE NULL END
) OVER (PARTITION BY county) AS q3
FROM details
)
SELECT county,
ARRAY_TO_STRING(
ARRAY_AGG(
CASE WHEN value < q1 - ((q3-q1) * 1.5)
THEN value::VARCHAR ELSE NULL END
), ',') AS lower_outliers,
MIN(CASE WHEN value >= q1 - ((q3-q1) * 1.5)
THEN value ELSE NULL END
) AS minimum,
AVG(q1) AS q1,
AVG(median) AS median,
AVG(q3) AS q3,
MAX(CASE WHEN value <= q3 + ((q3-q1) * 1.5)
THEN value ELSE NULL END
) AS maximum,
ARRAY_TO_STRING(
ARRAY_AGG(
CASE WHEN value > q3 + ((q3-q1) * 1.5)
THEN value::VARCHAR ELSE NULL END
), ',') AS upper_outliers
FROM quartiles
GROUP BY 1
ORDER BY 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment