Created
December 29, 2015 19:28
-
-
Save barrettclark/a9cf7e2f6b69efbdc8de to your computer and use it in GitHub Desktop.
Box Plot Quartile Exploration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH raw_data AS ( | |
SELECT jurisdictions AS county, median_value AS value | |
FROM maryland_residential_sales_figures | |
WHERE median_value > 999 | |
), ntiles AS ( | |
SELECT county, | |
value, | |
ROW_NUMBER() OVER (PARTITION BY county ORDER BY value) AS row_number, | |
COUNT(*) OVER (PARTITION BY county ) AS total, | |
NTILE(2) OVER (PARTITION BY county ORDER BY value) AS bitile | |
FROM raw_data | |
), medians AS ( | |
SELECT county, total, | |
AVG(CASE WHEN total % 2 > 0 | |
THEN MAX(value) FILTER(WHERE bitile=1) | |
ELSE (MAX(value) FILTER(WHERE bitile=1) + MIN(value) FILTER(WHERE bitile=2)) / 2.0 | |
END | |
) OVER (PARTITION BY county) as median | |
FROM ntiles | |
GROUP BY 1, 2 | |
), quartiles AS ( | |
SELECT ntiles.county, | |
MEDIAN(value) FILTER(WHERE value < median) as q1, | |
AVG(median) AS median, | |
MEDIAN(value) FILTER(WHERE value > median) as q3 | |
FROM ntiles | |
JOIN medians on medians.county = ntiles.county | |
GROUP BY 1 | |
ORDER BY 1 | |
) | |
SELECT quartiles.county, | |
ARRAY_TO_STRING( | |
ARRAY_AGG( | |
CASE WHEN value < q1 - ((q3-q1) * 1.5) | |
THEN value::VARCHAR ELSE NULL END | |
), ',') AS lower_outliers, | |
MIN(CASE WHEN value >= q1 - ((q3-q1) * 1.5) | |
THEN value ELSE NULL END | |
) AS minimum, | |
quartiles.q1, | |
quartiles.median, | |
quartiles.q3, | |
MAX(CASE WHEN value <= q3 + ((q3-q1) * 1.5) | |
THEN value ELSE NULL END | |
) AS maximum, | |
ARRAY_TO_STRING( | |
ARRAY_AGG( | |
CASE WHEN value > q3 + ((q3-q1) * 1.5) | |
THEN value::VARCHAR ELSE NULL END | |
), ',') AS upper_outliers | |
FROM quartiles | |
JOIN raw_data on quartiles.county = raw_data.county | |
GROUP BY quartiles.county, quartiles.q1, quartiles.median, quartiles.q3 | |
ORDER BY quartiles.county |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION _final_median(anyarray) RETURNS float8 AS $$ | |
WITH q AS | |
( | |
SELECT val | |
FROM unnest($1) val | |
WHERE VAL IS NOT NULL | |
ORDER BY 1 | |
), | |
cnt AS | |
( | |
SELECT COUNT(*) AS c FROM q | |
) | |
SELECT AVG(val)::float8 | |
FROM | |
( | |
SELECT val FROM q | |
LIMIT 2 - MOD((SELECT c FROM cnt), 2) | |
OFFSET GREATEST(CEIL((SELECT c FROM cnt) / 2.0) - 1,0) | |
) q2; | |
$$ LANGUAGE SQL IMMUTABLE; | |
CREATE AGGREGATE median(anyelement) ( | |
SFUNC=array_append, | |
STYPE=anyarray, | |
FINALFUNC=_final_median, | |
INITCOND='{}' | |
); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- https://blog.modeanalytics.com/how-to-make-box-and-whisker-plot-sql/ | |
WITH raw_data AS ( | |
SELECT jurisdictions AS county, median_value AS value | |
FROM maryland_residential_sales_figures | |
WHERE median_value > 999 | |
), details AS ( | |
SELECT county, | |
value, | |
ROW_NUMBER() OVER (PARTITION BY county ORDER BY value) AS row_number, | |
COUNT(*) OVER (PARTITION BY county) AS total | |
FROM raw_data | |
), quartiles AS ( | |
-- NOTE: q1 and q3 could potentially include the median if it's repeated in the data | |
SELECT county, | |
value, | |
AVG(CASE WHEN row_number BETWEEN FLOOR(total/2.0)/2.0 AND FLOOR(total/2.0)/2.0 + 1 | |
THEN value ELSE NULL END | |
) OVER (PARTITION BY county) as q1, | |
AVG(CASE WHEN row_number BETWEEN total/2.0 AND total/2.0 + 1 | |
THEN value ELSE NULL END | |
) OVER (PARTITION BY county) AS median, | |
AVG(CASE WHEN row_number BETWEEN CEIL(total/2.0) + FLOOR(total/2.0)/2.0 AND CEIL(total/2.0) + FLOOR(total/2.0)/2.0 + 1 | |
THEN value/1.0 ELSE NULL END | |
) OVER (PARTITION BY county) AS q3 | |
FROM details | |
) | |
SELECT county, | |
ARRAY_TO_STRING( | |
ARRAY_AGG( | |
CASE WHEN value < q1 - ((q3-q1) * 1.5) | |
THEN value::VARCHAR ELSE NULL END | |
), ',') AS lower_outliers, | |
MIN(CASE WHEN value >= q1 - ((q3-q1) * 1.5) | |
THEN value ELSE NULL END | |
) AS minimum, | |
AVG(q1) AS q1, | |
AVG(median) AS median, | |
AVG(q3) AS q3, | |
MAX(CASE WHEN value <= q3 + ((q3-q1) * 1.5) | |
THEN value ELSE NULL END | |
) AS maximum, | |
ARRAY_TO_STRING( | |
ARRAY_AGG( | |
CASE WHEN value > q3 + ((q3-q1) * 1.5) | |
THEN value::VARCHAR ELSE NULL END | |
), ',') AS upper_outliers | |
FROM quartiles | |
GROUP BY 1 | |
ORDER BY 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment