Created
February 20, 2023 12:58
-
-
Save capnfabs/631d4365c01664193d22b61ac73efa5f to your computer and use it in GitHub Desktop.
SQL for exploring musicbrainz data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- grabbed this from a migration somewhere; it's populated by db triggers AFAICT | |
INSERT INTO release_first_release_date ( | |
SELECT DISTINCT ON (release) | |
release, date_year, date_month, date_day | |
FROM ( | |
SELECT release, date_year, date_month, date_day | |
FROM release_country | |
UNION ALL | |
SELECT release, date_year, date_month, date_day | |
FROM release_unknown_country | |
) all_dates | |
ORDER BY | |
release, | |
date_year NULLS LAST, | |
date_month NULLS LAST, | |
date_day NULLS LAST | |
); | |
INSERT INTO recording_first_release_date ( | |
SELECT DISTINCT ON (track.recording) | |
track.recording, | |
rd.year, | |
rd.month, | |
rd.day | |
FROM track | |
JOIN medium ON medium.id = track.medium | |
JOIN release_first_release_date rd ON rd.release = medium.release | |
ORDER BY | |
track.recording, | |
rd.year NULLS LAST, | |
rd.month NULLS LAST, | |
rd.day NULLS LAST | |
); | |
--- END MIGRATIONS | |
-- CATEGORISATION | |
DROP TABLE IF EXISTS recordings_with_categorized_titles_and_first_release_dates; | |
CREATE TABLE IF NOT EXISTS recordings_with_categorized_titles_and_first_release_dates AS | |
(SELECT CASE | |
WHEN recording.name IN ('[untitled]', '(no audio)', '[unknown]') | |
THEN | |
'no_name' | |
-- if it's got something that's not letters, not special characters | |
WHEN recording.name ~ '[^a-zA-Z\s!-/:-@\[-`{-~0-9]' | |
THEN 'something_else' | |
-- if it doesn't have a single letter, it's not cased | |
WHEN recording.name ~ '^[^a-zA-Z]+$' | |
THEN 'something_else' | |
WHEN recording.name ~ '^[a-z_0-9\s\W]+$' | |
THEN 'all_lowercase' | |
WHEN recording.name ~ '^[A-Z_0-9\s\W]+$' | |
THEN 'all_uppercase' | |
ELSE 'mixed_case' | |
END AS category, | |
* | |
FROM recording | |
JOIN recording_first_release_date dates | |
ON recording.id = dates.recording); | |
-- I was exploring by month for a while, but not anymore | |
WITH aggregated AS ( | |
SELECT year, month, category, COUNT(*) AS num_recordings | |
FROM recordings_with_categorized_titles_and_first_release_dates | |
WHERE year >= 1990 AND (year < 2023 OR (year=2023 AND month = 1)) | |
AND month IS NOT NULL | |
GROUP BY 1, 2, 3 | |
), | |
with_total AS ( | |
SELECT year, month, SUM(num_recordings) AS total_recordings_for_month | |
FROM aggregated | |
GROUP BY 1,2 | |
) | |
SELECT year, month, category, num_recordings | |
FROM aggregated JOIN with_total USING (year, month); | |
--- main query | |
WITH aggregated_by_year AS ( | |
SELECT year, category, COUNT(*) AS num_recordings | |
FROM recordings_with_categorized_titles_and_first_release_dates | |
WHERE year >= 1990 AND (year < 2023) | |
AND category NOT IN ('no_name', 'something_else') | |
GROUP BY 1, 2 | |
), | |
with_total AS ( | |
SELECT year, SUM(num_recordings) AS total_recordings | |
FROM aggregated_by_year | |
GROUP BY 1 | |
) | |
, tabulated AS ( | |
SELECT year, category, num_recordings, num_recordings / total_recordings AS num_recordings_frac | |
FROM aggregated_by_year JOIN with_total USING (year)) | |
SELECT year, | |
MAX(num_recordings) FILTER (WHERE category = 'all_lowercase') AS all_lower, | |
MAX(num_recordings_frac) FILTER (WHERE category = 'all_lowercase') AS all_lower_frac, | |
MAX(num_recordings) FILTER (WHERE category = 'all_uppercase') AS all_upper, | |
MAX(num_recordings_frac) FILTER (WHERE category = 'all_uppercase') AS all_upper_frac, | |
MAX(num_recordings) FILTER (WHERE category = 'mixed_case') AS mixed, | |
MAX(num_recordings_frac) FILTER (WHERE category = 'mixed_case') AS mixed_frac | |
FROM tabulated | |
GROUP BY year ORDER BY year | |
; | |
--- count by year | |
SELECT year, COUNT(*) AS num_recordings FROM recordings_with_categorized_titles_and_first_release_dates | |
WHERE year >= 1990 AND (year < 2023) | |
AND category NOT IN ('no_name', 'something_else') | |
GROUP BY 1; | |
--- use this for homing in on specific categories / years | |
SELECT rec.*, artist.name FROM | |
recordings_with_categorized_titles_and_first_release_dates rec | |
JOIN artist ON rec.artist_credit = artist.id | |
WHERE category = 'all_lowercase' | |
AND year = 1990 LIMIT 1000; | |
SELECT COUNT(*) FROM recording; | |
--- UPCASE | |
SELECT rec.*, artist.name FROM | |
recordings_with_categorized_titles_and_first_release_dates rec | |
JOIN artist ON rec.artist_credit = artist.id | |
WHERE category = 'all_uppercase' | |
AND rec.name IN ('I', 'II', 'III', 'IV') LIMIT 1000; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment