Skip to content

Instantly share code, notes, and snippets.

@capnfabs
Created February 20, 2023 12:58
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save capnfabs/631d4365c01664193d22b61ac73efa5f to your computer and use it in GitHub Desktop.
SQL for exploring musicbrainz data
-- grabbed this from a migration somewhere; it's populated by db triggers AFAICT
INSERT INTO release_first_release_date (
SELECT DISTINCT ON (release)
release, date_year, date_month, date_day
FROM (
SELECT release, date_year, date_month, date_day
FROM release_country
UNION ALL
SELECT release, date_year, date_month, date_day
FROM release_unknown_country
) all_dates
ORDER BY
release,
date_year NULLS LAST,
date_month NULLS LAST,
date_day NULLS LAST
);
INSERT INTO recording_first_release_date (
SELECT DISTINCT ON (track.recording)
track.recording,
rd.year,
rd.month,
rd.day
FROM track
JOIN medium ON medium.id = track.medium
JOIN release_first_release_date rd ON rd.release = medium.release
ORDER BY
track.recording,
rd.year NULLS LAST,
rd.month NULLS LAST,
rd.day NULLS LAST
);
--- END MIGRATIONS
-- CATEGORISATION
DROP TABLE IF EXISTS recordings_with_categorized_titles_and_first_release_dates;
CREATE TABLE IF NOT EXISTS recordings_with_categorized_titles_and_first_release_dates AS
(SELECT CASE
WHEN recording.name IN ('[untitled]', '(no audio)', '[unknown]')
THEN
'no_name'
-- if it's got something that's not letters, not special characters
WHEN recording.name ~ '[^a-zA-Z\s!-/:-@\[-`{-~0-9]'
THEN 'something_else'
-- if it doesn't have a single letter, it's not cased
WHEN recording.name ~ '^[^a-zA-Z]+$'
THEN 'something_else'
WHEN recording.name ~ '^[a-z_0-9\s\W]+$'
THEN 'all_lowercase'
WHEN recording.name ~ '^[A-Z_0-9\s\W]+$'
THEN 'all_uppercase'
ELSE 'mixed_case'
END AS category,
*
FROM recording
JOIN recording_first_release_date dates
ON recording.id = dates.recording);
-- I was exploring by month for a while, but not anymore
WITH aggregated AS (
SELECT year, month, category, COUNT(*) AS num_recordings
FROM recordings_with_categorized_titles_and_first_release_dates
WHERE year >= 1990 AND (year < 2023 OR (year=2023 AND month = 1))
AND month IS NOT NULL
GROUP BY 1, 2, 3
),
with_total AS (
SELECT year, month, SUM(num_recordings) AS total_recordings_for_month
FROM aggregated
GROUP BY 1,2
)
SELECT year, month, category, num_recordings
FROM aggregated JOIN with_total USING (year, month);
--- main query
WITH aggregated_by_year AS (
SELECT year, category, COUNT(*) AS num_recordings
FROM recordings_with_categorized_titles_and_first_release_dates
WHERE year >= 1990 AND (year < 2023)
AND category NOT IN ('no_name', 'something_else')
GROUP BY 1, 2
),
with_total AS (
SELECT year, SUM(num_recordings) AS total_recordings
FROM aggregated_by_year
GROUP BY 1
)
, tabulated AS (
SELECT year, category, num_recordings, num_recordings / total_recordings AS num_recordings_frac
FROM aggregated_by_year JOIN with_total USING (year))
SELECT year,
MAX(num_recordings) FILTER (WHERE category = 'all_lowercase') AS all_lower,
MAX(num_recordings_frac) FILTER (WHERE category = 'all_lowercase') AS all_lower_frac,
MAX(num_recordings) FILTER (WHERE category = 'all_uppercase') AS all_upper,
MAX(num_recordings_frac) FILTER (WHERE category = 'all_uppercase') AS all_upper_frac,
MAX(num_recordings) FILTER (WHERE category = 'mixed_case') AS mixed,
MAX(num_recordings_frac) FILTER (WHERE category = 'mixed_case') AS mixed_frac
FROM tabulated
GROUP BY year ORDER BY year
;
--- count by year
SELECT year, COUNT(*) AS num_recordings FROM recordings_with_categorized_titles_and_first_release_dates
WHERE year >= 1990 AND (year < 2023)
AND category NOT IN ('no_name', 'something_else')
GROUP BY 1;
--- use this for homing in on specific categories / years
SELECT rec.*, artist.name FROM
recordings_with_categorized_titles_and_first_release_dates rec
JOIN artist ON rec.artist_credit = artist.id
WHERE category = 'all_lowercase'
AND year = 1990 LIMIT 1000;
SELECT COUNT(*) FROM recording;
--- UPCASE
SELECT rec.*, artist.name FROM
recordings_with_categorized_titles_and_first_release_dates rec
JOIN artist ON rec.artist_credit = artist.id
WHERE category = 'all_uppercase'
AND rec.name IN ('I', 'II', 'III', 'IV') LIMIT 1000;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment