msd song length analysis
DROP TABLE IF EXISTS msd_data; | |
CREATE EXTERNAL TABLE msd_data | |
( | |
ref string, | |
analysis_sample_rate float , | |
artist_7digitalid int , | |
artist_familiarity float , | |
artist_hotttnesss float , | |
artist_id string , | |
artist_latitude float , | |
artist_location string , | |
artist_longitude float , | |
artist_mbid string , | |
artist_mbtags string , | |
artist_mbtags_count int , | |
artist_name string , | |
artist_playmeid int , | |
artist_terms string , | |
artist_terms_freq string , | |
artist_terms_weight string , | |
audio_md5 string , | |
bars_confidence string , | |
bars_start string , | |
beats_confidence string , | |
beats_start string , | |
danceability float , | |
duration float , | |
end_of_fade_in float , | |
energy float , | |
key int , | |
key_confidence float , | |
loudness float , | |
mode int , | |
mode_confidence float , | |
release string , | |
release_7digitalid int , | |
sections_confidence string , | |
sections_start string , | |
segments_confidence string , | |
segments_loudness_max string , | |
segments_loudness_max_time string , | |
segments_loudness_max_start string , | |
segments_pitches string , | |
segments_start string , | |
segments_timbre string , | |
similar_artists string , | |
song_hotttnesss string , | |
song_id string , | |
start_of_fade_out float , | |
tatums_confidence string , | |
tatums_start string , | |
tempo float , | |
time_signature int , | |
time_signature_confidence float , | |
title string , | |
track_id string , | |
year int | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION '/data/files' | |
; | |
/* | |
select * from msd_data limit 100; | |
select count(1) from msd_data; | |
select artist_name, title from msd_data limit 100; | |
*/ | |
DROP TABLE IF EXISTS msd_song_length_breakdown; | |
CREATE EXTERNAL TABLE msd_song_length_breakdown | |
( | |
DURATION_GROUP STRING, | |
COUNT BIGINT | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION 's3n://folder/project/file/filename/msd_song_length_breakdown' | |
; | |
INSERT OVERWRITE TABLE msd_song_length_breakdown | |
select | |
case | |
when duration<=60 then '01DUR_0to1' | |
when duration >60 and duration<=120 then '02DUR_1to2' | |
when duration >120 and duration<=150 then '03DUR_2to2.5' | |
when duration >150 and duration<=180 then '04DUR_2.5to3' | |
when duration >180 and duration<=210 then '05DUR_3to3.5' | |
when duration >210 and duration<=240 then '06DUR_3.5to4' | |
when duration >240 and duration<=270 then '07DUR_4to4.5' | |
when duration >270 and duration<=300 then '08DUR_4.5to5' | |
when duration >270 and duration<=330 then '09DUR_5to6' | |
when duration >330 and duration<=390 then '10DUR_6to7' | |
when duration >390 then '11DUR_7PLUS' | |
else NULL | |
end as DURATION_GROUP, | |
count(1) as count | |
from msd_data | |
group by | |
case | |
when duration<=60 then '01DUR_0to1' | |
when duration >60 and duration<=120 then '02DUR_1to2' | |
when duration >120 and duration<=150 then '03DUR_2to2.5' | |
when duration >150 and duration<=180 then '04DUR_2.5to3' | |
when duration >180 and duration<=210 then '05DUR_3to3.5' | |
when duration >210 and duration<=240 then '06DUR_3.5to4' | |
when duration >240 and duration<=270 then '07DUR_4to4.5' | |
when duration >270 and duration<=300 then '08DUR_4.5to5' | |
when duration >270 and duration<=330 then '09DUR_5to6' | |
when duration >330 and duration<=390 then '10DUR_6to7' | |
when duration >390 then '11DUR_7PLUS' | |
else NULL | |
end; | |
select * from msd_song_length_breakdown order by DURATION_GROUP; | |
DROP TABLE IF EXISTS msd_avg_song_length_by_year; | |
CREATE EXTERNAL TABLE msd_avg_song_length_by_year | |
( | |
year int, | |
avg_duration float, | |
total_songs bigint, | |
DUR_0_1 bigint, | |
DUR_1_2 bigint, | |
DUR_2_2P5 bigint, | |
DUR_2P5_3 bigint, | |
DUR_3_3P5 bigint, | |
DUR_3P5_4 bigint, | |
DUR_4_4P5 bigint, | |
DUR_4P5_5 bigint, | |
DUR_5_6 bigint, | |
DUR_6_7 bigint, | |
DUR_7PLUS bigint | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION 's3n://folder/project/file/filename/msd_avg_song_length_by_year' | |
; | |
INSERT OVERWRITE TABLE msd_avg_song_length_by_year | |
select | |
year, | |
avg(duration), | |
count(1), | |
sum(case when duration<=60 then 1 else 0 end) as DUR_0_1, | |
sum(case when duration >60 and duration<=120 then 1 else 0 end) as DUR_1_2, | |
sum(case when duration >120 and duration<=150 then 1 else 0 end) as DUR_2_2P5, | |
sum(case when duration >150 and duration<=180 then 1 else 0 end) as DUR_2P5_3, | |
sum(case when duration >180 and duration<=210 then 1 else 0 end) as DUR_3_3P5, | |
sum(case when duration >210 and duration<=240 then 1 else 0 end) as DUR_3P5_4, | |
sum(case when duration >240 and duration<=270 then 1 else 0 end) as DUR_4_4P5, | |
sum(case when duration >270 and duration<=300 then 1 else 0 end) as DUR_4P5_5, | |
sum(case when duration >300 and duration<=360 then 1 else 0 end) as DUR_5_6, | |
sum(case when duration >360 and duration<=420 then 1 else 0 end) as DUR_6_7, | |
sum(case when duration >420 then 1 else 0 end) as DUR_7PLUS | |
from msd_data | |
group by year | |
order by year; | |
select * from msd_avg_song_length_by_year; | |
--Genres | |
/* | |
select | |
substr(artist_terms, 1, instr(artist_terms, ",")-1) as first_term, | |
count(1) as freq | |
from msd_sample | |
group by substr(artist_terms, 1, instr(artist_terms, ",")-1) | |
order by freq desc | |
select | |
artist_name, | |
artist_terms, | |
title, | |
genres | |
from msd_sample LATERAL VIEW explode(split(artist_terms, ",")) genretable as genres | |
limit 100; | |
*/ | |
DROP TABLE IF EXISTS msd_avg_song_length_by_genre; | |
CREATE EXTERNAL TABLE msd_avg_song_length_by_genre | |
( | |
genre string, | |
avg_duration float, | |
total_songs bigint, | |
DUR_0_1 bigint, | |
DUR_1_2 bigint, | |
DUR_2_2P5 bigint, | |
DUR_2P5_3 bigint, | |
DUR_3_3P5 bigint, | |
DUR_3P5_4 bigint, | |
DUR_4_4P5 bigint, | |
DUR_4P5_5 bigint, | |
DUR_5_6 bigint, | |
DUR_6_7 bigint, | |
DUR_7PLUS bigint | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION 's3n://folder/project/file/filename/msd_avg_song_length_by_genre' | |
; | |
INSERT OVERWRITE TABLE msd_avg_song_length_by_genre | |
select | |
genres, | |
avg(duration) as duration, | |
count(1) as freq, | |
sum(case when duration<=60 then 1 else 0 end) as DUR_0_1, | |
sum(case when duration >60 and duration<=120 then 1 else 0 end) as DUR_1_2, | |
sum(case when duration >120 and duration<=150 then 1 else 0 end) as DUR_2_2P5, | |
sum(case when duration >150 and duration<=180 then 1 else 0 end) as DUR_2P5_3, | |
sum(case when duration >180 and duration<=210 then 1 else 0 end) as DUR_3_3P5, | |
sum(case when duration >210 and duration<=240 then 1 else 0 end) as DUR_3P5_4, | |
sum(case when duration >240 and duration<=270 then 1 else 0 end) as DUR_4_4P5, | |
sum(case when duration >270 and duration<=300 then 1 else 0 end) as DUR_4P5_5, | |
sum(case when duration >300 and duration<=360 then 1 else 0 end) as DUR_5_6, | |
sum(case when duration >360 and duration<=420 then 1 else 0 end) as DUR_6_7, | |
sum(case when duration >420 then 1 else 0 end) as DUR_7PLUS | |
from msd_data LATERAL VIEW explode(split(artist_terms, ",")) genretable as genres | |
group by genres | |
--order by duration desc | |
; | |
select * from msd_avg_song_length_by_genre limit 100; | |
--artist lifestage | |
DROP TABLE IF EXISTS msd_artist_career_stage_7min; | |
CREATE EXTERNAL TABLE msd_artist_career_stage_7min | |
( | |
artist_name string, | |
songs bigint, | |
first_song_year bigint, | |
last_song_year bigint, | |
years_active bigint, | |
songs_over_7mins bigint, | |
prop_songs_over_7mins float, | |
first_song_year_over_7mins bigint, | |
last_song_year_over_7mins bigint, | |
perc_into_career_first_record_7min_song float | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION 's3n://folder/project/file/filename/msd_artist_career_stage_7min' | |
; | |
INSERT OVERWRITE TABLE msd_artist_career_stage_7min | |
select artist_name, | |
count(1) as songs, | |
min(year) as first_song_year, | |
max(year) as last_song_year, | |
max(year)-min(year) as years_active, | |
sum(case when duration >420 then 1 else 0 end) as songs_over_7mins, | |
sum(case when duration >420 then 1 else 0 end)/count(1) as prop_songs_over_7mins, | |
min(case when duration >420 then year end) as first_song_year_over_7mins, | |
max(case when duration >420 then year end) as last_song_year_over_7mins, | |
case | |
when min(case when duration >420 then year end)=min(year) then 0.0 | |
else (min(case when duration >420 then year end)-min(year))/(max(year)-min(year)) | |
end as perc_into_career_first_record_7min_song | |
from msd_data | |
where year<>0 | |
group by artist_name | |
; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment