Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
msd song length analysis
DROP TABLE IF EXISTS msd_data;
CREATE EXTERNAL TABLE msd_data
(
ref string,
analysis_sample_rate float ,
artist_7digitalid int ,
artist_familiarity float ,
artist_hotttnesss float ,
artist_id string ,
artist_latitude float ,
artist_location string ,
artist_longitude float ,
artist_mbid string ,
artist_mbtags string ,
artist_mbtags_count int ,
artist_name string ,
artist_playmeid int ,
artist_terms string ,
artist_terms_freq string ,
artist_terms_weight string ,
audio_md5 string ,
bars_confidence string ,
bars_start string ,
beats_confidence string ,
beats_start string ,
danceability float ,
duration float ,
end_of_fade_in float ,
energy float ,
key int ,
key_confidence float ,
loudness float ,
mode int ,
mode_confidence float ,
release string ,
release_7digitalid int ,
sections_confidence string ,
sections_start string ,
segments_confidence string ,
segments_loudness_max string ,
segments_loudness_max_time string ,
segments_loudness_max_start string ,
segments_pitches string ,
segments_start string ,
segments_timbre string ,
similar_artists string ,
song_hotttnesss string ,
song_id string ,
start_of_fade_out float ,
tatums_confidence string ,
tatums_start string ,
tempo float ,
time_signature int ,
time_signature_confidence float ,
title string ,
track_id string ,
year int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/data/files'
;
/*
select * from msd_data limit 100;
select count(1) from msd_data;
select artist_name, title from msd_data limit 100;
*/
DROP TABLE IF EXISTS msd_song_length_breakdown;
CREATE EXTERNAL TABLE msd_song_length_breakdown
(
DURATION_GROUP STRING,
COUNT BIGINT
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION 's3n://folder/project/file/filename/msd_song_length_breakdown'
;
INSERT OVERWRITE TABLE msd_song_length_breakdown
select
case
when duration<=60 then '01DUR_0to1'
when duration >60 and duration<=120 then '02DUR_1to2'
when duration >120 and duration<=150 then '03DUR_2to2.5'
when duration >150 and duration<=180 then '04DUR_2.5to3'
when duration >180 and duration<=210 then '05DUR_3to3.5'
when duration >210 and duration<=240 then '06DUR_3.5to4'
when duration >240 and duration<=270 then '07DUR_4to4.5'
when duration >270 and duration<=300 then '08DUR_4.5to5'
when duration >270 and duration<=330 then '09DUR_5to6'
when duration >330 and duration<=390 then '10DUR_6to7'
when duration >390 then '11DUR_7PLUS'
else NULL
end as DURATION_GROUP,
count(1) as count
from msd_data
group by
case
when duration<=60 then '01DUR_0to1'
when duration >60 and duration<=120 then '02DUR_1to2'
when duration >120 and duration<=150 then '03DUR_2to2.5'
when duration >150 and duration<=180 then '04DUR_2.5to3'
when duration >180 and duration<=210 then '05DUR_3to3.5'
when duration >210 and duration<=240 then '06DUR_3.5to4'
when duration >240 and duration<=270 then '07DUR_4to4.5'
when duration >270 and duration<=300 then '08DUR_4.5to5'
when duration >270 and duration<=330 then '09DUR_5to6'
when duration >330 and duration<=390 then '10DUR_6to7'
when duration >390 then '11DUR_7PLUS'
else NULL
end;
select * from msd_song_length_breakdown order by DURATION_GROUP;
DROP TABLE IF EXISTS msd_avg_song_length_by_year;
CREATE EXTERNAL TABLE msd_avg_song_length_by_year
(
year int,
avg_duration float,
total_songs bigint,
DUR_0_1 bigint,
DUR_1_2 bigint,
DUR_2_2P5 bigint,
DUR_2P5_3 bigint,
DUR_3_3P5 bigint,
DUR_3P5_4 bigint,
DUR_4_4P5 bigint,
DUR_4P5_5 bigint,
DUR_5_6 bigint,
DUR_6_7 bigint,
DUR_7PLUS bigint
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION 's3n://folder/project/file/filename/msd_avg_song_length_by_year'
;
INSERT OVERWRITE TABLE msd_avg_song_length_by_year
select
year,
avg(duration),
count(1),
sum(case when duration<=60 then 1 else 0 end) as DUR_0_1,
sum(case when duration >60 and duration<=120 then 1 else 0 end) as DUR_1_2,
sum(case when duration >120 and duration<=150 then 1 else 0 end) as DUR_2_2P5,
sum(case when duration >150 and duration<=180 then 1 else 0 end) as DUR_2P5_3,
sum(case when duration >180 and duration<=210 then 1 else 0 end) as DUR_3_3P5,
sum(case when duration >210 and duration<=240 then 1 else 0 end) as DUR_3P5_4,
sum(case when duration >240 and duration<=270 then 1 else 0 end) as DUR_4_4P5,
sum(case when duration >270 and duration<=300 then 1 else 0 end) as DUR_4P5_5,
sum(case when duration >300 and duration<=360 then 1 else 0 end) as DUR_5_6,
sum(case when duration >360 and duration<=420 then 1 else 0 end) as DUR_6_7,
sum(case when duration >420 then 1 else 0 end) as DUR_7PLUS
from msd_data
group by year
order by year;
select * from msd_avg_song_length_by_year;
--Genres
/*
select
substr(artist_terms, 1, instr(artist_terms, ",")-1) as first_term,
count(1) as freq
from msd_sample
group by substr(artist_terms, 1, instr(artist_terms, ",")-1)
order by freq desc
select
artist_name,
artist_terms,
title,
genres
from msd_sample LATERAL VIEW explode(split(artist_terms, ",")) genretable as genres
limit 100;
*/
DROP TABLE IF EXISTS msd_avg_song_length_by_genre;
CREATE EXTERNAL TABLE msd_avg_song_length_by_genre
(
genre string,
avg_duration float,
total_songs bigint,
DUR_0_1 bigint,
DUR_1_2 bigint,
DUR_2_2P5 bigint,
DUR_2P5_3 bigint,
DUR_3_3P5 bigint,
DUR_3P5_4 bigint,
DUR_4_4P5 bigint,
DUR_4P5_5 bigint,
DUR_5_6 bigint,
DUR_6_7 bigint,
DUR_7PLUS bigint
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION 's3n://folder/project/file/filename/msd_avg_song_length_by_genre'
;
INSERT OVERWRITE TABLE msd_avg_song_length_by_genre
select
genres,
avg(duration) as duration,
count(1) as freq,
sum(case when duration<=60 then 1 else 0 end) as DUR_0_1,
sum(case when duration >60 and duration<=120 then 1 else 0 end) as DUR_1_2,
sum(case when duration >120 and duration<=150 then 1 else 0 end) as DUR_2_2P5,
sum(case when duration >150 and duration<=180 then 1 else 0 end) as DUR_2P5_3,
sum(case when duration >180 and duration<=210 then 1 else 0 end) as DUR_3_3P5,
sum(case when duration >210 and duration<=240 then 1 else 0 end) as DUR_3P5_4,
sum(case when duration >240 and duration<=270 then 1 else 0 end) as DUR_4_4P5,
sum(case when duration >270 and duration<=300 then 1 else 0 end) as DUR_4P5_5,
sum(case when duration >300 and duration<=360 then 1 else 0 end) as DUR_5_6,
sum(case when duration >360 and duration<=420 then 1 else 0 end) as DUR_6_7,
sum(case when duration >420 then 1 else 0 end) as DUR_7PLUS
from msd_data LATERAL VIEW explode(split(artist_terms, ",")) genretable as genres
group by genres
--order by duration desc
;
select * from msd_avg_song_length_by_genre limit 100;
--artist lifestage
DROP TABLE IF EXISTS msd_artist_career_stage_7min;
CREATE EXTERNAL TABLE msd_artist_career_stage_7min
(
artist_name string,
songs bigint,
first_song_year bigint,
last_song_year bigint,
years_active bigint,
songs_over_7mins bigint,
prop_songs_over_7mins float,
first_song_year_over_7mins bigint,
last_song_year_over_7mins bigint,
perc_into_career_first_record_7min_song float
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION 's3n://folder/project/file/filename/msd_artist_career_stage_7min'
;
INSERT OVERWRITE TABLE msd_artist_career_stage_7min
select artist_name,
count(1) as songs,
min(year) as first_song_year,
max(year) as last_song_year,
max(year)-min(year) as years_active,
sum(case when duration >420 then 1 else 0 end) as songs_over_7mins,
sum(case when duration >420 then 1 else 0 end)/count(1) as prop_songs_over_7mins,
min(case when duration >420 then year end) as first_song_year_over_7mins,
max(case when duration >420 then year end) as last_song_year_over_7mins,
case
when min(case when duration >420 then year end)=min(year) then 0.0
else (min(case when duration >420 then year end)-min(year))/(max(year)-min(year))
end as perc_into_career_first_record_7min_song
from msd_data
where year<>0
group by artist_name
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment