#import all data add column headers and run checks | |
dist <- read.delim("~/Documents/my blog/million song database/7plus songs/output1.txt", header=FALSE) | |
colnames(dist)<-c('length', 'freq') | |
dist | |
dist_time <- read.csv("~/Documents/my blog/million song database/7plus songs/output2.txt", header=FALSE) | |
colnames(dist_time)<-c('year', 'avg_duration', 'freq', 'DUR_0to1', 'DUR_1to2', 'DUR_2to2.5', 'DUR_2.5to3', 'DUR_3to3.5', 'DUR_3.5to4', 'DUR_4to4.5', 'DUR_4.5to5', 'DUR_5to6', 'DUR_6to7', 'DUR_7PLUS') | |
head(dist_time) | |
dist_genre_all <- read.delim("~/Documents/my blog/million song database/7plus songs/output3a.txt", header=FALSE) | |
dist_genre_top <- read.delim("~/Documents/my blog/million song database/7plus songs/output3b.txt", header=FALSE) | |
dist_genre_longest <- read.delim("~/Documents/my blog/million song database/7plus songs/output3c.txt", header=FALSE) | |
colnames(dist_genre_all)<-c('genre', 'avg_duration', 'freq', 'DUR_0to1', 'DUR_1to2', 'DUR_2to2.5', 'DUR_2.5to3', 'DUR_3to3.5', 'DUR_3.5to4', 'DUR_4to4.5', 'DUR_4.5to5', 'DUR_5to6', 'DUR_6to7', 'DUR_7PLUS') | |
colnames(dist_genre_top)<-c('genre', 'avg_duration', 'freq', 'DUR_0to1', 'DUR_1to2', 'DUR_2to2.5', 'DUR_2.5to3', 'DUR_3to3.5', 'DUR_3.5to4', 'DUR_4to4.5', 'DUR_4.5to5', 'DUR_5to6', 'DUR_6to7', 'DUR_7PLUS') | |
colnames(dist_genre_longest)<-c('genre', 'avg_duration', 'freq', 'DUR_0to1', 'DUR_1to2', 'DUR_2to2.5', 'DUR_2.5to3', 'DUR_3to3.5', 'DUR_3.5to4', 'DUR_4to4.5', 'DUR_4.5to5', 'DUR_5to6', 'DUR_6to7', 'DUR_7PLUS') | |
head(dist_genre_all) | |
head(dist_genre_top) | |
head(dist_genre_longest) | |
(sum(dist_time$avg_duration * dist_time$freq))/(sum(dist_time$freq)) | |
#graphs | |
library(ggplot2) | |
qplot(length, data=dist, geom="bar", weight=freq, ylab="Songs", xlab='Song Length', fill = I("orange")) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) + | |
scale_x_discrete(labels=c('0 to 1','1 to 2', '2 to 2.5', '2.5 to 3', '3 to 3.5', '3.5 to 4', '4 to 4.5', '4.5 to 5', '5 to 6', '6 to 7', '7+')) | |
dim(dist_time) | |
ggplot(dist_time[-c(1,90),], aes(x=year, y=avg_duration/60)) + | |
ylab("Song Length") + xlab("Year") + | |
scale_x_continuous(breaks=seq(1920, 2010, 10)) + | |
geom_line(color="orange", size=1) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) | |
ggplot(dist_time[-c(1,90),], aes(x=year, y=freq)) + | |
ylab("Number of songs") + xlab("Year") + | |
scale_x_continuous(breaks=seq(1920, 2010, 10)) + | |
geom_line(color="orange", size=1) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) | |
head(dist_time) | |
prop_time<-data.frame(year=dist_time$year, | |
DUR_0to1=dist_time$DUR_0to1/dist_time$freq, | |
DUR_1to2=dist_time$DUR_1to2/dist_time$freq, | |
DUR_2to2.5=dist_time$DUR_2to2.5/dist_time$freq, | |
DUR_2.5to3=dist_time$DUR_2.5to3/dist_time$freq, | |
DUR_3to3.5=dist_time$DUR_3to3.5/dist_time$freq, | |
DUR_3.5to4=dist_time$DUR_3.5to4/dist_time$freq, | |
DUR_4to4.5=dist_time$DUR_4to4.5/dist_time$freq, | |
DUR_4.5to5=dist_time$DUR_4.5to5/dist_time$freq, | |
DUR_5to6=dist_time$DUR_5to6/dist_time$freq, | |
DUR_6to7=dist_time$DUR_6to7/dist_time$freq, | |
DUR_7PLUS=dist_time$DUR_7PLUS/dist_time$freq | |
) | |
ggplot(prop_time[-c(1:48,90),], aes(x=year, y=DUR_7PLUS)) + | |
ylab("Proportion of songs over 7 mins long") + xlab("Year") + | |
scale_x_continuous(breaks=seq(1970, 2010, 10)) + | |
geom_line(color="orange", size=1) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) | |
ggplot(prop_time[-c(1:48,90),], aes(x=year, y=DUR_3to3.5)) + | |
ylab("Proportion of songs over 7 mins long") + xlab("Year") + | |
scale_x_continuous(breaks=seq(1970, 2010, 10)) + | |
# ylim(0.0, 0.2) + | |
geom_line(color="orange", size=1) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) | |
ggplot(prop_time[-c(1:48,90),], aes(x=year, y=DUR_2.5to3)) + | |
ylab("Proportion of songs over 7 mins long") + xlab("Year") + | |
# ylim(0.0, 0.2) + | |
scale_x_continuous(breaks=seq(1970, 2010, 10)) + | |
geom_line(color="orange", size=1) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) | |
ggplot(prop_time[-c(1:48,90),], aes(x=year, y=DUR_4to4.5)) + | |
ylab("Proportion of songs over 7 mins long") + xlab("Year") + | |
# ylim(0.0, 0.2) + | |
scale_x_continuous(breaks=seq(1970, 2010, 10)) + | |
geom_line(color="orange", size=1) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) | |
library(reshape2) | |
melted<-melt(prop_time[-c(1:48,90),], id.vars='year') | |
qplot(factor(year), data=melted, geom="bar", fill=variable, weight=value, ylab="Proportion", xlab='Year') + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, angle=90, vjust=0.5, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) + | |
scale_fill_manual(labels=c('0 to 1','1 to 2', '2 to 2.5', '2.5 to 3', '3 to 3.5', '3.5 to 4', '4 to 4.5', '4.5 to 5', '5 to 6', '6 to 7', '7+'), | |
values=c('#ffffeb','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026','#400013') | |
) | |
###visualise the top genres | |
head(dist_genre_all) | |
head(dist_genre_top) | |
head(dist_genre_longest) | |
biggenres<-dist_genre_all[dist_genre_all$freq>10000,] | |
ordered<-biggenres[order(-biggenres$avg_duration),] | |
ordered[1:150,1] | |
str(dist_genre_longest) | |
dist_genre_longest$genre2 <- factor(dist_genre_longest$genre, as.character(dist_genre_longest$genre)) | |
ggplot(dist_genre_longest[1:20,], aes(x=genre2, y=avg_duration/60)) + | |
geom_bar(stat="identity", fill="orange") + | |
labs(y="Average Length", x="Genre") + | |
ylim(0,8) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, angle=90, vjust=0, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) | |
dist_genre_top$genre2 <- factor(dist_genre_top$genre, as.character(dist_genre_top$genre)) | |
ggplot(dist_genre_top[2:21,], aes(x=genre2, y=avg_duration/60)) + | |
geom_bar(stat="identity", fill="orange") + | |
labs(y="Average Length", x="Genre") + | |
ylim(0,8) + | |
theme(panel.background = element_blank(), | |
axis.text.x=element_text(size=15, angle=90, vjust=0, color='grey40'), | |
axis.text.y=element_text(size=15, color='grey40'), | |
axis.title.x=element_text(size=18, color='grey40'), | |
axis.title.y=element_text(size=18, color='grey40') | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment