Skip to content

Instantly share code, notes, and snippets.

@jamesthomson
Created January 11, 2015 10:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jamesthomson/112ae4fc61b03add6c4f to your computer and use it in GitHub Desktop.
Save jamesthomson/112ae4fc61b03add6c4f to your computer and use it in GitHub Desktop.
#import all data add column headers and run checks
dist <- read.delim("~/Documents/my blog/million song database/7plus songs/output1.txt", header=FALSE)
colnames(dist)<-c('length', 'freq')
dist
dist_time <- read.csv("~/Documents/my blog/million song database/7plus songs/output2.txt", header=FALSE)
colnames(dist_time)<-c('year', 'avg_duration', 'freq', 'DUR_0to1', 'DUR_1to2', 'DUR_2to2.5', 'DUR_2.5to3', 'DUR_3to3.5', 'DUR_3.5to4', 'DUR_4to4.5', 'DUR_4.5to5', 'DUR_5to6', 'DUR_6to7', 'DUR_7PLUS')
head(dist_time)
dist_genre_all <- read.delim("~/Documents/my blog/million song database/7plus songs/output3a.txt", header=FALSE)
dist_genre_top <- read.delim("~/Documents/my blog/million song database/7plus songs/output3b.txt", header=FALSE)
dist_genre_longest <- read.delim("~/Documents/my blog/million song database/7plus songs/output3c.txt", header=FALSE)
colnames(dist_genre_all)<-c('genre', 'avg_duration', 'freq', 'DUR_0to1', 'DUR_1to2', 'DUR_2to2.5', 'DUR_2.5to3', 'DUR_3to3.5', 'DUR_3.5to4', 'DUR_4to4.5', 'DUR_4.5to5', 'DUR_5to6', 'DUR_6to7', 'DUR_7PLUS')
colnames(dist_genre_top)<-c('genre', 'avg_duration', 'freq', 'DUR_0to1', 'DUR_1to2', 'DUR_2to2.5', 'DUR_2.5to3', 'DUR_3to3.5', 'DUR_3.5to4', 'DUR_4to4.5', 'DUR_4.5to5', 'DUR_5to6', 'DUR_6to7', 'DUR_7PLUS')
colnames(dist_genre_longest)<-c('genre', 'avg_duration', 'freq', 'DUR_0to1', 'DUR_1to2', 'DUR_2to2.5', 'DUR_2.5to3', 'DUR_3to3.5', 'DUR_3.5to4', 'DUR_4to4.5', 'DUR_4.5to5', 'DUR_5to6', 'DUR_6to7', 'DUR_7PLUS')
head(dist_genre_all)
head(dist_genre_top)
head(dist_genre_longest)
(sum(dist_time$avg_duration * dist_time$freq))/(sum(dist_time$freq))
#graphs
library(ggplot2)
qplot(length, data=dist, geom="bar", weight=freq, ylab="Songs", xlab='Song Length', fill = I("orange")) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
) +
scale_x_discrete(labels=c('0 to 1','1 to 2', '2 to 2.5', '2.5 to 3', '3 to 3.5', '3.5 to 4', '4 to 4.5', '4.5 to 5', '5 to 6', '6 to 7', '7+'))
dim(dist_time)
ggplot(dist_time[-c(1,90),], aes(x=year, y=avg_duration/60)) +
ylab("Song Length") + xlab("Year") +
scale_x_continuous(breaks=seq(1920, 2010, 10)) +
geom_line(color="orange", size=1) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
)
ggplot(dist_time[-c(1,90),], aes(x=year, y=freq)) +
ylab("Number of songs") + xlab("Year") +
scale_x_continuous(breaks=seq(1920, 2010, 10)) +
geom_line(color="orange", size=1) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
)
head(dist_time)
prop_time<-data.frame(year=dist_time$year,
DUR_0to1=dist_time$DUR_0to1/dist_time$freq,
DUR_1to2=dist_time$DUR_1to2/dist_time$freq,
DUR_2to2.5=dist_time$DUR_2to2.5/dist_time$freq,
DUR_2.5to3=dist_time$DUR_2.5to3/dist_time$freq,
DUR_3to3.5=dist_time$DUR_3to3.5/dist_time$freq,
DUR_3.5to4=dist_time$DUR_3.5to4/dist_time$freq,
DUR_4to4.5=dist_time$DUR_4to4.5/dist_time$freq,
DUR_4.5to5=dist_time$DUR_4.5to5/dist_time$freq,
DUR_5to6=dist_time$DUR_5to6/dist_time$freq,
DUR_6to7=dist_time$DUR_6to7/dist_time$freq,
DUR_7PLUS=dist_time$DUR_7PLUS/dist_time$freq
)
ggplot(prop_time[-c(1:48,90),], aes(x=year, y=DUR_7PLUS)) +
ylab("Proportion of songs over 7 mins long") + xlab("Year") +
scale_x_continuous(breaks=seq(1970, 2010, 10)) +
geom_line(color="orange", size=1) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
)
ggplot(prop_time[-c(1:48,90),], aes(x=year, y=DUR_3to3.5)) +
ylab("Proportion of songs over 7 mins long") + xlab("Year") +
scale_x_continuous(breaks=seq(1970, 2010, 10)) +
# ylim(0.0, 0.2) +
geom_line(color="orange", size=1) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
)
ggplot(prop_time[-c(1:48,90),], aes(x=year, y=DUR_2.5to3)) +
ylab("Proportion of songs over 7 mins long") + xlab("Year") +
# ylim(0.0, 0.2) +
scale_x_continuous(breaks=seq(1970, 2010, 10)) +
geom_line(color="orange", size=1) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
)
ggplot(prop_time[-c(1:48,90),], aes(x=year, y=DUR_4to4.5)) +
ylab("Proportion of songs over 7 mins long") + xlab("Year") +
# ylim(0.0, 0.2) +
scale_x_continuous(breaks=seq(1970, 2010, 10)) +
geom_line(color="orange", size=1) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
)
library(reshape2)
melted<-melt(prop_time[-c(1:48,90),], id.vars='year')
qplot(factor(year), data=melted, geom="bar", fill=variable, weight=value, ylab="Proportion", xlab='Year') +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, angle=90, vjust=0.5, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
) +
scale_fill_manual(labels=c('0 to 1','1 to 2', '2 to 2.5', '2.5 to 3', '3 to 3.5', '3.5 to 4', '4 to 4.5', '4.5 to 5', '5 to 6', '6 to 7', '7+'),
values=c('#ffffeb','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026','#400013')
)
###visualise the top genres
head(dist_genre_all)
head(dist_genre_top)
head(dist_genre_longest)
biggenres<-dist_genre_all[dist_genre_all$freq>10000,]
ordered<-biggenres[order(-biggenres$avg_duration),]
ordered[1:150,1]
str(dist_genre_longest)
dist_genre_longest$genre2 <- factor(dist_genre_longest$genre, as.character(dist_genre_longest$genre))
ggplot(dist_genre_longest[1:20,], aes(x=genre2, y=avg_duration/60)) +
geom_bar(stat="identity", fill="orange") +
labs(y="Average Length", x="Genre") +
ylim(0,8) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, angle=90, vjust=0, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
)
dist_genre_top$genre2 <- factor(dist_genre_top$genre, as.character(dist_genre_top$genre))
ggplot(dist_genre_top[2:21,], aes(x=genre2, y=avg_duration/60)) +
geom_bar(stat="identity", fill="orange") +
labs(y="Average Length", x="Genre") +
ylim(0,8) +
theme(panel.background = element_blank(),
axis.text.x=element_text(size=15, angle=90, vjust=0, color='grey40'),
axis.text.y=element_text(size=15, color='grey40'),
axis.title.x=element_text(size=18, color='grey40'),
axis.title.y=element_text(size=18, color='grey40')
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment