Skip to content

Instantly share code, notes, and snippets.

@swans-one
Created August 31, 2013 16:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save swans-one/6399433 to your computer and use it in GitHub Desktop.
Save swans-one/6399433 to your computer and use it in GitHub Desktop.
Analysis of The Current's playlist data.
# Filename: playlist-analysis.R
library(data.table)
library(ggplot2)
library(xtable)
df <- read.csv("current_playlist_2006-2012.csv", stringsAsFactors = FALSE)
df.unclean <- df # so we can revert without re-reading during development
# What form the data is in:
# > str(df)
# 'data.frame': 805604 obs. of 5 variables:
# $ id : chr "song133409" "song141983" "song133931" "song133031" ...
# $ date : chr "2006-01-01 02:59:00" "2006-01-01 02:55:00" "2006-01-01 02:48:00" "2006-01-01 02:43:00" ...
# $ title : chr "Anything (feat. Bent Van Looy + Miki)" "Rock On" "On And On" "Just 1 Kiss" ...
# $ artist : chr "Styrofoam" "Jackson and His Computer Band" "Roni Size" "Basement Jaxx" ...
# $ starrating: int 4 4 NA 4 5 4 3 NA 3 5 ...
########################################
# Data Cleaning and Organizations #
########################################
# There are a bunch of songs without titles or artists in this data
# set. We're going to get rid of them.
df <- df[df$title != "" & df$artist != "",]
# There are also a bunch of special programs on The Current which are
# not songs in their own right. We'd like to delete them. However,
# sometimes they enter into the data set under slightly different
# names, or with the name of the host attached. For this reason We're
# going to do logical string matching with `grepl` to get rid of them.
df <- df[!grepl("local show", df$title, ignore.case = TRUE)
& !grepl("musicheads", df$title, ignore.case = TRUE)
& !grepl("american routes", df$title, ignore.case = TRUE)
& !grepl("sound opinions", df$title, ignore.case = TRUE)
& !grepl("passport approved", df$title, ignore.case = TRUE)
& !grepl("redefinition radio", df$title, ignore.case = TRUE)
& !(grepl("transmission", df$title, ignore.case = TRUE)
& grepl("jake rudh", df$artist, ignore.case = TRUE))
& !grepl("in the studio", df$title, ignore.case = TRUE)
,]
# Year and weekday factors
df$year <- factor(strftime(df$date, "%Y"))
df$weekday <- factor(strftime(df$date, "%A"))
# Finding the ordinal week number within the dataset
df$dayofyear <- as.numeric(strftime(df$date, "%j"))
df$yearnum <- as.numeric(strftime(df$date, "%Y"))
df$leapdays <- 0
df[df$year %in% c(2009, 2010, 2011, 2012),]$leapdays <- 1
df$daynum <- df$dayofyear + 365 * (df$yearnum - 2006) + df$leapdays
# FINDING WEEKNUM:
# - first day of the data set is a sunday
# - Want the first full week to be week 1,
df$weeknum <- floor((df$daynum - 2) / 7) + 1
# Create a data.table
playlist <- data.table(df[,c("id", "year", "artist", "title", "weeknum", "date")])
setkey(playlist, id, year, artist)
# Remove trailing whitespace (extremely common, good for normalization)
playlist[,artist := gsub("[[:space:]]*$","",artist)]
playlist[,title := gsub("[[:space:]]*$","",title)]
# There are a number of songs where the song is in The Current's
# database, but still (occasionally) appears with just the song and
# artist and an id of "song". Additionally, there are songs in the
# database which have multiple versions. We would like to keep the
# multiple versions of a song separate, but roll the non-id'd songs in
# with the lowest id'd song with the same title/artist combo.
setkey(playlist, id, year, artist)
noid <- playlist["song"]
hasid <- playlist[!"song"]
setkey(hasid, title, artist, id)
min.id <- hasid[,list(minid = min(id)),by="title,artist"]
setkey(min.id, title, artist)
setkey(noid, title, artist, year, weeknum, date)
fixed.id <- noid[min.id, nomatch=0][, list(id=minid,title,artist,year,weeknum,date)]
playlist <- rbind(hasid[,list(id,title,artist,year,weeknum,date)],
fixed.id)
setkey(playlist, id, year)
########################################
# Whole Dataset Summary Stats #
########################################
# Total Number of songs played:
nrow(df)
# Number of unique songs played (that have ids in the current's database):
length(unique(df$id)) - 1
# Number of songs without an id number in the database
nrow(df[df$id == "song",])
# Number of unique song title/artist combinations for unidentified songs
length(unique(with(df[df$id == "song",], paste(title, artist))))
########################################
# Aggregation #
########################################
# Song playcount by year:
########################################
# We're creating a new data table, `total.number.of.plays` which
# collapses all identical songs played in a year into a single row,
# adding a `plays` column with the count in that year.
total.number.of.plays <- playlist[,list(plays = length(date)), by="id,title,artist,year"][order(-plays)]
# Add a column that represents the song's "rank" in a given year (most
# plays -> rank 1).
setkey(total.number.of.plays, year, plays)
total.number.of.plays[,year.rank:=seq(from = .N, to = 1), by = "year"]
setkey(total.number.of.plays, year, year.rank)
# We update the `total.number.of.plays` table to include a cumulative
# sum of the proportion of total plays are the top `n` plays.
total.number.of.plays[,total.year.plays:=sum(plays), by=year]
total.number.of.plays[,norm.year.plays:=plays/total.year.plays]
total.number.of.plays[,cumulative.plays:=cumsum(norm.year.plays), by=year]
# Top Songs By Week
########################################
top.songs.by.week <- playlist[,list(plays = length(date)),
by="id,title,artist,weeknum"][order(-plays)]
setkey(top.songs.by.week, weeknum, plays)
top.songs.by.week[,week.rank:=seq(from = .N, to = 1), by = "weeknum"]
setkey(top.songs.by.week, weeknum, week.rank)
########################################
# Tables #
########################################
sink("analysisOutput/tables/totalAndUniqueCounts.html")
t <- playlist[,list(total = length(id),
unique = length(unique(id))),
by = "year"]
print(xtable(t), type="html")
sink()
# Playcounts of top ten songs in each year
#
sink("analysisOutput/tables/topSongsByYear.html", append=TRUE)
for (yr in c("2006","2007","2008","2009", "2010","2011", "2012")) {
top.tbl <- total.number.of.plays[year==yr,list(title,artist,plays)][1:10]
cat(sprintf("\n\n%s\n\n", yr))
print(xtable(top.tbl), type = "html")
}
sink()
sink("analysisOutput/tables/topSongEachWeek.html")
print(xtable(top.songs.by.week[week.rank==1, list(weeknum, title, artist)][order(-as.numeric(weeknum))]), type = "html")
sink()
########################################
# Plotting #
########################################
# The following section has a number of interesting charts. Only the
# ones surrounded with `png(...)` and `dev.off()` are used in the
# post.
# Plays vs Most Played rank
# By: Year
g <- ggplot(total.number.of.plays, aes(x=year.rank, y=plays, color=year))
g <- g + geom_line(size = 2)
g <- g + scale_x_continuous(limits=c(0,500))
g <- g + guides(col = guide_legend(reverse = TRUE))
# Cumulative plays % -- Top 1000 songs
# By: year
# Normalized: % plays/year
png(filename = "analysisOutput/images/cumulativeTopProportion.png",
height = 600, width = 800)
g <- ggplot(total.number.of.plays,
aes(x=year.rank, y=cumulative.plays, color=year))
g <- g + geom_line(size=1.2)
g <- g + scale_x_continuous(limits = c(0,1000))
g <- g + scale_y_continuous(limits = c(0, .65))
g <- g + guides(col = guide_legend(reverse = TRUE))
g <- g + xlab("Song Rank (by playcount and year)")
g <- g + ylab("Cumulative Proportion of Airtime")
g <- g + ggtitle("Cumulative Play Proportion of Top 1,000 Songs in Each Year")
g <- g + theme_set(theme_gray(base_size = 18))
g
dev.off()
# Cumulative plays % -- Top 100 songs
# By: year
# Normalized: % plays/year
g <- ggplot(total.number.of.plays,
aes(x=year.rank, y=cumulative.plays, color=year))
g <- g + geom_line(size=1.2)
g <- g + scale_x_continuous(limits = c(0,100))
g <- g + scale_y_continuous(limits = c(0, .20))
g <- g + guides(col = guide_legend(reverse = TRUE))
g <- g + xlab("Song Rank (by playcount and year)")
g <- g + ylab("Cumulative Proportion of Airtime")
# Weekly, # of plays of top played song
png(filename = "analysisOutput/images/weeklyTopPlays.png",
height = 400, width = 800)
g <- ggplot(top.songs.by.week[week.rank == 1
& as.numeric(weeknum) < 366
& as.numeric(weeknum) > 0],
aes(x=as.numeric(weeknum),
y=plays)
)
g <- g + geom_line()
g <- g + xlab("Weeks after January 1st 2006")
g <- g + ylab("Number of Plays")
g <- g + ggtitle("Plays of Weekly Top Song")
g <- g + scale_x_continuous(breaks = c(0, 52, 104, 156, 208, 260, 312, 364))
g <- g + theme_set(theme_gray(base_size = 18))
g
dev.off()
g <- ggplot(top.songs.by.week[week.rank == 5
& as.numeric(weeknum) < 366
& as.numeric(weeknum) > 0],
aes(x=as.numeric(weeknum),
y=plays)
)
g <- g + geom_line()
g <- g + xlab("Weeks after January 1st 2006")
g <- g + ylab("Number of Plays")
g <- g + ggtitle("Plays of Weekly Top Song")
g <- g + scale_x_continuous(breaks = c(0, 52, 104, 156, 208, 260, 312, 364))
g <- g + theme_set(theme_gray(base_size = 18))
g
# Weekly Top 5 songs. Barchart.
g <- ggplot(top.songs.by.week[week.rank <= 5],
aes(x=as.numeric(weeknum),
y=plays,
group=week.rank,
fill = week.rank,
order = -week.rank)
)
g <- g + geom_bar(stat="identity", width=1)
g <- g + guides(fill = guide_legend(reverse = FALSE))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment