Created
August 31, 2013 16:58
-
-
Save swans-one/6399433 to your computer and use it in GitHub Desktop.
Analysis of The Current's playlist data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Filename: playlist-analysis.R | |
library(data.table) | |
library(ggplot2) | |
library(xtable) | |
df <- read.csv("current_playlist_2006-2012.csv", stringsAsFactors = FALSE) | |
df.unclean <- df # so we can revert without re-reading during development | |
# What form the data is in: | |
# > str(df) | |
# 'data.frame': 805604 obs. of 5 variables: | |
# $ id : chr "song133409" "song141983" "song133931" "song133031" ... | |
# $ date : chr "2006-01-01 02:59:00" "2006-01-01 02:55:00" "2006-01-01 02:48:00" "2006-01-01 02:43:00" ... | |
# $ title : chr "Anything (feat. Bent Van Looy + Miki)" "Rock On" "On And On" "Just 1 Kiss" ... | |
# $ artist : chr "Styrofoam" "Jackson and His Computer Band" "Roni Size" "Basement Jaxx" ... | |
# $ starrating: int 4 4 NA 4 5 4 3 NA 3 5 ... | |
######################################## | |
# Data Cleaning and Organizations # | |
######################################## | |
# There are a bunch of songs without titles or artists in this data | |
# set. We're going to get rid of them. | |
df <- df[df$title != "" & df$artist != "",] | |
# There are also a bunch of special programs on The Current which are | |
# not songs in their own right. We'd like to delete them. However, | |
# sometimes they enter into the data set under slightly different | |
# names, or with the name of the host attached. For this reason We're | |
# going to do logical string matching with `grepl` to get rid of them. | |
df <- df[!grepl("local show", df$title, ignore.case = TRUE) | |
& !grepl("musicheads", df$title, ignore.case = TRUE) | |
& !grepl("american routes", df$title, ignore.case = TRUE) | |
& !grepl("sound opinions", df$title, ignore.case = TRUE) | |
& !grepl("passport approved", df$title, ignore.case = TRUE) | |
& !grepl("redefinition radio", df$title, ignore.case = TRUE) | |
& !(grepl("transmission", df$title, ignore.case = TRUE) | |
& grepl("jake rudh", df$artist, ignore.case = TRUE)) | |
& !grepl("in the studio", df$title, ignore.case = TRUE) | |
,] | |
# Year and weekday factors | |
df$year <- factor(strftime(df$date, "%Y")) | |
df$weekday <- factor(strftime(df$date, "%A")) | |
# Finding the ordinal week number within the dataset | |
df$dayofyear <- as.numeric(strftime(df$date, "%j")) | |
df$yearnum <- as.numeric(strftime(df$date, "%Y")) | |
df$leapdays <- 0 | |
df[df$year %in% c(2009, 2010, 2011, 2012),]$leapdays <- 1 | |
df$daynum <- df$dayofyear + 365 * (df$yearnum - 2006) + df$leapdays | |
# FINDING WEEKNUM: | |
# - first day of the data set is a sunday | |
# - Want the first full week to be week 1, | |
df$weeknum <- floor((df$daynum - 2) / 7) + 1 | |
# Create a data.table | |
playlist <- data.table(df[,c("id", "year", "artist", "title", "weeknum", "date")]) | |
setkey(playlist, id, year, artist) | |
# Remove trailing whitespace (extremely common, good for normalization) | |
playlist[,artist := gsub("[[:space:]]*$","",artist)] | |
playlist[,title := gsub("[[:space:]]*$","",title)] | |
# There are a number of songs where the song is in The Current's | |
# database, but still (occasionally) appears with just the song and | |
# artist and an id of "song". Additionally, there are songs in the | |
# database which have multiple versions. We would like to keep the | |
# multiple versions of a song separate, but roll the non-id'd songs in | |
# with the lowest id'd song with the same title/artist combo. | |
setkey(playlist, id, year, artist) | |
noid <- playlist["song"] | |
hasid <- playlist[!"song"] | |
setkey(hasid, title, artist, id) | |
min.id <- hasid[,list(minid = min(id)),by="title,artist"] | |
setkey(min.id, title, artist) | |
setkey(noid, title, artist, year, weeknum, date) | |
fixed.id <- noid[min.id, nomatch=0][, list(id=minid,title,artist,year,weeknum,date)] | |
playlist <- rbind(hasid[,list(id,title,artist,year,weeknum,date)], | |
fixed.id) | |
setkey(playlist, id, year) | |
######################################## | |
# Whole Dataset Summary Stats # | |
######################################## | |
# Total Number of songs played: | |
nrow(df) | |
# Number of unique songs played (that have ids in the current's database): | |
length(unique(df$id)) - 1 | |
# Number of songs without an id number in the database | |
nrow(df[df$id == "song",]) | |
# Number of unique song title/artist combinations for unidentified songs | |
length(unique(with(df[df$id == "song",], paste(title, artist)))) | |
######################################## | |
# Aggregation # | |
######################################## | |
# Song playcount by year: | |
######################################## | |
# We're creating a new data table, `total.number.of.plays` which | |
# collapses all identical songs played in a year into a single row, | |
# adding a `plays` column with the count in that year. | |
total.number.of.plays <- playlist[,list(plays = length(date)), by="id,title,artist,year"][order(-plays)] | |
# Add a column that represents the song's "rank" in a given year (most | |
# plays -> rank 1). | |
setkey(total.number.of.plays, year, plays) | |
total.number.of.plays[,year.rank:=seq(from = .N, to = 1), by = "year"] | |
setkey(total.number.of.plays, year, year.rank) | |
# We update the `total.number.of.plays` table to include a cumulative | |
# sum of the proportion of total plays are the top `n` plays. | |
total.number.of.plays[,total.year.plays:=sum(plays), by=year] | |
total.number.of.plays[,norm.year.plays:=plays/total.year.plays] | |
total.number.of.plays[,cumulative.plays:=cumsum(norm.year.plays), by=year] | |
# Top Songs By Week | |
######################################## | |
top.songs.by.week <- playlist[,list(plays = length(date)), | |
by="id,title,artist,weeknum"][order(-plays)] | |
setkey(top.songs.by.week, weeknum, plays) | |
top.songs.by.week[,week.rank:=seq(from = .N, to = 1), by = "weeknum"] | |
setkey(top.songs.by.week, weeknum, week.rank) | |
######################################## | |
# Tables # | |
######################################## | |
sink("analysisOutput/tables/totalAndUniqueCounts.html") | |
t <- playlist[,list(total = length(id), | |
unique = length(unique(id))), | |
by = "year"] | |
print(xtable(t), type="html") | |
sink() | |
# Playcounts of top ten songs in each year | |
# | |
sink("analysisOutput/tables/topSongsByYear.html", append=TRUE) | |
for (yr in c("2006","2007","2008","2009", "2010","2011", "2012")) { | |
top.tbl <- total.number.of.plays[year==yr,list(title,artist,plays)][1:10] | |
cat(sprintf("\n\n%s\n\n", yr)) | |
print(xtable(top.tbl), type = "html") | |
} | |
sink() | |
sink("analysisOutput/tables/topSongEachWeek.html") | |
print(xtable(top.songs.by.week[week.rank==1, list(weeknum, title, artist)][order(-as.numeric(weeknum))]), type = "html") | |
sink() | |
######################################## | |
# Plotting # | |
######################################## | |
# The following section has a number of interesting charts. Only the | |
# ones surrounded with `png(...)` and `dev.off()` are used in the | |
# post. | |
# Plays vs Most Played rank | |
# By: Year | |
g <- ggplot(total.number.of.plays, aes(x=year.rank, y=plays, color=year)) | |
g <- g + geom_line(size = 2) | |
g <- g + scale_x_continuous(limits=c(0,500)) | |
g <- g + guides(col = guide_legend(reverse = TRUE)) | |
# Cumulative plays % -- Top 1000 songs | |
# By: year | |
# Normalized: % plays/year | |
png(filename = "analysisOutput/images/cumulativeTopProportion.png", | |
height = 600, width = 800) | |
g <- ggplot(total.number.of.plays, | |
aes(x=year.rank, y=cumulative.plays, color=year)) | |
g <- g + geom_line(size=1.2) | |
g <- g + scale_x_continuous(limits = c(0,1000)) | |
g <- g + scale_y_continuous(limits = c(0, .65)) | |
g <- g + guides(col = guide_legend(reverse = TRUE)) | |
g <- g + xlab("Song Rank (by playcount and year)") | |
g <- g + ylab("Cumulative Proportion of Airtime") | |
g <- g + ggtitle("Cumulative Play Proportion of Top 1,000 Songs in Each Year") | |
g <- g + theme_set(theme_gray(base_size = 18)) | |
g | |
dev.off() | |
# Cumulative plays % -- Top 100 songs | |
# By: year | |
# Normalized: % plays/year | |
g <- ggplot(total.number.of.plays, | |
aes(x=year.rank, y=cumulative.plays, color=year)) | |
g <- g + geom_line(size=1.2) | |
g <- g + scale_x_continuous(limits = c(0,100)) | |
g <- g + scale_y_continuous(limits = c(0, .20)) | |
g <- g + guides(col = guide_legend(reverse = TRUE)) | |
g <- g + xlab("Song Rank (by playcount and year)") | |
g <- g + ylab("Cumulative Proportion of Airtime") | |
# Weekly, # of plays of top played song | |
png(filename = "analysisOutput/images/weeklyTopPlays.png", | |
height = 400, width = 800) | |
g <- ggplot(top.songs.by.week[week.rank == 1 | |
& as.numeric(weeknum) < 366 | |
& as.numeric(weeknum) > 0], | |
aes(x=as.numeric(weeknum), | |
y=plays) | |
) | |
g <- g + geom_line() | |
g <- g + xlab("Weeks after January 1st 2006") | |
g <- g + ylab("Number of Plays") | |
g <- g + ggtitle("Plays of Weekly Top Song") | |
g <- g + scale_x_continuous(breaks = c(0, 52, 104, 156, 208, 260, 312, 364)) | |
g <- g + theme_set(theme_gray(base_size = 18)) | |
g | |
dev.off() | |
g <- ggplot(top.songs.by.week[week.rank == 5 | |
& as.numeric(weeknum) < 366 | |
& as.numeric(weeknum) > 0], | |
aes(x=as.numeric(weeknum), | |
y=plays) | |
) | |
g <- g + geom_line() | |
g <- g + xlab("Weeks after January 1st 2006") | |
g <- g + ylab("Number of Plays") | |
g <- g + ggtitle("Plays of Weekly Top Song") | |
g <- g + scale_x_continuous(breaks = c(0, 52, 104, 156, 208, 260, 312, 364)) | |
g <- g + theme_set(theme_gray(base_size = 18)) | |
g | |
# Weekly Top 5 songs. Barchart. | |
g <- ggplot(top.songs.by.week[week.rank <= 5], | |
aes(x=as.numeric(weeknum), | |
y=plays, | |
group=week.rank, | |
fill = week.rank, | |
order = -week.rank) | |
) | |
g <- g + geom_bar(stat="identity", width=1) | |
g <- g + guides(fill = guide_legend(reverse = FALSE)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment