swans-one/playlist-analysis.R

## playlist-analysis.R
# Filename: playlist-analysis.R

library(data.table)
library(ggplot2)
library(xtable)

df <- read.csv("current_playlist_2006-2012.csv", stringsAsFactors = FALSE)
df.unclean <- df # so we can revert without re-reading during development

# What form the data is in:
# > str(df)
# 'data.frame':  805604 obs. of  5 variables:
#  $ id        : chr  "song133409" "song141983" "song133931" "song133031" ...
#  $ date      : chr  "2006-01-01 02:59:00" "2006-01-01 02:55:00" "2006-01-01 02:48:00" "2006-01-01 02:43:00" ...
#  $ title     : chr  "Anything (feat. Bent Van Looy + Miki)" "Rock On" "On And On" "Just 1 Kiss" ...
#  $ artist    : chr  "Styrofoam" "Jackson and His Computer Band" "Roni Size" "Basement Jaxx" ...
#  $ starrating: int  4 4 NA 4 5 4 3 NA 3 5 ...

########################################
#   Data Cleaning and Organizations    #
########################################

# There are a bunch of songs without titles or artists in this data
# set. We're going to get rid of them.
df <- df[df$title != "" & df$artist != "",]

# There are also a bunch of special programs on The Current which are
# not songs in their own right. We'd like to delete them. However,
# sometimes they enter into the data set under slightly different
# names, or with the name of the host attached. For this reason We're
# going to do logical string matching with `grepl` to get rid of them.
df <- df[!grepl("local show", df$title, ignore.case = TRUE)
         & !grepl("musicheads", df$title, ignore.case = TRUE)
         & !grepl("american routes", df$title, ignore.case = TRUE)
         & !grepl("sound opinions", df$title, ignore.case = TRUE)
         & !grepl("passport approved", df$title, ignore.case = TRUE)
         & !grepl("redefinition radio", df$title, ignore.case = TRUE)
         & !(grepl("transmission", df$title, ignore.case = TRUE)
             & grepl("jake rudh", df$artist, ignore.case = TRUE))
         & !grepl("in the studio", df$title, ignore.case = TRUE)
         ,]

# Year and weekday factors
df$year <- factor(strftime(df$date, "%Y"))
df$weekday <- factor(strftime(df$date, "%A"))

# Finding the ordinal week number within the dataset
df$dayofyear <- as.numeric(strftime(df$date, "%j"))
df$yearnum <- as.numeric(strftime(df$date, "%Y"))
df$leapdays <- 0
df[df$year %in% c(2009, 2010, 2011, 2012),]$leapdays <- 1
df$daynum <- df$dayofyear + 365 * (df$yearnum - 2006) + df$leapdays
# FINDING WEEKNUM:
#   - first day of the data set is a sunday
#   - Want the first full week to be week 1,
df$weeknum <- floor((df$daynum - 2) / 7) + 1

# Create a data.table
playlist <- data.table(df[,c("id", "year", "artist", "title", "weeknum", "date")])
setkey(playlist, id, year, artist)

# Remove trailing whitespace (extremely common, good for normalization)
playlist[,artist := gsub("[[:space:]]*$","",artist)]
playlist[,title := gsub("[[:space:]]*$","",title)]

# There are a number of songs where the song is in The Current's
# database, but still (occasionally) appears with just the song and
# artist and an id of "song". Additionally, there are songs in the
# database which have multiple versions. We would like to keep the
# multiple versions of a song separate, but roll the non-id'd songs in
# with the lowest id'd song with the same title/artist combo.
setkey(playlist, id, year, artist)
noid <- playlist["song"]
hasid <- playlist[!"song"]

setkey(hasid, title, artist, id)
min.id <- hasid[,list(minid = min(id)),by="title,artist"]
setkey(min.id, title, artist)
setkey(noid, title, artist, year, weeknum, date)

fixed.id <- noid[min.id, nomatch=0][, list(id=minid,title,artist,year,weeknum,date)]

playlist <- rbind(hasid[,list(id,title,artist,year,weeknum,date)],
                  fixed.id)
setkey(playlist, id, year)

########################################
#     Whole Dataset Summary Stats      #
########################################

# Total Number of songs played:
nrow(df)

# Number of unique songs played (that have ids in the current's database):
length(unique(df$id)) - 1

# Number of songs without an id number in the database
nrow(df[df$id == "song",])

# Number of unique song title/artist combinations for unidentified songs
length(unique(with(df[df$id == "song",], paste(title, artist))))

########################################
#             Aggregation              #
########################################

# Song playcount by year:
########################################

# We're creating a new data table, `total.number.of.plays` which
# collapses all identical songs played in a year into a single row,
# adding a `plays` column with the count in that year.
total.number.of.plays <- playlist[,list(plays = length(date)), by="id,title,artist,year"][order(-plays)]

# Add a column that represents the song's "rank" in a given year (most
# plays -> rank 1).
setkey(total.number.of.plays, year, plays)
total.number.of.plays[,year.rank:=seq(from = .N, to = 1), by = "year"]
setkey(total.number.of.plays, year, year.rank)

# We update the `total.number.of.plays` table to include a cumulative
# sum of the proportion of total plays are the top `n` plays.
total.number.of.plays[,total.year.plays:=sum(plays), by=year]
total.number.of.plays[,norm.year.plays:=plays/total.year.plays]
total.number.of.plays[,cumulative.plays:=cumsum(norm.year.plays), by=year]


# Top Songs By Week
########################################
top.songs.by.week <- playlist[,list(plays = length(date)),
                              by="id,title,artist,weeknum"][order(-plays)]
setkey(top.songs.by.week, weeknum, plays)
top.songs.by.week[,week.rank:=seq(from = .N, to = 1), by = "weeknum"]
setkey(top.songs.by.week, weeknum, week.rank)

########################################
#                Tables                #
########################################

sink("analysisOutput/tables/totalAndUniqueCounts.html")
t <- playlist[,list(total = length(id),
                    unique = length(unique(id))),
              by = "year"]
print(xtable(t), type="html")
sink()

# Playcounts of top ten songs in each year
#
sink("analysisOutput/tables/topSongsByYear.html", append=TRUE)
for (yr in c("2006","2007","2008","2009", "2010","2011", "2012")) {
  top.tbl <- total.number.of.plays[year==yr,list(title,artist,plays)][1:10]
  cat(sprintf("\n\n%s\n\n", yr))
  print(xtable(top.tbl), type = "html")
}
sink()

sink("analysisOutput/tables/topSongEachWeek.html")
print(xtable(top.songs.by.week[week.rank==1, list(weeknum, title, artist)][order(-as.numeric(weeknum))]), type = "html")
sink()


########################################
#               Plotting               #
########################################

# The following section has a number of interesting charts. Only the
# ones surrounded with `png(...)` and `dev.off()` are used in the
# post.


# Plays vs Most Played rank
#    By: Year
g <- ggplot(total.number.of.plays, aes(x=year.rank, y=plays, color=year))
g <- g + geom_line(size = 2)
g <- g + scale_x_continuous(limits=c(0,500))
g <- g + guides(col = guide_legend(reverse = TRUE))

# Cumulative plays % -- Top 1000 songs
#    By: year
#    Normalized: % plays/year
png(filename = "analysisOutput/images/cumulativeTopProportion.png",
    height = 600, width = 800)
g <- ggplot(total.number.of.plays,
            aes(x=year.rank, y=cumulative.plays, color=year))
g <- g + geom_line(size=1.2)
g <- g + scale_x_continuous(limits = c(0,1000))
g <- g + scale_y_continuous(limits = c(0, .65))
g <- g + guides(col = guide_legend(reverse = TRUE))
g <- g + xlab("Song Rank (by playcount and year)")
g <- g + ylab("Cumulative Proportion of Airtime")
g <- g + ggtitle("Cumulative Play Proportion of Top 1,000 Songs in Each Year")
g <- g + theme_set(theme_gray(base_size = 18))
g
dev.off()

# Cumulative plays % -- Top 100 songs
#    By: year
#    Normalized: % plays/year
g <- ggplot(total.number.of.plays,
            aes(x=year.rank, y=cumulative.plays, color=year))
g <- g + geom_line(size=1.2)
g <- g + scale_x_continuous(limits = c(0,100))
g <- g + scale_y_continuous(limits = c(0, .20))
g <- g + guides(col = guide_legend(reverse = TRUE))
g <- g + xlab("Song Rank (by playcount and year)")
g <- g + ylab("Cumulative Proportion of Airtime")

# Weekly, # of plays of top played song
png(filename = "analysisOutput/images/weeklyTopPlays.png",
    height = 400, width = 800)
g <- ggplot(top.songs.by.week[week.rank == 1
                    & as.numeric(weeknum) < 366
                    & as.numeric(weeknum) > 0],
            aes(x=as.numeric(weeknum),
                y=plays)
            )
g <- g + geom_line()
g <- g + xlab("Weeks after January 1st 2006")
g <- g + ylab("Number of Plays")
g <- g + ggtitle("Plays of Weekly Top Song")
g <- g + scale_x_continuous(breaks = c(0, 52, 104, 156, 208, 260, 312, 364))
g <- g + theme_set(theme_gray(base_size = 18))
g
dev.off()

g <- ggplot(top.songs.by.week[week.rank == 5
                    & as.numeric(weeknum) < 366
                    & as.numeric(weeknum) > 0],
            aes(x=as.numeric(weeknum),
                y=plays)
            )
g <- g + geom_line()
g <- g + xlab("Weeks after January 1st 2006")
g <- g + ylab("Number of Plays")
g <- g + ggtitle("Plays of Weekly Top Song")
g <- g + scale_x_continuous(breaks = c(0, 52, 104, 156, 208, 260, 312, 364))
g <- g + theme_set(theme_gray(base_size = 18))
g


# Weekly Top 5 songs. Barchart.
g <- ggplot(top.songs.by.week[week.rank <= 5],
            aes(x=as.numeric(weeknum),
                y=plays,
                group=week.rank,
                fill = week.rank,
                order = -week.rank)
            )
g <- g + geom_bar(stat="identity", width=1)
g <- g + guides(fill = guide_legend(reverse = FALSE))
	# Filename: playlist-analysis.R

	library(data.table)
	library(ggplot2)
	library(xtable)

	df <- read.csv("current_playlist_2006-2012.csv", stringsAsFactors = FALSE)
	df.unclean <- df # so we can revert without re-reading during development

	# What form the data is in:
	# > str(df)
	# 'data.frame': 805604 obs. of 5 variables:
	# $ id : chr "song133409" "song141983" "song133931" "song133031" ...
	# $ date : chr "2006-01-01 02:59:00" "2006-01-01 02:55:00" "2006-01-01 02:48:00" "2006-01-01 02:43:00" ...
	# $ title : chr "Anything (feat. Bent Van Looy + Miki)" "Rock On" "On And On" "Just 1 Kiss" ...
	# $ artist : chr "Styrofoam" "Jackson and His Computer Band" "Roni Size" "Basement Jaxx" ...
	# $ starrating: int 4 4 NA 4 5 4 3 NA 3 5 ...

	########################################
	# Data Cleaning and Organizations #
	########################################

	# There are a bunch of songs without titles or artists in this data
	# set. We're going to get rid of them.
	df <- df[df$title != "" & df$artist != "",]

	# There are also a bunch of special programs on The Current which are
	# not songs in their own right. We'd like to delete them. However,
	# sometimes they enter into the data set under slightly different
	# names, or with the name of the host attached. For this reason We're
	# going to do logical string matching with `grepl` to get rid of them.
	df <- df[!grepl("local show", df$title, ignore.case = TRUE)
	& !grepl("musicheads", df$title, ignore.case = TRUE)
	& !grepl("american routes", df$title, ignore.case = TRUE)
	& !grepl("sound opinions", df$title, ignore.case = TRUE)
	& !grepl("passport approved", df$title, ignore.case = TRUE)
	& !grepl("redefinition radio", df$title, ignore.case = TRUE)
	& !(grepl("transmission", df$title, ignore.case = TRUE)
	& grepl("jake rudh", df$artist, ignore.case = TRUE))
	& !grepl("in the studio", df$title, ignore.case = TRUE)
	,]

	# Year and weekday factors
	df$year <- factor(strftime(df$date, "%Y"))
	df$weekday <- factor(strftime(df$date, "%A"))

	# Finding the ordinal week number within the dataset
	df$dayofyear <- as.numeric(strftime(df$date, "%j"))
	df$yearnum <- as.numeric(strftime(df$date, "%Y"))
	df$leapdays <- 0
	df[df$year %in% c(2009, 2010, 2011, 2012),]$leapdays <- 1
	df$daynum <- df$dayofyear + 365 * (df$yearnum - 2006) + df$leapdays
	# FINDING WEEKNUM:
	# - first day of the data set is a sunday
	# - Want the first full week to be week 1,
	df$weeknum <- floor((df$daynum - 2) / 7) + 1

	# Create a data.table
	playlist <- data.table(df[,c("id", "year", "artist", "title", "weeknum", "date")])
	setkey(playlist, id, year, artist)

	# Remove trailing whitespace (extremely common, good for normalization)
	playlist[,artist := gsub("[[:space:]]*$","",artist)]
	playlist[,title := gsub("[[:space:]]*$","",title)]

	# There are a number of songs where the song is in The Current's
	# database, but still (occasionally) appears with just the song and
	# artist and an id of "song". Additionally, there are songs in the
	# database which have multiple versions. We would like to keep the
	# multiple versions of a song separate, but roll the non-id'd songs in
	# with the lowest id'd song with the same title/artist combo.
	setkey(playlist, id, year, artist)
	noid <- playlist["song"]
	hasid <- playlist[!"song"]

	setkey(hasid, title, artist, id)
	min.id <- hasid[,list(minid = min(id)),by="title,artist"]
	setkey(min.id, title, artist)
	setkey(noid, title, artist, year, weeknum, date)

	fixed.id <- noid[min.id, nomatch=0][, list(id=minid,title,artist,year,weeknum,date)]

	playlist <- rbind(hasid[,list(id,title,artist,year,weeknum,date)],
	fixed.id)
	setkey(playlist, id, year)

	########################################
	# Whole Dataset Summary Stats #
	########################################

	# Total Number of songs played:
	nrow(df)

	# Number of unique songs played (that have ids in the current's database):
	length(unique(df$id)) - 1

	# Number of songs without an id number in the database
	nrow(df[df$id == "song",])

	# Number of unique song title/artist combinations for unidentified songs
	length(unique(with(df[df$id == "song",], paste(title, artist))))

	########################################
	# Aggregation #
	########################################

	# Song playcount by year:
	########################################

	# We're creating a new data table, `total.number.of.plays` which
	# collapses all identical songs played in a year into a single row,
	# adding a `plays` column with the count in that year.
	total.number.of.plays <- playlist[,list(plays = length(date)), by="id,title,artist,year"][order(-plays)]

	# Add a column that represents the song's "rank" in a given year (most
	# plays -> rank 1).
	setkey(total.number.of.plays, year, plays)
	total.number.of.plays[,year.rank:=seq(from = .N, to = 1), by = "year"]
	setkey(total.number.of.plays, year, year.rank)

	# We update the `total.number.of.plays` table to include a cumulative
	# sum of the proportion of total plays are the top `n` plays.
	total.number.of.plays[,total.year.plays:=sum(plays), by=year]
	total.number.of.plays[,norm.year.plays:=plays/total.year.plays]
	total.number.of.plays[,cumulative.plays:=cumsum(norm.year.plays), by=year]


	# Top Songs By Week
	########################################
	top.songs.by.week <- playlist[,list(plays = length(date)),
	by="id,title,artist,weeknum"][order(-plays)]
	setkey(top.songs.by.week, weeknum, plays)
	top.songs.by.week[,week.rank:=seq(from = .N, to = 1), by = "weeknum"]
	setkey(top.songs.by.week, weeknum, week.rank)

	########################################
	# Tables #
	########################################

	sink("analysisOutput/tables/totalAndUniqueCounts.html")
	t <- playlist[,list(total = length(id),
	unique = length(unique(id))),
	by = "year"]
	print(xtable(t), type="html")
	sink()

	# Playcounts of top ten songs in each year
	#
	sink("analysisOutput/tables/topSongsByYear.html", append=TRUE)
	for (yr in c("2006","2007","2008","2009", "2010","2011", "2012")) {
	top.tbl <- total.number.of.plays[year==yr,list(title,artist,plays)][1:10]
	cat(sprintf("\n\n%s\n\n", yr))
	print(xtable(top.tbl), type = "html")
	}
	sink()

	sink("analysisOutput/tables/topSongEachWeek.html")
	print(xtable(top.songs.by.week[week.rank==1, list(weeknum, title, artist)][order(-as.numeric(weeknum))]), type = "html")
	sink()



	########################################
	# Plotting #
	########################################

	# The following section has a number of interesting charts. Only the
	# ones surrounded with `png(...)` and `dev.off()` are used in the
	# post.


	# Plays vs Most Played rank
	# By: Year
	g <- ggplot(total.number.of.plays, aes(x=year.rank, y=plays, color=year))
	g <- g + geom_line(size = 2)
	g <- g + scale_x_continuous(limits=c(0,500))
	g <- g + guides(col = guide_legend(reverse = TRUE))

	# Cumulative plays % -- Top 1000 songs
	# By: year
	# Normalized: % plays/year
	png(filename = "analysisOutput/images/cumulativeTopProportion.png",
	height = 600, width = 800)
	g <- ggplot(total.number.of.plays,
	aes(x=year.rank, y=cumulative.plays, color=year))
	g <- g + geom_line(size=1.2)
	g <- g + scale_x_continuous(limits = c(0,1000))
	g <- g + scale_y_continuous(limits = c(0, .65))
	g <- g + guides(col = guide_legend(reverse = TRUE))
	g <- g + xlab("Song Rank (by playcount and year)")
	g <- g + ylab("Cumulative Proportion of Airtime")
	g <- g + ggtitle("Cumulative Play Proportion of Top 1,000 Songs in Each Year")
	g <- g + theme_set(theme_gray(base_size = 18))
	g
	dev.off()

	# Cumulative plays % -- Top 100 songs
	# By: year
	# Normalized: % plays/year
	g <- ggplot(total.number.of.plays,
	aes(x=year.rank, y=cumulative.plays, color=year))
	g <- g + geom_line(size=1.2)
	g <- g + scale_x_continuous(limits = c(0,100))
	g <- g + scale_y_continuous(limits = c(0, .20))
	g <- g + guides(col = guide_legend(reverse = TRUE))
	g <- g + xlab("Song Rank (by playcount and year)")
	g <- g + ylab("Cumulative Proportion of Airtime")

	# Weekly, # of plays of top played song
	png(filename = "analysisOutput/images/weeklyTopPlays.png",
	height = 400, width = 800)
	g <- ggplot(top.songs.by.week[week.rank == 1
	& as.numeric(weeknum) < 366
	& as.numeric(weeknum) > 0],
	aes(x=as.numeric(weeknum),
	y=plays)
	)
	g <- g + geom_line()
	g <- g + xlab("Weeks after January 1st 2006")
	g <- g + ylab("Number of Plays")
	g <- g + ggtitle("Plays of Weekly Top Song")
	g <- g + scale_x_continuous(breaks = c(0, 52, 104, 156, 208, 260, 312, 364))
	g <- g + theme_set(theme_gray(base_size = 18))
	g
	dev.off()

	g <- ggplot(top.songs.by.week[week.rank == 5
	& as.numeric(weeknum) < 366
	& as.numeric(weeknum) > 0],
	aes(x=as.numeric(weeknum),
	y=plays)
	)
	g <- g + geom_line()
	g <- g + xlab("Weeks after January 1st 2006")
	g <- g + ylab("Number of Plays")
	g <- g + ggtitle("Plays of Weekly Top Song")
	g <- g + scale_x_continuous(breaks = c(0, 52, 104, 156, 208, 260, 312, 364))
	g <- g + theme_set(theme_gray(base_size = 18))
	g


	# Weekly Top 5 songs. Barchart.
	g <- ggplot(top.songs.by.week[week.rank <= 5],
	aes(x=as.numeric(weeknum),
	y=plays,
	group=week.rank,
	fill = week.rank,
	order = -week.rank)
	)
	g <- g + geom_bar(stat="identity", width=1)
	g <- g + guides(fill = guide_legend(reverse = FALSE))