slhck/analyze.r

## analyze.r
#!/usr/bin/env Rscript
#
# Synopsis:     Converts raw NappingPlayer output to a summary data frame for
#               input in FactoMineR or SensoMineR. This also contains a few examples.
#               Comment out the lines you don't need, or browse through the source.
# Author:       Werner Robitza, <werner.robitza@univie.ac.at>
# Known issues: At the moment there might be a problem with non-alphanumeric names being used.
#               Report bugs here, please: https://github.com/slhck/napping-player/issues

# Change the working directory that contains your raw tablet data files
setwd("~/data")

# =============================================================================
# YOU DO NOT NEED TO CHANGE ANYTHING BELOW THIS LINE

# gotta love Hadley's libraries
library(plyr)
library(stringr)
library(ggplot2)
library(reshape2)
library(FactoMineR)
library(SensoMineR)

# cleanup workspace
rm(list=ls())

# read list of files
videoList   = list.files(pattern="*-videos.csv")
groupList   = list.files(pattern="*-groups.csv")
keywordList = list.files(pattern="*-keywords.csv")

# function to extract the user IDs from the dataset
getUserId = function(x) str_match(x, ".*-([0-9a-zA-Z]+)-videos.csv")[,2]
userIds = sapply(videoList, getUserId, USE.NAMES=F)
noUsers = length(userIds)

# read CSV data to data frame
videoData   = lapply(videoList,   read.csv, stringsAsFactors=FALSE)
groupData   = lapply(groupList,   read.csv, stringsAsFactors=FALSE)
keywordData = lapply(keywordList, read.csv, stringsAsFactors=FALSE)

# merge tables
mergeData      = function(x, y) merge(x, y, all=TRUE)
videoGroupData = mapply(mergeData, videoData, groupData, SIMPLIFY=FALSE)
data.all       = mapply(mergeData, videoGroupData, keywordData, SIMPLIFY=FALSE)

# add user ID
addUserId = function(x, y) data.frame(x, user_id = y)
data.all  = mapply(addUserId, data.all, userIds, SIMPLIFY = FALSE)
data.all  = rbind.fill(data.all)

# convert to factors
data.all$user_id  = as.factor(data.all$user_id)
data.all$video_id = as.factor(data.all$video_id)
data.all$file     = as.factor(data.all$file)
data.all$group_id = as.factor(data.all$group_id)

# get all file names
data.filenames = unique(data.all[c("video_id","file")])

# output summary file
# write.csv(data.all, "data-all.csv")

# =============================================================================
# only get x and y data for each user, for each video
# http://stackoverflow.com/q/13953443
summaryList.xy = dlply(
  data.all,
  .(user_id), # divide data by user id
  function(x) unique(x[c("video_id", "x", "y")])  # get unique combination for X/Y values
  )
# merge individual users together to wide table
nappingData.MFA = Reduce(function(x, y) merge(x, y, by="video_id"), summaryList.xy)
# fix the names for the wide table
names(nappingData.MFA)[-1] = do.call(
  function(...) paste(..., sep = "."),
  expand.grid(c("x", "y"), names(summaryList.xy))
  )

# get unique combination of user, video and keyword
# and generate frequency table
videosKeywords = unique(data.all[c("user_id","keyword","video_id")])
videosKeywords.table = as.data.frame.matrix(
  table(videosKeywords[c("video_id","keyword")])
  )
videosKeywords.table = cbind(
  data.frame(video_id=rownames(videosKeywords.table)),
  videosKeywords.table
  )

# merge positions and keywords into final data frame
nappingData.MFA = merge(nappingData.MFA, videosKeywords.table, by="video_id")
row.names(nappingData.MFA) = data.filenames[match(nappingData.MFA[,1], data.filenames$video_id), ]$file

# output final file
write.csv(nappingData.MFA, "data-all-napping-mfa.csv")

# generate MFA for X/Y values
res.MFA = MFA(
  nappingData.MFA[,2:(noUsers*2 + 1)],
  group = c(rep(2, noUsers)),
  type  = c(rep("s", noUsers)),
  axes  = c(1,2)
  )

# # hierarchy for the HMFA
noKeywords = length(unique(na.omit(data.all$keyword)))
hierar = list(c(rep(2,noUsers), noKeywords), c(noUsers, 1))
res.HMFA = HMFA(
  nappingData.MFA[, -1],
  hierar,
  type = c(rep("c",noUsers), rep("s",1))
  )

# generate INDSCAL model
res.INDSCAL = indscal(nappingData.MFA[,2:(noUsers*2 + 1)])

# =============================================================================
# another form of representation, where the keywords appear next to the user

# get all keywords and paste them into one string
keywordConcat = data.all[c("video_id", "user_id", "keyword")]
keywordConcat = ddply(keywordConcat, .(video_id, user_id), function(x) { paste(unique(x$keyword), collapse=";") })

# fix the names, then cast into wide format
names(keywordConcat) = c("video_id", "user_id", "keyword")
# keywordConcat.Wide = dcast(keywordConcat, video_id ~ user_id)

# merge with X/Y data
summaryList.keywords = dlply(data.all, .(user_id),
  function(x) {
    ret = merge(
      unique(x[c("video_id", "user_id", "x", "y")]),
      keywordConcat,
      by=c("user_id", "video_id")
      )[,-1]  # drop user_id column again
    ret$keyword = as.factor(ret$keyword)
    ret$x = as.numeric(ret$x)
    ret$y = as.numeric(ret$y)
    return(ret)
  }
)

# prepare data for FASNT method from SensoMineR
# http://sensominer.free.fr/fasnt.htm
nappingData.FASNT = Reduce(function(x, y) merge(x, y, by="video_id"), summaryList.keywords)
names(nappingData.FASNT)[-1] = do.call(
  function(...) paste(..., sep = "."),
  expand.grid(c("x", "y", "cat"), names(summaryList.keywords))
)

# update row names from files
row.names(nappingData.FASNT) = data.filenames[match(nappingData.FASNT[,1], data.filenames$video_id), ]$file

res.FASNT = fasnt(nappingData.FASNT[,-1],first="nappe",sep.word=";")
	#!/usr/bin/env Rscript
	#
	# Synopsis: Converts raw NappingPlayer output to a summary data frame for
	# input in FactoMineR or SensoMineR. This also contains a few examples.
	# Comment out the lines you don't need, or browse through the source.
	# Author: Werner Robitza, <werner.robitza@univie.ac.at>
	# Known issues: At the moment there might be a problem with non-alphanumeric names being used.
	# Report bugs here, please: https://github.com/slhck/napping-player/issues

	# Change the working directory that contains your raw tablet data files
	setwd("~/data")

	# =============================================================================
	# YOU DO NOT NEED TO CHANGE ANYTHING BELOW THIS LINE

	# gotta love Hadley's libraries
	library(plyr)
	library(stringr)
	library(ggplot2)
	library(reshape2)
	library(FactoMineR)
	library(SensoMineR)

	# cleanup workspace
	rm(list=ls())

	# read list of files
	videoList = list.files(pattern="*-videos.csv")
	groupList = list.files(pattern="*-groups.csv")
	keywordList = list.files(pattern="*-keywords.csv")

	# function to extract the user IDs from the dataset
	getUserId = function(x) str_match(x, ".*-([0-9a-zA-Z]+)-videos.csv")[,2]
	userIds = sapply(videoList, getUserId, USE.NAMES=F)
	noUsers = length(userIds)

	# read CSV data to data frame
	videoData = lapply(videoList, read.csv, stringsAsFactors=FALSE)
	groupData = lapply(groupList, read.csv, stringsAsFactors=FALSE)
	keywordData = lapply(keywordList, read.csv, stringsAsFactors=FALSE)

	# merge tables
	mergeData = function(x, y) merge(x, y, all=TRUE)
	videoGroupData = mapply(mergeData, videoData, groupData, SIMPLIFY=FALSE)
	data.all = mapply(mergeData, videoGroupData, keywordData, SIMPLIFY=FALSE)

	# add user ID
	addUserId = function(x, y) data.frame(x, user_id = y)
	data.all = mapply(addUserId, data.all, userIds, SIMPLIFY = FALSE)
	data.all = rbind.fill(data.all)

	# convert to factors
	data.all$user_id = as.factor(data.all$user_id)
	data.all$video_id = as.factor(data.all$video_id)
	data.all$file = as.factor(data.all$file)
	data.all$group_id = as.factor(data.all$group_id)

	# get all file names
	data.filenames = unique(data.all[c("video_id","file")])

	# output summary file
	# write.csv(data.all, "data-all.csv")

	# =============================================================================
	# only get x and y data for each user, for each video
	# http://stackoverflow.com/q/13953443
	summaryList.xy = dlply(
	data.all,
	.(user_id), # divide data by user id
	function(x) unique(x[c("video_id", "x", "y")]) # get unique combination for X/Y values
	)
	# merge individual users together to wide table
	nappingData.MFA = Reduce(function(x, y) merge(x, y, by="video_id"), summaryList.xy)
	# fix the names for the wide table
	names(nappingData.MFA)[-1] = do.call(
	function(...) paste(..., sep = "."),
	expand.grid(c("x", "y"), names(summaryList.xy))
	)

	# get unique combination of user, video and keyword
	# and generate frequency table
	videosKeywords = unique(data.all[c("user_id","keyword","video_id")])
	videosKeywords.table = as.data.frame.matrix(
	table(videosKeywords[c("video_id","keyword")])
	)
	videosKeywords.table = cbind(
	data.frame(video_id=rownames(videosKeywords.table)),
	videosKeywords.table
	)

	# merge positions and keywords into final data frame
	nappingData.MFA = merge(nappingData.MFA, videosKeywords.table, by="video_id")
	row.names(nappingData.MFA) = data.filenames[match(nappingData.MFA[,1], data.filenames$video_id), ]$file

	# output final file
	write.csv(nappingData.MFA, "data-all-napping-mfa.csv")

	# generate MFA for X/Y values
	res.MFA = MFA(
	nappingData.MFA[,2:(noUsers*2 + 1)],
	group = c(rep(2, noUsers)),
	type = c(rep("s", noUsers)),
	axes = c(1,2)
	)

	# # hierarchy for the HMFA
	noKeywords = length(unique(na.omit(data.all$keyword)))
	hierar = list(c(rep(2,noUsers), noKeywords), c(noUsers, 1))
	res.HMFA = HMFA(
	nappingData.MFA[, -1],
	hierar,
	type = c(rep("c",noUsers), rep("s",1))
	)

	# generate INDSCAL model
	res.INDSCAL = indscal(nappingData.MFA[,2:(noUsers*2 + 1)])

	# =============================================================================
	# another form of representation, where the keywords appear next to the user

	# get all keywords and paste them into one string
	keywordConcat = data.all[c("video_id", "user_id", "keyword")]
	keywordConcat = ddply(keywordConcat, .(video_id, user_id), function(x) { paste(unique(x$keyword), collapse=";") })

	# fix the names, then cast into wide format
	names(keywordConcat) = c("video_id", "user_id", "keyword")
	# keywordConcat.Wide = dcast(keywordConcat, video_id ~ user_id)

	# merge with X/Y data
	summaryList.keywords = dlply(data.all, .(user_id),
	function(x) {
	ret = merge(
	unique(x[c("video_id", "user_id", "x", "y")]),
	keywordConcat,
	by=c("user_id", "video_id")
	)[,-1] # drop user_id column again
	ret$keyword = as.factor(ret$keyword)
	ret$x = as.numeric(ret$x)
	ret$y = as.numeric(ret$y)
	return(ret)
	}
	)

	# prepare data for FASNT method from SensoMineR
	# http://sensominer.free.fr/fasnt.htm
	nappingData.FASNT = Reduce(function(x, y) merge(x, y, by="video_id"), summaryList.keywords)
	names(nappingData.FASNT)[-1] = do.call(
	function(...) paste(..., sep = "."),
	expand.grid(c("x", "y", "cat"), names(summaryList.keywords))
	)

	# update row names from files
	row.names(nappingData.FASNT) = data.filenames[match(nappingData.FASNT[,1], data.filenames$video_id), ]$file

	res.FASNT = fasnt(nappingData.FASNT[,-1],first="nappe",sep.word=";")