Skip to content

Instantly share code, notes, and snippets.

@slhck
Last active December 19, 2015 09:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slhck/5934779 to your computer and use it in GitHub Desktop.
Save slhck/5934779 to your computer and use it in GitHub Desktop.
Converts raw NappingPlayer output to a summary data frame.
#!/usr/bin/env Rscript
#
# Synopsis: Converts raw NappingPlayer output to a summary data frame for
# input in FactoMineR or SensoMineR. This also contains a few examples.
# Comment out the lines you don't need, or browse through the source.
# Author: Werner Robitza, <werner.robitza@univie.ac.at>
# Known issues: At the moment there might be a problem with non-alphanumeric names being used.
# Report bugs here, please: https://github.com/slhck/napping-player/issues
# Change the working directory that contains your raw tablet data files
setwd("~/data")
# =============================================================================
# YOU DO NOT NEED TO CHANGE ANYTHING BELOW THIS LINE
# gotta love Hadley's libraries
library(plyr)
library(stringr)
library(ggplot2)
library(reshape2)
library(FactoMineR)
library(SensoMineR)
# cleanup workspace
rm(list=ls())
# read list of files
videoList = list.files(pattern="*-videos.csv")
groupList = list.files(pattern="*-groups.csv")
keywordList = list.files(pattern="*-keywords.csv")
# function to extract the user IDs from the dataset
getUserId = function(x) str_match(x, ".*-([0-9a-zA-Z]+)-videos.csv")[,2]
userIds = sapply(videoList, getUserId, USE.NAMES=F)
noUsers = length(userIds)
# read CSV data to data frame
videoData = lapply(videoList, read.csv, stringsAsFactors=FALSE)
groupData = lapply(groupList, read.csv, stringsAsFactors=FALSE)
keywordData = lapply(keywordList, read.csv, stringsAsFactors=FALSE)
# merge tables
mergeData = function(x, y) merge(x, y, all=TRUE)
videoGroupData = mapply(mergeData, videoData, groupData, SIMPLIFY=FALSE)
data.all = mapply(mergeData, videoGroupData, keywordData, SIMPLIFY=FALSE)
# add user ID
addUserId = function(x, y) data.frame(x, user_id = y)
data.all = mapply(addUserId, data.all, userIds, SIMPLIFY = FALSE)
data.all = rbind.fill(data.all)
# convert to factors
data.all$user_id = as.factor(data.all$user_id)
data.all$video_id = as.factor(data.all$video_id)
data.all$file = as.factor(data.all$file)
data.all$group_id = as.factor(data.all$group_id)
# get all file names
data.filenames = unique(data.all[c("video_id","file")])
# output summary file
# write.csv(data.all, "data-all.csv")
# =============================================================================
# only get x and y data for each user, for each video
# http://stackoverflow.com/q/13953443
summaryList.xy = dlply(
data.all,
.(user_id), # divide data by user id
function(x) unique(x[c("video_id", "x", "y")]) # get unique combination for X/Y values
)
# merge individual users together to wide table
nappingData.MFA = Reduce(function(x, y) merge(x, y, by="video_id"), summaryList.xy)
# fix the names for the wide table
names(nappingData.MFA)[-1] = do.call(
function(...) paste(..., sep = "."),
expand.grid(c("x", "y"), names(summaryList.xy))
)
# get unique combination of user, video and keyword
# and generate frequency table
videosKeywords = unique(data.all[c("user_id","keyword","video_id")])
videosKeywords.table = as.data.frame.matrix(
table(videosKeywords[c("video_id","keyword")])
)
videosKeywords.table = cbind(
data.frame(video_id=rownames(videosKeywords.table)),
videosKeywords.table
)
# merge positions and keywords into final data frame
nappingData.MFA = merge(nappingData.MFA, videosKeywords.table, by="video_id")
row.names(nappingData.MFA) = data.filenames[match(nappingData.MFA[,1], data.filenames$video_id), ]$file
# output final file
write.csv(nappingData.MFA, "data-all-napping-mfa.csv")
# generate MFA for X/Y values
res.MFA = MFA(
nappingData.MFA[,2:(noUsers*2 + 1)],
group = c(rep(2, noUsers)),
type = c(rep("s", noUsers)),
axes = c(1,2)
)
# # hierarchy for the HMFA
noKeywords = length(unique(na.omit(data.all$keyword)))
hierar = list(c(rep(2,noUsers), noKeywords), c(noUsers, 1))
res.HMFA = HMFA(
nappingData.MFA[, -1],
hierar,
type = c(rep("c",noUsers), rep("s",1))
)
# generate INDSCAL model
res.INDSCAL = indscal(nappingData.MFA[,2:(noUsers*2 + 1)])
# =============================================================================
# another form of representation, where the keywords appear next to the user
# get all keywords and paste them into one string
keywordConcat = data.all[c("video_id", "user_id", "keyword")]
keywordConcat = ddply(keywordConcat, .(video_id, user_id), function(x) { paste(unique(x$keyword), collapse=";") })
# fix the names, then cast into wide format
names(keywordConcat) = c("video_id", "user_id", "keyword")
# keywordConcat.Wide = dcast(keywordConcat, video_id ~ user_id)
# merge with X/Y data
summaryList.keywords = dlply(data.all, .(user_id),
function(x) {
ret = merge(
unique(x[c("video_id", "user_id", "x", "y")]),
keywordConcat,
by=c("user_id", "video_id")
)[,-1] # drop user_id column again
ret$keyword = as.factor(ret$keyword)
ret$x = as.numeric(ret$x)
ret$y = as.numeric(ret$y)
return(ret)
}
)
# prepare data for FASNT method from SensoMineR
# http://sensominer.free.fr/fasnt.htm
nappingData.FASNT = Reduce(function(x, y) merge(x, y, by="video_id"), summaryList.keywords)
names(nappingData.FASNT)[-1] = do.call(
function(...) paste(..., sep = "."),
expand.grid(c("x", "y", "cat"), names(summaryList.keywords))
)
# update row names from files
row.names(nappingData.FASNT) = data.filenames[match(nappingData.FASNT[,1], data.filenames$video_id), ]$file
res.FASNT = fasnt(nappingData.FASNT[,-1],first="nappe",sep.word=";")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment