Last active
December 19, 2015 09:39
-
-
Save slhck/5934779 to your computer and use it in GitHub Desktop.
Converts raw NappingPlayer output to a summary data frame.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
# | |
# Synopsis: Converts raw NappingPlayer output to a summary data frame for | |
# input in FactoMineR or SensoMineR. This also contains a few examples. | |
# Comment out the lines you don't need, or browse through the source. | |
# Author: Werner Robitza, <werner.robitza@univie.ac.at> | |
# Known issues: At the moment there might be a problem with non-alphanumeric names being used. | |
# Report bugs here, please: https://github.com/slhck/napping-player/issues | |
# Change the working directory that contains your raw tablet data files | |
setwd("~/data") | |
# ============================================================================= | |
# YOU DO NOT NEED TO CHANGE ANYTHING BELOW THIS LINE | |
# gotta love Hadley's libraries | |
library(plyr) | |
library(stringr) | |
library(ggplot2) | |
library(reshape2) | |
library(FactoMineR) | |
library(SensoMineR) | |
# cleanup workspace | |
rm(list=ls()) | |
# read list of files | |
videoList = list.files(pattern="*-videos.csv") | |
groupList = list.files(pattern="*-groups.csv") | |
keywordList = list.files(pattern="*-keywords.csv") | |
# function to extract the user IDs from the dataset | |
getUserId = function(x) str_match(x, ".*-([0-9a-zA-Z]+)-videos.csv")[,2] | |
userIds = sapply(videoList, getUserId, USE.NAMES=F) | |
noUsers = length(userIds) | |
# read CSV data to data frame | |
videoData = lapply(videoList, read.csv, stringsAsFactors=FALSE) | |
groupData = lapply(groupList, read.csv, stringsAsFactors=FALSE) | |
keywordData = lapply(keywordList, read.csv, stringsAsFactors=FALSE) | |
# merge tables | |
mergeData = function(x, y) merge(x, y, all=TRUE) | |
videoGroupData = mapply(mergeData, videoData, groupData, SIMPLIFY=FALSE) | |
data.all = mapply(mergeData, videoGroupData, keywordData, SIMPLIFY=FALSE) | |
# add user ID | |
addUserId = function(x, y) data.frame(x, user_id = y) | |
data.all = mapply(addUserId, data.all, userIds, SIMPLIFY = FALSE) | |
data.all = rbind.fill(data.all) | |
# convert to factors | |
data.all$user_id = as.factor(data.all$user_id) | |
data.all$video_id = as.factor(data.all$video_id) | |
data.all$file = as.factor(data.all$file) | |
data.all$group_id = as.factor(data.all$group_id) | |
# get all file names | |
data.filenames = unique(data.all[c("video_id","file")]) | |
# output summary file | |
# write.csv(data.all, "data-all.csv") | |
# ============================================================================= | |
# only get x and y data for each user, for each video | |
# http://stackoverflow.com/q/13953443 | |
summaryList.xy = dlply( | |
data.all, | |
.(user_id), # divide data by user id | |
function(x) unique(x[c("video_id", "x", "y")]) # get unique combination for X/Y values | |
) | |
# merge individual users together to wide table | |
nappingData.MFA = Reduce(function(x, y) merge(x, y, by="video_id"), summaryList.xy) | |
# fix the names for the wide table | |
names(nappingData.MFA)[-1] = do.call( | |
function(...) paste(..., sep = "."), | |
expand.grid(c("x", "y"), names(summaryList.xy)) | |
) | |
# get unique combination of user, video and keyword | |
# and generate frequency table | |
videosKeywords = unique(data.all[c("user_id","keyword","video_id")]) | |
videosKeywords.table = as.data.frame.matrix( | |
table(videosKeywords[c("video_id","keyword")]) | |
) | |
videosKeywords.table = cbind( | |
data.frame(video_id=rownames(videosKeywords.table)), | |
videosKeywords.table | |
) | |
# merge positions and keywords into final data frame | |
nappingData.MFA = merge(nappingData.MFA, videosKeywords.table, by="video_id") | |
row.names(nappingData.MFA) = data.filenames[match(nappingData.MFA[,1], data.filenames$video_id), ]$file | |
# output final file | |
write.csv(nappingData.MFA, "data-all-napping-mfa.csv") | |
# generate MFA for X/Y values | |
res.MFA = MFA( | |
nappingData.MFA[,2:(noUsers*2 + 1)], | |
group = c(rep(2, noUsers)), | |
type = c(rep("s", noUsers)), | |
axes = c(1,2) | |
) | |
# # hierarchy for the HMFA | |
noKeywords = length(unique(na.omit(data.all$keyword))) | |
hierar = list(c(rep(2,noUsers), noKeywords), c(noUsers, 1)) | |
res.HMFA = HMFA( | |
nappingData.MFA[, -1], | |
hierar, | |
type = c(rep("c",noUsers), rep("s",1)) | |
) | |
# generate INDSCAL model | |
res.INDSCAL = indscal(nappingData.MFA[,2:(noUsers*2 + 1)]) | |
# ============================================================================= | |
# another form of representation, where the keywords appear next to the user | |
# get all keywords and paste them into one string | |
keywordConcat = data.all[c("video_id", "user_id", "keyword")] | |
keywordConcat = ddply(keywordConcat, .(video_id, user_id), function(x) { paste(unique(x$keyword), collapse=";") }) | |
# fix the names, then cast into wide format | |
names(keywordConcat) = c("video_id", "user_id", "keyword") | |
# keywordConcat.Wide = dcast(keywordConcat, video_id ~ user_id) | |
# merge with X/Y data | |
summaryList.keywords = dlply(data.all, .(user_id), | |
function(x) { | |
ret = merge( | |
unique(x[c("video_id", "user_id", "x", "y")]), | |
keywordConcat, | |
by=c("user_id", "video_id") | |
)[,-1] # drop user_id column again | |
ret$keyword = as.factor(ret$keyword) | |
ret$x = as.numeric(ret$x) | |
ret$y = as.numeric(ret$y) | |
return(ret) | |
} | |
) | |
# prepare data for FASNT method from SensoMineR | |
# http://sensominer.free.fr/fasnt.htm | |
nappingData.FASNT = Reduce(function(x, y) merge(x, y, by="video_id"), summaryList.keywords) | |
names(nappingData.FASNT)[-1] = do.call( | |
function(...) paste(..., sep = "."), | |
expand.grid(c("x", "y", "cat"), names(summaryList.keywords)) | |
) | |
# update row names from files | |
row.names(nappingData.FASNT) = data.filenames[match(nappingData.FASNT[,1], data.filenames$video_id), ]$file | |
res.FASNT = fasnt(nappingData.FASNT[,-1],first="nappe",sep.word=";") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment