Skip to content

Instantly share code, notes, and snippets.

@thiemehennis
Last active December 31, 2015 12:49
Show Gist options
  • Save thiemehennis/7988746 to your computer and use it in GitHub Desktop.
Save thiemehennis/7988746 to your computer and use it in GitHub Desktop.
This R code cleans up raw sql edX dataset into a new dataset with columns "Student", "Question", "Answer". The previous version contained a Credits: Written by @cbdvs for beautiful karma!
# Credits for @cbdvs (Christopher Davis) - check out his amazing work here: http://enipedia.tudelft.nl or email him at: c.b.davis@tudelft.nl
#never ever convert strings to factors
options(stringsAsFactors = FALSE)
###### TODO set this, the data will be written out there
setwd("/home/username/Desktop/R data/")
file.remove("allData.csv") ## removes the old datafile if there is one (so the data is not appended to the file, but a new file is created)
#library(sqldf)
library(rjson)
df = read.table("/directory/one/two/folder/rawdataset.csv", sep="\t", header=FALSE, quote = "")
colnames(df) = c("id", "module_type", "module_id", "student_id", "state", "grade", "created", "modified", "max_grade", "done", "course_id")
# go over each row in the data
for (i in c(1:nrow(df))){
print(i)
# check for valid json with jsonlint.com
test = df$state[i]
# quotes inside quoted values
test = gsub('\\\\\\\"', '\"', test)
something = fromJSON(test)
# make sure we have answers
if (!is.null(something$student_answers)){
# check that there are more than 0 answers
if (length(something$student_answers) > 0){
student_id = df$student_id[i]
questionNames = names(something$student_answers)
numQuestions = length(something$student_answers)
if (numQuestions > 0){
for (i in c(1:numQuestions)){
answers = something$student_answers[[i]]
questionName = questionNames[i]
for (answer in answers){
#print(paste(student_id, questionName, answer), sep="\t")
write.table(t(c(student_id, questionName, answer)), file="allData.csv", sep="\t", append=TRUE, row.names=FALSE, col.names=FALSE)
}
}
}
}
}
}
#TODO check working directory
allData = read.table("allData.csv")
row.names(allData) = NULL
colnames(allData) = c("student", "question", "answer")
# Some question statistics: how many answers per question
questionStats = sqldf("SELECT question, count(*) as numAnswered FROM allData GROUP BY question ORDER BY numAnswered DESC")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment