Last active
December 31, 2015 12:49
-
-
Save thiemehennis/7988746 to your computer and use it in GitHub Desktop.
This R code cleans up raw sql edX dataset into a new dataset with columns "Student", "Question", "Answer". The previous version contained a Credits: Written by @cbdvs for beautiful karma!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Credits for @cbdvs (Christopher Davis) - check out his amazing work here: http://enipedia.tudelft.nl or email him at: c.b.davis@tudelft.nl | |
#never ever convert strings to factors | |
options(stringsAsFactors = FALSE) | |
###### TODO set this, the data will be written out there | |
setwd("/home/username/Desktop/R data/") | |
file.remove("allData.csv") ## removes the old datafile if there is one (so the data is not appended to the file, but a new file is created) | |
#library(sqldf) | |
library(rjson) | |
df = read.table("/directory/one/two/folder/rawdataset.csv", sep="\t", header=FALSE, quote = "") | |
colnames(df) = c("id", "module_type", "module_id", "student_id", "state", "grade", "created", "modified", "max_grade", "done", "course_id") | |
# go over each row in the data | |
for (i in c(1:nrow(df))){ | |
print(i) | |
# check for valid json with jsonlint.com | |
test = df$state[i] | |
# quotes inside quoted values | |
test = gsub('\\\\\\\"', '\"', test) | |
something = fromJSON(test) | |
# make sure we have answers | |
if (!is.null(something$student_answers)){ | |
# check that there are more than 0 answers | |
if (length(something$student_answers) > 0){ | |
student_id = df$student_id[i] | |
questionNames = names(something$student_answers) | |
numQuestions = length(something$student_answers) | |
if (numQuestions > 0){ | |
for (i in c(1:numQuestions)){ | |
answers = something$student_answers[[i]] | |
questionName = questionNames[i] | |
for (answer in answers){ | |
#print(paste(student_id, questionName, answer), sep="\t") | |
write.table(t(c(student_id, questionName, answer)), file="allData.csv", sep="\t", append=TRUE, row.names=FALSE, col.names=FALSE) | |
} | |
} | |
} | |
} | |
} | |
} | |
#TODO check working directory | |
allData = read.table("allData.csv") | |
row.names(allData) = NULL | |
colnames(allData) = c("student", "question", "answer") | |
# Some question statistics: how many answers per question | |
questionStats = sqldf("SELECT question, count(*) as numAnswered FROM allData GROUP BY question ORDER BY numAnswered DESC") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment