thiemehennis/edX student module analytics to R

## edX student module analytics to R
# Credits for @cbdvs (Christopher Davis) - check out his amazing work here: http://enipedia.tudelft.nl or email him at: c.b.davis@tudelft.nl

#never ever convert strings to factors
options(stringsAsFactors = FALSE)

###### TODO set this, the data will be written out there
setwd("/home/username/Desktop/R data/")

file.remove("allData.csv") ## removes the old datafile if there is one (so the data is not appended to the file, but a new file is created)

#library(sqldf)
library(rjson)

df = read.table("/directory/one/two/folder/rawdataset.csv", sep="\t", header=FALSE, quote = "")
colnames(df) = c("id", "module_type", "module_id", "student_id", "state", "grade", "created", "modified", "max_grade", "done", "course_id")

# go over each row in the data
for (i in c(1:nrow(df))){
  print(i)
  # check for valid json with jsonlint.com
  test = df$state[i]
  # quotes inside quoted values
  test = gsub('\\\\\\\"', '\"', test)

  something = fromJSON(test)
  # make sure we have answers
  if (!is.null(something$student_answers)){
    # check that there are more than 0 answers
    if (length(something$student_answers) > 0){

      student_id = df$student_id[i]
      questionNames = names(something$student_answers)
      numQuestions = length(something$student_answers)
      if (numQuestions > 0){
        for (i in c(1:numQuestions)){
          answers = something$student_answers[[i]]
          questionName = questionNames[i]
          for (answer in answers){
            #print(paste(student_id, questionName, answer), sep="\t")
            write.table(t(c(student_id, questionName, answer)), file="allData.csv", sep="\t", append=TRUE, row.names=FALSE, col.names=FALSE)
          }
        }
      }
    }
  }
}

#TODO check working directory
allData = read.table("allData.csv")
row.names(allData) = NULL
colnames(allData) = c("student", "question", "answer")

# Some question statistics: how many answers per question
questionStats = sqldf("SELECT question, count(*) as numAnswered FROM allData GROUP BY question ORDER BY numAnswered DESC")
	# Credits for @cbdvs (Christopher Davis) - check out his amazing work here: http://enipedia.tudelft.nl or email him at: c.b.davis@tudelft.nl

	#never ever convert strings to factors
	options(stringsAsFactors = FALSE)

	###### TODO set this, the data will be written out there
	setwd("/home/username/Desktop/R data/")

	file.remove("allData.csv") ## removes the old datafile if there is one (so the data is not appended to the file, but a new file is created)

	#library(sqldf)
	library(rjson)

	df = read.table("/directory/one/two/folder/rawdataset.csv", sep="\t", header=FALSE, quote = "")
	colnames(df) = c("id", "module_type", "module_id", "student_id", "state", "grade", "created", "modified", "max_grade", "done", "course_id")

	# go over each row in the data
	for (i in c(1:nrow(df))){
	print(i)
	# check for valid json with jsonlint.com
	test = df$state[i]
	# quotes inside quoted values
	test = gsub('\\\\\\\"', '\"', test)

	something = fromJSON(test)
	# make sure we have answers
	if (!is.null(something$student_answers)){
	# check that there are more than 0 answers
	if (length(something$student_answers) > 0){

	student_id = df$student_id[i]
	questionNames = names(something$student_answers)
	numQuestions = length(something$student_answers)
	if (numQuestions > 0){
	for (i in c(1:numQuestions)){
	answers = something$student_answers[[i]]
	questionName = questionNames[i]
	for (answer in answers){
	#print(paste(student_id, questionName, answer), sep="\t")
	write.table(t(c(student_id, questionName, answer)), file="allData.csv", sep="\t", append=TRUE, row.names=FALSE, col.names=FALSE)
	}
	}
	}
	}
	}
	}

	#TODO check working directory
	allData = read.table("allData.csv")
	row.names(allData) = NULL
	colnames(allData) = c("student", "question", "answer")

	# Some question statistics: how many answers per question
	questionStats = sqldf("SELECT question, count(*) as numAnswered FROM allData GROUP BY question ORDER BY numAnswered DESC")