Last active
August 29, 2015 14:00
-
-
Save thiemehennis/4d3d179a6d44e2d23cfe to your computer and use it in GitHub Desktop.
Turn edX .mongo forum data into csv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### Entire workflow: | |
# Checked some of the data in jsonlint - corrected the errors → },{ instead of }{ between each line and [ and ] at the beginning and end of the file | |
# Made a smaller file to play with, containing about 11 JSON lines | |
# Used the code below to parse the datafile - however, checking the different listItems first if they are not lists themselves (that gives problems) // as you will see, | |
# I also removed things like \n because that gave errors and added an empty value for parent_id if there is none in the data (otherwise it would mix up the data) | |
# The code to import the .mongo file into R and then parse it into CSV: | |
setwd("/your/favourite/dir/json to csv/") | |
library(rjson) | |
#never ever convert strings to factors | |
options(stringsAsFactors = FALSE) | |
#import the .mongo file to R | |
temp.data = fromJSON(file="temp.mongo", method="C", unexpected.escape="error") | |
# remove the old datafile if there is one (so the data is not appended to the file, but a new file is created as it uses append) | |
file.remove("temp.csv") | |
# listItem = temp.data[[3]] ## if there is a specific row that gives trouble, enter the row number here to check the row contents | |
# the function | |
for (listItem in temp.data){ | |
parent_id = "" | |
if (length(listItem$parent_id)>0){ | |
parent_id = listItem$parent_id | |
} | |
write.table(t( | |
c(listItem$votes$up_count, listItem$visible, parent_id, gsub("\n", "", listItem$body), listItem$course_id, unlist(listItem["_type"]), listItem$endorsed, listItem$author_id, unlist(listItem$comment_thread_id), listItem$author_username, as.POSIXct(unlist(listItem$created_at)/1000, origin="1970-01-01"))), | |
file="temp.csv", sep="\t", append=TRUE, row.names=FALSE, col.names=FALSE) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment