Skip to content

Instantly share code, notes, and snippets.

@SamuelMarks
Last active November 15, 2017 10:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SamuelMarks/b284c5fc4c6699cdfd603f6b9065c513 to your computer and use it in GitHub Desktop.
Save SamuelMarks/b284c5fc4c6699cdfd603f6b9065c513 to your computer and use it in GitHub Desktop.
OpenEdX datasets attempts to: SELECT grade, student_id, minutes_taken FROM questions_answered GROUP BY exam
// Now ignore everyone else; mongodump to CSV; `LOAD DATA FROM CSV` in MySQL
db.modulestore.structures.aggregate([
{ $unwind: '$blocks' },
{ $replaceRoot: { newRoot: "$blocks" } },
{ $out: 'modulestore.blocks' }
]);
// Resolve all children so each block document contains the entire hierarchy of blocks
// (not just one level deep of descendent IDs)
db.modulestore.structures.aggregate([
{
$unwind: '$blocks' // flatten "blocks" array
},
{
$replaceRoot: { // move "blocks" field to top level
newRoot: "$blocks"
}
},
{
$unwind: { // flatten "fields.children" array
path: "$fields.children",
preserveNullAndEmptyArrays: true
}
},
{
// this step is technically not needed but it might speed up things - try running with and without that
$addFields: { // we only keep the second (last, really) entry of all your arrays since this is the only valid join key for the graphLookup
"fields.children": {
$slice: [ "$fields.children", -1 ]
}
}
},
{
$unwind: { // flatten "fields.children" array one more time because it was nested before
path: "$fields.children",
preserveNullAndEmptyArrays: true
}
}, {
$group: { // reduce the number of lookups required later by eliminating duplicate parent-child paths
"_id": "$block_id",
"block_type": { $first: "$block_type" },
"definition": { $first: "$definition" },
"fieldsFormat": { $first: "$fields.format" },
"fieldsChildren": { $addToSet: "$fields.children" }
}
}, {
$project: { // restore original structure
"block_id": "$_id",
"block_type": "$block_type",
"definition": "$definition",
"fields": {
"format": "$fieldsFormat",
"children": "$fieldsChildren"
}
}
}, { // spit out the result into "modulestore.mapped0" collection, overwriting all existing content
$out: 'modulestore.mapped0'
}]);
db.modulestore.mapped0.aggregate([
{
$graphLookup: {
from: 'modulestore.mapped0',
startWith: '$block_id',
connectToField: 'fields.children',
connectFromField: 'block_id',
as: 'block_ids',
maxDepth: 0
}
},
{
$lookup: {
from: 'modulestore.mapped0',
localField: 'block_ids.fields.children',
foreignField: '_id',
as: 'block_ids.fields.children'
}
}
]);
mongoexport --fields blocks --collection 'modulestore.structures' -d edxapp --assertExists --query '{"blocks.fields.format": {$in: ["Midterm Exam", "Final Exam"]}}' | node -e 'require("readline").createInterface({input: process.stdin,output: process.stdout,terminal: false}).on("line", l => console.info(JSON.parse(l).blocks.filter(e => ["Midterm Exam", "Final Exam"].indexOf(e.fields.format) > -1).map(e => `${e.block_id},${e.fields.format},${e.fields.display_name}`).join("\n")))' | { echo 'block_id,exam,display_name'; sort -u; } > /tmp/blockid_exam_displayname.csv
-- Deps: mysql_data_load0.sql
WITH
Q0 AS (
SELECT T0.id, T0.module_type, T0.course_id, T0.module_id, SUBSTRING_INDEX(T0.module_id, '@', -1) block_id, TIMESTAMPDIFF(MINUTE, T0.created, T0.modified) minutes_taken
FROM edxapp.courseware_studentmodule T0
WHERE T0.course_id = '<my_course_id>' /* AND T0.module_type = 'problem' */
),
Q1 AS (
SELECT T1.block_id, T1.exam, T1.children
FROM blockid_exam_displayname T1
)
SELECT Q0.*, Q1.block_id q1_block_id, Q1.exam, Q1.children
FROM Q0
LEFT JOIN Q1 ON (
LOCATE(Q0.block_id, Q1.children)
);
-- Also tried:
WITH big AS
(SELECT
T0.module_type,
T0.course_id,
T0.module_id,
SUBSTRING_INDEX(T0.module_id, '@', -1) block_id,
T2.display_name,
T2.parent_display_name,
T2.exam,
T2.children,
TIMESTAMPDIFF(MINUTE, T0.created, T0.modified) minutes_taken
FROM edxapp.courseware_studentmodule T0
JOIN blockid_exam_displayname T2
ON SUBSTRING_INDEX(T0.module_id, '@', -1) = T2.block_id
) SELECT * FROM big
WHERE course_id = '<course_id>' AND module_type = 'course';
-- Deps: mysql_data_load1.sql
-- Attempts to associate exam with all rows
WITH RECURSIVE exam_paths (block_id, display_name, `type`, parent, exam) AS (
SELECT block_id, display_name, `type`, parent, exam
FROM blockid_exam_displayname
WHERE parent IS NULL
UNION ALL
SELECT e.block_id, e.display_name, e.`type`, e.parent, ep.exam
FROM exam_paths AS ep
JOIN blockid_exam_displayname AS e
ON ep.block_id = e.parent
) SELECT distinct(block_id), display_name, `type`, parent, exam
FROM exam_paths
ORDER BY exam;
-- Deps: mysql_data_load1.sql
WITH RECURSIVE Q0 AS (
SELECT T0.block_id, T0.display_name,
T0.child, T0.parent,
IFNULL(T0.exam, '') AS exam
FROM blockid_exam_displayname T0
-- WHERE exam IS NULL
UNION ALL
SELECT T1.block_id, T1.display_name,
T1.child, T1.parent,
-- T1.exam
CONCAT(
IFNULL(Q0.exam, ''),
':',
IFNULL(T1.exam, '')
) AS exam0
-- FROM blockid_exam_displayname T1
FROM Q0
JOIN blockid_exam_displayname T1
ON T1.block_id=Q0.parent
) SELECT distinct(Q0.block_id), Q0.child, Q0.parent, Q0.exam
FROM Q0
WHERE Q0.exam IS NOT NULL
ORDER BY WEIGHT_STRING(Q0.exam) DESC;
-- Results in all having 'Midterm Exam'; none have 'Final Exam' :(
-- Deps: mysql_data_load2.sql
WITH
Q0 AS
(SELECT *, SUBSTRING_INDEX(T0.module_id, '@', -1) block_id
FROM edxapp.courseware_studentmodule T0)
SELECT Q0.*
FROM Q0
INNER JOIN blockid_exam_displayname_child T1
ON Q0.block_id = T1.block_id AND Q0.module_type = T1.block_type;
-- Deps: mysql_data_load2.sql
WITH RECURSIVE Q0 AS (
SELECT T0.block_id, T0.block_type, T0.display_name,
T0.child_id, T0.child_type, T0.parent,
IFNULL(T0.exam, '') AS exam
FROM blockid_exam_displayname_child T0
-- WHERE exam IS NULL
UNION ALL
SELECT T1.block_id, T1.block_type, T1.display_name,
T1.child_id, T1.child_type, T1.parent,
-- T1.exam
CONCAT(
IFNULL(Q0.exam, ''),
':',
IFNULL(T1.exam, '')
) AS exam0
-- FROM blockid_exam_displayname T1
FROM Q0
JOIN blockid_exam_displayname_child T1
ON T1.block_id=Q0.parent
) SELECT distinct(Q0.block_id), Q0.block_type, Q0.child_id, Q0.child_type, Q0.parent, Q0.exam
FROM Q0
WHERE Q0.exam IS NOT NULL AND Q0.exam LIKE '%Exam%'
ORDER BY WEIGHT_STRING(Q0.exam) DESC;
-- ^This one almost works; only get 239 rows back though; none with the block_type of 'problem'.
-- Only block_types were: course; chapter; sequential
-- Deps: mongo_export0.bash
CREATE TEMPORARY TABLE blockid_exam_displayname (
block_id VARCHAR(33) PRIMARY KEY,
exam VARCHAR(33),
display_name VARCHAR(33),
parent_display_name VARCHAR(33),
children TEXT
);
LOAD DATA LOCAL INFILE '/tmp/blockid_exam_displayname_children.csv'
INTO TABLE blockid_exam_displayname
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
IGNORE 1 LINES;
-- Deps: mysql_load_data1.bash
CREATE TEMPORARY TABLE blockid_exam_displayname (
block_id VARCHAR(33) NOT NULL,
exam VARCHAR(33),
display_name VARCHAR(33),
child VARCHAR(33),
parent VARCHAR(33) NULL,
INDEX(parent),
FOREIGN KEY (parent) REFERENCES blockid_exam_displayname (block_id),
PRIMARY KEY (block_id, child)
);
LOAD DATA LOCAL INFILE '/tmp/blockid_exam_displayname_child.csv'
INTO TABLE blockid_exam_displayname
FIELDS TERMINATED BY ','
ENCLOSED BY '"';
-- Set `parent` field
UPDATE blockid_exam_displayname T0
INNER JOIN blockid_exam_displayname T1
ON T1.child = T0.block_id
SET T0.parent = T1.block_id
WHERE T0.parent IS NULL;
-- Deps: mysql_load_data2.bash
DROP TABLE IF EXISTS blockid_exam_displayname_child;
CREATE TEMPORARY TABLE blockid_exam_displayname_child (
id MEDIUMINT NOT NULL AUTO_INCREMENT,
block_id VARCHAR(33) NOT NULL,
block_type VARCHAR(33) NOT NULL,
exam VARCHAR(33),
display_name VARCHAR(33),
child_id VARCHAR(33),
child_type VARCHAR(33),
parent VARCHAR(33) NULL,
INDEX(parent),
INDEX(exam),
PRIMARY KEY(id)
-- FOREIGN KEY (parent) REFERENCES blockid_exam_displayname_child (block_id),
-- PRIMARY KEY (block_id, block_type, child)
);
LOAD DATA LOCAL INFILE '/tmp/blockid_exam_displayname_child.csv'
INTO TABLE blockid_exam_displayname_child
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
IGNORE 1 LINES
(`block_id`, `block_type`, `exam`, `child_id`, `child_type`);
-- Set `parent` field
UPDATE blockid_exam_displayname_child T0
INNER JOIN blockid_exam_displayname_child T1
ON T1.child_id = T0.block_id AND T1.child_type = T0.block_type
SET T0.parent = T1.block_id
WHERE T0.parent IS NULL;
# Deps: post_mongo0.js in directory above
mongoexport --fields blocks --collection 'modulestore.structures' -d edxapp --assertExists --query '{"blocks.fields.format": {$in: ["Midterm Exam", "Final Exam"]}}' | node ../post_mongo0.js | { echo 'block_id,exam,display_name,parent_display_name,children'; sort -u; } > blockid_exam_displayname.csv
# Deps: post_mongo1.js in directory above
mongoexport --fields block_id,block_type,fields --collection 'modulestore.blocks' -d edxapp --assertExists | node post_mongo.js | { echo 'id,block_id,block_type,exam,child_id,child_type'; sort -u; } > blockid_exam_displayname_child.csv
#!/usr/bin/env node
// Parses such that one child per row; rather than an array of children
require("readline").createInterface({input: process.stdin,output: process.stdout,terminal: false}).on("line", l => {
l = JSON.parse(l);
process.stdout.write(l.blocks.map(e => {
const r = `"${e.block_id}",${e.fields.hasOwnProperty("format") ? "\"" + e.fields.format + "\"" : "\\N"},${e.fields.hasOwnProperty("display_name") ? "\"" + e.fields.display_name + "\"" : "\\N"},${l.blocks[0].fields.hasOwnProperty("display_name")? "\""+l.blocks[0].fields.display_name+"\"" : "\\N"}`;
return e.fields.children && e.fields.children[0] ? e.fields.children.map(child => `${r},"${child[0]}","${child[1]}"`).join("\n"): `${r},\\N`;
}).join("\n"))
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment