Skip to content

Instantly share code, notes, and snippets.

@timkendall
Last active August 29, 2015 14:10
Show Gist options
  • Save timkendall/97b701c2602a17bf1420 to your computer and use it in GitHub Desktop.
Save timkendall/97b701c2602a17bf1420 to your computer and use it in GitHub Desktop.
var sqlite3 = require('sqlite3').verbose(),
fs = require('fs'),
Q = require('q');
/*
* Grab data synchronously (this loads the entire raw data into memory, SLOW!)
*/
//var obj = JSON.parse(fs.readFileSync('/Users/Me/Google Drive/CPSC-408/TimKendall-Assignment5/data/sections.json', 'utf8'));
//console.log(obj)
/*
* Grab data iteratively through a stream, extremely meory efficeint and fast
* Credit: http://stackoverflow.com/questions/11874096/parse-large-json-file-in-nodejs
*/
// Set some globals for tracking
var count = 0;
console.time('dbsave'); // start counting
// Setup database connection
var db = new sqlite3.cached.Database('/Users/Me/Google Drive/CPSC-408/TimKendall-Assignment5/database/school.db', function(error) {
if (error) throw new Error(error);
});
db.run("PRAGMA synchronous = OFF")
// Setup file stream
var jsonDataPath = '/Users/Me/Google Drive/CPSC-408/TimKendall-Assignment5/data/sections.json',
stream = fs.createReadStream(jsonDataPath, {
flags: 'r',
encoding: 'utf-8'
}),
buffer = '';
// Setup prepared statment
var prepared = db.prepare("INSERT INTO sections VALUES (?, ?, ?, ?, ?, ?)");
// Do something on each new chunk
stream.on('data', function(data) {
buffer += data.toString(); // when data is read, stash it in a string buffer
pump(process); // then process the buffer
});
// Do something when done reading file
stream.on('end', function(data) {
prepared.finalize();
console.log(count + ' record(s) inserted');
console.timeEnd('dbsave'); // Stop counter and output
db.close();
});
// Process the buffer (chunks are determined by positions of commas, assuming each comma denotes a new object in the raw data)
function pump(process) {
var pos;
while ((pos = buffer.indexOf('},')) >= 0) { // keep going while there's a ',' somewhere in the buffer
if (pos == 0) { // if there's more than one newline in a row, the buffer will now start with a newline
buffer = buffer.slice(1); // discard it
continue; // so that the next iteration will start with data
}
process(buffer.slice(0, pos + 1)); // hand off the line
buffer = buffer.slice(pos + 2); // and slice the processed data off the buffer
}
}
// Function to call on each data chunk (in this case each JSON object)
function process(line) { // here's where we do something with a line
if (line[line.length - 1] == ',') line = line.substr(0, line.length - 1); // discard ','
if (line.length > 0) { // ignore empty lines
var section = JSON.parse(line); // parse the JSON
/*
* Call our promise chain to progressively normalize the data
*/
normalizeRecord(section, count);
}
}
function normalizeRecord (record, count) {
findOrCreateSchool(record)
.then(findOrCreateMajor)
.then(findOrCreateCourse)
.then(findOrCreateProfessor)
.then(function (record) {
var deferred = Q.defer();
/*
* By this point all raw data in the record has been replaced by simple row ID's for easy joining later
*/
setTimeout(function () {
prepared.run([null, record.start, record.end, record.meets, record.professor, record.course ], function (error) {
if (error) deferred.reject(error);
else {
++count;
deferred.resolve(record);
}
});
}, 10);
return deferred.promise;
})
.catch(function (error) {
console.log(error)
})
.done();
}
function findOrCreateSchool(record, callback) {
var deferred = Q.defer();
db.run('INSERT INTO schools(name) SELECT $name WHERE NOT EXISTS(SELECT 1 FROM schools WHERE name = $name);', {
$name: record.school
}, function(error) {
if (error) deferred.reject(error);
/*
* If we just created the school use it's returned ID, otherwise run query to find correct ID
*/
if (this.changes > 0) {
record.school = this.lastID; // ID of created or found record (hopefully)
deferred.resolve(record);
} else {
db.get('SELECT * FROM schools WHERE name = $name', { $name: record.school }, function (error, row) {
if (error || !row) return deferred.reject(error || 'Row missing :(');
record.school = row.id;
deferred.resolve(record);
});
}
});
// Allow for promises or callbacks
deferred.promise.nodeify(callback); // callback is passes results of deferred.reject and deferred.resolve in respectively
return deferred.promise;
}
function findOrCreateMajor(record, callback) {
var deferred = Q.defer();
db.run('INSERT INTO majors(abbreviation, _school) SELECT $abbreviation, $_school WHERE NOT EXISTS(SELECT 1 FROM majors WHERE abbreviation = $abbreviation AND _school = $_school)', {
$abbreviation: record.major,
$_school: record.school
}, function(error) {
if (error) deferred.reject(error);
/*
* If we just created the major use it's returned ID, otherwise run query to find correct ID
*/
if (this.changes > 0) {
record.major = this.lastID; // ID of created or found record (hopefully)
deferred.resolve(record);
} else {
db.get('SELECT * FROM majors WHERE abbreviation = $abbreviation AND _school = $school', { $abbreviation: record.major, $school: record.school }, function (error, row) {
if (error || !row) return deferred.reject(error || 'Row missing :(');
record.major = row.id;
deferred.resolve(record);
});
}
});
// Allow for promises or callbacks
deferred.promise.nodeify(callback); // callback is passes results of deferred.reject and deferred.resolve in respectively
return deferred.promise;
}
function findOrCreateCourse(record, callback) {
var deferred = Q.defer();
db.run('INSERT INTO courses(id, name, courseID, level, credits, averageGrade, _school, _major) SELECT $id, $courseID, $name, $level, $credits, $averageGrade, $_school, $_major WHERE NOT EXISTS(SELECT 1 FROM courses WHERE courseID = $courseID AND _school = $_school)', {
$id: null,
$courseID: record.course, // Have way to many unique
$name: record.name,
$level: record.level,
$credits: record.credits,
$averageGrade: record.averageGrade,
$_school: record.school,
$_major: record.major
}, function(error) {
if (error) deferred.reject(error);
/*
* If we just created the course use it's returned ID, otherwise run query to find correct ID
*/
if (this.changes > 0) {
record.course = this.lastID; // ID of created or found record (hopefully)
deferred.resolve(record);
} else {
db.get('SELECT * FROM courses WHERE name = $name AND _school = $school', { $name: record.course, $school: record.school }, function (error, row) {
if (error || !row) {
console.log('-----------------------------')
console.log(record.school)
console.log(record.course)
return deferred.reject(error || 'Row missing :(');
}
record.course = row.id;
deferred.resolve(record);
});
}
});
// Allow for promises or callbacks
deferred.promise.nodeify(callback); // callback is passes results of deferred.reject and deferred.resolve in respectively
return deferred.promise;
}
function findOrCreateProfessor(record, callback) {
var deferred = Q.defer();
db.run('INSERT INTO professors(name, _school) SELECT $name, $_school WHERE NOT EXISTS(SELECT 1 FROM professors WHERE name = $name AND _school = $_school);', {
$name: record.professor,
$_school: record.school
}, function(error) {
if (error) deferred.reject(error);
/*
* If we just created the professor use it's returned ID, otherwise run query to find correct ID
*/
if (this.changes > 0) {
record.professor = this.lastID; // ID of created or found record (hopefully)
deferred.resolve(record);
} else {
db.get('SELECT * FROM professors WHERE name = $name AND _school = $school', { $name: record.professor, $school: record.school }, function (error, row) {
if (error || !row) deferred.reject(error || 'Row missing :(');
record.professor = row.id;
deferred.resolve(record);
});
}
});
// Allow for promises or callbacks
deferred.promise.nodeify(callback); // callback is passes results of deferred.reject and deferred.resolve in respectively
return deferred.promise;
}
@timkendall
Copy link
Author

Don't know why syntax highlighting won't work with a file name...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment