Last active
August 29, 2015 14:23
-
-
Save thisismattmiller/f842a25cde30cc01c701 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var csv = require("fast-csv"), | |
fs = require("fs"), | |
readable = require('stream').Readable, | |
jsonStream = require('JSONStream'), | |
viaf = require("viaf-wrapper"); | |
var stream = fs.createReadStream("perscorp-collection.csv"); | |
var line = 0 | |
//setup our output files | |
var outStreamExactDateMatch = new readable({objectMode: true}) | |
var outStreamExactDateMatchOutfile = fs.createWriteStream("exact_dates_match.json",{ 'flags': 'a'}) | |
var outStreamExactDateMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n") | |
outStreamExactDateMatch._read = function () {} | |
outStreamExactDateMatch.pipe(outStreamExactDateMatchStringify).pipe(outStreamExactDateMatchOutfile) | |
var outStreamExactNoDateMatch = new readable({objectMode: true}) | |
var outStreamExactNoDateMatchOutfile = fs.createWriteStream("exact_no_dates_match.json",{ 'flags': 'a'}) | |
var outStreamExactNoDateMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n") | |
outStreamExactNoDateMatch._read = function () {} | |
outStreamExactNoDateMatch.pipe(outStreamExactNoDateMatchStringify).pipe(outStreamExactNoDateMatchOutfile) | |
var outStreamPrefDateMatch = new readable({objectMode: true}) | |
var outStreamPrefDateMatchOutfile = fs.createWriteStream("pref_dates_match.json",{ 'flags': 'a'}) | |
var outStreamPrefDateMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n") | |
outStreamPrefDateMatch._read = function () {} | |
outStreamPrefDateMatch.pipe(outStreamPrefDateMatchStringify).pipe(outStreamPrefDateMatchOutfile) | |
var outStreamPrefNoDateMatch = new readable({objectMode: true}) | |
var outStreamPrefNoDateMatchOutfile = fs.createWriteStream("pref_no_dates_match.json",{ 'flags': 'a'}) | |
var outStreamPrefNoDateMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n") | |
outStreamPrefNoDateMatch._read = function () {} | |
outStreamPrefNoDateMatch.pipe(outStreamPrefNoDateMatchStringify).pipe(outStreamPrefNoDateMatchOutfile) | |
var outStreamNoMatch = new readable({objectMode: true}) | |
var outStreamNoMatchOutfile = fs.createWriteStream("no_match.json",{ 'flags': 'a'}) | |
var outStreamNoMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n") | |
outStreamNoMatch._read = function () {} | |
outStreamNoMatch.pipe(outStreamNoMatchStringify).pipe(outStreamNoMatchOutfile) | |
var checkDates = function(record,data){ | |
//are there dates, atleast birth or death | |
var b = record.birthDate.match("[0-9]{4}") | |
var d = record.deathDate.match("[0-9]{4}") | |
var start = parseInt(data[3]) | |
var end = parseInt(data[4]) | |
var bDateOkay = false | |
var dDateOkay = false | |
if (b){ | |
b = parseInt(b[0]) | |
//do we have a start date | |
if (start){ | |
if (b < start){ | |
console.log("\t\t","Was born before this reference.",b, "<",start) | |
bDateOkay = true | |
} | |
} | |
} | |
if (d){ | |
d = parseInt(d[0]) | |
//do we have a start date | |
if (end){ | |
if (d > end){ | |
console.log("\t\t","Was dead after this reference.",d, ">",end) | |
dDateOkay = true | |
} | |
} | |
} | |
return [bDateOkay, dDateOkay] | |
} | |
var csvStream = csv() | |
.on("data", function(data){ | |
csvStream.pause() | |
console.log(data[1]); | |
line++ | |
//check if the exact name is in there | |
viaf.searchNames(data[1], {operator:viaf.EXACT}) | |
.then(function(results){ | |
if (results.length == 0){ | |
//there were no hits, lets try perfred headings instead | |
viaf.searchPreferredName(data[1]) | |
.then(function(results){ | |
//no match | |
if (results.length == 0){ | |
outStreamNoMatch.push({ archivesId : data[0], archivesName : data[1] }) | |
}else{ | |
//matchc assume the first one is the one we want to look at for now | |
console.log("\tLooking at:",results[0].heading) | |
//there is an exact match, assume the first one is the one we want to check | |
var r = { | |
archivesId : data[0], | |
archivesName : data[1], | |
viafName : results[0].heading, | |
viafId : results[0].viafId, | |
viafURI : results[0].primaryTopic | |
} | |
var check = checkDates(results[0],data) | |
var bDateOkay = check[0] | |
var dDateOkay = check[1] | |
if (bDateOkay && dDateOkay){ | |
outStreamPrefDateMatch.push(r) | |
}else{ | |
outStreamPrefNoDateMatch.push(r) | |
} | |
} | |
csvStream.resume() | |
}).catch(function (error) { | |
console.log("ERROR:",error) | |
csvStream.resume() | |
}) | |
.done() | |
}else{ | |
//start the stream again | |
console.log("\tLooking at:",results[0].heading) | |
//there is an exact match, assume the first one is the one we want to check | |
var r = { | |
archivesId : data[0], | |
archivesName : data[1], | |
viafName : results[0].heading, | |
viafId : results[0].viafId, | |
viafURI : results[0].primaryTopic | |
} | |
var check = checkDates(results[0],data) | |
var bDateOkay = check[0] | |
var dDateOkay = check[1] | |
if (bDateOkay || dDateOkay){ | |
outStreamExactDateMatch.push(r) | |
}else{ | |
outStreamExactNoDateMatch.push(r) | |
} | |
csvStream.resume() | |
} | |
}) | |
}) | |
.on("error", function(data){ | |
console.log(data,line); | |
}) | |
.on("end", function(){ | |
outStreamExactDateMatch.push(null) | |
outStreamExactNoDateMatch.push(null) | |
outStreamPrefDateMatch.push(null) | |
outStreamPrefNoDateMatch.push(null) | |
outStreamNoMatch.push(null) | |
console.log("Done") | |
}); | |
stream.pipe(csvStream); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment