Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Last active August 29, 2015 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/f842a25cde30cc01c701 to your computer and use it in GitHub Desktop.
Save thisismattmiller/f842a25cde30cc01c701 to your computer and use it in GitHub Desktop.
var csv = require("fast-csv"),
fs = require("fs"),
readable = require('stream').Readable,
jsonStream = require('JSONStream'),
viaf = require("viaf-wrapper");
var stream = fs.createReadStream("perscorp-collection.csv");
var line = 0
//setup our output files
var outStreamExactDateMatch = new readable({objectMode: true})
var outStreamExactDateMatchOutfile = fs.createWriteStream("exact_dates_match.json",{ 'flags': 'a'})
var outStreamExactDateMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n")
outStreamExactDateMatch._read = function () {}
outStreamExactDateMatch.pipe(outStreamExactDateMatchStringify).pipe(outStreamExactDateMatchOutfile)
var outStreamExactNoDateMatch = new readable({objectMode: true})
var outStreamExactNoDateMatchOutfile = fs.createWriteStream("exact_no_dates_match.json",{ 'flags': 'a'})
var outStreamExactNoDateMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n")
outStreamExactNoDateMatch._read = function () {}
outStreamExactNoDateMatch.pipe(outStreamExactNoDateMatchStringify).pipe(outStreamExactNoDateMatchOutfile)
var outStreamPrefDateMatch = new readable({objectMode: true})
var outStreamPrefDateMatchOutfile = fs.createWriteStream("pref_dates_match.json",{ 'flags': 'a'})
var outStreamPrefDateMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n")
outStreamPrefDateMatch._read = function () {}
outStreamPrefDateMatch.pipe(outStreamPrefDateMatchStringify).pipe(outStreamPrefDateMatchOutfile)
var outStreamPrefNoDateMatch = new readable({objectMode: true})
var outStreamPrefNoDateMatchOutfile = fs.createWriteStream("pref_no_dates_match.json",{ 'flags': 'a'})
var outStreamPrefNoDateMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n")
outStreamPrefNoDateMatch._read = function () {}
outStreamPrefNoDateMatch.pipe(outStreamPrefNoDateMatchStringify).pipe(outStreamPrefNoDateMatchOutfile)
var outStreamNoMatch = new readable({objectMode: true})
var outStreamNoMatchOutfile = fs.createWriteStream("no_match.json",{ 'flags': 'a'})
var outStreamNoMatchStringify = jsonStream.stringify("[\n",",\n","\n]\n")
outStreamNoMatch._read = function () {}
outStreamNoMatch.pipe(outStreamNoMatchStringify).pipe(outStreamNoMatchOutfile)
var checkDates = function(record,data){
//are there dates, atleast birth or death
var b = record.birthDate.match("[0-9]{4}")
var d = record.deathDate.match("[0-9]{4}")
var start = parseInt(data[3])
var end = parseInt(data[4])
var bDateOkay = false
var dDateOkay = false
if (b){
b = parseInt(b[0])
//do we have a start date
if (start){
if (b < start){
console.log("\t\t","Was born before this reference.",b, "<",start)
bDateOkay = true
}
}
}
if (d){
d = parseInt(d[0])
//do we have a start date
if (end){
if (d > end){
console.log("\t\t","Was dead after this reference.",d, ">",end)
dDateOkay = true
}
}
}
return [bDateOkay, dDateOkay]
}
var csvStream = csv()
.on("data", function(data){
csvStream.pause()
console.log(data[1]);
line++
//check if the exact name is in there
viaf.searchNames(data[1], {operator:viaf.EXACT})
.then(function(results){
if (results.length == 0){
//there were no hits, lets try perfred headings instead
viaf.searchPreferredName(data[1])
.then(function(results){
//no match
if (results.length == 0){
outStreamNoMatch.push({ archivesId : data[0], archivesName : data[1] })
}else{
//matchc assume the first one is the one we want to look at for now
console.log("\tLooking at:",results[0].heading)
//there is an exact match, assume the first one is the one we want to check
var r = {
archivesId : data[0],
archivesName : data[1],
viafName : results[0].heading,
viafId : results[0].viafId,
viafURI : results[0].primaryTopic
}
var check = checkDates(results[0],data)
var bDateOkay = check[0]
var dDateOkay = check[1]
if (bDateOkay && dDateOkay){
outStreamPrefDateMatch.push(r)
}else{
outStreamPrefNoDateMatch.push(r)
}
}
csvStream.resume()
}).catch(function (error) {
console.log("ERROR:",error)
csvStream.resume()
})
.done()
}else{
//start the stream again
console.log("\tLooking at:",results[0].heading)
//there is an exact match, assume the first one is the one we want to check
var r = {
archivesId : data[0],
archivesName : data[1],
viafName : results[0].heading,
viafId : results[0].viafId,
viafURI : results[0].primaryTopic
}
var check = checkDates(results[0],data)
var bDateOkay = check[0]
var dDateOkay = check[1]
if (bDateOkay || dDateOkay){
outStreamExactDateMatch.push(r)
}else{
outStreamExactNoDateMatch.push(r)
}
csvStream.resume()
}
})
})
.on("error", function(data){
console.log(data,line);
})
.on("end", function(){
outStreamExactDateMatch.push(null)
outStreamExactNoDateMatch.push(null)
outStreamPrefDateMatch.push(null)
outStreamPrefNoDateMatch.push(null)
outStreamNoMatch.push(null)
console.log("Done")
});
stream.pipe(csvStream);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment