Skip to content

Instantly share code, notes, and snippets.

@calvinmetcalf
Last active January 3, 2016 19:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save calvinmetcalf/8509609 to your computer and use it in GitHub Desktop.
Save calvinmetcalf/8509609 to your computer and use it in GitHub Desktop.
"use strict";
var PouchDB = require('pouchdb');
var sax = require('sax');
var fs = require('fs');
var Promise = require('lie');
var co = require('co');
var names = [
'department_of_industrial_accidents',
'appellate_court_opinions',
'district_court_appellate_division_opinions',
'superior_court_opinions'
];
function parseXML(xml){
return new Promise(function(resolve,reject){
var level = 0;
var parser = sax.parser();
var data = {};
parser.onopentag = function(tag){
level++;
if(level===2){
data[tag.name.toLowerCase()] = {start:parser.position}
}
}
parser.onclosetag = function(){
if(level===2){
data[parser.tag.name.toLowerCase()].end = parser.position-(parser.tag.name.length+3);
}
level--;
}
parser.onend = function(){
var outData = {};
var reg1 = /<\/?(?:center|italic|citation|bold|underline)>/g;
var reg2 = /\n/
Object.keys(data).forEach(function(key){
outData[key] = xml.slice(data[key].start,data[key].end).replace(reg1, "").trim().replace(/\n\n/g,"\n");
if(reg2.exec(outData[key])){
outData[key] = outData[key].split('\n').map(function(a){
return a.trim();
});
}
});
resolve(outData);
}
parser.onerror = reject;
parser.write(xml).close();
});
}
co(function*(){
var i2 = -1;
var len2 = names.length;
var db = yield PouchDB.bind(null,'names');
db.replicate.to('http://127.0.0.1:5984/cases',{continuous:true});
var out = [];
var dupNum = 0;
while(++i2<len2){
let which = names[i2];
let whichName = which.split("_").join(" ");
console.log(whichName);
out[i2] = yield co(function*(){
var baseDir = './'+which;
var inDir = baseDir + "/documents";
var files = yield fs.readdir.bind(fs, inDir);
var len = files.length;
var i = -1;
while(++i<len){
let file = yield fs.readFile.bind(fs,inDir+'/'+files[i],{encoding:'utf8'});
let json = yield parseXML(file);
if(json.reporter_caption){
json._id = json.reporter_caption;
}else{
json._id = files[i].slice(0,-4);
}
json.court = whichName;
try{
yield db.put.bind(db,json);
}catch(e){
try{
json._id = json._id+" dup"+dupNum++;
yield db.put.bind(db,json);
console.log('ok', json._id);
}catch(e2){
console.error('not ok',json._id);
}
}
}
return 'done '+i+" for"+whichName;
});
}
return out;
})(function(err,done){
console.log(err, done);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment