Last active
December 19, 2017 23:08
-
-
Save thisismattmiller/3b594b9e462c285bb38d54e4907e406d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const highland = require("highland") | |
const fs = require("fs") | |
const path_to_dpla_extract = "all.dec.2017.json" | |
const path_to_dpla_output = "all_docs_simple" | |
const extractFrom = [ 'title', | |
// 'extent', | |
'language', | |
'format', | |
// '@id', | |
'date', | |
'type', | |
// 'identifier', | |
// 'publisher', | |
'specType', | |
'spatial', | |
'subject', | |
'relation', | |
'creator', | |
'contributor', | |
'description', | |
'temporal', | |
// 'rights', | |
// 'collection', | |
// 'stateLocatedIn', | |
// 'hasType', | |
'alternative', | |
'genre', | |
// 'isPartOf' | |
] | |
var out = fs.createWriteStream(path_to_dpla_output) | |
var counter = 0 | |
allKeys=[] | |
highland(fs.createReadStream(path_to_dpla_extract)) | |
.split() | |
.compact() | |
.map((line)=>{ | |
line = line.trim() | |
line = line.replace(/:null,/g,':"",').replace(/:null}/g,':""}') | |
if (line.charAt(0) === ',' || line.charAt(0) === '[' || line.charAt(0) === ']') return line.substring(1).trim() | |
return line.trim() | |
}) | |
.compact() | |
.map(JSON.parse) | |
.map((line)=>{ | |
process.stdout.write('Progress: '+ counter++ +'\r'); | |
if (line && line._source && line._source.originalRecord){ | |
var content = "" | |
function extractValues(obj) { | |
for(prop in obj) { | |
content = content + obj[prop] + " " | |
if (typeof obj[prop] === 'object') | |
extractValues(obj[prop]); | |
} | |
} | |
extractFrom.forEach((key)=>{ | |
if (line._source.sourceResource[key]){ | |
if (typeof line._source.sourceResource[key] === 'string'){ | |
content = content + line._source.sourceResource[key] + " " | |
}else{ | |
extractValues(line._source.sourceResource[key]) | |
} | |
} | |
}) | |
content = content.replace(/\[object Object\]/g,'').replace(/\n\n/g,'\n') | |
content = content.replace(/http.*?\s|http.*?,|http.*?\./ig,' ') | |
return JSON.stringify({id:line._source.id,doc:content}) + "\n" | |
} | |
}) | |
.compact() | |
.pipe(out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment