Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Last active December 19, 2017 23:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/3b594b9e462c285bb38d54e4907e406d to your computer and use it in GitHub Desktop.
Save thisismattmiller/3b594b9e462c285bb38d54e4907e406d to your computer and use it in GitHub Desktop.
const highland = require("highland")
const fs = require("fs")
const path_to_dpla_extract = "all.dec.2017.json"
const path_to_dpla_output = "all_docs_simple"
const extractFrom = [ 'title',
// 'extent',
'language',
'format',
// '@id',
'date',
'type',
// 'identifier',
// 'publisher',
'specType',
'spatial',
'subject',
'relation',
'creator',
'contributor',
'description',
'temporal',
// 'rights',
// 'collection',
// 'stateLocatedIn',
// 'hasType',
'alternative',
'genre',
// 'isPartOf'
]
var out = fs.createWriteStream(path_to_dpla_output)
var counter = 0
allKeys=[]
highland(fs.createReadStream(path_to_dpla_extract))
.split()
.compact()
.map((line)=>{
line = line.trim()
line = line.replace(/:null,/g,':"",').replace(/:null}/g,':""}')
if (line.charAt(0) === ',' || line.charAt(0) === '[' || line.charAt(0) === ']') return line.substring(1).trim()
return line.trim()
})
.compact()
.map(JSON.parse)
.map((line)=>{
process.stdout.write('Progress: '+ counter++ +'\r');
if (line && line._source && line._source.originalRecord){
var content = ""
function extractValues(obj) {
for(prop in obj) {
content = content + obj[prop] + " "
if (typeof obj[prop] === 'object')
extractValues(obj[prop]);
}
}
extractFrom.forEach((key)=>{
if (line._source.sourceResource[key]){
if (typeof line._source.sourceResource[key] === 'string'){
content = content + line._source.sourceResource[key] + " "
}else{
extractValues(line._source.sourceResource[key])
}
}
})
content = content.replace(/\[object Object\]/g,'').replace(/\n\n/g,'\n')
content = content.replace(/http.*?\s|http.*?,|http.*?\./ig,' ')
return JSON.stringify({id:line._source.id,doc:content}) + "\n"
}
})
.compact()
.pipe(out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment