Skip to content

Instantly share code, notes, and snippets.

@blahah
Created June 30, 2016 11:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blahah/bb8ee2c7695180f425f443f88fa16b79 to your computer and use it in GitHub Desktop.
Save blahah/bb8ee2c7695180f425f443f88fa16b79 to your computer and use it in GitHub Desktop.
use the unified / rehype / retext ecosystem to extract enriched keywords from each article in a corpus of JATS XML articles
var fs = require('fs')
var unified = require('unified')
var parseHTML = require('rehype-parse')
var visit = require('unist-util-visit')
var findAfter = require('unist-util-find-after')
var file = '/Users/rds45/.sciencefair/data/elife_dws2/articles/01096/elife-01096-v1.xml'
var getBody = function () {
return function (ast, file) {
var article = findAfter(ast, ast.children[0], function (node) {
return node.type === 'element' && node.tagName === 'article'
})
var body = findAfter(article, article.children[0], function (node) {
return node.type === 'element' && node.tagName === 'body'
})
// console.log(body)
return body
}
}
var toPlaintext = function (processor, config) {
function Compiler (file, options, processor) {
this.options = options
this.data = processor.data
this.file = file
}
function compile (tree) {
var parts = []
visit(tree, 'text', function (node) {
parts.push(node.value)
})
return parts.join('')
}
/* Expose methods. */
Compiler.prototype.compile = compile
processor.Compiler = Compiler
}
fs.createReadStream(file, 'utf8')
.pipe(unified())
.use(parseHTML)
.use(getBody)
.use(toPlaintext)
.pipe(process.stdout())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment