Created
June 30, 2016 11:16
-
-
Save blahah/bb8ee2c7695180f425f443f88fa16b79 to your computer and use it in GitHub Desktop.
use the unified / rehype / retext ecosystem to extract enriched keywords from each article in a corpus of JATS XML articles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs') | |
var unified = require('unified') | |
var parseHTML = require('rehype-parse') | |
var visit = require('unist-util-visit') | |
var findAfter = require('unist-util-find-after') | |
var file = '/Users/rds45/.sciencefair/data/elife_dws2/articles/01096/elife-01096-v1.xml' | |
var getBody = function () { | |
return function (ast, file) { | |
var article = findAfter(ast, ast.children[0], function (node) { | |
return node.type === 'element' && node.tagName === 'article' | |
}) | |
var body = findAfter(article, article.children[0], function (node) { | |
return node.type === 'element' && node.tagName === 'body' | |
}) | |
// console.log(body) | |
return body | |
} | |
} | |
var toPlaintext = function (processor, config) { | |
function Compiler (file, options, processor) { | |
this.options = options | |
this.data = processor.data | |
this.file = file | |
} | |
function compile (tree) { | |
var parts = [] | |
visit(tree, 'text', function (node) { | |
parts.push(node.value) | |
}) | |
return parts.join('') | |
} | |
/* Expose methods. */ | |
Compiler.prototype.compile = compile | |
processor.Compiler = Compiler | |
} | |
fs.createReadStream(file, 'utf8') | |
.pipe(unified()) | |
.use(parseHTML) | |
.use(getBody) | |
.use(toPlaintext) | |
.pipe(process.stdout()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment