Skip to content

Instantly share code, notes, and snippets.

@derms
Last active February 27, 2020 15:25
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save derms/fb66072747ef5f744d685029bac90804 to your computer and use it in GitHub Desktop.
Save derms/fb66072747ef5f744d685029bac90804 to your computer and use it in GitHub Desktop.
Example of using ONNX Machine Learning Models in MarkLogic 10 (10.0-4 and above). BiDAF model: https://github.com/onnx/models/tree/master/text/machine_comprehension/bidirectional_attention_flow.
'use strict';
// Simple way to download the model and insert it into the database.
// Normally would use a tool like mlcp to do this. This only needs to be done once
declareUpdate()
//downloads the BiDAF model
let model = fn.subsequence(
xdmp.httpGet("https://onnxzoo.blob.core.windows.net/models/opset_9/bidaf/bidaf.onnx")
, 2, 1)
//inserts the BiDAF model into the database at the URI /onnx/model/bidaf.onnx
xdmp.documentInsert('/onnx/model/bidaf.onnx', model, {collections:['/onnx/model']})
'use strict';
// Simple way to download the helper library and insert it into the modules database.
// Normally would use a tool like gradle/ml-gradle to do this. This only needs to be done once
//download the library
let lib = fn.subsequence(
xdmp.httpGet("https://cdn.jsdelivr.net/gh/nicolaspanel/numjs@0.15.1/dist/numjs.js",{verifyCert:false})
, 2, 1)
//inserts the library into the modules database at the URI "/lib/numjs.sjs"
xdmp.invokeFunction(()=>{
declareUpdate();
xdmp.documentInsert("/lib/numjs.sjs", lib)
},{database:xdmp.modulesDatabase()})
'use strict';
//numpy for javascript
const nj = require("/lib/numjs.sjs")
/**
* Tokenizes a string using cts.tokenize
*
* @param {string} str the string to tokenize
* @return {string} the tokenized string as an Array
*/
function tokenize(str) {
let words = cts.tokenize(str)
.toArray()
.filter(token => token instanceof cts.word)
.map(word=>String(word).toLowerCase())
return words
}
/**
* Flattens an Array. Just flattens one level
*
* @param {Array} arr the array to flatten
* @return {Array} the flattened array
*/
function flattenArray(arr) {
return arr.reduce((acc, val) => acc.concat(val), []);
}
/**
* Creates an Array of a given size, fills it with values
* from a specified array and pads the rest of the array with a given value.
*
* @param {Array} arr the source array to use for the initial values.
* @param {number} size the size of the desired arrary
* @param {string} padValue value to use to pad the array
* @return {Array} the padded array
*/
function padArray(arr,size,padValue) {
if (arr.length>=size) {
return arr;
}
let newArray = Array(size-arr.length).fill(padValue);
return [...arr,...newArray];
}
/**
* Transform the text into inputs that can be used in the model
*
* @param {string} text the text to transform
* @param {string} padValue value to use to pad the array
* @return {Array} Array containing the word NdArray and character NdArray model inputs
*/
function preprocess(text) {
let tokens = tokenize(text)
let words = nj.reshape(tokens, [-1,1]) //create 2d NdArray
let chars = tokens.map(token=>token.split('')) // get the charaters array from the string
.map(arr=>padArray(arr,16,'')) // pad the array so that it is 16 charaters
chars = flattenArray(chars) // flatten to a 1d array
chars = nj.reshape(chars, [-1,1,1,16]) //create 4d NdArray from 1d array
return [words,chars]
}
/**
* Execute the model
*
* @param {Object} session the ONNX Run Time (ort) session to execute against
* @param {Object} contextInput the content inputs into the model
* @param {Object} queryInput the query inputs into the model
* @return {string} the response to the query
*/
function executeModel(session,contextInput,queryInput) {
let [cw,cc] = contextInput;
let [qw,qc] = queryInput;
let answer = ort.run(session,{
"context_word":ort.string(cw.selection.data, cw.selection.shape),
"context_char":ort.string(cc.selection.data, cc.selection.shape),
"query_word":ort.string(qw.selection.data, qw.selection.shape),
"query_char":ort.string(qc.selection.data, qc.selection.shape),
})
let startPos = ort.valueGetArray(answer["start_pos"])
let endPos = ort.valueGetArray(answer["end_pos"])
let result = []
for (let i=startPos;i<=endPos;i++) {
result.push(cw.selection.data[i])
}
return result.join(" ")
}
/**
* Ask questions to the BiDAF (bidirectional attention flow) model.
*
* https://github.com/onnx/models/tree/master/text/machine_comprehension/bidirectional_attention_flow.
*
* @param {string} context the text that provides the context to the questions
* @param {Array} questions the string array of questions to ask
* @return {Array} the questions and their responses from the model
*/
function askQuestions(context,questions) {
let contextInput = preprocess(context)
let queryInputs = questions.map(question=>preprocess(question))
const session = ort.session(cts.doc("/onnx/model/bidaf.onnx"))
let answers = queryInputs.map(queryInput=>executeModel(session,contextInput,queryInput))
let output = []
for (let i=0; i<questions.length;i++ ) {
output.push({question:questions[i],answer:answers[i]})
}
return output
}
//Taken from the MarkLogic wikipedia page https://en.wikipedia.org/wiki/MarkLogic
//Other sample datasets (context and questions) - https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev
let context = `MarkLogic was first named Cerisent and was founded in 2001 by Christopher Lindblad, who was the Chief Architect of the Ultraseek search engine at Infoseek, and Paul Pedersen, a professor of computer science at Cornell University and UCLA, and Frank R. Caufield, Founder of Darwin Ventures,[8] to address shortcomings with existing search and data products. The product first focused on using XML document markup standard and XQuery as the query standard for accessing collections of documents up to hundreds of terabytes in size.
In 2009 IDC mentioned MarkLogic in a report as one of the top Innovative Information Access Companies with under $100 million in revenue.
In May 2012, Gary Bloom was appointed as Chief Executive Officer. He held senior positions at Symantec Corporation, Veritas Software, and Oracle.`
let questions = [
'When was MarkLogic founded?',
'Who founded MarkLogic?',
'What did MarkLogic first focus on?',
'Who is the Chief Executive Officer?',
'When was the Chief Executive Officer appointed?',
'Which report mentioned MarkLogic?'
]
askQuestions(context,questions)
// sample results
/*
[
{
"question":"When was MarkLogic founded?",
"answer":"2001"
},
{
"question":"Who founded MarkLogic?",
"answer":"christopher lindblad"
},
{
"question":"What did MarkLogic first focus on?",
"answer":"xml document markup standard and xquery"
},
{
"question":"Who is the Chief Executive Officer?",
"answer":"gary bloom"
},
{
"question":"When was the Chief Executive Officer appointed?",
"answer":"2012"
},
{
"question":"Which report mentioned MarkLogic?",
"answer":"idc"
}
]
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment