Skip to content

Instantly share code, notes, and snippets.

@o0101
Last active November 1, 2017 07:55
Show Gist options
  • Save o0101/67c34151467bf34f111e571e62432ad8 to your computer and use it in GitHub Desktop.
Save o0101/67c34151467bf34f111e571e62432ad8 to your computer and use it in GitHub Desktop.
requirebin sketch
"use strict";
{
const {get:edit_distance} = require('fast-levenshtein');
const SEN_MARK = {
en: [ ".", "!", "?", "()", "[]" ],
zh: [ "。", "「」", "!", "?", "()", "[]", "【】", "《》", "〈〉"],
es: [ ".", "'", '"', ";", "¡!", "¿?", "“”", "‘’" ],
hi: [ "|", ";", "?", "!", "”" ],
ar: [ ".", "؟", ":", "“”" ]
};
const SPACE_MARK = [
"\u0009-\u000d",
"\u0020",
"\u0085",
"\u00a0",
"\u2000-\u200b",
"\u2028",
"\u2029",
"\u202f",
"\u205f",
"\u3000"
];
const ZH_PATTERNS = [
"[\u2e80-\ud7af]"
];
const NONZH_PATTERNS = [
"[\u0021-\u0d7f][\u0021-\u0d7f]"
];
const ZH_COMMON = new RegExp( ZH_PATTERNS.join(''), "g" );
const NONZH_COMMON = new RegExp( NONZH_PATTERNS.join(''), "g" );
const WORD_BREAK = new RegExp(`[${SPACE_MARK.join('')}]+`,"g");
const WEIGHT = {
I: 0.333,
T: 0.333,
W: 0.333
};
const SEN_BREAK = new RegExp(`[${
Object.values(SEN_MARK).reduce((val,m)=>(val.push(
...(m.reduce((ms,mark) => (ms.push(...mark),ms),[]))
),val),[]).map(char => `\\${char}` ).join('')
}]+`,"g");
console.log(SEN_BREAK);
const lib = { get_sentences, similarity_score, find_closest };
try {
module.exports = lib;
} catch(e) { Object.assign( self, {lib}) };
function get_sentences(text) {
return text.split( SEN_BREAK ).map( (s,i,S) => new Sentence(s,i,S.length) );
}
function find_closest( sen, text ) {
const sens = get_sentences( text );
sen = new Sentence( sen, 0, sens.length );
const scores = sens.map( tsen => ({ score: similarity_score( sen, tsen ), tsen }) );
console.log( sens, scores );
scores.sort( (a,b) => b.score - a.score );
console.log( "top matches", scores.slice(0,5) );
return scores[0].tsen;
}
function similarity_score(sen1,sen2) {
const idiff = 1.0 - Math.abs(sen1.index/sen1.count-sen2.index/sen2.count);
const tdiff = 1.0 - (edit_distance(sen1.text,sen2.text)/(sen1.text.length+sen2.text.length));
const wdiff = intersection_size(sen1.emes,sen2.emes)/union_size(sen1.emes,sen2.emes);
return WEIGHT.I*idiff + WEIGHT.T*tdiff + WEIGHT.W*wdiff;
}
class Sentence {
constructor(s,index,count) {
this.text = s;
this.emes = new Set(get_emes(s));
this.index = index;
this.count = count;
}
}
function intersection_size(s,t) {
return s.size+t.size-union_size(s,t);
}
function union_size(s,t) {
return new Set([...s,...t]).size;
}
function get_emes(s) {
if ( lang(s) == 'zh' ) return s.split('');
else return s.split(WORD_BREAK);
}
function lang(s) {
if ( s.match( ZH_COMMON ) && !(s.match(NONZH_COMMON)) ) return 'zh';
else return 'nonzh';
}
}
setTimeout(function(){
;require=(function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({"fast-levenshtein":[function(require,module,exports){
(function() {
'use strict';
var collator;
try {
collator = (typeof Intl !== "undefined" && typeof Intl.Collator !== "undefined") ? Intl.Collator("generic", { sensitivity: "base" }) : null;
} catch (err){
console.log("Collator could not be initialized and wouldn't be used");
}
// arrays to re-use
var prevRow = [],
str2Char = [];
/**
* Based on the algorithm at http://en.wikipedia.org/wiki/Levenshtein_distance.
*/
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @param [options] Additional options.
* @param [options.useCollator] Use `Intl.Collator` for locale-sensitive string comparison.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2, options) {
var useCollator = (options && collator && options.useCollator);
var str1Len = str1.length,
str2Len = str2.length;
// base cases
if (str1Len === 0) return str2Len;
if (str2Len === 0) return str1Len;
// two rows
var curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<str2Len; ++i) {
prevRow[i] = i;
str2Char[i] = str2.charCodeAt(i);
}
prevRow[str2Len] = str2Len;
var strCmp;
if (useCollator) {
// calculate current row distance from previous row using collator
for (i = 0; i < str1Len; ++i) {
nextCol = i + 1;
for (j = 0; j < str2Len; ++j) {
curCol = nextCol;
// substution
strCmp = 0 === collator.compare(str1.charAt(i), String.fromCharCode(str2Char[j]));
nextCol = prevRow[j] + (strCmp ? 0 : 1);
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
}
else {
// calculate current row distance from previous row without collator
for (i = 0; i < str1Len; ++i) {
nextCol = i + 1;
for (j = 0; j < str2Len; ++j) {
curCol = nextCol;
// substution
strCmp = str1.charCodeAt(i) === str2Char[j];
nextCol = prevRow[j] + (strCmp ? 0 : 1);
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
}
return nextCol;
}
};
// amd
if (typeof define !== "undefined" && define !== null && define.amd) {
define(function() {
return Levenshtein;
});
}
// commonjs
else if (typeof module !== "undefined" && module !== null && typeof exports !== "undefined" && module.exports === exports) {
module.exports = Levenshtein;
}
// web worker
else if (typeof self !== "undefined" && typeof self.postMessage === 'function' && typeof self.importScripts === 'function') {
self.Levenshtein = Levenshtein;
}
// browser main thread
else if (typeof window !== "undefined" && window !== null) {
window.Levenshtein = Levenshtein;
}
}());
},{}]},{},[])
//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJzb3VyY2VzIjpbIi4uLy4uLy4uLy4uL2hvbWUvYWRtaW4vYnJvd3NlcmlmeS1jZG4vbm9kZV9tb2R1bGVzL2Jyb3dzZXJpZnkvbm9kZV9tb2R1bGVzL2Jyb3dzZXItcGFjay9fcHJlbHVkZS5qcyIsImZhc3QtbGV2ZW5zaHRlaW4iXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUE7QUNBQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBIiwiZmlsZSI6ImdlbmVyYXRlZC5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzQ29udGVudCI6WyIoZnVuY3Rpb24gZSh0LG4scil7ZnVuY3Rpb24gcyhvLHUpe2lmKCFuW29dKXtpZighdFtvXSl7dmFyIGE9dHlwZW9mIHJlcXVpcmU9PVwiZnVuY3Rpb25cIiYmcmVxdWlyZTtpZighdSYmYSlyZXR1cm4gYShvLCEwKTtpZihpKXJldHVybiBpKG8sITApO3ZhciBmPW5ldyBFcnJvcihcIkNhbm5vdCBmaW5kIG1vZHVsZSAnXCIrbytcIidcIik7dGhyb3cgZi5jb2RlPVwiTU9EVUxFX05PVF9GT1VORFwiLGZ9dmFyIGw9bltvXT17ZXhwb3J0czp7fX07dFtvXVswXS5jYWxsKGwuZXhwb3J0cyxmdW5jdGlvbihlKXt2YXIgbj10W29dWzFdW2VdO3JldHVybiBzKG4/bjplKX0sbCxsLmV4cG9ydHMsZSx0LG4scil9cmV0dXJuIG5bb10uZXhwb3J0c312YXIgaT10eXBlb2YgcmVxdWlyZT09XCJmdW5jdGlvblwiJiZyZXF1aXJlO2Zvcih2YXIgbz0wO288ci5sZW5ndGg7bysrKXMocltvXSk7cmV0dXJuIHN9KSIsIihmdW5jdGlvbigpIHtcbiAgJ3VzZSBzdHJpY3QnO1xuICBcbiAgdmFyIGNvbGxhdG9yO1xuICB0cnkge1xuICAgIGNvbGxhdG9yID0gKHR5cGVvZiBJbnRsICE9PSBcInVuZGVmaW5lZFwiICYmIHR5cGVvZiBJbnRsLkNvbGxhdG9yICE9PSBcInVuZGVmaW5lZFwiKSA/IEludGwuQ29sbGF0b3IoXCJnZW5lcmljXCIsIHsgc2Vuc2l0aXZpdHk6IFwiYmFzZVwiIH0pIDogbnVsbDtcbiAgfSBjYXRjaCAoZXJyKXtcbiAgICBjb25zb2xlLmxvZyhcIkNvbGxhdG9yIGNvdWxkIG5vdCBiZSBpbml0aWFsaXplZCBhbmQgd291bGRuJ3QgYmUgdXNlZFwiKTtcbiAgfVxuICAvLyBhcnJheXMgdG8gcmUtdXNlXG4gIHZhciBwcmV2Um93ID0gW10sXG4gICAgc3RyMkNoYXIgPSBbXTtcbiAgXG4gIC8qKlxuICAgKiBCYXNlZCBvbiB0aGUgYWxnb3JpdGhtIGF0IGh0dHA6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvTGV2ZW5zaHRlaW5fZGlzdGFuY2UuXG4gICAqL1xuICB2YXIgTGV2ZW5zaHRlaW4gPSB7XG4gICAgLyoqXG4gICAgICogQ2FsY3VsYXRlIGxldmVuc2h0ZWluIGRpc3RhbmNlIG9mIHRoZSB0d28gc3RyaW5ncy5cbiAgICAgKlxuICAgICAqIEBwYXJhbSBzdHIxIFN0cmluZyB0aGUgZmlyc3Qgc3RyaW5nLlxuICAgICAqIEBwYXJhbSBzdHIyIFN0cmluZyB0aGUgc2Vjb25kIHN0cmluZy5cbiAgICAgKiBAcGFyYW0gW29wdGlvbnNdIEFkZGl0aW9uYWwgb3B0aW9ucy5cbiAgICAgKiBAcGFyYW0gW29wdGlvbnMudXNlQ29sbGF0b3JdIFVzZSBgSW50bC5Db2xsYXRvcmAgZm9yIGxvY2FsZS1zZW5zaXRpdmUgc3RyaW5nIGNvbXBhcmlzb24uXG4gICAgICogQHJldHVybiBJbnRlZ2VyIHRoZSBsZXZlbnNodGVpbiBkaXN0YW5jZSAoMCBhbmQgYWJvdmUpLlxuICAgICAqL1xuICAgIGdldDogZnVuY3Rpb24oc3RyMSwgc3RyMiwgb3B0aW9ucykge1xuICAgICAgdmFyIHVzZUNvbGxhdG9yID0gKG9wdGlvbnMgJiYgY29sbGF0b3IgJiYgb3B0aW9ucy51c2VDb2xsYXRvcik7XG4gICAgICBcbiAgICAgIHZhciBzdHIxTGVuID0gc3RyMS5sZW5ndGgsXG4gICAgICAgIHN0cjJMZW4gPSBzdHIyLmxlbmd0aDtcbiAgICAgIFxuICAgICAgLy8gYmFzZSBjYXNlc1xuICAgICAgaWYgKHN0cjFMZW4gPT09IDApIHJldHVybiBzdHIyTGVuO1xuICAgICAgaWYgKHN0cjJMZW4gPT09IDApIHJldHVybiBzdHIxTGVuO1xuXG4gICAgICAvLyB0d28gcm93c1xuICAgICAgdmFyIGN1ckNvbCwgbmV4dENvbCwgaSwgaiwgdG1wO1xuXG4gICAgICAvLyBpbml0aWFsaXNlIHByZXZpb3VzIHJvd1xuICAgICAgZm9yIChpPTA7IGk8c3RyMkxlbjsgKytpKSB7XG4gICAgICAgIHByZXZSb3dbaV0gPSBpO1xuICAgICAgICBzdHIyQ2hhcltpXSA9IHN0cjIuY2hhckNvZGVBdChpKTtcbiAgICAgIH1cbiAgICAgIHByZXZSb3dbc3RyMkxlbl0gPSBzdHIyTGVuO1xuXG4gICAgICB2YXIgc3RyQ21wO1xuICAgICAgaWYgKHVzZUNvbGxhdG9yKSB7XG4gICAgICAgIC8vIGNhbGN1bGF0ZSBjdXJyZW50IHJvdyBkaXN0YW5jZSBmcm9tIHByZXZpb3VzIHJvdyB1c2luZyBjb2xsYXRvclxuICAgICAgICBmb3IgKGkgPSAwOyBpIDwgc3RyMUxlbjsgKytpKSB7XG4gICAgICAgICAgbmV4dENvbCA9IGkgKyAxO1xuXG4gICAgICAgICAgZm9yIChqID0gMDsgaiA8IHN0cjJMZW47ICsraikge1xuICAgICAgICAgICAgY3VyQ29sID0gbmV4dENvbDtcblxuICAgICAgICAgICAgLy8gc3Vic3R1dGlvblxuICAgICAgICAgICAgc3RyQ21wID0gMCA9PT0gY29sbGF0b3IuY29tcGFyZShzdHIxLmNoYXJBdChpKSwgU3RyaW5nLmZyb21DaGFyQ29kZShzdHIyQ2hhcltqXSkpO1xuXG4gICAgICAgICAgICBuZXh0Q29sID0gcHJldlJvd1tqXSArIChzdHJDbXAgPyAwIDogMSk7XG5cbiAgICAgICAgICAgIC8vIGluc2VydGlvblxuICAgICAgICAgICAgdG1wID0gY3VyQ29sICsgMTtcbiAgICAgICAgICAgIGlmIChuZXh0Q29sID4gdG1wKSB7XG4gICAgICAgICAgICAgIG5leHRDb2wgPSB0bXA7XG4gICAgICAgICAgICB9XG4gICAgICAgICAgICAvLyBkZWxldGlvblxuICAgICAgICAgICAgdG1wID0gcHJldlJvd1tqICsgMV0gKyAxO1xuICAgICAgICAgICAgaWYgKG5leHRDb2wgPiB0bXApIHtcbiAgICAgICAgICAgICAgbmV4dENvbCA9IHRtcDtcbiAgICAgICAgICAgIH1cblxuICAgICAgICAgICAgLy8gY29weSBjdXJyZW50IGNvbCB2YWx1ZSBpbnRvIHByZXZpb3VzIChpbiBwcmVwYXJhdGlvbiBmb3IgbmV4dCBpdGVyYXRpb24pXG4gICAgICAgICAgICBwcmV2Um93W2pdID0gY3VyQ29sO1xuICAgICAgICAgIH1cblxuICAgICAgICAgIC8vIGNvcHkgbGFzdCBjb2wgdmFsdWUgaW50byBwcmV2aW91cyAoaW4gcHJlcGFyYXRpb24gZm9yIG5leHQgaXRlcmF0aW9uKVxuICAgICAgICAgIHByZXZSb3dbal0gPSBuZXh0Q29sO1xuICAgICAgICB9XG4gICAgICB9XG4gICAgICBlbHNlIHtcbiAgICAgICAgLy8gY2FsY3VsYXRlIGN1cnJlbnQgcm93IGRpc3RhbmNlIGZyb20gcHJldmlvdXMgcm93IHdpdGhvdXQgY29sbGF0b3JcbiAgICAgICAgZm9yIChpID0gMDsgaSA8IHN0cjFMZW47ICsraSkge1xuICAgICAgICAgIG5leHRDb2wgPSBpICsgMTtcblxuICAgICAgICAgIGZvciAoaiA9IDA7IGogPCBzdHIyTGVuOyArK2opIHtcbiAgICAgICAgICAgIGN1ckNvbCA9IG5leHRDb2w7XG5cbiAgICAgICAgICAgIC8vIHN1YnN0dXRpb25cbiAgICAgICAgICAgIHN0ckNtcCA9IHN0cjEuY2hhckNvZGVBdChpKSA9PT0gc3RyMkNoYXJbal07XG5cbiAgICAgICAgICAgIG5leHRDb2wgPSBwcmV2Um93W2pdICsgKHN0ckNtcCA/IDAgOiAxKTtcblxuICAgICAgICAgICAgLy8gaW5zZXJ0aW9uXG4gICAgICAgICAgICB0bXAgPSBjdXJDb2wgKyAxO1xuICAgICAgICAgICAgaWYgKG5leHRDb2wgPiB0bXApIHtcbiAgICAgICAgICAgICAgbmV4dENvbCA9IHRtcDtcbiAgICAgICAgICAgIH1cbiAgICAgICAgICAgIC8vIGRlbGV0aW9uXG4gICAgICAgICAgICB0bXAgPSBwcmV2Um93W2ogKyAxXSArIDE7XG4gICAgICAgICAgICBpZiAobmV4dENvbCA+IHRtcCkge1xuICAgICAgICAgICAgICBuZXh0Q29sID0gdG1wO1xuICAgICAgICAgICAgfVxuXG4gICAgICAgICAgICAvLyBjb3B5IGN1cnJlbnQgY29sIHZhbHVlIGludG8gcHJldmlvdXMgKGluIHByZXBhcmF0aW9uIGZvciBuZXh0IGl0ZXJhdGlvbilcbiAgICAgICAgICAgIHByZXZSb3dbal0gPSBjdXJDb2w7XG4gICAgICAgICAgfVxuXG4gICAgICAgICAgLy8gY29weSBsYXN0IGNvbCB2YWx1ZSBpbnRvIHByZXZpb3VzIChpbiBwcmVwYXJhdGlvbiBmb3IgbmV4dCBpdGVyYXRpb24pXG4gICAgICAgICAgcHJldlJvd1tqXSA9IG5leHRDb2w7XG4gICAgICAgIH1cbiAgICAgIH1cbiAgICAgIHJldHVybiBuZXh0Q29sO1xuICAgIH1cblxuICB9O1xuXG4gIC8vIGFtZFxuICBpZiAodHlwZW9mIGRlZmluZSAhPT0gXCJ1bmRlZmluZWRcIiAmJiBkZWZpbmUgIT09IG51bGwgJiYgZGVmaW5lLmFtZCkge1xuICAgIGRlZmluZShmdW5jdGlvbigpIHtcbiAgICAgIHJldHVybiBMZXZlbnNodGVpbjtcbiAgICB9KTtcbiAgfVxuICAvLyBjb21tb25qc1xuICBlbHNlIGlmICh0eXBlb2YgbW9kdWxlICE9PSBcInVuZGVmaW5lZFwiICYmIG1vZHVsZSAhPT0gbnVsbCAmJiB0eXBlb2YgZXhwb3J0cyAhPT0gXCJ1bmRlZmluZWRcIiAmJiBtb2R1bGUuZXhwb3J0cyA9PT0gZXhwb3J0cykge1xuICAgIG1vZHVsZS5leHBvcnRzID0gTGV2ZW5zaHRlaW47XG4gIH1cbiAgLy8gd2ViIHdvcmtlclxuICBlbHNlIGlmICh0eXBlb2Ygc2VsZiAhPT0gXCJ1bmRlZmluZWRcIiAmJiB0eXBlb2Ygc2VsZi5wb3N0TWVzc2FnZSA9PT0gJ2Z1bmN0aW9uJyAmJiB0eXBlb2Ygc2VsZi5pbXBvcnRTY3JpcHRzID09PSAnZnVuY3Rpb24nKSB7XG4gICAgc2VsZi5MZXZlbnNodGVpbiA9IExldmVuc2h0ZWluO1xuICB9XG4gIC8vIGJyb3dzZXIgbWFpbiB0aHJlYWRcbiAgZWxzZSBpZiAodHlwZW9mIHdpbmRvdyAhPT0gXCJ1bmRlZmluZWRcIiAmJiB3aW5kb3cgIT09IG51bGwpIHtcbiAgICB3aW5kb3cuTGV2ZW5zaHRlaW4gPSBMZXZlbnNodGVpbjtcbiAgfVxufSgpKTtcblxuIl19
"use strict";
{
const {get:edit_distance} = require('fast-levenshtein');
const SEN_MARK = {
en: [ ".", "!", "?", "()", "[]" ],
zh: [ "。", "「」", "!", "?", "()", "[]", "【】", "《》", "〈〉"],
es: [ ".", "'", '"', ";", "¡!", "¿?", "“”", "‘’" ],
hi: [ "|", ";", "?", "!", "”" ],
ar: [ ".", "؟", ":", "“”" ]
};
const SPACE_MARK = [
"\u0009-\u000d",
"\u0020",
"\u0085",
"\u00a0",
"\u2000-\u200b",
"\u2028",
"\u2029",
"\u202f",
"\u205f",
"\u3000"
];
const ZH_PATTERNS = [
"[\u2e80-\ud7af]"
];
const NONZH_PATTERNS = [
"[\u0021-\u0d7f][\u0021-\u0d7f]"
];
const ZH_COMMON = new RegExp( ZH_PATTERNS.join(''), "g" );
const NONZH_COMMON = new RegExp( NONZH_PATTERNS.join(''), "g" );
const WORD_BREAK = new RegExp(`[${SPACE_MARK.join('')}]+`,"g");
const WEIGHT = {
I: 0.333,
T: 0.333,
W: 0.333
};
const SEN_BREAK = new RegExp(`[${
Object.values(SEN_MARK).reduce((val,m)=>(val.push(
...(m.reduce((ms,mark) => (ms.push(...mark),ms),[]))
),val),[]).map(char => `\\${char}` ).join('')
}]+`,"g");
console.log(SEN_BREAK);
const lib = { get_sentences, similarity_score, find_closest };
try {
module.exports = lib;
} catch(e) { Object.assign( self, {lib}) };
function get_sentences(text) {
return text.split( SEN_BREAK ).map( (s,i,S) => new Sentence(s,i,S.length) );
}
function find_closest( sen, text ) {
const sens = get_sentences( text );
sen = new Sentence( sen, 0, sens.length );
const scores = sens.map( tsen => ({ score: similarity_score( sen, tsen ), tsen }) );
console.log( sens, scores );
scores.sort( (a,b) => b.score - a.score );
console.log( "top matches", scores.slice(0,5) );
return scores[0].tsen;
}
function similarity_score(sen1,sen2) {
const idiff = 1.0 - Math.abs(sen1.index/sen1.count-sen2.index/sen2.count);
const tdiff = 1.0 - (edit_distance(sen1.text,sen2.text)/(sen1.text.length+sen2.text.length));
const wdiff = intersection_size(sen1.emes,sen2.emes)/union_size(sen1.emes,sen2.emes);
return WEIGHT.I*idiff + WEIGHT.T*tdiff + WEIGHT.W*wdiff;
}
class Sentence {
constructor(s,index,count) {
this.text = s;
this.emes = new Set(get_emes(s));
this.index = index;
this.count = count;
}
}
function intersection_size(s,t) {
return s.size+t.size-union_size(s,t);
}
function union_size(s,t) {
return new Set([...s,...t]).size;
}
function get_emes(s) {
if ( lang(s) == 'zh' ) return s.split('');
else return s.split(WORD_BREAK);
}
function lang(s) {
if ( s.match( ZH_COMMON ) && !(s.match(NONZH_COMMON)) ) return 'zh';
else return 'nonzh';
}
}
;}, 0)
{
"name": "requirebin-sketch",
"version": "1.0.0",
"dependencies": {
"fast-levenshtein": "2.0.6"
}
}
<!-- contents of this file will be placed inside the <body> -->
<form id=form>
<p><input name=sentence value="abc">
<p><textarea name=text>abc. def. sdfkhsdf. 1231d.</textarea>
<p><output name=result></output>
<p><button name=find>Find</button>
</form>
<script>
form.find.onclick = e => {
e.preventDefault();
form.result.value = lib.find_closest(form.sentence.value,form.text.value).text;
};
</script>
<!-- contents of this file will be placed inside the <head> -->
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment