made with requirebin
Last active
November 1, 2017 07:55
-
-
Save o0101/67c34151467bf34f111e571e62432ad8 to your computer and use it in GitHub Desktop.
requirebin sketch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict"; | |
{ | |
const {get:edit_distance} = require('fast-levenshtein'); | |
const SEN_MARK = { | |
en: [ ".", "!", "?", "()", "[]" ], | |
zh: [ "。", "「」", "!", "?", "()", "[]", "【】", "《》", "〈〉"], | |
es: [ ".", "'", '"', ";", "¡!", "¿?", "“”", "‘’" ], | |
hi: [ "|", ";", "?", "!", "”" ], | |
ar: [ ".", "؟", ":", "“”" ] | |
}; | |
const SPACE_MARK = [ | |
"\u0009-\u000d", | |
"\u0020", | |
"\u0085", | |
"\u00a0", | |
"\u2000-\u200b", | |
"\u2028", | |
"\u2029", | |
"\u202f", | |
"\u205f", | |
"\u3000" | |
]; | |
const ZH_PATTERNS = [ | |
"[\u2e80-\ud7af]" | |
]; | |
const NONZH_PATTERNS = [ | |
"[\u0021-\u0d7f][\u0021-\u0d7f]" | |
]; | |
const ZH_COMMON = new RegExp( ZH_PATTERNS.join(''), "g" ); | |
const NONZH_COMMON = new RegExp( NONZH_PATTERNS.join(''), "g" ); | |
const WORD_BREAK = new RegExp(`[${SPACE_MARK.join('')}]+`,"g"); | |
const WEIGHT = { | |
I: 0.333, | |
T: 0.333, | |
W: 0.333 | |
}; | |
const SEN_BREAK = new RegExp(`[${ | |
Object.values(SEN_MARK).reduce((val,m)=>(val.push( | |
...(m.reduce((ms,mark) => (ms.push(...mark),ms),[])) | |
),val),[]).map(char => `\\${char}` ).join('') | |
}]+`,"g"); | |
console.log(SEN_BREAK); | |
const lib = { get_sentences, similarity_score, find_closest }; | |
try { | |
module.exports = lib; | |
} catch(e) { Object.assign( self, {lib}) }; | |
function get_sentences(text) { | |
return text.split( SEN_BREAK ).map( (s,i,S) => new Sentence(s,i,S.length) ); | |
} | |
function find_closest( sen, text ) { | |
const sens = get_sentences( text ); | |
sen = new Sentence( sen, 0, sens.length ); | |
const scores = sens.map( tsen => ({ score: similarity_score( sen, tsen ), tsen }) ); | |
console.log( sens, scores ); | |
scores.sort( (a,b) => b.score - a.score ); | |
console.log( "top matches", scores.slice(0,5) ); | |
return scores[0].tsen; | |
} | |
function similarity_score(sen1,sen2) { | |
const idiff = 1.0 - Math.abs(sen1.index/sen1.count-sen2.index/sen2.count); | |
const tdiff = 1.0 - (edit_distance(sen1.text,sen2.text)/(sen1.text.length+sen2.text.length)); | |
const wdiff = intersection_size(sen1.emes,sen2.emes)/union_size(sen1.emes,sen2.emes); | |
return WEIGHT.I*idiff + WEIGHT.T*tdiff + WEIGHT.W*wdiff; | |
} | |
class Sentence { | |
constructor(s,index,count) { | |
this.text = s; | |
this.emes = new Set(get_emes(s)); | |
this.index = index; | |
this.count = count; | |
} | |
} | |
function intersection_size(s,t) { | |
return s.size+t.size-union_size(s,t); | |
} | |
function union_size(s,t) { | |
return new Set([...s,...t]).size; | |
} | |
function get_emes(s) { | |
if ( lang(s) == 'zh' ) return s.split(''); | |
else return s.split(WORD_BREAK); | |
} | |
function lang(s) { | |
if ( s.match( ZH_COMMON ) && !(s.match(NONZH_COMMON)) ) return 'zh'; | |
else return 'nonzh'; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setTimeout(function(){ | |
;require=(function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({"fast-levenshtein":[function(require,module,exports){ | |
(function() { | |
'use strict'; | |
var collator; | |
try { | |
collator = (typeof Intl !== "undefined" && typeof Intl.Collator !== "undefined") ? Intl.Collator("generic", { sensitivity: "base" }) : null; | |
} catch (err){ | |
console.log("Collator could not be initialized and wouldn't be used"); | |
} | |
// arrays to re-use | |
var prevRow = [], | |
str2Char = []; | |
/** | |
* Based on the algorithm at http://en.wikipedia.org/wiki/Levenshtein_distance. | |
*/ | |
var Levenshtein = { | |
/** | |
* Calculate levenshtein distance of the two strings. | |
* | |
* @param str1 String the first string. | |
* @param str2 String the second string. | |
* @param [options] Additional options. | |
* @param [options.useCollator] Use `Intl.Collator` for locale-sensitive string comparison. | |
* @return Integer the levenshtein distance (0 and above). | |
*/ | |
get: function(str1, str2, options) { | |
var useCollator = (options && collator && options.useCollator); | |
var str1Len = str1.length, | |
str2Len = str2.length; | |
// base cases | |
if (str1Len === 0) return str2Len; | |
if (str2Len === 0) return str1Len; | |
// two rows | |
var curCol, nextCol, i, j, tmp; | |
// initialise previous row | |
for (i=0; i<str2Len; ++i) { | |
prevRow[i] = i; | |
str2Char[i] = str2.charCodeAt(i); | |
} | |
prevRow[str2Len] = str2Len; | |
var strCmp; | |
if (useCollator) { | |
// calculate current row distance from previous row using collator | |
for (i = 0; i < str1Len; ++i) { | |
nextCol = i + 1; | |
for (j = 0; j < str2Len; ++j) { | |
curCol = nextCol; | |
// substution | |
strCmp = 0 === collator.compare(str1.charAt(i), String.fromCharCode(str2Char[j])); | |
nextCol = prevRow[j] + (strCmp ? 0 : 1); | |
// insertion | |
tmp = curCol + 1; | |
if (nextCol > tmp) { | |
nextCol = tmp; | |
} | |
// deletion | |
tmp = prevRow[j + 1] + 1; | |
if (nextCol > tmp) { | |
nextCol = tmp; | |
} | |
// copy current col value into previous (in preparation for next iteration) | |
prevRow[j] = curCol; | |
} | |
// copy last col value into previous (in preparation for next iteration) | |
prevRow[j] = nextCol; | |
} | |
} | |
else { | |
// calculate current row distance from previous row without collator | |
for (i = 0; i < str1Len; ++i) { | |
nextCol = i + 1; | |
for (j = 0; j < str2Len; ++j) { | |
curCol = nextCol; | |
// substution | |
strCmp = str1.charCodeAt(i) === str2Char[j]; | |
nextCol = prevRow[j] + (strCmp ? 0 : 1); | |
// insertion | |
tmp = curCol + 1; | |
if (nextCol > tmp) { | |
nextCol = tmp; | |
} | |
// deletion | |
tmp = prevRow[j + 1] + 1; | |
if (nextCol > tmp) { | |
nextCol = tmp; | |
} | |
// copy current col value into previous (in preparation for next iteration) | |
prevRow[j] = curCol; | |
} | |
// copy last col value into previous (in preparation for next iteration) | |
prevRow[j] = nextCol; | |
} | |
} | |
return nextCol; | |
} | |
}; | |
// amd | |
if (typeof define !== "undefined" && define !== null && define.amd) { | |
define(function() { | |
return Levenshtein; | |
}); | |
} | |
// commonjs | |
else if (typeof module !== "undefined" && module !== null && typeof exports !== "undefined" && module.exports === exports) { | |
module.exports = Levenshtein; | |
} | |
// web worker | |
else if (typeof self !== "undefined" && typeof self.postMessage === 'function' && typeof self.importScripts === 'function') { | |
self.Levenshtein = Levenshtein; | |
} | |
// browser main thread | |
else if (typeof window !== "undefined" && window !== null) { | |
window.Levenshtein = Levenshtein; | |
} | |
}()); | |
},{}]},{},[]) | |
//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJzb3VyY2VzIjpbIi4uLy4uLy4uLy4uL2hvbWUvYWRtaW4vYnJvd3NlcmlmeS1jZG4vbm9kZV9tb2R1bGVzL2Jyb3dzZXJpZnkvbm9kZV9tb2R1bGVzL2Jyb3dzZXItcGFjay9fcHJlbHVkZS5qcyIsImZhc3QtbGV2ZW5zaHRlaW4iXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUE7QUNBQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBO0FBQ0E7QUFDQTtBQUNBIiwiZmlsZSI6ImdlbmVyYXRlZC5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzQ29udGVudCI6WyIoZnVuY3Rpb24gZSh0LG4scil7ZnVuY3Rpb24gcyhvLHUpe2lmKCFuW29dKXtpZighdFtvXSl7dmFyIGE9dHlwZW9mIHJlcXVpcmU9PVwiZnVuY3Rpb25cIiYmcmVxdWlyZTtpZighdSYmYSlyZXR1cm4gYShvLCEwKTtpZihpKXJldHVybiBpKG8sITApO3ZhciBmPW5ldyBFcnJvcihcIkNhbm5vdCBmaW5kIG1vZHVsZSAnXCIrbytcIidcIik7dGhyb3cgZi5jb2RlPVwiTU9EVUxFX05PVF9GT1VORFwiLGZ9dmFyIGw9bltvXT17ZXhwb3J0czp7fX07dFtvXVswXS5jYWxsKGwuZXhwb3J0cyxmdW5jdGlvbihlKXt2YXIgbj10W29dWzFdW2VdO3JldHVybiBzKG4/bjplKX0sbCxsLmV4cG9ydHMsZSx0LG4scil9cmV0dXJuIG5bb10uZXhwb3J0c312YXIgaT10eXBlb2YgcmVxdWlyZT09XCJmdW5jdGlvblwiJiZyZXF1aXJlO2Zvcih2YXIgbz0wO288ci5sZW5ndGg7bysrKXMocltvXSk7cmV0dXJuIHN9KSIsIihmdW5jdGlvbigpIHtcbiAgJ3VzZSBzdHJpY3QnO1xuICBcbiAgdmFyIGNvbGxhdG9yO1xuICB0cnkge1xuICAgIGNvbGxhdG9yID0gKHR5cGVvZiBJbnRsICE9PSBcInVuZGVmaW5lZFwiICYmIHR5cGVvZiBJbnRsLkNvbGxhdG9yICE9PSBcInVuZGVmaW5lZFwiKSA/IEludGwuQ29sbGF0b3IoXCJnZW5lcmljXCIsIHsgc2Vuc2l0aXZpdHk6IFwiYmFzZVwiIH0pIDogbnVsbDtcbiAgfSBjYXRjaCAoZXJyKXtcbiAgICBjb25zb2xlLmxvZyhcIkNvbGxhdG9yIGNvdWxkIG5vdCBiZSBpbml0aWFsaXplZCBhbmQgd291bGRuJ3QgYmUgdXNlZFwiKTtcbiAgfVxuICAvLyBhcnJheXMgdG8gcmUtdXNlXG4gIHZhciBwcmV2Um93ID0gW10sXG4gICAgc3RyMkNoYXIgPSBbXTtcbiAgXG4gIC8qKlxuICAgKiBCYXNlZCBvbiB0aGUgYWxnb3JpdGhtIGF0IGh0dHA6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvTGV2ZW5zaHRlaW5fZGlzdGFuY2UuXG4gICAqL1xuICB2YXIgTGV2ZW5zaHRlaW4gPSB7XG4gICAgLyoqXG4gICAgICogQ2FsY3VsYXRlIGxldmVuc2h0ZWluIGRpc3RhbmNlIG9mIHRoZSB0d28gc3RyaW5ncy5cbiAgICAgKlxuICAgICAqIEBwYXJhbSBzdHIxIFN0cmluZyB0aGUgZmlyc3Qgc3RyaW5nLlxuICAgICAqIEBwYXJhbSBzdHIyIFN0cmluZyB0aGUgc2Vjb25kIHN0cmluZy5cbiAgICAgKiBAcGFyYW0gW29wdGlvbnNdIEFkZGl0aW9uYWwgb3B0aW9ucy5cbiAgICAgKiBAcGFyYW0gW29wdGlvbnMudXNlQ29sbGF0b3JdIFVzZSBgSW50bC5Db2xsYXRvcmAgZm9yIGxvY2FsZS1zZW5zaXRpdmUgc3RyaW5nIGNvbXBhcmlzb24uXG4gICAgICogQHJldHVybiBJbnRlZ2VyIHRoZSBsZXZlbnNodGVpbiBkaXN0YW5jZSAoMCBhbmQgYWJvdmUpLlxuICAgICAqL1xuICAgIGdldDogZnVuY3Rpb24oc3RyMSwgc3RyMiwgb3B0aW9ucykge1xuICAgICAgdmFyIHVzZUNvbGxhdG9yID0gKG9wdGlvbnMgJiYgY29sbGF0b3IgJiYgb3B0aW9ucy51c2VDb2xsYXRvcik7XG4gICAgICBcbiAgICAgIHZhciBzdHIxTGVuID0gc3RyMS5sZW5ndGgsXG4gICAgICAgIHN0cjJMZW4gPSBzdHIyLmxlbmd0aDtcbiAgICAgIFxuICAgICAgLy8gYmFzZSBjYXNlc1xuICAgICAgaWYgKHN0cjFMZW4gPT09IDApIHJldHVybiBzdHIyTGVuO1xuICAgICAgaWYgKHN0cjJMZW4gPT09IDApIHJldHVybiBzdHIxTGVuO1xuXG4gICAgICAvLyB0d28gcm93c1xuICAgICAgdmFyIGN1ckNvbCwgbmV4dENvbCwgaSwgaiwgdG1wO1xuXG4gICAgICAvLyBpbml0aWFsaXNlIHByZXZpb3VzIHJvd1xuICAgICAgZm9yIChpPTA7IGk8c3RyMkxlbjsgKytpKSB7XG4gICAgICAgIHByZXZSb3dbaV0gPSBpO1xuICAgICAgICBzdHIyQ2hhcltpXSA9IHN0cjIuY2hhckNvZGVBdChpKTtcbiAgICAgIH1cbiAgICAgIHByZXZSb3dbc3RyMkxlbl0gPSBzdHIyTGVuO1xuXG4gICAgICB2YXIgc3RyQ21wO1xuICAgICAgaWYgKHVzZUNvbGxhdG9yKSB7XG4gICAgICAgIC8vIGNhbGN1bGF0ZSBjdXJyZW50IHJvdyBkaXN0YW5jZSBmcm9tIHByZXZpb3VzIHJvdyB1c2luZyBjb2xsYXRvclxuICAgICAgICBmb3IgKGkgPSAwOyBpIDwgc3RyMUxlbjsgKytpKSB7XG4gICAgICAgICAgbmV4dENvbCA9IGkgKyAxO1xuXG4gICAgICAgICAgZm9yIChqID0gMDsgaiA8IHN0cjJMZW47ICsraikge1xuICAgICAgICAgICAgY3VyQ29sID0gbmV4dENvbDtcblxuICAgICAgICAgICAgLy8gc3Vic3R1dGlvblxuICAgICAgICAgICAgc3RyQ21wID0gMCA9PT0gY29sbGF0b3IuY29tcGFyZShzdHIxLmNoYXJBdChpKSwgU3RyaW5nLmZyb21DaGFyQ29kZShzdHIyQ2hhcltqXSkpO1xuXG4gICAgICAgICAgICBuZXh0Q29sID0gcHJldlJvd1tqXSArIChzdHJDbXAgPyAwIDogMSk7XG5cbiAgICAgICAgICAgIC8vIGluc2VydGlvblxuICAgICAgICAgICAgdG1wID0gY3VyQ29sICsgMTtcbiAgICAgICAgICAgIGlmIChuZXh0Q29sID4gdG1wKSB7XG4gICAgICAgICAgICAgIG5leHRDb2wgPSB0bXA7XG4gICAgICAgICAgICB9XG4gICAgICAgICAgICAvLyBkZWxldGlvblxuICAgICAgICAgICAgdG1wID0gcHJldlJvd1tqICsgMV0gKyAxO1xuICAgICAgICAgICAgaWYgKG5leHRDb2wgPiB0bXApIHtcbiAgICAgICAgICAgICAgbmV4dENvbCA9IHRtcDtcbiAgICAgICAgICAgIH1cblxuICAgICAgICAgICAgLy8gY29weSBjdXJyZW50IGNvbCB2YWx1ZSBpbnRvIHByZXZpb3VzIChpbiBwcmVwYXJhdGlvbiBmb3IgbmV4dCBpdGVyYXRpb24pXG4gICAgICAgICAgICBwcmV2Um93W2pdID0gY3VyQ29sO1xuICAgICAgICAgIH1cblxuICAgICAgICAgIC8vIGNvcHkgbGFzdCBjb2wgdmFsdWUgaW50byBwcmV2aW91cyAoaW4gcHJlcGFyYXRpb24gZm9yIG5leHQgaXRlcmF0aW9uKVxuICAgICAgICAgIHByZXZSb3dbal0gPSBuZXh0Q29sO1xuICAgICAgICB9XG4gICAgICB9XG4gICAgICBlbHNlIHtcbiAgICAgICAgLy8gY2FsY3VsYXRlIGN1cnJlbnQgcm93IGRpc3RhbmNlIGZyb20gcHJldmlvdXMgcm93IHdpdGhvdXQgY29sbGF0b3JcbiAgICAgICAgZm9yIChpID0gMDsgaSA8IHN0cjFMZW47ICsraSkge1xuICAgICAgICAgIG5leHRDb2wgPSBpICsgMTtcblxuICAgICAgICAgIGZvciAoaiA9IDA7IGogPCBzdHIyTGVuOyArK2opIHtcbiAgICAgICAgICAgIGN1ckNvbCA9IG5leHRDb2w7XG5cbiAgICAgICAgICAgIC8vIHN1YnN0dXRpb25cbiAgICAgICAgICAgIHN0ckNtcCA9IHN0cjEuY2hhckNvZGVBdChpKSA9PT0gc3RyMkNoYXJbal07XG5cbiAgICAgICAgICAgIG5leHRDb2wgPSBwcmV2Um93W2pdICsgKHN0ckNtcCA/IDAgOiAxKTtcblxuICAgICAgICAgICAgLy8gaW5zZXJ0aW9uXG4gICAgICAgICAgICB0bXAgPSBjdXJDb2wgKyAxO1xuICAgICAgICAgICAgaWYgKG5leHRDb2wgPiB0bXApIHtcbiAgICAgICAgICAgICAgbmV4dENvbCA9IHRtcDtcbiAgICAgICAgICAgIH1cbiAgICAgICAgICAgIC8vIGRlbGV0aW9uXG4gICAgICAgICAgICB0bXAgPSBwcmV2Um93W2ogKyAxXSArIDE7XG4gICAgICAgICAgICBpZiAobmV4dENvbCA+IHRtcCkge1xuICAgICAgICAgICAgICBuZXh0Q29sID0gdG1wO1xuICAgICAgICAgICAgfVxuXG4gICAgICAgICAgICAvLyBjb3B5IGN1cnJlbnQgY29sIHZhbHVlIGludG8gcHJldmlvdXMgKGluIHByZXBhcmF0aW9uIGZvciBuZXh0IGl0ZXJhdGlvbilcbiAgICAgICAgICAgIHByZXZSb3dbal0gPSBjdXJDb2w7XG4gICAgICAgICAgfVxuXG4gICAgICAgICAgLy8gY29weSBsYXN0IGNvbCB2YWx1ZSBpbnRvIHByZXZpb3VzIChpbiBwcmVwYXJhdGlvbiBmb3IgbmV4dCBpdGVyYXRpb24pXG4gICAgICAgICAgcHJldlJvd1tqXSA9IG5leHRDb2w7XG4gICAgICAgIH1cbiAgICAgIH1cbiAgICAgIHJldHVybiBuZXh0Q29sO1xuICAgIH1cblxuICB9O1xuXG4gIC8vIGFtZFxuICBpZiAodHlwZW9mIGRlZmluZSAhPT0gXCJ1bmRlZmluZWRcIiAmJiBkZWZpbmUgIT09IG51bGwgJiYgZGVmaW5lLmFtZCkge1xuICAgIGRlZmluZShmdW5jdGlvbigpIHtcbiAgICAgIHJldHVybiBMZXZlbnNodGVpbjtcbiAgICB9KTtcbiAgfVxuICAvLyBjb21tb25qc1xuICBlbHNlIGlmICh0eXBlb2YgbW9kdWxlICE9PSBcInVuZGVmaW5lZFwiICYmIG1vZHVsZSAhPT0gbnVsbCAmJiB0eXBlb2YgZXhwb3J0cyAhPT0gXCJ1bmRlZmluZWRcIiAmJiBtb2R1bGUuZXhwb3J0cyA9PT0gZXhwb3J0cykge1xuICAgIG1vZHVsZS5leHBvcnRzID0gTGV2ZW5zaHRlaW47XG4gIH1cbiAgLy8gd2ViIHdvcmtlclxuICBlbHNlIGlmICh0eXBlb2Ygc2VsZiAhPT0gXCJ1bmRlZmluZWRcIiAmJiB0eXBlb2Ygc2VsZi5wb3N0TWVzc2FnZSA9PT0gJ2Z1bmN0aW9uJyAmJiB0eXBlb2Ygc2VsZi5pbXBvcnRTY3JpcHRzID09PSAnZnVuY3Rpb24nKSB7XG4gICAgc2VsZi5MZXZlbnNodGVpbiA9IExldmVuc2h0ZWluO1xuICB9XG4gIC8vIGJyb3dzZXIgbWFpbiB0aHJlYWRcbiAgZWxzZSBpZiAodHlwZW9mIHdpbmRvdyAhPT0gXCJ1bmRlZmluZWRcIiAmJiB3aW5kb3cgIT09IG51bGwpIHtcbiAgICB3aW5kb3cuTGV2ZW5zaHRlaW4gPSBMZXZlbnNodGVpbjtcbiAgfVxufSgpKTtcblxuIl19 | |
"use strict"; | |
{ | |
const {get:edit_distance} = require('fast-levenshtein'); | |
const SEN_MARK = { | |
en: [ ".", "!", "?", "()", "[]" ], | |
zh: [ "。", "「」", "!", "?", "()", "[]", "【】", "《》", "〈〉"], | |
es: [ ".", "'", '"', ";", "¡!", "¿?", "“”", "‘’" ], | |
hi: [ "|", ";", "?", "!", "”" ], | |
ar: [ ".", "؟", ":", "“”" ] | |
}; | |
const SPACE_MARK = [ | |
"\u0009-\u000d", | |
"\u0020", | |
"\u0085", | |
"\u00a0", | |
"\u2000-\u200b", | |
"\u2028", | |
"\u2029", | |
"\u202f", | |
"\u205f", | |
"\u3000" | |
]; | |
const ZH_PATTERNS = [ | |
"[\u2e80-\ud7af]" | |
]; | |
const NONZH_PATTERNS = [ | |
"[\u0021-\u0d7f][\u0021-\u0d7f]" | |
]; | |
const ZH_COMMON = new RegExp( ZH_PATTERNS.join(''), "g" ); | |
const NONZH_COMMON = new RegExp( NONZH_PATTERNS.join(''), "g" ); | |
const WORD_BREAK = new RegExp(`[${SPACE_MARK.join('')}]+`,"g"); | |
const WEIGHT = { | |
I: 0.333, | |
T: 0.333, | |
W: 0.333 | |
}; | |
const SEN_BREAK = new RegExp(`[${ | |
Object.values(SEN_MARK).reduce((val,m)=>(val.push( | |
...(m.reduce((ms,mark) => (ms.push(...mark),ms),[])) | |
),val),[]).map(char => `\\${char}` ).join('') | |
}]+`,"g"); | |
console.log(SEN_BREAK); | |
const lib = { get_sentences, similarity_score, find_closest }; | |
try { | |
module.exports = lib; | |
} catch(e) { Object.assign( self, {lib}) }; | |
function get_sentences(text) { | |
return text.split( SEN_BREAK ).map( (s,i,S) => new Sentence(s,i,S.length) ); | |
} | |
function find_closest( sen, text ) { | |
const sens = get_sentences( text ); | |
sen = new Sentence( sen, 0, sens.length ); | |
const scores = sens.map( tsen => ({ score: similarity_score( sen, tsen ), tsen }) ); | |
console.log( sens, scores ); | |
scores.sort( (a,b) => b.score - a.score ); | |
console.log( "top matches", scores.slice(0,5) ); | |
return scores[0].tsen; | |
} | |
function similarity_score(sen1,sen2) { | |
const idiff = 1.0 - Math.abs(sen1.index/sen1.count-sen2.index/sen2.count); | |
const tdiff = 1.0 - (edit_distance(sen1.text,sen2.text)/(sen1.text.length+sen2.text.length)); | |
const wdiff = intersection_size(sen1.emes,sen2.emes)/union_size(sen1.emes,sen2.emes); | |
return WEIGHT.I*idiff + WEIGHT.T*tdiff + WEIGHT.W*wdiff; | |
} | |
class Sentence { | |
constructor(s,index,count) { | |
this.text = s; | |
this.emes = new Set(get_emes(s)); | |
this.index = index; | |
this.count = count; | |
} | |
} | |
function intersection_size(s,t) { | |
return s.size+t.size-union_size(s,t); | |
} | |
function union_size(s,t) { | |
return new Set([...s,...t]).size; | |
} | |
function get_emes(s) { | |
if ( lang(s) == 'zh' ) return s.split(''); | |
else return s.split(WORD_BREAK); | |
} | |
function lang(s) { | |
if ( s.match( ZH_COMMON ) && !(s.match(NONZH_COMMON)) ) return 'zh'; | |
else return 'nonzh'; | |
} | |
} | |
;}, 0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "requirebin-sketch", | |
"version": "1.0.0", | |
"dependencies": { | |
"fast-levenshtein": "2.0.6" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- contents of this file will be placed inside the <body> --> | |
<form id=form> | |
<p><input name=sentence value="abc"> | |
<p><textarea name=text>abc. def. sdfkhsdf. 1231d.</textarea> | |
<p><output name=result></output> | |
<p><button name=find>Find</button> | |
</form> | |
<script> | |
form.find.onclick = e => { | |
e.preventDefault(); | |
form.result.value = lib.find_closest(form.sentence.value,form.text.value).text; | |
}; | |
</script> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- contents of this file will be placed inside the <head> --> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment