Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
langid.js
<!DOCTYPE HTML>
<html>
<head>
<title>langid.js demonstration</title>
<script src="https://raw.githubusercontent.com/saffsd/langid.js/master/langid-model-full.js"></script>
<script src="langid.js"></script>
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
<script type="text/javascript" charset="utf-8">
$(document).ready(function() {
$("#typerArea").keyup(displayType);
function displayType(){
var contents = $("#typerArea").val();
if (contents.length == 0) {
$("#rankTable").hide();
}
else{
var rank = langid.rank(contents);
for (var i=0;i<5;i++){
$("#lang"+i).html(rank[i]["lang"]);
$("#conf"+i).html(rank[i]["logprob"]);
}
$("#rankTable").show();
}
}
$("#manualSubmit").remove();
$("#rankTable").hide();
});
</script>
</head>
<body>
<form method=post>
<center><table>
<tr>
<td>
<textarea name="q" id="typerArea" cols=40 rows=6></textarea></br>
</td>
</tr>
<tr>
<td>
<table id="rankTable">
<tr>
<td id="lang0">
<p>Unable to load jQuery, live update disabled.</p>
</td><td id="conf0"/>
</tr>
<tr><td id="lang1"/><td id="conf1"></tr>
<tr><td id="lang2"/><td id="conf2"></tr>
<tr><td id="lang3"/><td id="conf3"></tr>
<tr><td id="lang4"/><td id="conf4"></tr>
</table>
<input type=submit id="manualSubmit" value="submit">
</td>
</tr>
</table></center>
</form>
</body>
</html>
// langid.js is a direct port of langid.py to JavaScript.
// For license conditions, see the LICENSE file in the same repository.
// Marco Lui <saffsd@gmail.com>, July 2014
"use strict";
var langid = (function() {
var my = {};
function base64ToArray(encStr, arraytype) {
var decStr = atob(encStr)
var buf = new ArrayBuffer(decStr.length);
var bufWrite = new Uint8Array(buf);
for (var i=0, bufSize=bufWrite.length; i<bufSize; i++){
bufWrite[i] = decStr.charCodeAt(i);
}
var bufView = new arraytype(buf);
return bufView;
}
// unpack the model. the _xxx packed version is externally supplied.
var tk_nextmove = base64ToArray(_tk_nextmove, Uint16Array);
var tk_output_packed = base64ToArray(_tk_output, Uint16Array);
var nb_pc = base64ToArray(_nb_pc, Float64Array);
var nb_ptc = base64ToArray(_nb_ptc, Float64Array);
var nb_classes = _nb_classes
// unpack tk_output
var tk_output = {};
var limit = tk_output_packed[0];
for (var i=0, j=1; i < limit; i++) {
var s = tk_output_packed[j];
var c = tk_output_packed[j+1];
var arr = tk_output_packed.subarray(j+2,j+2+c);
tk_output[s] = arr;
j += 2+c;
}
// calculate some properties of the model
var num_langs = nb_classes.length;
var num_features = nb_ptc.length / num_langs;
var num_states = tk_nextmove.length / 256;
console.log("unpacked a langid model: " + num_langs + " langs, " + num_features + " feats, " + num_states + " states.");
my.textToFv = function(str){
// convert raw input text to a vector of transitions.
// The model in langid.js operates at a byte level, and most
// of the training data used was UTF8, so we need to first encode
// the string in UTF8 before processing.
var enc = unescape(encodeURIComponent(str));
var sv = new Uint32Array(num_states);
var s = 0; // start at state 0;
for (var i=0, l=enc.length; i<l; i++){
var c = enc.charCodeAt(i);
s = tk_nextmove[(s<<8)+c];
sv[s] += 1;
}
// convert the transitions into feature counts
var fv = new Uint32Array(num_features);
for (var i=0, l=num_states; i<l; i++){
if ((sv[i] > 0) && (i in tk_output)){
var states = tk_output[i];
for (var j=0, m=states.length; j<m; j++){
fv[states[j]] += sv[i]; // increment corresponding features
}
}
}
return fv;
}
my.fvToLogprob = function(fv){
// rank languages based on an input fv
var logprob = new Float64Array(nb_pc);
for (var i = 0; i < num_features; i++){
if (fv[i] > 0){
for (var j=0; j < num_langs; j++){
logprob[j] += fv[i] * nb_ptc[i*num_langs + j];
}
}
}
return logprob;
}
my.logprobToPred = function(logprob){
var _i = 0;
for (var i=1;i<num_langs;i++){
if (logprob[_i] < logprob[i]) _i = i;
}
console.log('pred: '+_i+ ' lang: '+ nb_classes[_i] + ' logprob: ' + logprob[_i]);
return nb_classes[_i];
}
my.logprobToRank= function(logprob){
var preds = [];
for (var i=0;i<num_langs;i++) preds.push({"lang":nb_classes[i], "logprob":logprob[i]});
preds.sort(function(a,b){return b["logprob"]-a["logprob"];});
return preds;
}
my.identify = function(str){
var fv = my.textToFv(str);
var lp = my.fvToLogprob(fv);
var pred = my.logprobToPred(lp);
return pred;
}
my.rank = function(str){
var fv = my.textToFv(str);
var lp = my.fvToLogprob(fv);
var rank = my.logprobToRank(lp);
return rank;
}
return my;
})();

All code taken from: https://github.com/saffsd/langid.js

Introduction

langid.js is a direct port of the language identifier implemented by langid.py. The theory behind the method is described in two published research papers [1,2]. langid.js does not implement the training of the model, and instead provides a tool ldpy2ldjs.py to convert models trained with the langid.py training tools.

Demonstration

Open demo.html in a browser. langid.js uses TypedArrays so a browser that supports them is required.

Usage

The models and the actual classifier are distributed as two separate javascript files, and both must be included in a page for the langid.js to work. In this repository, I initially provide langid-model-acquis.js, a toy 4-language model based on only JRC-Acquis data, useful for testing and development purposes, as well as langid-model-full.js, the same model that is packaged by default with langid.py.

References

[1] http://aclweb.org/anthology-new/I/I11/I11-1062.pdf [2] http://www.aclweb.org/anthology/P/P12/P12-3005.pdf © 2020 GitHub, Inc.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment