MarZab/ngrams.js

## ngrams.js
'use strict';
/*
 N-Gram-Based Text Categorisation

 Implementation by marko@zabreznik.net
 28.3.2015 All rights reserved.

 Based on the paper by:
 William B. Cavnar and John M. Trenkle
 Environmental Research Institute of Michigan
 P.O. Box 134001
 Ann Arbor MI 48113-4001

 http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.42.4093&rep=rep1&type=pdf

 Needs (npm install):
  IO.JS, for ECMAScript 6 - https://iojs.org/en/index.html
  Collections, for SortedSet - http://www.collectionsjs.com/
  Console.table, for lovely table output - https://github.com/bahmutov/console.table
 */

var SortedSet = require("collections/sorted-set");
require('console.table');
var fs = require('fs');
var async = require('async');

/**
 * Create Map of word occurances from file
 * @param file
 * @param callback
 */
exports.readWords = function (file, debug, callback) {

    console.log(debug + ' - Reading');

    // open file stream
    var stream = fs.createReadStream(file, {
        flags: 'r', encoding: 'utf-8'
    });

    stream.on('open', function () {
        console.time(debug + ' - Reading time'); // time word reading
    });

    // Regex to extract text
    var wordsOnly = /[\w']+/g;
    //var wordsOnly = /[a-zA-Z']+/g; // wayyyy faster
    var words = new Map();

    stream.on('data', function (chunk) {
        // We might have cut off a word here,
        //  but chunk size makes the mistake negligible.
        chunk.replace(wordsOnly, function (word) {
            // We get a word here, store and count it
            var e = words.get(word);
            if (e) {
                words.set(word, 1 + e);
            } else {
                words.set(word, 1);
            }

        });
    });

    stream.on('error', function (err) {
        console.log(debug + ' - ', err);
        callback();
    });

    stream.on('end', function () {
        console.log(debug + ' - Read ' + words.size + ' words');
        console.timeEnd(debug + ' - Reading time');
        callback(words);
    });

};


/**
 * Create list of n-grams from Map of word occurances
 * @param words = MAP(Word=>Count)
 */
exports.nGram = function (words, debug) {

    /*
     2.0 N-Grams
         An N-gram is an N-character slice of a longer
         string. Although in the literature the term can
         include the notion of any co-occurring set of
         characters in a string (e.g., an N-gram made up
         of the first and third character of a word), in this
         paper we use the term for contiguous slices only.
         Typically, one slices the string into a set of overlapping
         N-grams. In our system, we use N-grams
         of several different lengths simultaneously. We
         also append blanks to the beginning and ending
         of the string in order to help with matching
         beginning-of-word and ending-of-word situations.
         (We will use the underscore character (“_”)
         to represent blanks.)
     */

    var N = [1, 2, 3, 4, 5];
    var nGrams = new Map();

    console.time(debug + ' - Creating and counting n-grams');
    words.forEach(function (times, word) {
        var prefixedWord = '_' + word;
        N.forEach(function (n) {
            var paddedWord = prefixedWord + Array(n).join('_');
            for (var i = paddedWord.length - n; i >= 0; i--) {
                // we got an n-gram
                var nGram = paddedWord.substr(i, n);

                var exists = nGrams.get(nGram);
                if (exists) {
                    nGrams.set(nGram, exists + times);
                } else {
                    nGrams.set(nGram, times);
                }
            }
        });
    });
    console.timeEnd(debug + ' - Creating and counting n-grams');
    console.log(debug + ' - Got ' + nGrams.size + ' unique n-grams');

    console.time(debug + ' - Weighting n-grams');
    /*
     3.1.5 Sort those counts into reverse order by the
         number of occurrences. Keep just the Ngrams
         themselves, which are now in
         reverse order of frequency.

     We divert from the paper at this point, we care that ngrams
     with the same weight get the same "distance".

     We got a map of ('ngram' => count)
     put the values (count) into a sorted set
     then set the values of count to the index in the set

     */
    var occurrences = new SortedSet();
    for (var v of nGrams.values()) {
        occurrences.add(v);
    }
    var occurrencesLength = occurrences.length;
    for (var n of nGrams) {
        nGrams.set(n[0], occurrencesLength - occurrences.indexOf(n[1]));
    }
    console.timeEnd(debug + ' - Weighting n-grams'); // this was slow... gotta be a better way .. anywho
    /*
     We return a Map of n-grams with their respective weight
     Save the max weight for faster comparison later on
     */

    nGrams.max = occurrencesLength;
    nGrams.name = debug;

    return nGrams;
};

/**
 * Compare 2 n-grams
 * @param nGram1
 * @param nGram2
 */
exports.compareNGrams = function (trainedNGram, comperativeNgram) {

    /*
     3.2 Comparing and Ranking N-Gram Frequency Profiles
         For each N-gram in the document profile, we
         find its counterpart in the category profile, and
         then calculate how far out of place it is. For
         example, in Figure 3, the N-gram “ING” is at
         rank 2 in the document, but at rank 5 in the category.
         Thus it is 3 ranks out of place. If an N-gram
         (such as “ED” in the figure) is not in the category
         profile, it takes some maximum out-of-place
         value. The sum of all of the out-of-place values
         for all N-grams is the distance measure for the
         document from the category.

     Maximum out ouf place number was not clearly stated in the paper.
     For best performace we picked one more the max oop from the test set.
     */

    var maxoutOfPlace = comperativeNgram.max +1;
    var sum = 0;
    var undef = 0;

    console.log('Comparing', trainedNGram.name, 'with', comperativeNgram.name, maxoutOfPlace);

    for (var cnGram of comperativeNgram) {
        var tnGram = trainedNGram.get(cnGram[0]);
        // is this nGram in the trained set?
        if (tnGram !== undefined && tnGram < 400) {
            // cut the
            sum += Math.abs(cnGram[1]-tnGram);
        } else {
            undef ++;
            sum += maxoutOfPlace;
        }
    }

    return {
        'test set': trainedNGram.name,
        //compared: comperativeNgram.name,
        'hard misses': undef / comperativeNgram.size,
        distance: sum,
        averageOOP: ( sum + undef * maxoutOfPlace ) / comperativeNgram.size
    };
};


exports.getNGramFromFile = function (filename, debug, callback) {
    exports.readWords(filename, debug, function (words) {
        if (words) {
            return callback(exports.nGram(words, debug));
        }
        callback();
    });
};

// test

var test = 'test/sl.test';
var train =  ['test/bg_train.txt', 'test/en_train.txt', 'test/cz_train.txt', 'test/es_train.txt', 'test/train.sl'];

exports.getNGramFromFile(test, test, function (nGram) {

    async.mapSeries(
        train,
        function (file, callback) {
            exports.getNGramFromFile(file, file, function (trainedNGram) {

                /*
                     4.3  ... we also varied
                         the number of the N-gram frequencies available
                         in the profile for the match, by limiting it to
                         statistics for 100, 200, 300 or 400 N-grams. This
                         variable did have an impact on match performance,
                         although by the 400 N-gram level language
                         classification was almost perfect.
                 */

                callback(null, exports.compareNGrams(trainedNGram, nGram));
            });
        },
        function (err, results) {
            console.log('\nResults for', nGram.name);
            results.sort(function (r1, r2) {
                return r1.distance - r2.distance;
            });
            console.table(results);
            console.log('Done.');
        }
    );

});


## package.json
{
  "name": "n-grams",
  "version": "0.0.1",
  "private": true,
  "dependencies": {
    "async": "^0.9.0",
    "collections": "^1.2.2",
    "console.table": "^0.4.0"
  },
    "engines": {
      "iojs": ">= 1.6.2"
  }
}
	'use strict';
	/*
	N-Gram-Based Text Categorisation

	Implementation by marko@zabreznik.net
	28.3.2015 All rights reserved.

	Based on the paper by:
	William B. Cavnar and John M. Trenkle
	Environmental Research Institute of Michigan
	P.O. Box 134001
	Ann Arbor MI 48113-4001

	http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.42.4093&rep=rep1&type=pdf

	Needs (npm install):
	IO.JS, for ECMAScript 6 - https://iojs.org/en/index.html
	Collections, for SortedSet - http://www.collectionsjs.com/
	Console.table, for lovely table output - https://github.com/bahmutov/console.table
	*/

	var SortedSet = require("collections/sorted-set");
	require('console.table');
	var fs = require('fs');
	var async = require('async');

	/**
	* Create Map of word occurances from file
	* @param file
	* @param callback
	*/
	exports.readWords = function (file, debug, callback) {

	console.log(debug + ' - Reading');

	// open file stream
	var stream = fs.createReadStream(file, {
	flags: 'r', encoding: 'utf-8'
	});

	stream.on('open', function () {
	console.time(debug + ' - Reading time'); // time word reading
	});

	// Regex to extract text
	var wordsOnly = /[\w']+/g;
	//var wordsOnly = /[a-zA-Z']+/g; // wayyyy faster
	var words = new Map();

	stream.on('data', function (chunk) {
	// We might have cut off a word here,
	// but chunk size makes the mistake negligible.
	chunk.replace(wordsOnly, function (word) {
	// We get a word here, store and count it
	var e = words.get(word);
	if (e) {
	words.set(word, 1 + e);
	} else {
	words.set(word, 1);
	}

	});
	});

	stream.on('error', function (err) {
	console.log(debug + ' - ', err);
	callback();
	});

	stream.on('end', function () {
	console.log(debug + ' - Read ' + words.size + ' words');
	console.timeEnd(debug + ' - Reading time');
	callback(words);
	});

	};


	/**
	* Create list of n-grams from Map of word occurances
	* @param words = MAP(Word=>Count)
	*/
	exports.nGram = function (words, debug) {

	/*
	2.0 N-Grams
	An N-gram is an N-character slice of a longer
	string. Although in the literature the term can
	include the notion of any co-occurring set of
	characters in a string (e.g., an N-gram made up
	of the first and third character of a word), in this
	paper we use the term for contiguous slices only.
	Typically, one slices the string into a set of overlapping
	N-grams. In our system, we use N-grams
	of several different lengths simultaneously. We
	also append blanks to the beginning and ending
	of the string in order to help with matching
	beginning-of-word and ending-of-word situations.
	(We will use the underscore character (“_”)
	to represent blanks.)
	*/

	var N = [1, 2, 3, 4, 5];
	var nGrams = new Map();

	console.time(debug + ' - Creating and counting n-grams');
	words.forEach(function (times, word) {
	var prefixedWord = '_' + word;
	N.forEach(function (n) {
	var paddedWord = prefixedWord + Array(n).join('_');
	for (var i = paddedWord.length - n; i >= 0; i--) {
	// we got an n-gram
	var nGram = paddedWord.substr(i, n);

	var exists = nGrams.get(nGram);
	if (exists) {
	nGrams.set(nGram, exists + times);
	} else {
	nGrams.set(nGram, times);
	}
	}
	});
	});
	console.timeEnd(debug + ' - Creating and counting n-grams');
	console.log(debug + ' - Got ' + nGrams.size + ' unique n-grams');

	console.time(debug + ' - Weighting n-grams');
	/*
	3.1.5 Sort those counts into reverse order by the
	number of occurrences. Keep just the Ngrams
	themselves, which are now in
	reverse order of frequency.

	We divert from the paper at this point, we care that ngrams
	with the same weight get the same "distance".

	We got a map of ('ngram' => count)
	put the values (count) into a sorted set
	then set the values of count to the index in the set

	*/
	var occurrences = new SortedSet();
	for (var v of nGrams.values()) {
	occurrences.add(v);
	}
	var occurrencesLength = occurrences.length;
	for (var n of nGrams) {
	nGrams.set(n[0], occurrencesLength - occurrences.indexOf(n[1]));
	}
	console.timeEnd(debug + ' - Weighting n-grams'); // this was slow... gotta be a better way .. anywho
	/*
	We return a Map of n-grams with their respective weight
	Save the max weight for faster comparison later on
	*/

	nGrams.max = occurrencesLength;
	nGrams.name = debug;

	return nGrams;
	};

	/**
	* Compare 2 n-grams
	* @param nGram1
	* @param nGram2
	*/
	exports.compareNGrams = function (trainedNGram, comperativeNgram) {

	/*
	3.2 Comparing and Ranking N-Gram Frequency Profiles
	For each N-gram in the document profile, we
	find its counterpart in the category profile, and
	then calculate how far out of place it is. For
	example, in Figure 3, the N-gram “ING” is at
	rank 2 in the document, but at rank 5 in the category.
	Thus it is 3 ranks out of place. If an N-gram
	(such as “ED” in the figure) is not in the category
	profile, it takes some maximum out-of-place
	value. The sum of all of the out-of-place values
	for all N-grams is the distance measure for the
	document from the category.

	Maximum out ouf place number was not clearly stated in the paper.
	For best performace we picked one more the max oop from the test set.
	*/

	var maxoutOfPlace = comperativeNgram.max +1;
	var sum = 0;
	var undef = 0;

	console.log('Comparing', trainedNGram.name, 'with', comperativeNgram.name, maxoutOfPlace);

	for (var cnGram of comperativeNgram) {
	var tnGram = trainedNGram.get(cnGram[0]);
	// is this nGram in the trained set?
	if (tnGram !== undefined && tnGram < 400) {
	// cut the
	sum += Math.abs(cnGram[1]-tnGram);
	} else {
	undef ++;
	sum += maxoutOfPlace;
	}
	}

	return {
	'test set': trainedNGram.name,
	//compared: comperativeNgram.name,
	'hard misses': undef / comperativeNgram.size,
	distance: sum,
	averageOOP: ( sum + undef * maxoutOfPlace ) / comperativeNgram.size
	};
	};


	exports.getNGramFromFile = function (filename, debug, callback) {
	exports.readWords(filename, debug, function (words) {
	if (words) {
	return callback(exports.nGram(words, debug));
	}
	callback();
	});
	};

	// test

	var test = 'test/sl.test';
	var train = ['test/bg_train.txt', 'test/en_train.txt', 'test/cz_train.txt', 'test/es_train.txt', 'test/train.sl'];

	exports.getNGramFromFile(test, test, function (nGram) {

	async.mapSeries(
	train,
	function (file, callback) {
	exports.getNGramFromFile(file, file, function (trainedNGram) {

	/*
	4.3 ... we also varied
	the number of the N-gram frequencies available
	in the profile for the match, by limiting it to
	statistics for 100, 200, 300 or 400 N-grams. This
	variable did have an impact on match performance,
	although by the 400 N-gram level language
	classification was almost perfect.
	*/

	callback(null, exports.compareNGrams(trainedNGram, nGram));
	});
	},
	function (err, results) {
	console.log('\nResults for', nGram.name);
	results.sort(function (r1, r2) {
	return r1.distance - r2.distance;
	});
	console.table(results);
	console.log('Done.');
	}
	);

	});
	{
	"name": "n-grams",
	"version": "0.0.1",
	"private": true,
	"dependencies": {
	"async": "^0.9.0",
	"collections": "^1.2.2",
	"console.table": "^0.4.0"
	},
	"engines": {
	"iojs": ">= 1.6.2"
	}
	}