GregaMohorko/LanguageDetection.cs

## LanguageDetection.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace GM.NLP.TextCategorization
{
	public struct Profile
	{
		public string Name;
		public string[] NGrams;
	}

	/// <summary>
	/// N-Gram-Based Text Categorization algorithm used for language detection.
	/// <para>
	/// Based on this article: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.9367
	/// </para>
	/// </summary>
	public static class LanguageDetection
	{
		/// <summary>
		/// Returns the name of the detected category.
		/// </summary>
		public static string DetectLanguage(string text,List<Profile> categories)
		{
			Profile document = GenerateProfile(text);

			// take the distance measure from all the category profiles to the document profile and pick the smallest one
			int indexOfTheBest=-1;
			long best = long.MaxValue;
			for(int i=categories.Count-1;i>=0;--i) {
				Profile category = categories[i];
				long distance = MeasureProfileDistance(document, category);
				if(distance < best) {
					indexOfTheBest = i;
					best = distance;
				}
			}

			Profile bestFit = categories[indexOfTheBest];

			return bestFit.Name;
		}

		public static Profile GenerateProfile(string text,string name=null)
		{
			string[] tokens = text.ToUpper().SplitToTokens();

			// hash table which has the NGrams as the key, and its counter for value
			Dictionary<string, ulong> allNGrams = new Dictionary<string, ulong>();

			foreach(string token in tokens) {
				// scan down each token, generating all possible N-grams, for N=1 to 5
				for(int n = 1; n <= 5; ++n) {
					string[] currentNGrams = SplitToNGrams(token, n);
					foreach(string ngram in currentNGrams) {
						// insert into the table to find the counter for the N-gram and increment it
						if(allNGrams.ContainsKey(ngram))
							++allNGrams[ngram];
						else
							allNGrams[ngram] = 1;
					}
				}
			}

			// sort by the number of occurrences in reverse order, keep just the N-grams themselves
			var sortedNGrams = allNGrams.OrderByDescending(g => g.Value).Select(g => g.Key);

			// the top 300 or so N-grams are almost always highly correlated to the language
			string[] top300NGrams = sortedNGrams.Take(300).ToArray();

			Profile result;
			result.Name = name;
			result.NGrams = top300NGrams;

			return result;
		}

		/// <summary>
		/// Calculates a simple rank-order statistic, called 'out-of-place' measure. It determines how far out of place an N-gram in one profile is from its place in the other profile.
		/// </summary>
		private static long MeasureProfileDistance(Profile document,Profile category)
		{
			long MAX_OUT_OF_PLACE_DISTANCE = Math.Max(document.NGrams.LongLength, category.NGrams.LongLength);

			// the sum of all of the out-of-place values for all N-grams
			long sum = 0;

			// for each N-gram in the document profile
			for(long i= document.NGrams.LongLength-1; i>=0;--i) {
				string ngram_doc = document.NGrams[i];

				// we find its counterpart in the category profile
				long j = category.NGrams.LongLength;
				while(--j >= 0) {
					string ngram_cat = category.NGrams[j];
					if(ngram_cat == ngram_doc)
						break;
				}

				// calculate how far out of place it is
				long distance;
				if(j > -1) {
					distance = Math.Abs(i - j);
				}else {
					// an N-gram is not in the category profile
					// take maximum out-of-place value
					distance = MAX_OUT_OF_PLACE_DISTANCE;
				}

				sum += distance;
			}

			return sum;
		}

		/// <summary>
		/// Splits the provided text into separate tokens (words) consisting only of letters and apostrophes. Digits and punctuation are discarded.
		/// </summary>
		private static string[] SplitToTokens(this string text)
		{
			return text.Clean().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
		}

		/// <summary>
		/// Splits a word to n-grams for the provided word and the specified gram size.
		/// <para>
		/// Example for word 'TEXT':
		/// </para>
		/// <para>
		/// bi-grams: _T, TE, EX, XT, T_
		/// </para>
		/// <para>
		/// tri-grams: _TE, TEX, EXT, XT_, T__
		/// </para>
		/// <para>
		/// quad-grams: _TEX, TEXT, EXT_, XT__, T___
		/// </para>
		/// </summary>
		private static string[] SplitToNGrams(string word, int n)
		{
			string[] ngrams = new string[word.Length + 1];

			word = '_' + word;

			for(int i = 0; i < word.Length; ++i) {
				string ngram;

				if(i + n <= word.Length) {
					ngram = word.Substring(i, n);
				} else {
					int padLength = i + n - word.Length;
					ngram = word.Substring(i, n - padLength)+new string('_',padLength);
				}

				ngrams[i] = ngram;
			}

			return ngrams;
		}

		/// <summary>
		/// Cleans the provided text, leaving only letters apostrophes and spaces. Digits and punctuation are discarded.
		/// </summary>
		private static string Clean(this string text)
		{
			// replace dot with space in case there is a case of: "word1.word2"
			text = text.Replace('.', ' ');

			StringBuilder sb = new StringBuilder(text.Length);

			foreach(char c in text) {
				if(char.IsLetter(c) || c == '\'' || c == ' ')
					sb.Append(c);
			}

			return sb.ToString();
		}
	}
}
	using System;
	using System.Collections.Generic;
	using System.Linq;
	using System.Text;

	namespace GM.NLP.TextCategorization
	{
	public struct Profile
	{
	public string Name;
	public string[] NGrams;
	}

	/// <summary>
	/// N-Gram-Based Text Categorization algorithm used for language detection.
	/// <para>
	/// Based on this article: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.9367
	/// </para>
	/// </summary>
	public static class LanguageDetection
	{
	/// <summary>
	/// Returns the name of the detected category.
	/// </summary>
	public static string DetectLanguage(string text,List<Profile> categories)
	{
	Profile document = GenerateProfile(text);

	// take the distance measure from all the category profiles to the document profile and pick the smallest one
	int indexOfTheBest=-1;
	long best = long.MaxValue;
	for(int i=categories.Count-1;i>=0;--i) {
	Profile category = categories[i];
	long distance = MeasureProfileDistance(document, category);
	if(distance < best) {
	indexOfTheBest = i;
	best = distance;
	}
	}

	Profile bestFit = categories[indexOfTheBest];

	return bestFit.Name;
	}

	public static Profile GenerateProfile(string text,string name=null)
	{
	string[] tokens = text.ToUpper().SplitToTokens();

	// hash table which has the NGrams as the key, and its counter for value
	Dictionary<string, ulong> allNGrams = new Dictionary<string, ulong>();

	foreach(string token in tokens) {
	// scan down each token, generating all possible N-grams, for N=1 to 5
	for(int n = 1; n <= 5; ++n) {
	string[] currentNGrams = SplitToNGrams(token, n);
	foreach(string ngram in currentNGrams) {
	// insert into the table to find the counter for the N-gram and increment it
	if(allNGrams.ContainsKey(ngram))
	++allNGrams[ngram];
	else
	allNGrams[ngram] = 1;
	}
	}
	}

	// sort by the number of occurrences in reverse order, keep just the N-grams themselves
	var sortedNGrams = allNGrams.OrderByDescending(g => g.Value).Select(g => g.Key);

	// the top 300 or so N-grams are almost always highly correlated to the language
	string[] top300NGrams = sortedNGrams.Take(300).ToArray();

	Profile result;
	result.Name = name;
	result.NGrams = top300NGrams;

	return result;
	}

	/// <summary>
	/// Calculates a simple rank-order statistic, called 'out-of-place' measure. It determines how far out of place an N-gram in one profile is from its place in the other profile.
	/// </summary>
	private static long MeasureProfileDistance(Profile document,Profile category)
	{
	long MAX_OUT_OF_PLACE_DISTANCE = Math.Max(document.NGrams.LongLength, category.NGrams.LongLength);

	// the sum of all of the out-of-place values for all N-grams
	long sum = 0;

	// for each N-gram in the document profile
	for(long i= document.NGrams.LongLength-1; i>=0;--i) {
	string ngram_doc = document.NGrams[i];

	// we find its counterpart in the category profile
	long j = category.NGrams.LongLength;
	while(--j >= 0) {
	string ngram_cat = category.NGrams[j];
	if(ngram_cat == ngram_doc)
	break;
	}

	// calculate how far out of place it is
	long distance;
	if(j > -1) {
	distance = Math.Abs(i - j);
	}else {
	// an N-gram is not in the category profile
	// take maximum out-of-place value
	distance = MAX_OUT_OF_PLACE_DISTANCE;
	}

	sum += distance;
	}

	return sum;
	}

	/// <summary>
	/// Splits the provided text into separate tokens (words) consisting only of letters and apostrophes. Digits and punctuation are discarded.
	/// </summary>
	private static string[] SplitToTokens(this string text)
	{
	return text.Clean().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
	}

	/// <summary>
	/// Splits a word to n-grams for the provided word and the specified gram size.
	/// <para>
	/// Example for word 'TEXT':
	/// </para>
	/// <para>
	/// bi-grams: _T, TE, EX, XT, T_
	/// </para>
	/// <para>
	/// tri-grams: _TE, TEX, EXT, XT_, T__
	/// </para>
	/// <para>
	/// quad-grams: _TEX, TEXT, EXT_, XT__, T___
	/// </para>
	/// </summary>
	private static string[] SplitToNGrams(string word, int n)
	{
	string[] ngrams = new string[word.Length + 1];

	word = '_' + word;

	for(int i = 0; i < word.Length; ++i) {
	string ngram;

	if(i + n <= word.Length) {
	ngram = word.Substring(i, n);
	} else {
	int padLength = i + n - word.Length;
	ngram = word.Substring(i, n - padLength)+new string('_',padLength);
	}

	ngrams[i] = ngram;
	}

	return ngrams;
	}

	/// <summary>
	/// Cleans the provided text, leaving only letters apostrophes and spaces. Digits and punctuation are discarded.
	/// </summary>
	private static string Clean(this string text)
	{
	// replace dot with space in case there is a case of: "word1.word2"
	text = text.Replace('.', ' ');

	StringBuilder sb = new StringBuilder(text.Length);

	foreach(char c in text) {
	if(char.IsLetter(c) \|\| c == '\'' \|\| c == ' ')
	sb.Append(c);
	}

	return sb.ToString();
	}
	}
	}