Skip to content

Instantly share code, notes, and snippets.

@GregaMohorko
Last active April 13, 2017 15:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GregaMohorko/9a52f1699bbe38b38139edaa72417d9f to your computer and use it in GitHub Desktop.
Save GregaMohorko/9a52f1699bbe38b38139edaa72417d9f to your computer and use it in GitHub Desktop.
N-Gram-Based text categorization algorithm that can be used for language detection.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace GM.NLP.TextCategorization
{
public struct Profile
{
public string Name;
public string[] NGrams;
}
/// <summary>
/// N-Gram-Based Text Categorization algorithm used for language detection.
/// <para>
/// Based on this article: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.9367
/// </para>
/// </summary>
public static class LanguageDetection
{
/// <summary>
/// Returns the name of the detected category.
/// </summary>
public static string DetectLanguage(string text,List<Profile> categories)
{
Profile document = GenerateProfile(text);
// take the distance measure from all the category profiles to the document profile and pick the smallest one
int indexOfTheBest=-1;
long best = long.MaxValue;
for(int i=categories.Count-1;i>=0;--i) {
Profile category = categories[i];
long distance = MeasureProfileDistance(document, category);
if(distance < best) {
indexOfTheBest = i;
best = distance;
}
}
Profile bestFit = categories[indexOfTheBest];
return bestFit.Name;
}
public static Profile GenerateProfile(string text,string name=null)
{
string[] tokens = text.ToUpper().SplitToTokens();
// hash table which has the NGrams as the key, and its counter for value
Dictionary<string, ulong> allNGrams = new Dictionary<string, ulong>();
foreach(string token in tokens) {
// scan down each token, generating all possible N-grams, for N=1 to 5
for(int n = 1; n <= 5; ++n) {
string[] currentNGrams = SplitToNGrams(token, n);
foreach(string ngram in currentNGrams) {
// insert into the table to find the counter for the N-gram and increment it
if(allNGrams.ContainsKey(ngram))
++allNGrams[ngram];
else
allNGrams[ngram] = 1;
}
}
}
// sort by the number of occurrences in reverse order, keep just the N-grams themselves
var sortedNGrams = allNGrams.OrderByDescending(g => g.Value).Select(g => g.Key);
// the top 300 or so N-grams are almost always highly correlated to the language
string[] top300NGrams = sortedNGrams.Take(300).ToArray();
Profile result;
result.Name = name;
result.NGrams = top300NGrams;
return result;
}
/// <summary>
/// Calculates a simple rank-order statistic, called 'out-of-place' measure. It determines how far out of place an N-gram in one profile is from its place in the other profile.
/// </summary>
private static long MeasureProfileDistance(Profile document,Profile category)
{
long MAX_OUT_OF_PLACE_DISTANCE = Math.Max(document.NGrams.LongLength, category.NGrams.LongLength);
// the sum of all of the out-of-place values for all N-grams
long sum = 0;
// for each N-gram in the document profile
for(long i= document.NGrams.LongLength-1; i>=0;--i) {
string ngram_doc = document.NGrams[i];
// we find its counterpart in the category profile
long j = category.NGrams.LongLength;
while(--j >= 0) {
string ngram_cat = category.NGrams[j];
if(ngram_cat == ngram_doc)
break;
}
// calculate how far out of place it is
long distance;
if(j > -1) {
distance = Math.Abs(i - j);
}else {
// an N-gram is not in the category profile
// take maximum out-of-place value
distance = MAX_OUT_OF_PLACE_DISTANCE;
}
sum += distance;
}
return sum;
}
/// <summary>
/// Splits the provided text into separate tokens (words) consisting only of letters and apostrophes. Digits and punctuation are discarded.
/// </summary>
private static string[] SplitToTokens(this string text)
{
return text.Clean().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
}
/// <summary>
/// Splits a word to n-grams for the provided word and the specified gram size.
/// <para>
/// Example for word 'TEXT':
/// </para>
/// <para>
/// bi-grams: _T, TE, EX, XT, T_
/// </para>
/// <para>
/// tri-grams: _TE, TEX, EXT, XT_, T__
/// </para>
/// <para>
/// quad-grams: _TEX, TEXT, EXT_, XT__, T___
/// </para>
/// </summary>
private static string[] SplitToNGrams(string word, int n)
{
string[] ngrams = new string[word.Length + 1];
word = '_' + word;
for(int i = 0; i < word.Length; ++i) {
string ngram;
if(i + n <= word.Length) {
ngram = word.Substring(i, n);
} else {
int padLength = i + n - word.Length;
ngram = word.Substring(i, n - padLength)+new string('_',padLength);
}
ngrams[i] = ngram;
}
return ngrams;
}
/// <summary>
/// Cleans the provided text, leaving only letters apostrophes and spaces. Digits and punctuation are discarded.
/// </summary>
private static string Clean(this string text)
{
// replace dot with space in case there is a case of: "word1.word2"
text = text.Replace('.', ' ');
StringBuilder sb = new StringBuilder(text.Length);
foreach(char c in text) {
if(char.IsLetter(c) || c == '\'' || c == ' ')
sb.Append(c);
}
return sb.ToString();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment