Last active
April 13, 2017 15:21
-
-
Save GregaMohorko/9a52f1699bbe38b38139edaa72417d9f to your computer and use it in GitHub Desktop.
N-Gram-Based text categorization algorithm that can be used for language detection.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
namespace GM.NLP.TextCategorization | |
{ | |
public struct Profile | |
{ | |
public string Name; | |
public string[] NGrams; | |
} | |
/// <summary> | |
/// N-Gram-Based Text Categorization algorithm used for language detection. | |
/// <para> | |
/// Based on this article: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.9367 | |
/// </para> | |
/// </summary> | |
public static class LanguageDetection | |
{ | |
/// <summary> | |
/// Returns the name of the detected category. | |
/// </summary> | |
public static string DetectLanguage(string text,List<Profile> categories) | |
{ | |
Profile document = GenerateProfile(text); | |
// take the distance measure from all the category profiles to the document profile and pick the smallest one | |
int indexOfTheBest=-1; | |
long best = long.MaxValue; | |
for(int i=categories.Count-1;i>=0;--i) { | |
Profile category = categories[i]; | |
long distance = MeasureProfileDistance(document, category); | |
if(distance < best) { | |
indexOfTheBest = i; | |
best = distance; | |
} | |
} | |
Profile bestFit = categories[indexOfTheBest]; | |
return bestFit.Name; | |
} | |
public static Profile GenerateProfile(string text,string name=null) | |
{ | |
string[] tokens = text.ToUpper().SplitToTokens(); | |
// hash table which has the NGrams as the key, and its counter for value | |
Dictionary<string, ulong> allNGrams = new Dictionary<string, ulong>(); | |
foreach(string token in tokens) { | |
// scan down each token, generating all possible N-grams, for N=1 to 5 | |
for(int n = 1; n <= 5; ++n) { | |
string[] currentNGrams = SplitToNGrams(token, n); | |
foreach(string ngram in currentNGrams) { | |
// insert into the table to find the counter for the N-gram and increment it | |
if(allNGrams.ContainsKey(ngram)) | |
++allNGrams[ngram]; | |
else | |
allNGrams[ngram] = 1; | |
} | |
} | |
} | |
// sort by the number of occurrences in reverse order, keep just the N-grams themselves | |
var sortedNGrams = allNGrams.OrderByDescending(g => g.Value).Select(g => g.Key); | |
// the top 300 or so N-grams are almost always highly correlated to the language | |
string[] top300NGrams = sortedNGrams.Take(300).ToArray(); | |
Profile result; | |
result.Name = name; | |
result.NGrams = top300NGrams; | |
return result; | |
} | |
/// <summary> | |
/// Calculates a simple rank-order statistic, called 'out-of-place' measure. It determines how far out of place an N-gram in one profile is from its place in the other profile. | |
/// </summary> | |
private static long MeasureProfileDistance(Profile document,Profile category) | |
{ | |
long MAX_OUT_OF_PLACE_DISTANCE = Math.Max(document.NGrams.LongLength, category.NGrams.LongLength); | |
// the sum of all of the out-of-place values for all N-grams | |
long sum = 0; | |
// for each N-gram in the document profile | |
for(long i= document.NGrams.LongLength-1; i>=0;--i) { | |
string ngram_doc = document.NGrams[i]; | |
// we find its counterpart in the category profile | |
long j = category.NGrams.LongLength; | |
while(--j >= 0) { | |
string ngram_cat = category.NGrams[j]; | |
if(ngram_cat == ngram_doc) | |
break; | |
} | |
// calculate how far out of place it is | |
long distance; | |
if(j > -1) { | |
distance = Math.Abs(i - j); | |
}else { | |
// an N-gram is not in the category profile | |
// take maximum out-of-place value | |
distance = MAX_OUT_OF_PLACE_DISTANCE; | |
} | |
sum += distance; | |
} | |
return sum; | |
} | |
/// <summary> | |
/// Splits the provided text into separate tokens (words) consisting only of letters and apostrophes. Digits and punctuation are discarded. | |
/// </summary> | |
private static string[] SplitToTokens(this string text) | |
{ | |
return text.Clean().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); | |
} | |
/// <summary> | |
/// Splits a word to n-grams for the provided word and the specified gram size. | |
/// <para> | |
/// Example for word 'TEXT': | |
/// </para> | |
/// <para> | |
/// bi-grams: _T, TE, EX, XT, T_ | |
/// </para> | |
/// <para> | |
/// tri-grams: _TE, TEX, EXT, XT_, T__ | |
/// </para> | |
/// <para> | |
/// quad-grams: _TEX, TEXT, EXT_, XT__, T___ | |
/// </para> | |
/// </summary> | |
private static string[] SplitToNGrams(string word, int n) | |
{ | |
string[] ngrams = new string[word.Length + 1]; | |
word = '_' + word; | |
for(int i = 0; i < word.Length; ++i) { | |
string ngram; | |
if(i + n <= word.Length) { | |
ngram = word.Substring(i, n); | |
} else { | |
int padLength = i + n - word.Length; | |
ngram = word.Substring(i, n - padLength)+new string('_',padLength); | |
} | |
ngrams[i] = ngram; | |
} | |
return ngrams; | |
} | |
/// <summary> | |
/// Cleans the provided text, leaving only letters apostrophes and spaces. Digits and punctuation are discarded. | |
/// </summary> | |
private static string Clean(this string text) | |
{ | |
// replace dot with space in case there is a case of: "word1.word2" | |
text = text.Replace('.', ' '); | |
StringBuilder sb = new StringBuilder(text.Length); | |
foreach(char c in text) { | |
if(char.IsLetter(c) || c == '\'' || c == ' ') | |
sb.Append(c); | |
} | |
return sb.ToString(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment