Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
WordSegmentationTM: Fast Word segmentation with a Triangular Matrix
//MIT License: Copyright (c) 2018 Wolf Garbe
//https://github.com/wolfgarbe/WordSegmentationTM
/// <summary>Find best word segmentation for input string.</summary>
/// <param name="input">The string being word segmented.</param>
/// <param name="maxSegmentationWordLength">The maximum word length that should be considered.</param>
/// <returns>A tuple representing the suggested word segmented text and the sum of logarithmic word occurence probabilities.</returns>
public static (string segmentedString, decimal probabilityLogSum) WordSegmentationTM(string input, int maxSegmentationWordLength = 20)
{
int arraySize = Math.Min(maxSegmentationWordLength, input.Length);
int arrayWidth = ((input.Length - 1) >> 6) + 1; // /64 bit
int arrayWidthByte = arrayWidth << 3; //*8 byte
//instead of storing the segmented strings, we store only an array of potential space positions: bit set == space (1 bit instead of 1 char)
ulong[,] segmentedSpaceBits = new ulong[arraySize, arrayWidth];
decimal[] probabilityLogSum = new decimal[arraySize];
int circularIndex = -1;
//A Triangular Matrix of parts is generated (increasing part lengths), organized as Circular Array
//with nested loops of dimensions n*m : n=input.length; m=Min(input.length,maximum word length)
//outer loop (column): all possible part start positions
for (int j = 0; j < input.Length; j++)
{
int spaceUlongIndex = (j - 1) >> 6; // /64 bit
int arrayCopyByte = Math.Min(((spaceUlongIndex + 1) << 3), arrayWidthByte); // *8 byte
//best segmentation for the prefix of length j will be combined with + " " + part1; set space bit in row 0
if (j > 0) segmentedSpaceBits[circularIndex, spaceUlongIndex] |= ((ulong)1 << ((j - 1) & 0x3f)); // %64 bit
//inner loop (row): all possible part lengths (from start position): part can't be bigger than longest word in dictionary (other than long unknown word)
int imax = Math.Min(input.Length - j, maxSegmentationWordLength);
for (int i = 1; i <= imax; i++)
{
int destinationIndex = ((i + circularIndex) % arraySize);
//Calculate the Naive Bayes probability of a sequence of words (iterative in logarithmic scale)
string part1 = input.Substring(j, i);
decimal ProbabilityLogPart1 = 0;
if (dictionary.TryGetValue(part1, out long wordCount)) ProbabilityLogPart1 = (decimal)Math.Log10((double)wordCount / (double)N);
//estimation for unknown words
else ProbabilityLogPart1 = (decimal)Math.Log10(10.0 / (N * Math.Pow(10.0, part1.Length)));
//set values in first loop
if (j == 0)
{
probabilityLogSum[destinationIndex] = ProbabilityLogPart1;
}
//replace values if better probabilityLogSum
else if ((i == maxSegmentationWordLength) || (probabilityLogSum[destinationIndex] < probabilityLogSum[circularIndex] + ProbabilityLogPart1))
{
System.Buffer.BlockCopy(segmentedSpaceBits, circularIndex * arrayWidthByte, segmentedSpaceBits, destinationIndex * arrayWidthByte, arrayCopyByte);
probabilityLogSum[destinationIndex] = probabilityLogSum[circularIndex] + ProbabilityLogPart1;
}
}
circularIndex++; if (circularIndex == arraySize) circularIndex = 0;
}
//create segmented string result from input and segmentedSpaceBits
StringBuilder resultString = new StringBuilder(input.Length * 2);
int last = -1;
for (int i = 0; i <= input.Length - 2; i++) if ((segmentedSpaceBits[circularIndex, i >> 6] & ((ulong)1 << (i & 0x3f))) > 0)
{
resultString.Append(input, last + 1, i - last);
resultString.Append(' ');
last = i;
}
return (resultString.Append(input.Substring(last + 1)).ToString(), probabilityLogSum[circularIndex]);
}
Console.WriteLine(WordSegmentationTM("thequickbrownfoxjumpsoverthelazydog", maximumDictionaryWordLength).segmentedString);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment