Last active
April 27, 2018 13:29
-
-
Save wolfgarbe/f2e04bcbb7b3d92c83466e0a9354348d to your computer and use it in GitHub Desktop.
WordSegmentationTM: Fast Word segmentation with a Triangular Matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//MIT License: Copyright (c) 2018 Wolf Garbe | |
//https://github.com/wolfgarbe/WordSegmentationTM | |
/// <summary>Find best word segmentation for input string.</summary> | |
/// <param name="input">The string being word segmented.</param> | |
/// <param name="maxSegmentationWordLength">The maximum word length that should be considered.</param> | |
/// <returns>A tuple representing the suggested word segmented text and the sum of logarithmic word occurence probabilities.</returns> | |
public static (string segmentedString, decimal probabilityLogSum) WordSegmentationTM(string input, int maxSegmentationWordLength = 20) | |
{ | |
int arraySize = Math.Min(maxSegmentationWordLength, input.Length); | |
int arrayWidth = ((input.Length - 1) >> 6) + 1; // /64 bit | |
int arrayWidthByte = arrayWidth << 3; //*8 byte | |
//instead of storing the segmented strings, we store only an array of potential space positions: bit set == space (1 bit instead of 1 char) | |
ulong[,] segmentedSpaceBits = new ulong[arraySize, arrayWidth]; | |
decimal[] probabilityLogSum = new decimal[arraySize]; | |
int circularIndex = -1; | |
//A Triangular Matrix of parts is generated (increasing part lengths), organized as Circular Array | |
//with nested loops of dimensions n*m : n=input.length; m=Min(input.length,maximum word length) | |
//outer loop (column): all possible part start positions | |
for (int j = 0; j < input.Length; j++) | |
{ | |
int spaceUlongIndex = (j - 1) >> 6; // /64 bit | |
int arrayCopyByte = Math.Min(((spaceUlongIndex + 1) << 3), arrayWidthByte); // *8 byte | |
//best segmentation for the prefix of length j will be combined with + " " + part1; set space bit in row 0 | |
if (j > 0) segmentedSpaceBits[circularIndex, spaceUlongIndex] |= ((ulong)1 << ((j - 1) & 0x3f)); // %64 bit | |
//inner loop (row): all possible part lengths (from start position): part can't be bigger than longest word in dictionary (other than long unknown word) | |
int imax = Math.Min(input.Length - j, maxSegmentationWordLength); | |
for (int i = 1; i <= imax; i++) | |
{ | |
int destinationIndex = ((i + circularIndex) % arraySize); | |
//Calculate the Naive Bayes probability of a sequence of words (iterative in logarithmic scale) | |
string part1 = input.Substring(j, i); | |
decimal ProbabilityLogPart1 = 0; | |
if (dictionary.TryGetValue(part1, out long wordCount)) ProbabilityLogPart1 = (decimal)Math.Log10((double)wordCount / (double)N); | |
//estimation for unknown words | |
else ProbabilityLogPart1 = (decimal)Math.Log10(10.0 / (N * Math.Pow(10.0, part1.Length))); | |
//set values in first loop | |
if (j == 0) | |
{ | |
probabilityLogSum[destinationIndex] = ProbabilityLogPart1; | |
} | |
//replace values if better probabilityLogSum | |
else if ((i == maxSegmentationWordLength) || (probabilityLogSum[destinationIndex] < probabilityLogSum[circularIndex] + ProbabilityLogPart1)) | |
{ | |
System.Buffer.BlockCopy(segmentedSpaceBits, circularIndex * arrayWidthByte, segmentedSpaceBits, destinationIndex * arrayWidthByte, arrayCopyByte); | |
probabilityLogSum[destinationIndex] = probabilityLogSum[circularIndex] + ProbabilityLogPart1; | |
} | |
} | |
circularIndex++; if (circularIndex == arraySize) circularIndex = 0; | |
} | |
//create segmented string result from input and segmentedSpaceBits | |
StringBuilder resultString = new StringBuilder(input.Length * 2); | |
int last = -1; | |
for (int i = 0; i <= input.Length - 2; i++) if ((segmentedSpaceBits[circularIndex, i >> 6] & ((ulong)1 << (i & 0x3f))) > 0) | |
{ | |
resultString.Append(input, last + 1, i - last); | |
resultString.Append(' '); | |
last = i; | |
} | |
return (resultString.Append(input.Substring(last + 1)).ToString(), probabilityLogSum[circularIndex]); | |
} | |
Console.WriteLine(WordSegmentationTM("thequickbrownfoxjumpsoverthelazydog", maximumDictionaryWordLength).segmentedString); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment