Created
November 15, 2017 01:33
-
-
Save anonymous/86f736b7874711cd42d747540ba70cf1 to your computer and use it in GitHub Desktop.
String Comparison Speed ups
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NSUInteger mdc_levenshteinDistanceFast(NSString *left, NSString *right) { | |
mdc_normalizeDistanceParameters(&left, &right); | |
//test string & data length. if different, use mdc | |
//or use[left smallestEncoding] == 1/7 | |
NSInteger leftLength = [[left dataUsingEncoding:NSUTF8StringEncoding] length]; | |
NSInteger rightLength = [[right dataUsingEncoding:NSUTF8StringEncoding] length]; | |
if (rightLength == 0) { | |
return leftLength; | |
} | |
char *lBytes = (char *)[[left dataUsingEncoding:NSUTF8StringEncoding] bytes]; | |
char *rBytes = (char *)[[right dataUsingEncoding:NSUTF8StringEncoding] bytes]; | |
MDCDistanceMatrix *matrix = MDCDistanceMatrixCreateWithLengths(leftLength, rightLength); | |
MDCDistanceMatrixWalk(matrix, ^(MDCDistanceMatrix *matrix, NSUInteger x, NSUInteger y){ | |
// Each element in the matrix corresponds to the distance between | |
// the two strings. For example, with "car" and "boat", the matrix | |
// for the first characters of each would look like this: | |
// | |
// 0 c | |
// b 0 | |
// | |
// The distance it would take to turn "c" into "b" via deletion is | |
// the distance it would take to turn "b" into "bc" (1 by insertion), | |
// plus one to delete the extra character. | |
NSUInteger deletion = matrix->distances[y-1][x] + 1; | |
// The distance it would take to turn "c" into "b" via insertion is | |
// the cost of turning "c" into "" (1 by deletion), plus one. | |
NSUInteger insertion = matrix->distances[y][x-1] + 1; | |
// If the two characters are the same, there is no substituion cost, | |
// so the distance is equivalent to the distance between the two previous | |
// strings (the upper-left element in the matrix). | |
NSUInteger substitution = matrix->distances[y-1][x-1]; | |
// But if they're different, the distance is the previous distance plus one. | |
// The matrix is padded with row and column indices, so we must decrement | |
// x and y when accessing characters in the string. | |
// NSLog(@"%c - %c", rightBytes[x-1], leftBytes[y-1]); | |
if (lBytes[y-1] - rBytes[x-1]) { | |
++substitution; | |
} | |
// We're interested in the most efficient edit distance, so use the minimum | |
// of the three calculated costs. | |
matrix->distances[y][x] = MIN(MIN(insertion, deletion), substitution); | |
}); | |
// The corner value is the optimal distance between the two entire strings. | |
NSUInteger distance = MDCDistanceMatrixCornerValue(matrix); | |
MDCDistanceMatrixDestroy(matrix); | |
return distance; | |
} | |
MDCDistanceMatrix *MDCDistanceMatrixCreateWithLengths(NSInteger leftLength, NSInteger rightLength) { | |
MDCDistanceMatrix *matrix = malloc(sizeof(MDCDistanceMatrix)); | |
// Matrix height and width are padded by one | |
// to make room for column and row indices. | |
matrix->height = leftLength + 1; | |
matrix->width = rightLength + 1; | |
// Allocate memory for the matrix and initialize the rows to zero. | |
size_t rowSize = matrix->width * sizeof(NSUInteger); | |
matrix->distances = calloc(matrix->height, rowSize); | |
for (NSUInteger y = 0; y < matrix->height; ++y) { | |
matrix->distances[y] = calloc(1, rowSize); | |
} | |
// Set row and column indices. | |
for (NSUInteger rowIndex = 1; rowIndex < matrix->height; ++rowIndex) { | |
matrix->distances[rowIndex][0] = rowIndex; | |
} | |
for (NSUInteger columnIndex = 1; columnIndex < matrix->width; ++columnIndex) { | |
matrix->distances[0][columnIndex] = columnIndex; | |
} | |
return matrix; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment