Skip to content

Instantly share code, notes, and snippets.

Created November 15, 2017 01:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/86f736b7874711cd42d747540ba70cf1 to your computer and use it in GitHub Desktop.
Save anonymous/86f736b7874711cd42d747540ba70cf1 to your computer and use it in GitHub Desktop.
String Comparison Speed ups
NSUInteger mdc_levenshteinDistanceFast(NSString *left, NSString *right) {
mdc_normalizeDistanceParameters(&left, &right);
//test string & data length. if different, use mdc
//or use[left smallestEncoding] == 1/7
NSInteger leftLength = [[left dataUsingEncoding:NSUTF8StringEncoding] length];
NSInteger rightLength = [[right dataUsingEncoding:NSUTF8StringEncoding] length];
if (rightLength == 0) {
return leftLength;
}
char *lBytes = (char *)[[left dataUsingEncoding:NSUTF8StringEncoding] bytes];
char *rBytes = (char *)[[right dataUsingEncoding:NSUTF8StringEncoding] bytes];
MDCDistanceMatrix *matrix = MDCDistanceMatrixCreateWithLengths(leftLength, rightLength);
MDCDistanceMatrixWalk(matrix, ^(MDCDistanceMatrix *matrix, NSUInteger x, NSUInteger y){
// Each element in the matrix corresponds to the distance between
// the two strings. For example, with "car" and "boat", the matrix
// for the first characters of each would look like this:
//
// 0 c
// b 0
//
// The distance it would take to turn "c" into "b" via deletion is
// the distance it would take to turn "b" into "bc" (1 by insertion),
// plus one to delete the extra character.
NSUInteger deletion = matrix->distances[y-1][x] + 1;
// The distance it would take to turn "c" into "b" via insertion is
// the cost of turning "c" into "" (1 by deletion), plus one.
NSUInteger insertion = matrix->distances[y][x-1] + 1;
// If the two characters are the same, there is no substituion cost,
// so the distance is equivalent to the distance between the two previous
// strings (the upper-left element in the matrix).
NSUInteger substitution = matrix->distances[y-1][x-1];
// But if they're different, the distance is the previous distance plus one.
// The matrix is padded with row and column indices, so we must decrement
// x and y when accessing characters in the string.
// NSLog(@"%c - %c", rightBytes[x-1], leftBytes[y-1]);
if (lBytes[y-1] - rBytes[x-1]) {
++substitution;
}
// We're interested in the most efficient edit distance, so use the minimum
// of the three calculated costs.
matrix->distances[y][x] = MIN(MIN(insertion, deletion), substitution);
});
// The corner value is the optimal distance between the two entire strings.
NSUInteger distance = MDCDistanceMatrixCornerValue(matrix);
MDCDistanceMatrixDestroy(matrix);
return distance;
}
MDCDistanceMatrix *MDCDistanceMatrixCreateWithLengths(NSInteger leftLength, NSInteger rightLength) {
MDCDistanceMatrix *matrix = malloc(sizeof(MDCDistanceMatrix));
// Matrix height and width are padded by one
// to make room for column and row indices.
matrix->height = leftLength + 1;
matrix->width = rightLength + 1;
// Allocate memory for the matrix and initialize the rows to zero.
size_t rowSize = matrix->width * sizeof(NSUInteger);
matrix->distances = calloc(matrix->height, rowSize);
for (NSUInteger y = 0; y < matrix->height; ++y) {
matrix->distances[y] = calloc(1, rowSize);
}
// Set row and column indices.
for (NSUInteger rowIndex = 1; rowIndex < matrix->height; ++rowIndex) {
matrix->distances[rowIndex][0] = rowIndex;
}
for (NSUInteger columnIndex = 1; columnIndex < matrix->width; ++columnIndex) {
matrix->distances[0][columnIndex] = columnIndex;
}
return matrix;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment