Last active
August 7, 2019 11:45
-
-
Save brianewilkins/6f145b801da0726b77d10e0bc782dfb4 to your computer and use it in GitHub Desktop.
Design document that contains a view that implements the Double Metaphone algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"_id": "_design/doubleMetaphone", | |
"views": { | |
"doubleMetaphone": { | |
"map": "'use strict'\n\n// Match vowels (including `Y`).\nvar vowels = /[AEIOUY]/\n\n// Match few Slavo-Germanic values.\nvar slavoGermanic = /W|K|CZ|WITZ/\n\n// Match few Germanic values.\nvar germanic = /^(VAN |VON |SCH)/\n\n// Match initial values of which the first character should be skipped.\nvar initialExceptions = /^(GN|KN|PN|WR|PS)/\n\n// Match initial Greek-like values of which the `CH` sounds like `K`.\nvar initialGreekCh = /^CH(IA|EM|OR([^E])|YM|ARAC|ARIS)/\n\n// Match Greek-like values of which the `CH` sounds like `K`.\nvar greekCh = /ORCHES|ARCHIT|ORCHID/\n\n// Match values which when following `CH`, transform `CH` to sound like `K`.\nvar chForKh = /[ BFHLMNRVW]/\n\n// Match values which when preceding a vowel and `UGH`, sound like `F`.\nvar gForF = /[CGLRT]/\n\n// Match initial values which sound like either `K` or `J`.\nvar initialGForKj = /Y[\\s\\S]|E[BILPRSY]|I[BELN]/\n\n// Match initial values which sound like either `K` or `J`.\nvar initialAngerException = /^[DMR]ANGER/\n\n// Match values which when following `GY`, do not sound like `K` or `J`.\nvar gForKj = /[EGIR]/\n\n// Match values which when following `J`, do not sound `J`.\nvar jForJException = /[LTKSNMBZ]/\n\n// Match values which might sound like `L`.\nvar alle = /AS|OS/\n\n// Match Germanic values preceding `SH` which sound like `S`.\nvar hForS = /EIM|OEK|OLM|OLZ/\n\n// Match Dutch values following `SCH` which sound like either `X` and `SK`,\n// or `SK`.\nvar dutchSch = /E[DMNR]|UY|OO/\n\n// Get the phonetics according to the Double Metaphone algorithm from a value.\n// eslint-disable-next-line complexity\nfunction doubleMetaphone(value) {\n var primary = ''\n var secondary = ''\n var index = 0\n var length = value.length\n var last = length - 1\n var isSlavoGermanic\n var isGermanic\n var subvalue\n var next\n var prev\n var nextnext\n var characters\n\n value = String(value).toUpperCase() + ' '\n isSlavoGermanic = slavoGermanic.test(value)\n isGermanic = germanic.test(value)\n characters = value.split('')\n\n // Skip this at beginning of word.\n if (initialExceptions.test(value)) {\n index++\n }\n\n // Initial X is pronounced Z, which maps to S. Such as `Xavier`.\n if (characters[0] === 'X') {\n primary += 'S'\n secondary += 'S'\n index++\n }\n\n while (index < length) {\n prev = characters[index - 1]\n next = characters[index + 1]\n nextnext = characters[index + 2]\n\n switch (characters[index]) {\n case 'A':\n case 'E':\n case 'I':\n case 'O':\n case 'U':\n case 'Y':\n case 'À':\n case 'Ê':\n case 'É':\n if (index === 0) {\n // All initial vowels now map to `A`.\n primary += 'A'\n secondary += 'A'\n }\n\n index++\n\n break\n case 'B':\n primary += 'P'\n secondary += 'P'\n\n if (next === 'B') {\n index++\n }\n\n index++\n\n break\n case 'Ç':\n primary += 'S'\n secondary += 'S'\n index++\n\n break\n case 'C':\n // Various Germanic:\n if (\n prev === 'A' &&\n next === 'H' &&\n nextnext !== 'I' &&\n !vowels.test(characters[index - 2]) &&\n (nextnext !== 'E' ||\n (subvalue =\n value.slice(index - 2, index + 4) &&\n (subvalue === 'BACHER' || subvalue === 'MACHER')))\n ) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Special case for `Caesar`.\n if (index === 0 && value.slice(index + 1, index + 6) === 'AESAR') {\n primary += 'S'\n secondary += 'S'\n index += 2\n\n break\n }\n\n // Italian `Chianti`.\n if (value.slice(index + 1, index + 4) === 'HIA') {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n if (next === 'H') {\n // Find `Michael`.\n if (index > 0 && nextnext === 'A' && characters[index + 3] === 'E') {\n primary += 'K'\n secondary += 'X'\n index += 2\n\n break\n }\n\n // Greek roots such as `chemistry`, `chorus`.\n if (index === 0 && initialGreekCh.test(value)) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Germanic, Greek, or otherwise `CH` for `KH` sound.\n if (\n isGermanic ||\n // Such as 'architect' but not 'arch', orchestra', 'orchid'.\n greekCh.test(value.slice(index - 2, index + 4)) ||\n (nextnext === 'T' || nextnext === 'S') ||\n ((index === 0 ||\n prev === 'A' ||\n prev === 'E' ||\n prev === 'O' ||\n prev === 'U') &&\n // Such as `wachtler`, `weschsler`, but not `tichner`.\n chForKh.test(nextnext))\n ) {\n primary += 'K'\n secondary += 'K'\n } else if (index === 0) {\n primary += 'X'\n secondary += 'X'\n // Such as 'McHugh'.\n } else if (value.slice(0, 2) === 'MC') {\n // Bug? Why matching absolute? what about McHiccup?\n primary += 'K'\n secondary += 'K'\n } else {\n primary += 'X'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n // Such as `Czerny`.\n if (next === 'Z' && value.slice(index - 2, index) !== 'WI') {\n primary += 'S'\n secondary += 'X'\n index += 2\n\n break\n }\n\n // Such as `Focaccia`.\n if (value.slice(index + 1, index + 4) === 'CIA') {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n // Double `C`, but not `McClellan`.\n if (next === 'C' && !(index === 1 && characters[0] === 'M')) {\n // Such as `Bellocchio`, but not `Bacchus`.\n if (\n (nextnext === 'I' || nextnext === 'E' || nextnext === 'H') &&\n value.slice(index + 2, index + 4) !== 'HU'\n ) {\n subvalue = value.slice(index - 1, index + 4)\n\n // Such as `Accident`, `Accede`, `Succeed`.\n if (\n (index === 1 && prev === 'A') ||\n subvalue === 'UCCEE' ||\n subvalue === 'UCCES'\n ) {\n primary += 'KS'\n secondary += 'KS'\n // Such as `Bacci`, `Bertucci`, other Italian.\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 3\n\n break\n } else {\n // Pierce's rule.\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n }\n\n if (next === 'G' || next === 'K' || next === 'Q') {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Italian.\n if (\n next === 'I' &&\n // Bug: The original algorithm also calls for A (as in CIA), which is\n // already taken care of above.\n (nextnext === 'E' || nextnext === 'O')\n ) {\n primary += 'S'\n secondary += 'X'\n index += 2\n\n break\n }\n\n if (next === 'I' || next === 'E' || next === 'Y') {\n primary += 'S'\n secondary += 'S'\n index += 2\n\n break\n }\n\n primary += 'K'\n secondary += 'K'\n\n // Skip two extra characters ahead in `Mac Caffrey`, `Mac Gregor`.\n if (\n next === ' ' &&\n (nextnext === 'C' || nextnext === 'G' || nextnext === 'Q')\n ) {\n index += 3\n break\n }\n\n // Bug: Already covered above.\n // if (\n // next === 'K' ||\n // next === 'Q' ||\n // (next === 'C' && nextnext !== 'E' && nextnext !== 'I')\n // ) {\n // index++;\n // }\n\n index++\n\n break\n case 'D':\n if (next === 'G') {\n // Such as `edge`.\n if (nextnext === 'E' || nextnext === 'I' || nextnext === 'Y') {\n primary += 'J'\n secondary += 'J'\n index += 3\n // Such as `Edgar`.\n } else {\n primary += 'TK'\n secondary += 'TK'\n index += 2\n }\n\n break\n }\n\n if (next === 'T' || next === 'D') {\n primary += 'T'\n secondary += 'T'\n index += 2\n\n break\n }\n\n primary += 'T'\n secondary += 'T'\n index++\n\n break\n case 'F':\n if (next === 'F') {\n index++\n }\n\n index++\n primary += 'F'\n secondary += 'F'\n\n break\n case 'G':\n if (next === 'H') {\n if (index > 0 && !vowels.test(prev)) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Such as `Ghislane`, `Ghiradelli`.\n if (index === 0) {\n if (nextnext === 'I') {\n primary += 'J'\n secondary += 'J'\n } else {\n primary += 'K'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n // Parker's rule (with some further refinements).\n if (\n // Such as `Hugh`. The comma is not a bug.\n ((subvalue = characters[index - 2]),\n subvalue === 'B' || subvalue === 'H' || subvalue === 'D') ||\n // Such as `bough`. The comma is not a bug.\n ((subvalue = characters[index - 3]),\n subvalue === 'B' || subvalue === 'H' || subvalue === 'D') ||\n // Such as `Broughton`. The comma is not a bug.\n ((subvalue = characters[index - 4]),\n subvalue === 'B' || subvalue === 'H')\n ) {\n index += 2\n\n break\n }\n\n // Such as `laugh`, `McLaughlin`, `cough`, `gough`, `rough`, `tough`.\n if (index > 2 && prev === 'U' && gForF.test(characters[index - 3])) {\n primary += 'F'\n secondary += 'F'\n } else if (index > 0 && prev !== 'I') {\n primary += 'K'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n if (next === 'N') {\n if (index === 1 && vowels.test(characters[0]) && !isSlavoGermanic) {\n primary += 'KN'\n secondary += 'N'\n // Not like `Cagney`.\n } else if (\n value.slice(index + 2, index + 4) !== 'EY' &&\n value.slice(index + 1) !== 'Y' &&\n !isSlavoGermanic\n ) {\n primary += 'N'\n secondary += 'KN'\n } else {\n primary += 'KN'\n secondary += 'KN'\n }\n\n index += 2\n\n break\n }\n\n // Such as `Tagliaro`.\n if (value.slice(index + 1, index + 3) === 'LI' && !isSlavoGermanic) {\n primary += 'KL'\n secondary += 'L'\n index += 2\n\n break\n }\n\n // -ges-, -gep-, -gel- at beginning.\n if (index === 0 && initialGForKj.test(value.slice(1, 3))) {\n primary += 'K'\n secondary += 'J'\n index += 2\n\n break\n }\n\n // -ger-, -gy-.\n if (\n (value.slice(index + 1, index + 3) === 'ER' &&\n prev !== 'I' &&\n prev !== 'E' &&\n !initialAngerException.test(value.slice(0, 6))) ||\n (next === 'Y' && !gForKj.test(prev))\n ) {\n primary += 'K'\n secondary += 'J'\n index += 2\n\n break\n }\n\n // Italian such as `biaggi`.\n if (\n next === 'E' ||\n next === 'I' ||\n next === 'Y' ||\n ((prev === 'A' || prev === 'O') && next === 'G' && nextnext === 'I')\n ) {\n // Obvious Germanic.\n if (value.slice(index + 1, index + 3) === 'ET' || isGermanic) {\n primary += 'K'\n secondary += 'K'\n } else {\n primary += 'J'\n\n // Always soft if French ending.\n if (value.slice(index + 1, index + 5) === 'IER ') {\n secondary += 'J'\n } else {\n secondary += 'K'\n }\n }\n\n index += 2\n\n break\n }\n\n if (next === 'G') {\n index++\n }\n\n index++\n\n primary += 'K'\n secondary += 'K'\n\n break\n case 'H':\n // Only keep if first & before vowel or btw. 2 vowels.\n if (vowels.test(next) && (index === 0 || vowels.test(prev))) {\n primary += 'H'\n secondary += 'H'\n\n index++\n }\n\n index++\n\n break\n case 'J':\n // Obvious Spanish, `jose`, `San Jacinto`.\n if (\n value.slice(index, index + 4) === 'JOSE' ||\n value.slice(0, 4) === 'SAN '\n ) {\n if (\n value.slice(0, 4) === 'SAN ' ||\n (index === 0 && characters[index + 4] === ' ')\n ) {\n primary += 'H'\n secondary += 'H'\n } else {\n primary += 'J'\n secondary += 'H'\n }\n\n index++\n\n break\n }\n\n if (\n index === 0\n // Bug: unreachable (see previous statement).\n // && value.slice(index, index + 4) !== 'JOSE'.\n ) {\n primary += 'J'\n\n // Such as `Yankelovich` or `Jankelowicz`.\n secondary += 'A'\n // Spanish pron. of such as `bajador`.\n } else if (\n !isSlavoGermanic &&\n (next === 'A' || next === 'O') &&\n vowels.test(prev)\n ) {\n primary += 'J'\n secondary += 'H'\n } else if (index === last) {\n primary += 'J'\n } else if (\n prev !== 'S' &&\n prev !== 'K' &&\n prev !== 'L' &&\n !jForJException.test(next)\n ) {\n primary += 'J'\n secondary += 'J'\n // It could happen.\n } else if (next === 'J') {\n index++\n }\n\n index++\n\n break\n case 'K':\n if (next === 'K') {\n index++\n }\n\n primary += 'K'\n secondary += 'K'\n index++\n\n break\n case 'L':\n if (next === 'L') {\n // Spanish such as `cabrillo`, `gallegos`.\n if (\n (index === length - 3 &&\n ((prev === 'A' && nextnext === 'E') ||\n (prev === 'I' && (nextnext === 'O' || nextnext === 'A')))) ||\n (prev === 'A' &&\n nextnext === 'E' &&\n (characters[last] === 'A' ||\n characters[last] === 'O' ||\n alle.test(value.slice(last - 1, length))))\n ) {\n primary += 'L'\n index += 2\n\n break\n }\n\n index++\n }\n\n primary += 'L'\n secondary += 'L'\n index++\n\n break\n case 'M':\n if (\n next === 'M' ||\n // Such as `dumb`, `thumb`.\n (prev === 'U' &&\n next === 'B' &&\n (index + 1 === last || value.slice(index + 2, index + 4) === 'ER'))\n ) {\n index++\n }\n\n index++\n primary += 'M'\n secondary += 'M'\n\n break\n case 'N':\n if (next === 'N') {\n index++\n }\n\n index++\n primary += 'N'\n secondary += 'N'\n\n break\n case 'Ñ':\n index++\n primary += 'N'\n secondary += 'N'\n\n break\n case 'P':\n if (next === 'H') {\n primary += 'F'\n secondary += 'F'\n index += 2\n\n break\n }\n\n // Also account for `campbell` and `raspberry`.\n subvalue = next\n\n if (subvalue === 'P' || subvalue === 'B') {\n index++\n }\n\n index++\n\n primary += 'P'\n secondary += 'P'\n\n break\n case 'Q':\n if (next === 'Q') {\n index++\n }\n\n index++\n primary += 'K'\n secondary += 'K'\n\n break\n case 'R':\n // French such as `Rogier`, but exclude `Hochmeier`.\n if (\n index === last &&\n !isSlavoGermanic &&\n prev === 'E' &&\n characters[index - 2] === 'I' &&\n characters[index - 4] !== 'M' &&\n (characters[index - 3] !== 'E' && characters[index - 3] !== 'A')\n ) {\n secondary += 'R'\n } else {\n primary += 'R'\n secondary += 'R'\n }\n\n if (next === 'R') {\n index++\n }\n\n index++\n\n break\n case 'S':\n // Special cases `island`, `isle`, `carlisle`, `carlysle`.\n if (next === 'L' && (prev === 'I' || prev === 'Y')) {\n index++\n\n break\n }\n\n // Special case `sugar-`.\n if (index === 0 && value.slice(1, 5) === 'UGAR') {\n primary += 'X'\n secondary += 'S'\n index++\n\n break\n }\n\n if (next === 'H') {\n // Germanic.\n if (hForS.test(value.slice(index + 1, index + 5))) {\n primary += 'S'\n secondary += 'S'\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 2\n break\n }\n\n if (\n next === 'I' &&\n (nextnext === 'O' || nextnext === 'A')\n // Bug: Already covered by previous branch\n // || value.slice(index, index + 4) === 'SIAN'\n ) {\n if (isSlavoGermanic) {\n primary += 'S'\n secondary += 'S'\n } else {\n primary += 'S'\n secondary += 'X'\n }\n\n index += 3\n\n break\n }\n\n // German & Anglicization's, such as `Smith` match `Schmidt`, `snider`\n // match `Schneider`. Also, -sz- in slavic language although in\n // hungarian it is pronounced `s`.\n if (\n next === 'Z' ||\n (index === 0 &&\n (next === 'L' || next === 'M' || next === 'N' || next === 'W'))\n ) {\n primary += 'S'\n secondary += 'X'\n\n if (next === 'Z') {\n index++\n }\n\n index++\n\n break\n }\n\n if (next === 'C') {\n // Schlesinger's rule.\n if (nextnext === 'H') {\n subvalue = value.slice(index + 3, index + 5)\n\n // Dutch origin, such as `school`, `schooner`.\n if (dutchSch.test(subvalue)) {\n // Such as `schermerhorn`, `schenker`.\n if (subvalue === 'ER' || subvalue === 'EN') {\n primary += 'X'\n secondary += 'SK'\n } else {\n primary += 'SK'\n secondary += 'SK'\n }\n\n index += 3\n\n break\n }\n\n if (\n index === 0 &&\n !vowels.test(characters[3]) &&\n characters[3] !== 'W'\n ) {\n primary += 'X'\n secondary += 'S'\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 3\n\n break\n }\n\n if (nextnext === 'I' || nextnext === 'E' || nextnext === 'Y') {\n primary += 'S'\n secondary += 'S'\n index += 3\n break\n }\n\n primary += 'SK'\n secondary += 'SK'\n index += 3\n\n break\n }\n\n subvalue = value.slice(index - 2, index)\n\n // French such as `resnais`, `artois`.\n if (index === last && (subvalue === 'AI' || subvalue === 'OI')) {\n secondary += 'S'\n } else {\n primary += 'S'\n secondary += 'S'\n }\n\n if (\n next === 'S'\n // Bug: already taken care of by `German & Anglicization's` above:\n // || next === 'Z'\n ) {\n index++\n }\n\n index++\n\n break\n case 'T':\n if (next === 'I' && nextnext === 'O' && characters[index + 3] === 'N') {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n subvalue = value.slice(index + 1, index + 3)\n\n if (\n (next === 'I' && nextnext === 'A') ||\n (next === 'C' && nextnext === 'H')\n ) {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n if (next === 'H' || (next === 'T' && nextnext === 'H')) {\n // Special case `Thomas`, `Thames` or Germanic.\n if (\n isGermanic ||\n ((nextnext === 'O' || nextnext === 'A') &&\n characters[index + 3] === 'M')\n ) {\n primary += 'T'\n secondary += 'T'\n } else {\n primary += '0'\n secondary += 'T'\n }\n\n index += 2\n\n break\n }\n\n if (next === 'T' || next === 'D') {\n index++\n }\n\n index++\n primary += 'T'\n secondary += 'T'\n\n break\n case 'V':\n if (next === 'V') {\n index++\n }\n\n primary += 'F'\n secondary += 'F'\n index++\n\n break\n case 'W':\n // Can also be in middle of word (as already taken care of for initial).\n if (next === 'R') {\n primary += 'R'\n secondary += 'R'\n index += 2\n\n break\n }\n\n if (index === 0) {\n // `Wasserman` should match `Vasserman`.\n if (vowels.test(next)) {\n primary += 'A'\n secondary += 'F'\n } else if (next === 'H') {\n // Need `Uomo` to match `Womo`.\n primary += 'A'\n secondary += 'A'\n }\n }\n\n // `Arnow` should match `Arnoff`.\n if (\n ((prev === 'E' || prev === 'O') &&\n next === 'S' &&\n nextnext === 'K' &&\n (characters[index + 3] === 'I' || characters[index + 3] === 'Y')) ||\n // Maybe a bug? Shouldn't this be general Germanic?\n value.slice(0, 3) === 'SCH' ||\n (index === last && vowels.test(prev))\n ) {\n secondary += 'F'\n index++\n\n break\n }\n\n // Polish such as `Filipowicz`.\n if (\n next === 'I' &&\n (nextnext === 'C' || nextnext === 'T') &&\n characters[index + 3] === 'Z'\n ) {\n primary += 'TS'\n secondary += 'FX'\n index += 4\n\n break\n }\n\n index++\n\n break\n case 'X':\n // French such as `breaux`.\n if (\n !(\n index === last &&\n // Bug: IAU and EAU also match by AU\n // (/IAU|EAU/.test(value.slice(index - 3, index))) ||\n (prev === 'U' &&\n (characters[index - 2] === 'A' || characters[index - 2] === 'O'))\n )\n ) {\n primary += 'KS'\n secondary += 'KS'\n }\n\n if (next === 'C' || next === 'X') {\n index++\n }\n\n index++\n\n break\n case 'Z':\n // Chinese pinyin such as `Zhao`.\n if (next === 'H') {\n primary += 'J'\n secondary += 'J'\n index += 2\n\n break\n } else if (\n (next === 'Z' &&\n (nextnext === 'A' || nextnext === 'I' || nextnext === 'O')) ||\n (isSlavoGermanic && index > 0 && prev !== 'T')\n ) {\n primary += 'S'\n secondary += 'TS'\n } else {\n primary += 'S'\n secondary += 'S'\n }\n\n if (next === 'Z') {\n index++\n }\n\n index++\n\n break\n default:\n index++\n }\n }\n\n return [primary, secondary]\n}\n\nfunction (doc) {\n emit(doubleMetaphone(doc.name)[0], 1); \n emit(doubleMetaphone(doc.name)[1], 1);\n}" | |
} | |
}, | |
"language": "javascript", | |
"indexes": { | |
"doubleMetaphone": { | |
"analyzer": "keyword", | |
"index": "'use strict'\n\n// Match vowels (including `Y`).\nvar vowels = /[AEIOUY]/\n\n// Match few Slavo-Germanic values.\nvar slavoGermanic = /W|K|CZ|WITZ/\n\n// Match few Germanic values.\nvar germanic = /^(VAN |VON |SCH)/\n\n// Match initial values of which the first character should be skipped.\nvar initialExceptions = /^(GN|KN|PN|WR|PS)/\n\n// Match initial Greek-like values of which the `CH` sounds like `K`.\nvar initialGreekCh = /^CH(IA|EM|OR([^E])|YM|ARAC|ARIS)/\n\n// Match Greek-like values of which the `CH` sounds like `K`.\nvar greekCh = /ORCHES|ARCHIT|ORCHID/\n\n// Match values which when following `CH`, transform `CH` to sound like `K`.\nvar chForKh = /[ BFHLMNRVW]/\n\n// Match values which when preceding a vowel and `UGH`, sound like `F`.\nvar gForF = /[CGLRT]/\n\n// Match initial values which sound like either `K` or `J`.\nvar initialGForKj = /Y[\\s\\S]|E[BILPRSY]|I[BELN]/\n\n// Match initial values which sound like either `K` or `J`.\nvar initialAngerException = /^[DMR]ANGER/\n\n// Match values which when following `GY`, do not sound like `K` or `J`.\nvar gForKj = /[EGIR]/\n\n// Match values which when following `J`, do not sound `J`.\nvar jForJException = /[LTKSNMBZ]/\n\n// Match values which might sound like `L`.\nvar alle = /AS|OS/\n\n// Match Germanic values preceding `SH` which sound like `S`.\nvar hForS = /EIM|OEK|OLM|OLZ/\n\n// Match Dutch values following `SCH` which sound like either `X` and `SK`,\n// or `SK`.\nvar dutchSch = /E[DMNR]|UY|OO/\n\n// Get the phonetics according to the Double Metaphone algorithm from a value.\n// eslint-disable-next-line complexity\nfunction doubleMetaphone(value) {\n var primary = ''\n var secondary = ''\n var index = 0\n var length = value.length\n var last = length - 1\n var isSlavoGermanic\n var isGermanic\n var subvalue\n var next\n var prev\n var nextnext\n var characters\n\n value = String(value).toUpperCase() + ' '\n isSlavoGermanic = slavoGermanic.test(value)\n isGermanic = germanic.test(value)\n characters = value.split('')\n\n // Skip this at beginning of word.\n if (initialExceptions.test(value)) {\n index++\n }\n\n // Initial X is pronounced Z, which maps to S. Such as `Xavier`.\n if (characters[0] === 'X') {\n primary += 'S'\n secondary += 'S'\n index++\n }\n\n while (index < length) {\n prev = characters[index - 1]\n next = characters[index + 1]\n nextnext = characters[index + 2]\n\n switch (characters[index]) {\n case 'A':\n case 'E':\n case 'I':\n case 'O':\n case 'U':\n case 'Y':\n case 'À':\n case 'Ê':\n case 'É':\n if (index === 0) {\n // All initial vowels now map to `A`.\n primary += 'A'\n secondary += 'A'\n }\n\n index++\n\n break\n case 'B':\n primary += 'P'\n secondary += 'P'\n\n if (next === 'B') {\n index++\n }\n\n index++\n\n break\n case 'Ç':\n primary += 'S'\n secondary += 'S'\n index++\n\n break\n case 'C':\n // Various Germanic:\n if (\n prev === 'A' &&\n next === 'H' &&\n nextnext !== 'I' &&\n !vowels.test(characters[index - 2]) &&\n (nextnext !== 'E' ||\n (subvalue =\n value.slice(index - 2, index + 4) &&\n (subvalue === 'BACHER' || subvalue === 'MACHER')))\n ) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Special case for `Caesar`.\n if (index === 0 && value.slice(index + 1, index + 6) === 'AESAR') {\n primary += 'S'\n secondary += 'S'\n index += 2\n\n break\n }\n\n // Italian `Chianti`.\n if (value.slice(index + 1, index + 4) === 'HIA') {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n if (next === 'H') {\n // Find `Michael`.\n if (index > 0 && nextnext === 'A' && characters[index + 3] === 'E') {\n primary += 'K'\n secondary += 'X'\n index += 2\n\n break\n }\n\n // Greek roots such as `chemistry`, `chorus`.\n if (index === 0 && initialGreekCh.test(value)) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Germanic, Greek, or otherwise `CH` for `KH` sound.\n if (\n isGermanic ||\n // Such as 'architect' but not 'arch', orchestra', 'orchid'.\n greekCh.test(value.slice(index - 2, index + 4)) ||\n (nextnext === 'T' || nextnext === 'S') ||\n ((index === 0 ||\n prev === 'A' ||\n prev === 'E' ||\n prev === 'O' ||\n prev === 'U') &&\n // Such as `wachtler`, `weschsler`, but not `tichner`.\n chForKh.test(nextnext))\n ) {\n primary += 'K'\n secondary += 'K'\n } else if (index === 0) {\n primary += 'X'\n secondary += 'X'\n // Such as 'McHugh'.\n } else if (value.slice(0, 2) === 'MC') {\n // Bug? Why matching absolute? what about McHiccup?\n primary += 'K'\n secondary += 'K'\n } else {\n primary += 'X'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n // Such as `Czerny`.\n if (next === 'Z' && value.slice(index - 2, index) !== 'WI') {\n primary += 'S'\n secondary += 'X'\n index += 2\n\n break\n }\n\n // Such as `Focaccia`.\n if (value.slice(index + 1, index + 4) === 'CIA') {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n // Double `C`, but not `McClellan`.\n if (next === 'C' && !(index === 1 && characters[0] === 'M')) {\n // Such as `Bellocchio`, but not `Bacchus`.\n if (\n (nextnext === 'I' || nextnext === 'E' || nextnext === 'H') &&\n value.slice(index + 2, index + 4) !== 'HU'\n ) {\n subvalue = value.slice(index - 1, index + 4)\n\n // Such as `Accident`, `Accede`, `Succeed`.\n if (\n (index === 1 && prev === 'A') ||\n subvalue === 'UCCEE' ||\n subvalue === 'UCCES'\n ) {\n primary += 'KS'\n secondary += 'KS'\n // Such as `Bacci`, `Bertucci`, other Italian.\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 3\n\n break\n } else {\n // Pierce's rule.\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n }\n\n if (next === 'G' || next === 'K' || next === 'Q') {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Italian.\n if (\n next === 'I' &&\n // Bug: The original algorithm also calls for A (as in CIA), which is\n // already taken care of above.\n (nextnext === 'E' || nextnext === 'O')\n ) {\n primary += 'S'\n secondary += 'X'\n index += 2\n\n break\n }\n\n if (next === 'I' || next === 'E' || next === 'Y') {\n primary += 'S'\n secondary += 'S'\n index += 2\n\n break\n }\n\n primary += 'K'\n secondary += 'K'\n\n // Skip two extra characters ahead in `Mac Caffrey`, `Mac Gregor`.\n if (\n next === ' ' &&\n (nextnext === 'C' || nextnext === 'G' || nextnext === 'Q')\n ) {\n index += 3\n break\n }\n\n // Bug: Already covered above.\n // if (\n // next === 'K' ||\n // next === 'Q' ||\n // (next === 'C' && nextnext !== 'E' && nextnext !== 'I')\n // ) {\n // index++;\n // }\n\n index++\n\n break\n case 'D':\n if (next === 'G') {\n // Such as `edge`.\n if (nextnext === 'E' || nextnext === 'I' || nextnext === 'Y') {\n primary += 'J'\n secondary += 'J'\n index += 3\n // Such as `Edgar`.\n } else {\n primary += 'TK'\n secondary += 'TK'\n index += 2\n }\n\n break\n }\n\n if (next === 'T' || next === 'D') {\n primary += 'T'\n secondary += 'T'\n index += 2\n\n break\n }\n\n primary += 'T'\n secondary += 'T'\n index++\n\n break\n case 'F':\n if (next === 'F') {\n index++\n }\n\n index++\n primary += 'F'\n secondary += 'F'\n\n break\n case 'G':\n if (next === 'H') {\n if (index > 0 && !vowels.test(prev)) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Such as `Ghislane`, `Ghiradelli`.\n if (index === 0) {\n if (nextnext === 'I') {\n primary += 'J'\n secondary += 'J'\n } else {\n primary += 'K'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n // Parker's rule (with some further refinements).\n if (\n // Such as `Hugh`. The comma is not a bug.\n ((subvalue = characters[index - 2]),\n subvalue === 'B' || subvalue === 'H' || subvalue === 'D') ||\n // Such as `bough`. The comma is not a bug.\n ((subvalue = characters[index - 3]),\n subvalue === 'B' || subvalue === 'H' || subvalue === 'D') ||\n // Such as `Broughton`. The comma is not a bug.\n ((subvalue = characters[index - 4]),\n subvalue === 'B' || subvalue === 'H')\n ) {\n index += 2\n\n break\n }\n\n // Such as `laugh`, `McLaughlin`, `cough`, `gough`, `rough`, `tough`.\n if (index > 2 && prev === 'U' && gForF.test(characters[index - 3])) {\n primary += 'F'\n secondary += 'F'\n } else if (index > 0 && prev !== 'I') {\n primary += 'K'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n if (next === 'N') {\n if (index === 1 && vowels.test(characters[0]) && !isSlavoGermanic) {\n primary += 'KN'\n secondary += 'N'\n // Not like `Cagney`.\n } else if (\n value.slice(index + 2, index + 4) !== 'EY' &&\n value.slice(index + 1) !== 'Y' &&\n !isSlavoGermanic\n ) {\n primary += 'N'\n secondary += 'KN'\n } else {\n primary += 'KN'\n secondary += 'KN'\n }\n\n index += 2\n\n break\n }\n\n // Such as `Tagliaro`.\n if (value.slice(index + 1, index + 3) === 'LI' && !isSlavoGermanic) {\n primary += 'KL'\n secondary += 'L'\n index += 2\n\n break\n }\n\n // -ges-, -gep-, -gel- at beginning.\n if (index === 0 && initialGForKj.test(value.slice(1, 3))) {\n primary += 'K'\n secondary += 'J'\n index += 2\n\n break\n }\n\n // -ger-, -gy-.\n if (\n (value.slice(index + 1, index + 3) === 'ER' &&\n prev !== 'I' &&\n prev !== 'E' &&\n !initialAngerException.test(value.slice(0, 6))) ||\n (next === 'Y' && !gForKj.test(prev))\n ) {\n primary += 'K'\n secondary += 'J'\n index += 2\n\n break\n }\n\n // Italian such as `biaggi`.\n if (\n next === 'E' ||\n next === 'I' ||\n next === 'Y' ||\n ((prev === 'A' || prev === 'O') && next === 'G' && nextnext === 'I')\n ) {\n // Obvious Germanic.\n if (value.slice(index + 1, index + 3) === 'ET' || isGermanic) {\n primary += 'K'\n secondary += 'K'\n } else {\n primary += 'J'\n\n // Always soft if French ending.\n if (value.slice(index + 1, index + 5) === 'IER ') {\n secondary += 'J'\n } else {\n secondary += 'K'\n }\n }\n\n index += 2\n\n break\n }\n\n if (next === 'G') {\n index++\n }\n\n index++\n\n primary += 'K'\n secondary += 'K'\n\n break\n case 'H':\n // Only keep if first & before vowel or btw. 2 vowels.\n if (vowels.test(next) && (index === 0 || vowels.test(prev))) {\n primary += 'H'\n secondary += 'H'\n\n index++\n }\n\n index++\n\n break\n case 'J':\n // Obvious Spanish, `jose`, `San Jacinto`.\n if (\n value.slice(index, index + 4) === 'JOSE' ||\n value.slice(0, 4) === 'SAN '\n ) {\n if (\n value.slice(0, 4) === 'SAN ' ||\n (index === 0 && characters[index + 4] === ' ')\n ) {\n primary += 'H'\n secondary += 'H'\n } else {\n primary += 'J'\n secondary += 'H'\n }\n\n index++\n\n break\n }\n\n if (\n index === 0\n // Bug: unreachable (see previous statement).\n // && value.slice(index, index + 4) !== 'JOSE'.\n ) {\n primary += 'J'\n\n // Such as `Yankelovich` or `Jankelowicz`.\n secondary += 'A'\n // Spanish pron. of such as `bajador`.\n } else if (\n !isSlavoGermanic &&\n (next === 'A' || next === 'O') &&\n vowels.test(prev)\n ) {\n primary += 'J'\n secondary += 'H'\n } else if (index === last) {\n primary += 'J'\n } else if (\n prev !== 'S' &&\n prev !== 'K' &&\n prev !== 'L' &&\n !jForJException.test(next)\n ) {\n primary += 'J'\n secondary += 'J'\n // It could happen.\n } else if (next === 'J') {\n index++\n }\n\n index++\n\n break\n case 'K':\n if (next === 'K') {\n index++\n }\n\n primary += 'K'\n secondary += 'K'\n index++\n\n break\n case 'L':\n if (next === 'L') {\n // Spanish such as `cabrillo`, `gallegos`.\n if (\n (index === length - 3 &&\n ((prev === 'A' && nextnext === 'E') ||\n (prev === 'I' && (nextnext === 'O' || nextnext === 'A')))) ||\n (prev === 'A' &&\n nextnext === 'E' &&\n (characters[last] === 'A' ||\n characters[last] === 'O' ||\n alle.test(value.slice(last - 1, length))))\n ) {\n primary += 'L'\n index += 2\n\n break\n }\n\n index++\n }\n\n primary += 'L'\n secondary += 'L'\n index++\n\n break\n case 'M':\n if (\n next === 'M' ||\n // Such as `dumb`, `thumb`.\n (prev === 'U' &&\n next === 'B' &&\n (index + 1 === last || value.slice(index + 2, index + 4) === 'ER'))\n ) {\n index++\n }\n\n index++\n primary += 'M'\n secondary += 'M'\n\n break\n case 'N':\n if (next === 'N') {\n index++\n }\n\n index++\n primary += 'N'\n secondary += 'N'\n\n break\n case 'Ñ':\n index++\n primary += 'N'\n secondary += 'N'\n\n break\n case 'P':\n if (next === 'H') {\n primary += 'F'\n secondary += 'F'\n index += 2\n\n break\n }\n\n // Also account for `campbell` and `raspberry`.\n subvalue = next\n\n if (subvalue === 'P' || subvalue === 'B') {\n index++\n }\n\n index++\n\n primary += 'P'\n secondary += 'P'\n\n break\n case 'Q':\n if (next === 'Q') {\n index++\n }\n\n index++\n primary += 'K'\n secondary += 'K'\n\n break\n case 'R':\n // French such as `Rogier`, but exclude `Hochmeier`.\n if (\n index === last &&\n !isSlavoGermanic &&\n prev === 'E' &&\n characters[index - 2] === 'I' &&\n characters[index - 4] !== 'M' &&\n (characters[index - 3] !== 'E' && characters[index - 3] !== 'A')\n ) {\n secondary += 'R'\n } else {\n primary += 'R'\n secondary += 'R'\n }\n\n if (next === 'R') {\n index++\n }\n\n index++\n\n break\n case 'S':\n // Special cases `island`, `isle`, `carlisle`, `carlysle`.\n if (next === 'L' && (prev === 'I' || prev === 'Y')) {\n index++\n\n break\n }\n\n // Special case `sugar-`.\n if (index === 0 && value.slice(1, 5) === 'UGAR') {\n primary += 'X'\n secondary += 'S'\n index++\n\n break\n }\n\n if (next === 'H') {\n // Germanic.\n if (hForS.test(value.slice(index + 1, index + 5))) {\n primary += 'S'\n secondary += 'S'\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 2\n break\n }\n\n if (\n next === 'I' &&\n (nextnext === 'O' || nextnext === 'A')\n // Bug: Already covered by previous branch\n // || value.slice(index, index + 4) === 'SIAN'\n ) {\n if (isSlavoGermanic) {\n primary += 'S'\n secondary += 'S'\n } else {\n primary += 'S'\n secondary += 'X'\n }\n\n index += 3\n\n break\n }\n\n // German & Anglicization's, such as `Smith` match `Schmidt`, `snider`\n // match `Schneider`. Also, -sz- in slavic language although in\n // hungarian it is pronounced `s`.\n if (\n next === 'Z' ||\n (index === 0 &&\n (next === 'L' || next === 'M' || next === 'N' || next === 'W'))\n ) {\n primary += 'S'\n secondary += 'X'\n\n if (next === 'Z') {\n index++\n }\n\n index++\n\n break\n }\n\n if (next === 'C') {\n // Schlesinger's rule.\n if (nextnext === 'H') {\n subvalue = value.slice(index + 3, index + 5)\n\n // Dutch origin, such as `school`, `schooner`.\n if (dutchSch.test(subvalue)) {\n // Such as `schermerhorn`, `schenker`.\n if (subvalue === 'ER' || subvalue === 'EN') {\n primary += 'X'\n secondary += 'SK'\n } else {\n primary += 'SK'\n secondary += 'SK'\n }\n\n index += 3\n\n break\n }\n\n if (\n index === 0 &&\n !vowels.test(characters[3]) &&\n characters[3] !== 'W'\n ) {\n primary += 'X'\n secondary += 'S'\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 3\n\n break\n }\n\n if (nextnext === 'I' || nextnext === 'E' || nextnext === 'Y') {\n primary += 'S'\n secondary += 'S'\n index += 3\n break\n }\n\n primary += 'SK'\n secondary += 'SK'\n index += 3\n\n break\n }\n\n subvalue = value.slice(index - 2, index)\n\n // French such as `resnais`, `artois`.\n if (index === last && (subvalue === 'AI' || subvalue === 'OI')) {\n secondary += 'S'\n } else {\n primary += 'S'\n secondary += 'S'\n }\n\n if (\n next === 'S'\n // Bug: already taken care of by `German & Anglicization's` above:\n // || next === 'Z'\n ) {\n index++\n }\n\n index++\n\n break\n case 'T':\n if (next === 'I' && nextnext === 'O' && characters[index + 3] === 'N') {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n subvalue = value.slice(index + 1, index + 3)\n\n if (\n (next === 'I' && nextnext === 'A') ||\n (next === 'C' && nextnext === 'H')\n ) {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n if (next === 'H' || (next === 'T' && nextnext === 'H')) {\n // Special case `Thomas`, `Thames` or Germanic.\n if (\n isGermanic ||\n ((nextnext === 'O' || nextnext === 'A') &&\n characters[index + 3] === 'M')\n ) {\n primary += 'T'\n secondary += 'T'\n } else {\n primary += '0'\n secondary += 'T'\n }\n\n index += 2\n\n break\n }\n\n if (next === 'T' || next === 'D') {\n index++\n }\n\n index++\n primary += 'T'\n secondary += 'T'\n\n break\n case 'V':\n if (next === 'V') {\n index++\n }\n\n primary += 'F'\n secondary += 'F'\n index++\n\n break\n case 'W':\n // Can also be in middle of word (as already taken care of for initial).\n if (next === 'R') {\n primary += 'R'\n secondary += 'R'\n index += 2\n\n break\n }\n\n if (index === 0) {\n // `Wasserman` should match `Vasserman`.\n if (vowels.test(next)) {\n primary += 'A'\n secondary += 'F'\n } else if (next === 'H') {\n // Need `Uomo` to match `Womo`.\n primary += 'A'\n secondary += 'A'\n }\n }\n\n // `Arnow` should match `Arnoff`.\n if (\n ((prev === 'E' || prev === 'O') &&\n next === 'S' &&\n nextnext === 'K' &&\n (characters[index + 3] === 'I' || characters[index + 3] === 'Y')) ||\n // Maybe a bug? Shouldn't this be general Germanic?\n value.slice(0, 3) === 'SCH' ||\n (index === last && vowels.test(prev))\n ) {\n secondary += 'F'\n index++\n\n break\n }\n\n // Polish such as `Filipowicz`.\n if (\n next === 'I' &&\n (nextnext === 'C' || nextnext === 'T') &&\n characters[index + 3] === 'Z'\n ) {\n primary += 'TS'\n secondary += 'FX'\n index += 4\n\n break\n }\n\n index++\n\n break\n case 'X':\n // French such as `breaux`.\n if (\n !(\n index === last &&\n // Bug: IAU and EAU also match by AU\n // (/IAU|EAU/.test(value.slice(index - 3, index))) ||\n (prev === 'U' &&\n (characters[index - 2] === 'A' || characters[index - 2] === 'O'))\n )\n ) {\n primary += 'KS'\n secondary += 'KS'\n }\n\n if (next === 'C' || next === 'X') {\n index++\n }\n\n index++\n\n break\n case 'Z':\n // Chinese pinyin such as `Zhao`.\n if (next === 'H') {\n primary += 'J'\n secondary += 'J'\n index += 2\n\n break\n } else if (\n (next === 'Z' &&\n (nextnext === 'A' || nextnext === 'I' || nextnext === 'O')) ||\n (isSlavoGermanic && index > 0 && prev !== 'T')\n ) {\n primary += 'S'\n secondary += 'TS'\n } else {\n primary += 'S'\n secondary += 'S'\n }\n\n if (next === 'Z') {\n index++\n }\n\n index++\n\n break\n default:\n index++\n }\n }\n\n return [primary, secondary]\n}\n\nfunction (doc) {\n index(\"name\", doc.name);\n index(\"encoding\", doubleMetaphone(doc.name)[0]);\n index(\"encoding\", doubleMetaphone(doc.name)[1]); \n}\n\n" | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment