Skip to content

Instantly share code, notes, and snippets.

@brianewilkins
Last active August 7, 2019 11:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brianewilkins/6f145b801da0726b77d10e0bc782dfb4 to your computer and use it in GitHub Desktop.
Save brianewilkins/6f145b801da0726b77d10e0bc782dfb4 to your computer and use it in GitHub Desktop.
Design document that contains a view that implements the Double Metaphone algorithm
{
"_id": "_design/doubleMetaphone",
"views": {
"doubleMetaphone": {
"map": "'use strict'\n\n// Match vowels (including `Y`).\nvar vowels = /[AEIOUY]/\n\n// Match few Slavo-Germanic values.\nvar slavoGermanic = /W|K|CZ|WITZ/\n\n// Match few Germanic values.\nvar germanic = /^(VAN |VON |SCH)/\n\n// Match initial values of which the first character should be skipped.\nvar initialExceptions = /^(GN|KN|PN|WR|PS)/\n\n// Match initial Greek-like values of which the `CH` sounds like `K`.\nvar initialGreekCh = /^CH(IA|EM|OR([^E])|YM|ARAC|ARIS)/\n\n// Match Greek-like values of which the `CH` sounds like `K`.\nvar greekCh = /ORCHES|ARCHIT|ORCHID/\n\n// Match values which when following `CH`, transform `CH` to sound like `K`.\nvar chForKh = /[ BFHLMNRVW]/\n\n// Match values which when preceding a vowel and `UGH`, sound like `F`.\nvar gForF = /[CGLRT]/\n\n// Match initial values which sound like either `K` or `J`.\nvar initialGForKj = /Y[\\s\\S]|E[BILPRSY]|I[BELN]/\n\n// Match initial values which sound like either `K` or `J`.\nvar initialAngerException = /^[DMR]ANGER/\n\n// Match values which when following `GY`, do not sound like `K` or `J`.\nvar gForKj = /[EGIR]/\n\n// Match values which when following `J`, do not sound `J`.\nvar jForJException = /[LTKSNMBZ]/\n\n// Match values which might sound like `L`.\nvar alle = /AS|OS/\n\n// Match Germanic values preceding `SH` which sound like `S`.\nvar hForS = /EIM|OEK|OLM|OLZ/\n\n// Match Dutch values following `SCH` which sound like either `X` and `SK`,\n// or `SK`.\nvar dutchSch = /E[DMNR]|UY|OO/\n\n// Get the phonetics according to the Double Metaphone algorithm from a value.\n// eslint-disable-next-line complexity\nfunction doubleMetaphone(value) {\n var primary = ''\n var secondary = ''\n var index = 0\n var length = value.length\n var last = length - 1\n var isSlavoGermanic\n var isGermanic\n var subvalue\n var next\n var prev\n var nextnext\n var characters\n\n value = String(value).toUpperCase() + ' '\n isSlavoGermanic = slavoGermanic.test(value)\n isGermanic = germanic.test(value)\n characters = value.split('')\n\n // Skip this at beginning of word.\n if (initialExceptions.test(value)) {\n index++\n }\n\n // Initial X is pronounced Z, which maps to S. Such as `Xavier`.\n if (characters[0] === 'X') {\n primary += 'S'\n secondary += 'S'\n index++\n }\n\n while (index < length) {\n prev = characters[index - 1]\n next = characters[index + 1]\n nextnext = characters[index + 2]\n\n switch (characters[index]) {\n case 'A':\n case 'E':\n case 'I':\n case 'O':\n case 'U':\n case 'Y':\n case 'À':\n case 'Ê':\n case 'É':\n if (index === 0) {\n // All initial vowels now map to `A`.\n primary += 'A'\n secondary += 'A'\n }\n\n index++\n\n break\n case 'B':\n primary += 'P'\n secondary += 'P'\n\n if (next === 'B') {\n index++\n }\n\n index++\n\n break\n case 'Ç':\n primary += 'S'\n secondary += 'S'\n index++\n\n break\n case 'C':\n // Various Germanic:\n if (\n prev === 'A' &&\n next === 'H' &&\n nextnext !== 'I' &&\n !vowels.test(characters[index - 2]) &&\n (nextnext !== 'E' ||\n (subvalue =\n value.slice(index - 2, index + 4) &&\n (subvalue === 'BACHER' || subvalue === 'MACHER')))\n ) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Special case for `Caesar`.\n if (index === 0 && value.slice(index + 1, index + 6) === 'AESAR') {\n primary += 'S'\n secondary += 'S'\n index += 2\n\n break\n }\n\n // Italian `Chianti`.\n if (value.slice(index + 1, index + 4) === 'HIA') {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n if (next === 'H') {\n // Find `Michael`.\n if (index > 0 && nextnext === 'A' && characters[index + 3] === 'E') {\n primary += 'K'\n secondary += 'X'\n index += 2\n\n break\n }\n\n // Greek roots such as `chemistry`, `chorus`.\n if (index === 0 && initialGreekCh.test(value)) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Germanic, Greek, or otherwise `CH` for `KH` sound.\n if (\n isGermanic ||\n // Such as 'architect' but not 'arch', orchestra', 'orchid'.\n greekCh.test(value.slice(index - 2, index + 4)) ||\n (nextnext === 'T' || nextnext === 'S') ||\n ((index === 0 ||\n prev === 'A' ||\n prev === 'E' ||\n prev === 'O' ||\n prev === 'U') &&\n // Such as `wachtler`, `weschsler`, but not `tichner`.\n chForKh.test(nextnext))\n ) {\n primary += 'K'\n secondary += 'K'\n } else if (index === 0) {\n primary += 'X'\n secondary += 'X'\n // Such as 'McHugh'.\n } else if (value.slice(0, 2) === 'MC') {\n // Bug? Why matching absolute? what about McHiccup?\n primary += 'K'\n secondary += 'K'\n } else {\n primary += 'X'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n // Such as `Czerny`.\n if (next === 'Z' && value.slice(index - 2, index) !== 'WI') {\n primary += 'S'\n secondary += 'X'\n index += 2\n\n break\n }\n\n // Such as `Focaccia`.\n if (value.slice(index + 1, index + 4) === 'CIA') {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n // Double `C`, but not `McClellan`.\n if (next === 'C' && !(index === 1 && characters[0] === 'M')) {\n // Such as `Bellocchio`, but not `Bacchus`.\n if (\n (nextnext === 'I' || nextnext === 'E' || nextnext === 'H') &&\n value.slice(index + 2, index + 4) !== 'HU'\n ) {\n subvalue = value.slice(index - 1, index + 4)\n\n // Such as `Accident`, `Accede`, `Succeed`.\n if (\n (index === 1 && prev === 'A') ||\n subvalue === 'UCCEE' ||\n subvalue === 'UCCES'\n ) {\n primary += 'KS'\n secondary += 'KS'\n // Such as `Bacci`, `Bertucci`, other Italian.\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 3\n\n break\n } else {\n // Pierce's rule.\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n }\n\n if (next === 'G' || next === 'K' || next === 'Q') {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Italian.\n if (\n next === 'I' &&\n // Bug: The original algorithm also calls for A (as in CIA), which is\n // already taken care of above.\n (nextnext === 'E' || nextnext === 'O')\n ) {\n primary += 'S'\n secondary += 'X'\n index += 2\n\n break\n }\n\n if (next === 'I' || next === 'E' || next === 'Y') {\n primary += 'S'\n secondary += 'S'\n index += 2\n\n break\n }\n\n primary += 'K'\n secondary += 'K'\n\n // Skip two extra characters ahead in `Mac Caffrey`, `Mac Gregor`.\n if (\n next === ' ' &&\n (nextnext === 'C' || nextnext === 'G' || nextnext === 'Q')\n ) {\n index += 3\n break\n }\n\n // Bug: Already covered above.\n // if (\n // next === 'K' ||\n // next === 'Q' ||\n // (next === 'C' && nextnext !== 'E' && nextnext !== 'I')\n // ) {\n // index++;\n // }\n\n index++\n\n break\n case 'D':\n if (next === 'G') {\n // Such as `edge`.\n if (nextnext === 'E' || nextnext === 'I' || nextnext === 'Y') {\n primary += 'J'\n secondary += 'J'\n index += 3\n // Such as `Edgar`.\n } else {\n primary += 'TK'\n secondary += 'TK'\n index += 2\n }\n\n break\n }\n\n if (next === 'T' || next === 'D') {\n primary += 'T'\n secondary += 'T'\n index += 2\n\n break\n }\n\n primary += 'T'\n secondary += 'T'\n index++\n\n break\n case 'F':\n if (next === 'F') {\n index++\n }\n\n index++\n primary += 'F'\n secondary += 'F'\n\n break\n case 'G':\n if (next === 'H') {\n if (index > 0 && !vowels.test(prev)) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Such as `Ghislane`, `Ghiradelli`.\n if (index === 0) {\n if (nextnext === 'I') {\n primary += 'J'\n secondary += 'J'\n } else {\n primary += 'K'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n // Parker's rule (with some further refinements).\n if (\n // Such as `Hugh`. The comma is not a bug.\n ((subvalue = characters[index - 2]),\n subvalue === 'B' || subvalue === 'H' || subvalue === 'D') ||\n // Such as `bough`. The comma is not a bug.\n ((subvalue = characters[index - 3]),\n subvalue === 'B' || subvalue === 'H' || subvalue === 'D') ||\n // Such as `Broughton`. The comma is not a bug.\n ((subvalue = characters[index - 4]),\n subvalue === 'B' || subvalue === 'H')\n ) {\n index += 2\n\n break\n }\n\n // Such as `laugh`, `McLaughlin`, `cough`, `gough`, `rough`, `tough`.\n if (index > 2 && prev === 'U' && gForF.test(characters[index - 3])) {\n primary += 'F'\n secondary += 'F'\n } else if (index > 0 && prev !== 'I') {\n primary += 'K'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n if (next === 'N') {\n if (index === 1 && vowels.test(characters[0]) && !isSlavoGermanic) {\n primary += 'KN'\n secondary += 'N'\n // Not like `Cagney`.\n } else if (\n value.slice(index + 2, index + 4) !== 'EY' &&\n value.slice(index + 1) !== 'Y' &&\n !isSlavoGermanic\n ) {\n primary += 'N'\n secondary += 'KN'\n } else {\n primary += 'KN'\n secondary += 'KN'\n }\n\n index += 2\n\n break\n }\n\n // Such as `Tagliaro`.\n if (value.slice(index + 1, index + 3) === 'LI' && !isSlavoGermanic) {\n primary += 'KL'\n secondary += 'L'\n index += 2\n\n break\n }\n\n // -ges-, -gep-, -gel- at beginning.\n if (index === 0 && initialGForKj.test(value.slice(1, 3))) {\n primary += 'K'\n secondary += 'J'\n index += 2\n\n break\n }\n\n // -ger-, -gy-.\n if (\n (value.slice(index + 1, index + 3) === 'ER' &&\n prev !== 'I' &&\n prev !== 'E' &&\n !initialAngerException.test(value.slice(0, 6))) ||\n (next === 'Y' && !gForKj.test(prev))\n ) {\n primary += 'K'\n secondary += 'J'\n index += 2\n\n break\n }\n\n // Italian such as `biaggi`.\n if (\n next === 'E' ||\n next === 'I' ||\n next === 'Y' ||\n ((prev === 'A' || prev === 'O') && next === 'G' && nextnext === 'I')\n ) {\n // Obvious Germanic.\n if (value.slice(index + 1, index + 3) === 'ET' || isGermanic) {\n primary += 'K'\n secondary += 'K'\n } else {\n primary += 'J'\n\n // Always soft if French ending.\n if (value.slice(index + 1, index + 5) === 'IER ') {\n secondary += 'J'\n } else {\n secondary += 'K'\n }\n }\n\n index += 2\n\n break\n }\n\n if (next === 'G') {\n index++\n }\n\n index++\n\n primary += 'K'\n secondary += 'K'\n\n break\n case 'H':\n // Only keep if first & before vowel or btw. 2 vowels.\n if (vowels.test(next) && (index === 0 || vowels.test(prev))) {\n primary += 'H'\n secondary += 'H'\n\n index++\n }\n\n index++\n\n break\n case 'J':\n // Obvious Spanish, `jose`, `San Jacinto`.\n if (\n value.slice(index, index + 4) === 'JOSE' ||\n value.slice(0, 4) === 'SAN '\n ) {\n if (\n value.slice(0, 4) === 'SAN ' ||\n (index === 0 && characters[index + 4] === ' ')\n ) {\n primary += 'H'\n secondary += 'H'\n } else {\n primary += 'J'\n secondary += 'H'\n }\n\n index++\n\n break\n }\n\n if (\n index === 0\n // Bug: unreachable (see previous statement).\n // && value.slice(index, index + 4) !== 'JOSE'.\n ) {\n primary += 'J'\n\n // Such as `Yankelovich` or `Jankelowicz`.\n secondary += 'A'\n // Spanish pron. of such as `bajador`.\n } else if (\n !isSlavoGermanic &&\n (next === 'A' || next === 'O') &&\n vowels.test(prev)\n ) {\n primary += 'J'\n secondary += 'H'\n } else if (index === last) {\n primary += 'J'\n } else if (\n prev !== 'S' &&\n prev !== 'K' &&\n prev !== 'L' &&\n !jForJException.test(next)\n ) {\n primary += 'J'\n secondary += 'J'\n // It could happen.\n } else if (next === 'J') {\n index++\n }\n\n index++\n\n break\n case 'K':\n if (next === 'K') {\n index++\n }\n\n primary += 'K'\n secondary += 'K'\n index++\n\n break\n case 'L':\n if (next === 'L') {\n // Spanish such as `cabrillo`, `gallegos`.\n if (\n (index === length - 3 &&\n ((prev === 'A' && nextnext === 'E') ||\n (prev === 'I' && (nextnext === 'O' || nextnext === 'A')))) ||\n (prev === 'A' &&\n nextnext === 'E' &&\n (characters[last] === 'A' ||\n characters[last] === 'O' ||\n alle.test(value.slice(last - 1, length))))\n ) {\n primary += 'L'\n index += 2\n\n break\n }\n\n index++\n }\n\n primary += 'L'\n secondary += 'L'\n index++\n\n break\n case 'M':\n if (\n next === 'M' ||\n // Such as `dumb`, `thumb`.\n (prev === 'U' &&\n next === 'B' &&\n (index + 1 === last || value.slice(index + 2, index + 4) === 'ER'))\n ) {\n index++\n }\n\n index++\n primary += 'M'\n secondary += 'M'\n\n break\n case 'N':\n if (next === 'N') {\n index++\n }\n\n index++\n primary += 'N'\n secondary += 'N'\n\n break\n case 'Ñ':\n index++\n primary += 'N'\n secondary += 'N'\n\n break\n case 'P':\n if (next === 'H') {\n primary += 'F'\n secondary += 'F'\n index += 2\n\n break\n }\n\n // Also account for `campbell` and `raspberry`.\n subvalue = next\n\n if (subvalue === 'P' || subvalue === 'B') {\n index++\n }\n\n index++\n\n primary += 'P'\n secondary += 'P'\n\n break\n case 'Q':\n if (next === 'Q') {\n index++\n }\n\n index++\n primary += 'K'\n secondary += 'K'\n\n break\n case 'R':\n // French such as `Rogier`, but exclude `Hochmeier`.\n if (\n index === last &&\n !isSlavoGermanic &&\n prev === 'E' &&\n characters[index - 2] === 'I' &&\n characters[index - 4] !== 'M' &&\n (characters[index - 3] !== 'E' && characters[index - 3] !== 'A')\n ) {\n secondary += 'R'\n } else {\n primary += 'R'\n secondary += 'R'\n }\n\n if (next === 'R') {\n index++\n }\n\n index++\n\n break\n case 'S':\n // Special cases `island`, `isle`, `carlisle`, `carlysle`.\n if (next === 'L' && (prev === 'I' || prev === 'Y')) {\n index++\n\n break\n }\n\n // Special case `sugar-`.\n if (index === 0 && value.slice(1, 5) === 'UGAR') {\n primary += 'X'\n secondary += 'S'\n index++\n\n break\n }\n\n if (next === 'H') {\n // Germanic.\n if (hForS.test(value.slice(index + 1, index + 5))) {\n primary += 'S'\n secondary += 'S'\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 2\n break\n }\n\n if (\n next === 'I' &&\n (nextnext === 'O' || nextnext === 'A')\n // Bug: Already covered by previous branch\n // || value.slice(index, index + 4) === 'SIAN'\n ) {\n if (isSlavoGermanic) {\n primary += 'S'\n secondary += 'S'\n } else {\n primary += 'S'\n secondary += 'X'\n }\n\n index += 3\n\n break\n }\n\n // German & Anglicization's, such as `Smith` match `Schmidt`, `snider`\n // match `Schneider`. Also, -sz- in slavic language although in\n // hungarian it is pronounced `s`.\n if (\n next === 'Z' ||\n (index === 0 &&\n (next === 'L' || next === 'M' || next === 'N' || next === 'W'))\n ) {\n primary += 'S'\n secondary += 'X'\n\n if (next === 'Z') {\n index++\n }\n\n index++\n\n break\n }\n\n if (next === 'C') {\n // Schlesinger's rule.\n if (nextnext === 'H') {\n subvalue = value.slice(index + 3, index + 5)\n\n // Dutch origin, such as `school`, `schooner`.\n if (dutchSch.test(subvalue)) {\n // Such as `schermerhorn`, `schenker`.\n if (subvalue === 'ER' || subvalue === 'EN') {\n primary += 'X'\n secondary += 'SK'\n } else {\n primary += 'SK'\n secondary += 'SK'\n }\n\n index += 3\n\n break\n }\n\n if (\n index === 0 &&\n !vowels.test(characters[3]) &&\n characters[3] !== 'W'\n ) {\n primary += 'X'\n secondary += 'S'\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 3\n\n break\n }\n\n if (nextnext === 'I' || nextnext === 'E' || nextnext === 'Y') {\n primary += 'S'\n secondary += 'S'\n index += 3\n break\n }\n\n primary += 'SK'\n secondary += 'SK'\n index += 3\n\n break\n }\n\n subvalue = value.slice(index - 2, index)\n\n // French such as `resnais`, `artois`.\n if (index === last && (subvalue === 'AI' || subvalue === 'OI')) {\n secondary += 'S'\n } else {\n primary += 'S'\n secondary += 'S'\n }\n\n if (\n next === 'S'\n // Bug: already taken care of by `German & Anglicization's` above:\n // || next === 'Z'\n ) {\n index++\n }\n\n index++\n\n break\n case 'T':\n if (next === 'I' && nextnext === 'O' && characters[index + 3] === 'N') {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n subvalue = value.slice(index + 1, index + 3)\n\n if (\n (next === 'I' && nextnext === 'A') ||\n (next === 'C' && nextnext === 'H')\n ) {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n if (next === 'H' || (next === 'T' && nextnext === 'H')) {\n // Special case `Thomas`, `Thames` or Germanic.\n if (\n isGermanic ||\n ((nextnext === 'O' || nextnext === 'A') &&\n characters[index + 3] === 'M')\n ) {\n primary += 'T'\n secondary += 'T'\n } else {\n primary += '0'\n secondary += 'T'\n }\n\n index += 2\n\n break\n }\n\n if (next === 'T' || next === 'D') {\n index++\n }\n\n index++\n primary += 'T'\n secondary += 'T'\n\n break\n case 'V':\n if (next === 'V') {\n index++\n }\n\n primary += 'F'\n secondary += 'F'\n index++\n\n break\n case 'W':\n // Can also be in middle of word (as already taken care of for initial).\n if (next === 'R') {\n primary += 'R'\n secondary += 'R'\n index += 2\n\n break\n }\n\n if (index === 0) {\n // `Wasserman` should match `Vasserman`.\n if (vowels.test(next)) {\n primary += 'A'\n secondary += 'F'\n } else if (next === 'H') {\n // Need `Uomo` to match `Womo`.\n primary += 'A'\n secondary += 'A'\n }\n }\n\n // `Arnow` should match `Arnoff`.\n if (\n ((prev === 'E' || prev === 'O') &&\n next === 'S' &&\n nextnext === 'K' &&\n (characters[index + 3] === 'I' || characters[index + 3] === 'Y')) ||\n // Maybe a bug? Shouldn't this be general Germanic?\n value.slice(0, 3) === 'SCH' ||\n (index === last && vowels.test(prev))\n ) {\n secondary += 'F'\n index++\n\n break\n }\n\n // Polish such as `Filipowicz`.\n if (\n next === 'I' &&\n (nextnext === 'C' || nextnext === 'T') &&\n characters[index + 3] === 'Z'\n ) {\n primary += 'TS'\n secondary += 'FX'\n index += 4\n\n break\n }\n\n index++\n\n break\n case 'X':\n // French such as `breaux`.\n if (\n !(\n index === last &&\n // Bug: IAU and EAU also match by AU\n // (/IAU|EAU/.test(value.slice(index - 3, index))) ||\n (prev === 'U' &&\n (characters[index - 2] === 'A' || characters[index - 2] === 'O'))\n )\n ) {\n primary += 'KS'\n secondary += 'KS'\n }\n\n if (next === 'C' || next === 'X') {\n index++\n }\n\n index++\n\n break\n case 'Z':\n // Chinese pinyin such as `Zhao`.\n if (next === 'H') {\n primary += 'J'\n secondary += 'J'\n index += 2\n\n break\n } else if (\n (next === 'Z' &&\n (nextnext === 'A' || nextnext === 'I' || nextnext === 'O')) ||\n (isSlavoGermanic && index > 0 && prev !== 'T')\n ) {\n primary += 'S'\n secondary += 'TS'\n } else {\n primary += 'S'\n secondary += 'S'\n }\n\n if (next === 'Z') {\n index++\n }\n\n index++\n\n break\n default:\n index++\n }\n }\n\n return [primary, secondary]\n}\n\nfunction (doc) {\n emit(doubleMetaphone(doc.name)[0], 1); \n emit(doubleMetaphone(doc.name)[1], 1);\n}"
}
},
"language": "javascript",
"indexes": {
"doubleMetaphone": {
"analyzer": "keyword",
"index": "'use strict'\n\n// Match vowels (including `Y`).\nvar vowels = /[AEIOUY]/\n\n// Match few Slavo-Germanic values.\nvar slavoGermanic = /W|K|CZ|WITZ/\n\n// Match few Germanic values.\nvar germanic = /^(VAN |VON |SCH)/\n\n// Match initial values of which the first character should be skipped.\nvar initialExceptions = /^(GN|KN|PN|WR|PS)/\n\n// Match initial Greek-like values of which the `CH` sounds like `K`.\nvar initialGreekCh = /^CH(IA|EM|OR([^E])|YM|ARAC|ARIS)/\n\n// Match Greek-like values of which the `CH` sounds like `K`.\nvar greekCh = /ORCHES|ARCHIT|ORCHID/\n\n// Match values which when following `CH`, transform `CH` to sound like `K`.\nvar chForKh = /[ BFHLMNRVW]/\n\n// Match values which when preceding a vowel and `UGH`, sound like `F`.\nvar gForF = /[CGLRT]/\n\n// Match initial values which sound like either `K` or `J`.\nvar initialGForKj = /Y[\\s\\S]|E[BILPRSY]|I[BELN]/\n\n// Match initial values which sound like either `K` or `J`.\nvar initialAngerException = /^[DMR]ANGER/\n\n// Match values which when following `GY`, do not sound like `K` or `J`.\nvar gForKj = /[EGIR]/\n\n// Match values which when following `J`, do not sound `J`.\nvar jForJException = /[LTKSNMBZ]/\n\n// Match values which might sound like `L`.\nvar alle = /AS|OS/\n\n// Match Germanic values preceding `SH` which sound like `S`.\nvar hForS = /EIM|OEK|OLM|OLZ/\n\n// Match Dutch values following `SCH` which sound like either `X` and `SK`,\n// or `SK`.\nvar dutchSch = /E[DMNR]|UY|OO/\n\n// Get the phonetics according to the Double Metaphone algorithm from a value.\n// eslint-disable-next-line complexity\nfunction doubleMetaphone(value) {\n var primary = ''\n var secondary = ''\n var index = 0\n var length = value.length\n var last = length - 1\n var isSlavoGermanic\n var isGermanic\n var subvalue\n var next\n var prev\n var nextnext\n var characters\n\n value = String(value).toUpperCase() + ' '\n isSlavoGermanic = slavoGermanic.test(value)\n isGermanic = germanic.test(value)\n characters = value.split('')\n\n // Skip this at beginning of word.\n if (initialExceptions.test(value)) {\n index++\n }\n\n // Initial X is pronounced Z, which maps to S. Such as `Xavier`.\n if (characters[0] === 'X') {\n primary += 'S'\n secondary += 'S'\n index++\n }\n\n while (index < length) {\n prev = characters[index - 1]\n next = characters[index + 1]\n nextnext = characters[index + 2]\n\n switch (characters[index]) {\n case 'A':\n case 'E':\n case 'I':\n case 'O':\n case 'U':\n case 'Y':\n case 'À':\n case 'Ê':\n case 'É':\n if (index === 0) {\n // All initial vowels now map to `A`.\n primary += 'A'\n secondary += 'A'\n }\n\n index++\n\n break\n case 'B':\n primary += 'P'\n secondary += 'P'\n\n if (next === 'B') {\n index++\n }\n\n index++\n\n break\n case 'Ç':\n primary += 'S'\n secondary += 'S'\n index++\n\n break\n case 'C':\n // Various Germanic:\n if (\n prev === 'A' &&\n next === 'H' &&\n nextnext !== 'I' &&\n !vowels.test(characters[index - 2]) &&\n (nextnext !== 'E' ||\n (subvalue =\n value.slice(index - 2, index + 4) &&\n (subvalue === 'BACHER' || subvalue === 'MACHER')))\n ) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Special case for `Caesar`.\n if (index === 0 && value.slice(index + 1, index + 6) === 'AESAR') {\n primary += 'S'\n secondary += 'S'\n index += 2\n\n break\n }\n\n // Italian `Chianti`.\n if (value.slice(index + 1, index + 4) === 'HIA') {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n if (next === 'H') {\n // Find `Michael`.\n if (index > 0 && nextnext === 'A' && characters[index + 3] === 'E') {\n primary += 'K'\n secondary += 'X'\n index += 2\n\n break\n }\n\n // Greek roots such as `chemistry`, `chorus`.\n if (index === 0 && initialGreekCh.test(value)) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Germanic, Greek, or otherwise `CH` for `KH` sound.\n if (\n isGermanic ||\n // Such as 'architect' but not 'arch', orchestra', 'orchid'.\n greekCh.test(value.slice(index - 2, index + 4)) ||\n (nextnext === 'T' || nextnext === 'S') ||\n ((index === 0 ||\n prev === 'A' ||\n prev === 'E' ||\n prev === 'O' ||\n prev === 'U') &&\n // Such as `wachtler`, `weschsler`, but not `tichner`.\n chForKh.test(nextnext))\n ) {\n primary += 'K'\n secondary += 'K'\n } else if (index === 0) {\n primary += 'X'\n secondary += 'X'\n // Such as 'McHugh'.\n } else if (value.slice(0, 2) === 'MC') {\n // Bug? Why matching absolute? what about McHiccup?\n primary += 'K'\n secondary += 'K'\n } else {\n primary += 'X'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n // Such as `Czerny`.\n if (next === 'Z' && value.slice(index - 2, index) !== 'WI') {\n primary += 'S'\n secondary += 'X'\n index += 2\n\n break\n }\n\n // Such as `Focaccia`.\n if (value.slice(index + 1, index + 4) === 'CIA') {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n // Double `C`, but not `McClellan`.\n if (next === 'C' && !(index === 1 && characters[0] === 'M')) {\n // Such as `Bellocchio`, but not `Bacchus`.\n if (\n (nextnext === 'I' || nextnext === 'E' || nextnext === 'H') &&\n value.slice(index + 2, index + 4) !== 'HU'\n ) {\n subvalue = value.slice(index - 1, index + 4)\n\n // Such as `Accident`, `Accede`, `Succeed`.\n if (\n (index === 1 && prev === 'A') ||\n subvalue === 'UCCEE' ||\n subvalue === 'UCCES'\n ) {\n primary += 'KS'\n secondary += 'KS'\n // Such as `Bacci`, `Bertucci`, other Italian.\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 3\n\n break\n } else {\n // Pierce's rule.\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n }\n\n if (next === 'G' || next === 'K' || next === 'Q') {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Italian.\n if (\n next === 'I' &&\n // Bug: The original algorithm also calls for A (as in CIA), which is\n // already taken care of above.\n (nextnext === 'E' || nextnext === 'O')\n ) {\n primary += 'S'\n secondary += 'X'\n index += 2\n\n break\n }\n\n if (next === 'I' || next === 'E' || next === 'Y') {\n primary += 'S'\n secondary += 'S'\n index += 2\n\n break\n }\n\n primary += 'K'\n secondary += 'K'\n\n // Skip two extra characters ahead in `Mac Caffrey`, `Mac Gregor`.\n if (\n next === ' ' &&\n (nextnext === 'C' || nextnext === 'G' || nextnext === 'Q')\n ) {\n index += 3\n break\n }\n\n // Bug: Already covered above.\n // if (\n // next === 'K' ||\n // next === 'Q' ||\n // (next === 'C' && nextnext !== 'E' && nextnext !== 'I')\n // ) {\n // index++;\n // }\n\n index++\n\n break\n case 'D':\n if (next === 'G') {\n // Such as `edge`.\n if (nextnext === 'E' || nextnext === 'I' || nextnext === 'Y') {\n primary += 'J'\n secondary += 'J'\n index += 3\n // Such as `Edgar`.\n } else {\n primary += 'TK'\n secondary += 'TK'\n index += 2\n }\n\n break\n }\n\n if (next === 'T' || next === 'D') {\n primary += 'T'\n secondary += 'T'\n index += 2\n\n break\n }\n\n primary += 'T'\n secondary += 'T'\n index++\n\n break\n case 'F':\n if (next === 'F') {\n index++\n }\n\n index++\n primary += 'F'\n secondary += 'F'\n\n break\n case 'G':\n if (next === 'H') {\n if (index > 0 && !vowels.test(prev)) {\n primary += 'K'\n secondary += 'K'\n index += 2\n\n break\n }\n\n // Such as `Ghislane`, `Ghiradelli`.\n if (index === 0) {\n if (nextnext === 'I') {\n primary += 'J'\n secondary += 'J'\n } else {\n primary += 'K'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n // Parker's rule (with some further refinements).\n if (\n // Such as `Hugh`. The comma is not a bug.\n ((subvalue = characters[index - 2]),\n subvalue === 'B' || subvalue === 'H' || subvalue === 'D') ||\n // Such as `bough`. The comma is not a bug.\n ((subvalue = characters[index - 3]),\n subvalue === 'B' || subvalue === 'H' || subvalue === 'D') ||\n // Such as `Broughton`. The comma is not a bug.\n ((subvalue = characters[index - 4]),\n subvalue === 'B' || subvalue === 'H')\n ) {\n index += 2\n\n break\n }\n\n // Such as `laugh`, `McLaughlin`, `cough`, `gough`, `rough`, `tough`.\n if (index > 2 && prev === 'U' && gForF.test(characters[index - 3])) {\n primary += 'F'\n secondary += 'F'\n } else if (index > 0 && prev !== 'I') {\n primary += 'K'\n secondary += 'K'\n }\n\n index += 2\n\n break\n }\n\n if (next === 'N') {\n if (index === 1 && vowels.test(characters[0]) && !isSlavoGermanic) {\n primary += 'KN'\n secondary += 'N'\n // Not like `Cagney`.\n } else if (\n value.slice(index + 2, index + 4) !== 'EY' &&\n value.slice(index + 1) !== 'Y' &&\n !isSlavoGermanic\n ) {\n primary += 'N'\n secondary += 'KN'\n } else {\n primary += 'KN'\n secondary += 'KN'\n }\n\n index += 2\n\n break\n }\n\n // Such as `Tagliaro`.\n if (value.slice(index + 1, index + 3) === 'LI' && !isSlavoGermanic) {\n primary += 'KL'\n secondary += 'L'\n index += 2\n\n break\n }\n\n // -ges-, -gep-, -gel- at beginning.\n if (index === 0 && initialGForKj.test(value.slice(1, 3))) {\n primary += 'K'\n secondary += 'J'\n index += 2\n\n break\n }\n\n // -ger-, -gy-.\n if (\n (value.slice(index + 1, index + 3) === 'ER' &&\n prev !== 'I' &&\n prev !== 'E' &&\n !initialAngerException.test(value.slice(0, 6))) ||\n (next === 'Y' && !gForKj.test(prev))\n ) {\n primary += 'K'\n secondary += 'J'\n index += 2\n\n break\n }\n\n // Italian such as `biaggi`.\n if (\n next === 'E' ||\n next === 'I' ||\n next === 'Y' ||\n ((prev === 'A' || prev === 'O') && next === 'G' && nextnext === 'I')\n ) {\n // Obvious Germanic.\n if (value.slice(index + 1, index + 3) === 'ET' || isGermanic) {\n primary += 'K'\n secondary += 'K'\n } else {\n primary += 'J'\n\n // Always soft if French ending.\n if (value.slice(index + 1, index + 5) === 'IER ') {\n secondary += 'J'\n } else {\n secondary += 'K'\n }\n }\n\n index += 2\n\n break\n }\n\n if (next === 'G') {\n index++\n }\n\n index++\n\n primary += 'K'\n secondary += 'K'\n\n break\n case 'H':\n // Only keep if first & before vowel or btw. 2 vowels.\n if (vowels.test(next) && (index === 0 || vowels.test(prev))) {\n primary += 'H'\n secondary += 'H'\n\n index++\n }\n\n index++\n\n break\n case 'J':\n // Obvious Spanish, `jose`, `San Jacinto`.\n if (\n value.slice(index, index + 4) === 'JOSE' ||\n value.slice(0, 4) === 'SAN '\n ) {\n if (\n value.slice(0, 4) === 'SAN ' ||\n (index === 0 && characters[index + 4] === ' ')\n ) {\n primary += 'H'\n secondary += 'H'\n } else {\n primary += 'J'\n secondary += 'H'\n }\n\n index++\n\n break\n }\n\n if (\n index === 0\n // Bug: unreachable (see previous statement).\n // && value.slice(index, index + 4) !== 'JOSE'.\n ) {\n primary += 'J'\n\n // Such as `Yankelovich` or `Jankelowicz`.\n secondary += 'A'\n // Spanish pron. of such as `bajador`.\n } else if (\n !isSlavoGermanic &&\n (next === 'A' || next === 'O') &&\n vowels.test(prev)\n ) {\n primary += 'J'\n secondary += 'H'\n } else if (index === last) {\n primary += 'J'\n } else if (\n prev !== 'S' &&\n prev !== 'K' &&\n prev !== 'L' &&\n !jForJException.test(next)\n ) {\n primary += 'J'\n secondary += 'J'\n // It could happen.\n } else if (next === 'J') {\n index++\n }\n\n index++\n\n break\n case 'K':\n if (next === 'K') {\n index++\n }\n\n primary += 'K'\n secondary += 'K'\n index++\n\n break\n case 'L':\n if (next === 'L') {\n // Spanish such as `cabrillo`, `gallegos`.\n if (\n (index === length - 3 &&\n ((prev === 'A' && nextnext === 'E') ||\n (prev === 'I' && (nextnext === 'O' || nextnext === 'A')))) ||\n (prev === 'A' &&\n nextnext === 'E' &&\n (characters[last] === 'A' ||\n characters[last] === 'O' ||\n alle.test(value.slice(last - 1, length))))\n ) {\n primary += 'L'\n index += 2\n\n break\n }\n\n index++\n }\n\n primary += 'L'\n secondary += 'L'\n index++\n\n break\n case 'M':\n if (\n next === 'M' ||\n // Such as `dumb`, `thumb`.\n (prev === 'U' &&\n next === 'B' &&\n (index + 1 === last || value.slice(index + 2, index + 4) === 'ER'))\n ) {\n index++\n }\n\n index++\n primary += 'M'\n secondary += 'M'\n\n break\n case 'N':\n if (next === 'N') {\n index++\n }\n\n index++\n primary += 'N'\n secondary += 'N'\n\n break\n case 'Ñ':\n index++\n primary += 'N'\n secondary += 'N'\n\n break\n case 'P':\n if (next === 'H') {\n primary += 'F'\n secondary += 'F'\n index += 2\n\n break\n }\n\n // Also account for `campbell` and `raspberry`.\n subvalue = next\n\n if (subvalue === 'P' || subvalue === 'B') {\n index++\n }\n\n index++\n\n primary += 'P'\n secondary += 'P'\n\n break\n case 'Q':\n if (next === 'Q') {\n index++\n }\n\n index++\n primary += 'K'\n secondary += 'K'\n\n break\n case 'R':\n // French such as `Rogier`, but exclude `Hochmeier`.\n if (\n index === last &&\n !isSlavoGermanic &&\n prev === 'E' &&\n characters[index - 2] === 'I' &&\n characters[index - 4] !== 'M' &&\n (characters[index - 3] !== 'E' && characters[index - 3] !== 'A')\n ) {\n secondary += 'R'\n } else {\n primary += 'R'\n secondary += 'R'\n }\n\n if (next === 'R') {\n index++\n }\n\n index++\n\n break\n case 'S':\n // Special cases `island`, `isle`, `carlisle`, `carlysle`.\n if (next === 'L' && (prev === 'I' || prev === 'Y')) {\n index++\n\n break\n }\n\n // Special case `sugar-`.\n if (index === 0 && value.slice(1, 5) === 'UGAR') {\n primary += 'X'\n secondary += 'S'\n index++\n\n break\n }\n\n if (next === 'H') {\n // Germanic.\n if (hForS.test(value.slice(index + 1, index + 5))) {\n primary += 'S'\n secondary += 'S'\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 2\n break\n }\n\n if (\n next === 'I' &&\n (nextnext === 'O' || nextnext === 'A')\n // Bug: Already covered by previous branch\n // || value.slice(index, index + 4) === 'SIAN'\n ) {\n if (isSlavoGermanic) {\n primary += 'S'\n secondary += 'S'\n } else {\n primary += 'S'\n secondary += 'X'\n }\n\n index += 3\n\n break\n }\n\n // German & Anglicization's, such as `Smith` match `Schmidt`, `snider`\n // match `Schneider`. Also, -sz- in slavic language although in\n // hungarian it is pronounced `s`.\n if (\n next === 'Z' ||\n (index === 0 &&\n (next === 'L' || next === 'M' || next === 'N' || next === 'W'))\n ) {\n primary += 'S'\n secondary += 'X'\n\n if (next === 'Z') {\n index++\n }\n\n index++\n\n break\n }\n\n if (next === 'C') {\n // Schlesinger's rule.\n if (nextnext === 'H') {\n subvalue = value.slice(index + 3, index + 5)\n\n // Dutch origin, such as `school`, `schooner`.\n if (dutchSch.test(subvalue)) {\n // Such as `schermerhorn`, `schenker`.\n if (subvalue === 'ER' || subvalue === 'EN') {\n primary += 'X'\n secondary += 'SK'\n } else {\n primary += 'SK'\n secondary += 'SK'\n }\n\n index += 3\n\n break\n }\n\n if (\n index === 0 &&\n !vowels.test(characters[3]) &&\n characters[3] !== 'W'\n ) {\n primary += 'X'\n secondary += 'S'\n } else {\n primary += 'X'\n secondary += 'X'\n }\n\n index += 3\n\n break\n }\n\n if (nextnext === 'I' || nextnext === 'E' || nextnext === 'Y') {\n primary += 'S'\n secondary += 'S'\n index += 3\n break\n }\n\n primary += 'SK'\n secondary += 'SK'\n index += 3\n\n break\n }\n\n subvalue = value.slice(index - 2, index)\n\n // French such as `resnais`, `artois`.\n if (index === last && (subvalue === 'AI' || subvalue === 'OI')) {\n secondary += 'S'\n } else {\n primary += 'S'\n secondary += 'S'\n }\n\n if (\n next === 'S'\n // Bug: already taken care of by `German & Anglicization's` above:\n // || next === 'Z'\n ) {\n index++\n }\n\n index++\n\n break\n case 'T':\n if (next === 'I' && nextnext === 'O' && characters[index + 3] === 'N') {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n subvalue = value.slice(index + 1, index + 3)\n\n if (\n (next === 'I' && nextnext === 'A') ||\n (next === 'C' && nextnext === 'H')\n ) {\n primary += 'X'\n secondary += 'X'\n index += 3\n\n break\n }\n\n if (next === 'H' || (next === 'T' && nextnext === 'H')) {\n // Special case `Thomas`, `Thames` or Germanic.\n if (\n isGermanic ||\n ((nextnext === 'O' || nextnext === 'A') &&\n characters[index + 3] === 'M')\n ) {\n primary += 'T'\n secondary += 'T'\n } else {\n primary += '0'\n secondary += 'T'\n }\n\n index += 2\n\n break\n }\n\n if (next === 'T' || next === 'D') {\n index++\n }\n\n index++\n primary += 'T'\n secondary += 'T'\n\n break\n case 'V':\n if (next === 'V') {\n index++\n }\n\n primary += 'F'\n secondary += 'F'\n index++\n\n break\n case 'W':\n // Can also be in middle of word (as already taken care of for initial).\n if (next === 'R') {\n primary += 'R'\n secondary += 'R'\n index += 2\n\n break\n }\n\n if (index === 0) {\n // `Wasserman` should match `Vasserman`.\n if (vowels.test(next)) {\n primary += 'A'\n secondary += 'F'\n } else if (next === 'H') {\n // Need `Uomo` to match `Womo`.\n primary += 'A'\n secondary += 'A'\n }\n }\n\n // `Arnow` should match `Arnoff`.\n if (\n ((prev === 'E' || prev === 'O') &&\n next === 'S' &&\n nextnext === 'K' &&\n (characters[index + 3] === 'I' || characters[index + 3] === 'Y')) ||\n // Maybe a bug? Shouldn't this be general Germanic?\n value.slice(0, 3) === 'SCH' ||\n (index === last && vowels.test(prev))\n ) {\n secondary += 'F'\n index++\n\n break\n }\n\n // Polish such as `Filipowicz`.\n if (\n next === 'I' &&\n (nextnext === 'C' || nextnext === 'T') &&\n characters[index + 3] === 'Z'\n ) {\n primary += 'TS'\n secondary += 'FX'\n index += 4\n\n break\n }\n\n index++\n\n break\n case 'X':\n // French such as `breaux`.\n if (\n !(\n index === last &&\n // Bug: IAU and EAU also match by AU\n // (/IAU|EAU/.test(value.slice(index - 3, index))) ||\n (prev === 'U' &&\n (characters[index - 2] === 'A' || characters[index - 2] === 'O'))\n )\n ) {\n primary += 'KS'\n secondary += 'KS'\n }\n\n if (next === 'C' || next === 'X') {\n index++\n }\n\n index++\n\n break\n case 'Z':\n // Chinese pinyin such as `Zhao`.\n if (next === 'H') {\n primary += 'J'\n secondary += 'J'\n index += 2\n\n break\n } else if (\n (next === 'Z' &&\n (nextnext === 'A' || nextnext === 'I' || nextnext === 'O')) ||\n (isSlavoGermanic && index > 0 && prev !== 'T')\n ) {\n primary += 'S'\n secondary += 'TS'\n } else {\n primary += 'S'\n secondary += 'S'\n }\n\n if (next === 'Z') {\n index++\n }\n\n index++\n\n break\n default:\n index++\n }\n }\n\n return [primary, secondary]\n}\n\nfunction (doc) {\n index(\"name\", doc.name);\n index(\"encoding\", doubleMetaphone(doc.name)[0]);\n index(\"encoding\", doubleMetaphone(doc.name)[1]); \n}\n\n"
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment