Skip to content

Instantly share code, notes, and snippets.

@nyarla
Created February 8, 2017 00:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nyarla/d88917bf19b65143f9fc4107e20ba3df to your computer and use it in GitHub Desktop.
Save nyarla/d88917bf19b65143f9fc4107e20ba3df to your computer and use it in GitHub Desktop.
A reverse porting to Modern Javascript from optimized implementation of TinySegmenter written by Julia-lang.
/* TinySegmenter.optmized.js
* =========================
*
* * A reverse porting to Modern Javascript from optimized implementation of TinySegmenter written by Julia-lang.
*
* LICENSE
* =======
* (c) 2008 Taku Kudo <taku@chasen.org>
* (c) 2015 Michiaki Ariga a.k.a chezou <chezou@gmail.com>
* (c) 2017 Naoki OKAMURA a.k.a nyarla <nyarla@thotep.net>
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the <ORGANIZATION> nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* */
"use strict";
var BIAS = -332;
var BC1 = new Map([
["HH" , 6 ],
["II" , 2461 ],
["KH" , 406 ],
["OH" , -1378]
]);
var BC2 = new Map([
["AA" , -3267 ],
["AI" , 2744 ],
["AN" , -878 ],
["HH" , -4070 ],
["HM" , -1711 ],
["HN" , 4012 ],
["HO" , 3761 ],
["IA" , 1327 ],
["IH" , -1184 ],
["II" , -1332 ],
["IK" , 1721 ],
["IO" , 5492 ],
["KI" , 3831 ],
["KK" , -8741 ],
["MH" , -3132 ],
["MK" , 3334 ],
["OO" , -2920]
]);
var BC3 = new Map([
["HH" , 996 ],
["HI" , 626 ],
["HK" , -721 ],
["HN" , -1307 ],
["HO" , -836 ],
["IH" , -301 ],
["KK" , 2762 ],
["MK" , 1079 ],
["MM" , 4034 ],
["OA" , -1652 ],
["OH" , 266]
]);
var BP1 = new Map([
["BB" , 295 ],
["OB" , 304 ],
["OO" , -125 ],
["UB" , 352]
]);
var BP2 = new Map([
["BO" , 60 ],
["OO" , -1762]
]);
var BQ1 = new Map([
["BHH" , 1150 ],
["BHM" , 1521 ],
["BII" , -1158 ],
["BIM" , 886 ],
["BMH" , 1208 ],
["BNH" , 449 ],
["BOH" , -91 ],
["BOO" , -2597 ],
["OHI" , 451 ],
["OIH" , -296 ],
["OKA" , 1851 ],
["OKH" , -1020 ],
["OKK" , 904 ],
["OOO" , 2965]
]);
var BQ2 = new Map([
["BHH" , 118 ],
["BHI" , -1159 ],
["BHM" , 466 ],
["BIH" , -919 ],
["BKK" , -1720 ],
["BKO" , 864 ],
["OHH" , -1139 ],
["OHM" , -181 ],
["OIH" , 153 ],
["UHI" , -1146]
]);
var BQ3 = new Map([
["BHH" , -792 ],
["BHI" , 2664 ],
["BII" , -299 ],
["BKI" , 419 ],
["BMH" , 937 ],
["BMM" , 8335 ],
["BNN" , 998 ],
["BOH" , 775 ],
["OHH" , 2174 ],
["OHM" , 439 ],
["OII" , 280 ],
["OKH" , 1798 ],
["OKI" , -793 ],
["OKO" , -2242 ],
["OMH" , -2402 ],
["OOO" , 11699]
]);
var BQ4 = new Map([
["BHH" , -3895 ],
["BIH" , 3761 ],
["BII" , -4654 ],
["BIK" , 1348 ],
["BKK" , -1806 ],
["BMI" , -3385 ],
["BOO" , -12396 ],
["OAH" , 926 ],
["OHH" , 266 ],
["OHK" , -2036 ],
["ONN" , -973]
]);
var BW1 = new Map([
[",と" , 660 ],
[",同" , 727 ],
["B1あ" , 1404 ],
["B1同" , 542 ],
["、と" , 660 ],
["、同" , 727 ],
["」と" , 1682 ],
["あっ" , 1505 ],
["いう" , 1743 ],
["いっ" , -2055 ],
["いる" , 672 ],
["うし" , -4817 ],
["うん" , 665 ],
["から" , 3472 ],
["がら" , 600 ],
["こう" , -790 ],
["こと" , 2083 ],
["こん" , -1262 ],
["さら" , -4143 ],
["さん" , 4573 ],
["した" , 2641 ],
["して" , 1104 ],
["すで" , -3399 ],
["そこ" , 1977 ],
["それ" , -871 ],
["たち" , 1122 ],
["ため" , 601 ],
["った" , 3463 ],
["つい" , -802 ],
["てい" , 805 ],
["てき" , 1249 ],
["でき" , 1127 ],
["です" , 3445 ],
["では" , 844 ],
["とい" , -4915 ],
["とみ" , 1922 ],
["どこ" , 3887 ],
["ない" , 5713 ],
["なっ" , 3015 ],
["など" , 7379 ],
["なん" , -1113 ],
["にし" , 2468 ],
["には" , 1498 ],
["にも" , 1671 ],
["に対" , -912 ],
["の一" , -501 ],
["の中" , 741 ],
["ませ" , 2448 ],
["まで" , 1711 ],
["まま" , 2600 ],
["まる" , -2155 ],
["やむ" , -1947 ],
["よっ" , -2565 ],
["れた" , 2369 ],
["れで" , -913 ],
["をし" , 1860 ],
["を見" , 731 ],
["亡く" , -1886 ],
["京都" , 2558 ],
["取り" , -2784 ],
["大き" , -2604 ],
["大阪" , 1497 ],
["平方" , -2314 ],
["引き" , -1336 ],
["日本" , -195 ],
["本当" , -2423 ],
["毎日" , -2113 ],
["目指" , -724 ],
["B1あ" , 1404 ],
["B1同" , 542 ],
["」と" , 1682]
]);
var BW2 = new Map([
[".." , -11822 ],
["11" , -669 ],
["――" , -5730 ],
["−−" , -13175 ],
["いう" , -1609 ],
["うか" , 2490 ],
["かし" , -1350 ],
["かも" , -602 ],
["から" , -7194 ],
["かれ" , 4612 ],
["がい" , 853 ],
["がら" , -3198 ],
["きた" , 1941 ],
["くな" , -1597 ],
["こと" , -8392 ],
["この" , -4193 ],
["させ" , 4533 ],
["され" , 13168 ],
["さん" , -3977 ],
["しい" , -1819 ],
["しか" , -545 ],
["した" , 5078 ],
["して" , 972 ],
["しな" , 939 ],
["その" , -3744 ],
["たい" , -1253 ],
["たた" , -662 ],
["ただ" , -3857 ],
["たち" , -786 ],
["たと" , 1224 ],
["たは" , -939 ],
["った" , 4589 ],
["って" , 1647 ],
["っと" , -2094 ],
["てい" , 6144 ],
["てき" , 3640 ],
["てく" , 2551 ],
["ては" , -3110 ],
["ても" , -3065 ],
["でい" , 2666 ],
["でき" , -1528 ],
["でし" , -3828 ],
["です" , -4761 ],
["でも" , -4203 ],
["とい" , 1890 ],
["とこ" , -1746 ],
["とと" , -2279 ],
["との" , 720 ],
["とみ" , 5168 ],
["とも" , -3941 ],
["ない" , -2488 ],
["なが" , -1313 ],
["など" , -6509 ],
["なの" , 2614 ],
["なん" , 3099 ],
["にお" , -1615 ],
["にし" , 2748 ],
["にな" , 2454 ],
["によ" , -7236 ],
["に対" , -14943 ],
["に従" , -4688 ],
["に関" , -11388 ],
["のか" , 2093 ],
["ので" , -7059 ],
["のに" , -6041 ],
["のの" , -6125 ],
["はい" , 1073 ],
["はが" , -1033 ],
["はず" , -2532 ],
["ばれ" , 1813 ],
["まし" , -1316 ],
["まで" , -6621 ],
["まれ" , 5409 ],
["めて" , -3153 ],
["もい" , 2230 ],
["もの" , -10713 ],
["らか" , -944 ],
["らし" , -1611 ],
["らに" , -1897 ],
["りし" , 651 ],
["りま" , 1620 ],
["れた" , 4270 ],
["れて" , 849 ],
["れば" , 4114 ],
["ろう" , 6067 ],
["われ" , 7901 ],
["を通" , -11877 ],
["んだ" , 728 ],
["んな" , -4115 ],
["一人" , 602 ],
["一方" , -1375 ],
["一日" , 970 ],
["一部" , -1051 ],
["上が" , -4479 ],
["会社" , -1116 ],
["出て" , 2163 ],
["分の" , -7758 ],
["同党" , 970 ],
["同日" , -913 ],
["大阪" , -2471 ],
["委員" , -1250 ],
["少な" , -1050 ],
["年度" , -8669 ],
["年間" , -1626 ],
["府県" , -2363 ],
["手権" , -1982 ],
["新聞" , -4066 ],
["日新" , -722 ],
["日本" , -7068 ],
["日米" , 3372 ],
["曜日" , -601 ],
["朝鮮" , -2355 ],
["本人" , -2697 ],
["東京" , -1543 ],
["然と" , -1384 ],
["社会" , -1276 ],
["立て" , -990 ],
["第に" , -1612 ],
["米国" , -4268 ],
["11" , -669]
]);
var BW3 = new Map([
["あた" , -2194 ],
["あり" , 719 ],
["ある" , 3846 ],
["い." , -1185 ],
["い。" , -1185 ],
["いい" , 5308 ],
["いえ" , 2079 ],
["いく" , 3029 ],
["いた" , 2056 ],
["いっ" , 1883 ],
["いる" , 5600 ],
["いわ" , 1527 ],
["うち" , 1117 ],
["うと" , 4798 ],
["えと" , 1454 ],
["か." , 2857 ],
["か。" , 2857 ],
["かけ" , -743 ],
["かっ" , -4098 ],
["かに" , -669 ],
["から" , 6520 ],
["かり" , -2670 ],
["が,", 1816 ],
["が、" , 1816 ],
["がき" , -4855 ],
["がけ" , -1127 ],
["がっ" , -913 ],
["がら" , -4977 ],
["がり" , -2064 ],
["きた" , 1645 ],
["けど" , 1374 ],
["こと" , 7397 ],
["この" , 1542 ],
["ころ" , -2757 ],
["さい" , -714 ],
["さを" , 976 ],
["し,", 1557 ],
["し、" , 1557 ],
["しい" , -3714 ],
["した" , 3562 ],
["して" , 1449 ],
["しな" , 2608 ],
["しま" , 1200 ],
["す." , -1310 ],
["す。" , -1310 ],
["する" , 6521 ],
["ず,", 3426 ],
["ず、" , 3426 ],
["ずに" , 841 ],
["そう" , 428 ],
["た." , 8875 ],
["た。" , 8875 ],
["たい" , -594 ],
["たの" , 812 ],
["たり" , -1183 ],
["たる" , -853 ],
["だ." , 4098 ],
["だ。" , 4098 ],
["だっ" , 1004 ],
["った" , -4748 ],
["って" , 300 ],
["てい" , 6240 ],
["てお" , 855 ],
["ても" , 302 ],
["です" , 1437 ],
["でに" , -1482 ],
["では" , 2295 ],
["とう" , -1387 ],
["とし" , 2266 ],
["との" , 541 ],
["とも" , -3543 ],
["どう" , 4664 ],
["ない" , 1796 ],
["なく" , -903 ],
["など" , 2135 ],
["に,", -1021 ],
["に、" , -1021 ],
["にし" , 1771 ],
["にな" , 1906 ],
["には" , 2644 ],
["の,", -724 ],
["の、" , -724 ],
["の子" , -1000 ],
["は,", 1337 ],
["は、" , 1337 ],
["べき" , 2181 ],
["まし" , 1113 ],
["ます" , 6943 ],
["まっ" , -1549 ],
["まで" , 6154 ],
["まれ" , -793 ],
["らし" , 1479 ],
["られ" , 6820 ],
["るる" , 3818 ],
["れ,", 854 ],
["れ、" , 854 ],
["れた" , 1850 ],
["れて" , 1375 ],
["れば" , -3246 ],
["れる" , 1091 ],
["われ" , -605 ],
["んだ" , 606 ],
["んで" , 798 ],
["カ月" , 990 ],
["会議" , 860 ],
["入り" , 1232 ],
["大会" , 2217 ],
["始め" , 1681 ],
["市" , 965 ],
["新聞" , -5055 ],
["日,", 974 ],
["日、" , 974 ],
["社会" , 2024 ],
["カ月" , 990]
]);
var TC1 = new Map([
["AAA" , 1093 ],
["HHH" , 1029 ],
["HHM" , 580 ],
["HII" , 998 ],
["HOH" , -390 ],
["HOM" , -331 ],
["IHI" , 1169 ],
["IOH" , -142 ],
["IOI" , -1015 ],
["IOM" , 467 ],
["MMH" , 187 ],
["OOI" , -1832]
]);
var TC2 = new Map([
["HHO" , 2088 ],
["HII" , -1023 ],
["HMM" , -1154 ],
["IHI" , -1965 ],
["KKH" , 703 ],
["OII" , -2649]
]);
var TC3 = new Map([
["AAA" , -294 ],
["HHH" , 346 ],
["HHI" , -341 ],
["HII" , -1088 ],
["HIK" , 731 ],
["HOH" , -1486 ],
["IHH" , 128 ],
["IHI" , -3041 ],
["IHO" , -1935 ],
["IIH" , -825 ],
["IIM" , -1035 ],
["IOI" , -542 ],
["KHH" , -1216 ],
["KKA" , 491 ],
["KKH" , -1217 ],
["KOK" , -1009 ],
["MHH" , -2694 ],
["MHM" , -457 ],
["MHO" , 123 ],
["MMH" , -471 ],
["NNH" , -1689 ],
["NNO" , 662 ],
["OHO" , -3393]
]);
var TC4 = new Map([
["HHH" , -203 ],
["HHI" , 1344 ],
["HHK" , 365 ],
["HHM" , -122 ],
["HHN" , 182 ],
["HHO" , 669 ],
["HIH" , 804 ],
["HII" , 679 ],
["HOH" , 446 ],
["IHH" , 695 ],
["IHO" , -2324 ],
["IIH" , 321 ],
["III" , 1497 ],
["IIO" , 656 ],
["IOO" , 54 ],
["KAK" , 4845 ],
["KKA" , 3386 ],
["KKK" , 3065 ],
["MHH" , -405 ],
["MHI" , 201 ],
["MMH" , -241 ],
["MMM" , 661 ],
["MOM" , 841]
]);
var TQ1 = new Map([
["BHHH" , -227 ],
["BHHI" , 316 ],
["BHIH" , -132 ],
["BIHH" , 60 ],
["BIII" , 1595 ],
["BNHH" , -744 ],
["BOHH" , 225 ],
["BOOO" , -908 ],
["OAKK" , 482 ],
["OHHH" , 281 ],
["OHIH" , 249 ],
["OIHI" , 200 ],
["OIIH" , -68]
]);
var TQ2 = new Map([
["BIHH" , -1401 ],
["BIII" , -1033 ],
["BKAK" , -543 ],
["BOOO" , -5591]
]);
var TQ3 = new Map([
["BHHH" , 478 ],
["BHHM" , -1073 ],
["BHIH" , 222 ],
["BHII" , -504 ],
["BIIH" , -116 ],
["BIII" , -105 ],
["BMHI" , -863 ],
["BMHM" , -464 ],
["BOMH" , 620 ],
["OHHH" , 346 ],
["OHHI" , 1729 ],
["OHII" , 997 ],
["OHMH" , 481 ],
["OIHH" , 623 ],
["OIIH" , 1344 ],
["OKAK" , 2792 ],
["OKHH" , 587 ],
["OKKA" , 679 ],
["OOHH" , 110 ],
["OOII" , -685]
]);
var TQ4 = new Map([
["BHHH" , -721 ],
["BHHM" , -3604 ],
["BHII" , -966 ],
["BIIH" , -607 ],
["BIII" , -2181 ],
["OAAA" , -2763 ],
["OAKK" , 180 ],
["OHHH" , -294 ],
["OHHI" , 2446 ],
["OHHO" , 480 ],
["OHIH" , -1573 ],
["OIHH" , 1935 ],
["OIHI" , -493 ],
["OIIH" , 626 ],
["OIII" , -4007 ],
["OKAK" , -8156]
]);
var TW1 = new Map([
["につい" , -4681 ],
["東京都" , 2026]
]);
var TW2 = new Map([
["ある程" , -2049 ],
["いった" , -1256 ],
["ころが" , -2434 ],
["しょう" , 3873 ],
["その後" , -4430 ],
["だって" , -1049 ],
["ていた" , 1833 ],
["として" , -4657 ],
["ともに" , -4517 ],
["もので" , 1882 ],
["一気に" , -792 ],
["初めて" , -1512 ],
["同時に" , -8097 ],
["大きな" , -1255 ],
["対して" , -2721 ],
["社会党" , -3216]
]);
var TW3 = new Map([
["いただ" , -1734 ],
["してい" , 1314 ],
["として" , -4314 ],
["につい" , -5483 ],
["にとっ" , -5989 ],
["に当た" , -6247 ],
["ので,", -727 ],
["ので、" , -727 ],
["のもの" , -600 ],
["れから" , -3752 ],
["十二月" , -2287]
]);
var TW4 = new Map([
["いう." , 8576 ],
["いう。" , 8576 ],
["からな" , -2348 ],
["してい" , 2958 ],
["たが,", 1516 ],
["たが、" , 1516 ],
["ている" , 1538 ],
["という" , 1349 ],
["ました" , 5543 ],
["ません" , 1097 ],
["ようと" , -4258 ],
["よると" , 5865]
]);
var UC1 = new Map([
["A" , 484 ],
["K" , 93 ],
["M" , 645 ],
["O" , -505]
]);
var UC2 = new Map([
["A" , 819 ],
["H" , 1059 ],
["I" , 409 ],
["M" , 3987 ],
["N" , 5775 ],
["O" , 646]
]);
var UC3 = new Map([
["A" , -1370 ],
["I" , 2311]
]);
var UC4 = new Map([
["A" , -2643 ],
["H" , 1809 ],
["I" , -1032 ],
["K" , -3450 ],
["M" , 3565 ],
["N" , 3876 ],
["O" , 6646]
]);
var UC5 = new Map([
["H" , 313 ],
["I" , -1238 ],
["K" , -799 ],
["M" , 539 ],
["O" , -831]
]);
var UC6 = new Map([
["H" , -506 ],
["I" , -253 ],
["K" , 87 ],
["M" , 247 ],
["O" , -387]
]);
var UP1 = new Map([
["O" , -214]
]);
var UP2 = new Map([
["B" , 69 ],
["O" , 935]
]);
var UP3 = new Map([
["B" , 189]
]);
var UQ1 = new Map([
["BH" , 21 ],
["BI" , -12 ],
["BK" , -99 ],
["BN" , 142 ],
["BO" , -56 ],
["OH" , -95 ],
["OI" , 477 ],
["OK" , 410 ],
["OO" , -2422]
]);
var UQ2 = new Map([
["BH" , 216 ],
["BI" , 113 ],
["OK" , 1759]
]);
var UQ3 = new Map([
["BA" , -479 ],
["BH" , 42 ],
["BI" , 1913 ],
["BK" , -7198 ],
["BM" , 3160 ],
["BN" , 6427 ],
["BO" , 14761 ],
["OI" , -827 ],
["ON" , -3212]
]);
var UW1 = new Map([
["," , 156 ],
["、" , 156 ],
["「" , -463 ],
["あ" , -941 ],
["う" , -127 ],
["が" , -553 ],
["き" , 121 ],
["こ" , 505 ],
["で" , -201 ],
["と" , -547 ],
["ど" , -123 ],
["に" , -789 ],
["の" , -185 ],
["は" , -847 ],
["も" , -466 ],
["や" , -470 ],
["よ" , 182 ],
["ら" , -292 ],
["り" , 208 ],
["れ" , 169 ],
["を" , -446 ],
["ん" , -137 ],
["・" , -135 ],
["主" , -402 ],
["京" , -268 ],
["区" , -912 ],
["午" , 871 ],
["国" , -460 ],
["大" , 561 ],
["委" , 729 ],
["市" , -411 ],
["日" , -141 ],
["理" , 361 ],
["生" , -408 ],
["県" , -386 ],
["都" , -718 ],
["「" , -463 ],
["・" , -135]
]);
var UW2 = new Map([
["," , -829 ],
["、" , -829 ],
["〇" , 892 ],
["「" , -645 ],
["」" , 3145 ],
["あ" , -538 ],
["い" , 505 ],
["う" , 134 ],
["お" , -502 ],
["か" , 1454 ],
["が" , -856 ],
["く" , -412 ],
["こ" , 1141 ],
["さ" , 878 ],
["ざ" , 540 ],
["し" , 1529 ],
["す" , -675 ],
["せ" , 300 ],
["そ" , -1011 ],
["た" , 188 ],
["だ" , 1837 ],
["つ" , -949 ],
["て" , -291 ],
["で" , -268 ],
["と" , -981 ],
["ど" , 1273 ],
["な" , 1063 ],
["に" , -1764 ],
["の" , 130 ],
["は" , -409 ],
["ひ" , -1273 ],
["べ" , 1261 ],
["ま" , 600 ],
["も" , -1263 ],
["や" , -402 ],
["よ" , 1639 ],
["り" , -579 ],
["る" , -694 ],
["れ" , 571 ],
["を" , -2516 ],
["ん" , 2095 ],
["ア" , -587 ],
["カ" , 306 ],
["キ" , 568 ],
["ッ" , 831 ],
["三" , -758 ],
["不" , -2150 ],
["世" , -302 ],
["中" , -968 ],
["主" , -861 ],
["事" , 492 ],
["人" , -123 ],
["会" , 978 ],
["保" , 362 ],
["入" , 548 ],
["初" , -3025 ],
["副" , -1566 ],
["北" , -3414 ],
["区" , -422 ],
["大" , -1769 ],
["天" , -865 ],
["太" , -483 ],
["子" , -1519 ],
["学" , 760 ],
["実" , 1023 ],
["小" , -2009 ],
["市" , -813 ],
["年" , -1060 ],
["強" , 1067 ],
["手" , -1519 ],
["揺" , -1033 ],
["政" , 1522 ],
["文" , -1355 ],
["新" , -1682 ],
["日" , -1815 ],
["明" , -1462 ],
["最" , -630 ],
["朝" , -1843 ],
["本" , -1650 ],
["東" , -931 ],
["果" , -665 ],
["次" , -2378 ],
["民" , -180 ],
["気" , -1740 ],
["理" , 752 ],
["発" , 529 ],
["目" , -1584 ],
["相" , -242 ],
["県" , -1165 ],
["立" , -763 ],
["第" , 810 ],
["米" , 509 ],
["自" , -1353 ],
["行" , 838 ],
["西" , -744 ],
["見" , -3874 ],
["調" , 1010 ],
["議" , 1198 ],
["込" , 3041 ],
["開" , 1758 ],
["間" , -1257 ],
["「" , -645 ],
["」" , 3145 ],
["ッ" , 831 ],
["ア" , -587 ],
["カ" , 306 ],
["キ" , 568]
]);
var UW3 = new Map([
["," , 4889 ],
["1" , -800 ],
["−" , -1723 ],
["、" , 4889 ],
["々" , -2311 ],
["〇" , 5827 ],
["」" , 2670 ],
["〓" , -3573 ],
["あ" , -2696 ],
["い" , 1006 ],
["う" , 2342 ],
["え" , 1983 ],
["お" , -4864 ],
["か" , -1163 ],
["が" , 3271 ],
["く" , 1004 ],
["け" , 388 ],
["げ" , 401 ],
["こ" , -3552 ],
["ご" , -3116 ],
["さ" , -1058 ],
["し" , -395 ],
["す" , 584 ],
["せ" , 3685 ],
["そ" , -5228 ],
["た" , 842 ],
["ち" , -521 ],
["っ" , -1444 ],
["つ" , -1081 ],
["て" , 6167 ],
["で" , 2318 ],
["と" , 1691 ],
["ど" , -899 ],
["な" , -2788 ],
["に" , 2745 ],
["の" , 4056 ],
["は" , 4555 ],
["ひ" , -2171 ],
["ふ" , -1798 ],
["へ" , 1199 ],
["ほ" , -5516 ],
["ま" , -4384 ],
["み" , -120 ],
["め" , 1205 ],
["も" , 2323 ],
["や" , -788 ],
["よ" , -202 ],
["ら" , 727 ],
["り" , 649 ],
["る" , 5905 ],
["れ" , 2773 ],
["わ" , -1207 ],
["を" , 6620 ],
["ん" , -518 ],
["ア" , 551 ],
["グ" , 1319 ],
["ス" , 874 ],
["ッ" , -1350 ],
["ト" , 521 ],
["ム" , 1109 ],
["ル" , 1591 ],
["ロ" , 2201 ],
["ン" , 278 ],
["・" , -3794 ],
["一" , -1619 ],
["下" , -1759 ],
["世" , -2087 ],
["両" , 3815 ],
["中" , 653 ],
["主" , -758 ],
["予" , -1193 ],
["二" , 974 ],
["人" , 2742 ],
["今" , 792 ],
["他" , 1889 ],
["以" , -1368 ],
["低" , 811 ],
["何" , 4265 ],
["作" , -361 ],
["保" , -2439 ],
["元" , 4858 ],
["党" , 3593 ],
["全" , 1574 ],
["公" , -3030 ],
["六" , 755 ],
["共" , -1880 ],
["円" , 5807 ],
["再" , 3095 ],
["分" , 457 ],
["初" , 2475 ],
["別" , 1129 ],
["前" , 2286 ],
["副" , 4437 ],
["力" , 365 ],
["動" , -949 ],
["務" , -1872 ],
["化" , 1327 ],
["北" , -1038 ],
["区" , 4646 ],
["千" , -2309 ],
["午" , -783 ],
["協" , -1006 ],
["口" , 483 ],
["右" , 1233 ],
["各" , 3588 ],
["合" , -241 ],
["同" , 3906 ],
["和" , -837 ],
["員" , 4513 ],
["国" , 642 ],
["型" , 1389 ],
["場" , 1219 ],
["外" , -241 ],
["妻" , 2016 ],
["学" , -1356 ],
["安" , -423 ],
["実" , -1008 ],
["家" , 1078 ],
["小" , -513 ],
["少" , -3102 ],
["州" , 1155 ],
["市" , 3197 ],
["平" , -1804 ],
["年" , 2416 ],
["広" , -1030 ],
["府" , 1605 ],
["度" , 1452 ],
["建" , -2352 ],
["当" , -3885 ],
["得" , 1905 ],
["思" , -1291 ],
["性" , 1822 ],
["戸" , -488 ],
["指" , -3973 ],
["政" , -2013 ],
["教" , -1479 ],
["数" , 3222 ],
["文" , -1489 ],
["新" , 1764 ],
["日" , 2099 ],
["旧" , 5792 ],
["昨" , -661 ],
["時" , -1248 ],
["曜" , -951 ],
["最" , -937 ],
["月" , 4125 ],
["期" , 360 ],
["李" , 3094 ],
["村" , 364 ],
["東" , -805 ],
["核" , 5156 ],
["森" , 2438 ],
["業" , 484 ],
["氏" , 2613 ],
["民" , -1694 ],
["決" , -1073 ],
["法" , 1868 ],
["海" , -495 ],
["無" , 979 ],
["物" , 461 ],
["特" , -3850 ],
["生" , -273 ],
["用" , 914 ],
["町" , 1215 ],
["的" , 7313 ],
["直" , -1835 ],
["省" , 792 ],
["県" , 6293 ],
["知" , -1528 ],
["私" , 4231 ],
["税" , 401 ],
["立" , -960 ],
["第" , 1201 ],
["米" , 7767 ],
["系" , 3066 ],
["約" , 3663 ],
["級" , 1384 ],
["統" , -4229 ],
["総" , 1163 ],
["線" , 1255 ],
["者" , 6457 ],
["能" , 725 ],
["自" , -2869 ],
["英" , 785 ],
["見" , 1044 ],
["調" , -562 ],
["財" , -733 ],
["費" , 1777 ],
["車" , 1835 ],
["軍" , 1375 ],
["込" , -1504 ],
["通" , -1136 ],
["選" , -681 ],
["郎" , 1026 ],
["郡" , 4404 ],
["部" , 1200 ],
["金" , 2163 ],
["長" , 421 ],
["開" , -1432 ],
["間" , 1302 ],
["関" , -1282 ],
["雨" , 2009 ],
["電" , -1045 ],
["非" , 2066 ],
["駅" , 1620 ],
["1" , -800 ],
["」" , 2670 ],
["・" , -3794 ],
["ッ" , -1350 ],
["ア" , 551 ],
["グ" , 1319 ],
["ス" , 874 ],
["ト" , 521 ],
["ム" , 1109 ],
["ル" , 1591 ],
["ロ" , 2201 ],
["ン" , 278]
]);
var UW4 = new Map([
["," , 3930 ],
["." , 3508 ],
["―" , -4841 ],
["、" , 3930 ],
["。" , 3508 ],
["〇" , 4999 ],
["「" , 1895 ],
["」" , 3798 ],
["〓" , -5156 ],
["あ" , 4752 ],
["い" , -3435 ],
["う" , -640 ],
["え" , -2514 ],
["お" , 2405 ],
["か" , 530 ],
["が" , 6006 ],
["き" , -4482 ],
["ぎ" , -3821 ],
["く" , -3788 ],
["け" , -4376 ],
["げ" , -4734 ],
["こ" , 2255 ],
["ご" , 1979 ],
["さ" , 2864 ],
["し" , -843 ],
["じ" , -2506 ],
["す" , -731 ],
["ず" , 1251 ],
["せ" , 181 ],
["そ" , 4091 ],
["た" , 5034 ],
["だ" , 5408 ],
["ち" , -3654 ],
["っ" , -5882 ],
["つ" , -1659 ],
["て" , 3994 ],
["で" , 7410 ],
["と" , 4547 ],
["な" , 5433 ],
["に" , 6499 ],
["ぬ" , 1853 ],
["ね" , 1413 ],
["の" , 7396 ],
["は" , 8578 ],
["ば" , 1940 ],
["ひ" , 4249 ],
["び" , -4134 ],
["ふ" , 1345 ],
["へ" , 6665 ],
["べ" , -744 ],
["ほ" , 1464 ],
["ま" , 1051 ],
["み" , -2082 ],
["む" , -882 ],
["め" , -5046 ],
["も" , 4169 ],
["ゃ" , -2666 ],
["や" , 2795 ],
["ょ" , -1544 ],
["よ" , 3351 ],
["ら" , -2922 ],
["り" , -9726 ],
["る" , -14896 ],
["れ" , -2613 ],
["ろ" , -4570 ],
["わ" , -1783 ],
["を" , 13150 ],
["ん" , -2352 ],
["カ" , 2145 ],
["コ" , 1789 ],
["セ" , 1287 ],
["ッ" , -724 ],
["ト" , -403 ],
["メ" , -1635 ],
["ラ" , -881 ],
["リ" , -541 ],
["ル" , -856 ],
["ン" , -3637 ],
["・" , -4371 ],
["ー" , -11870 ],
["一" , -2069 ],
["中" , 2210 ],
["予" , 782 ],
["事" , -190 ],
["井" , -1768 ],
["人" , 1036 ],
["以" , 544 ],
["会" , 950 ],
["体" , -1286 ],
["作" , 530 ],
["側" , 4292 ],
["先" , 601 ],
["党" , -2006 ],
["共" , -1212 ],
["内" , 584 ],
["円" , 788 ],
["初" , 1347 ],
["前" , 1623 ],
["副" , 3879 ],
["力" , -302 ],
["動" , -740 ],
["務" , -2715 ],
["化" , 776 ],
["区" , 4517 ],
["協" , 1013 ],
["参" , 1555 ],
["合" , -1834 ],
["和" , -681 ],
["員" , -910 ],
["器" , -851 ],
["回" , 1500 ],
["国" , -619 ],
["園" , -1200 ],
["地" , 866 ],
["場" , -1410 ],
["塁" , -2094 ],
["士" , -1413 ],
["多" , 1067 ],
["大" , 571 ],
["子" , -4802 ],
["学" , -1397 ],
["定" , -1057 ],
["寺" , -809 ],
["小" , 1910 ],
["屋" , -1328 ],
["山" , -1500 ],
["島" , -2056 ],
["川" , -2667 ],
["市" , 2771 ],
["年" , 374 ],
["庁" , -4556 ],
["後" , 456 ],
["性" , 553 ],
["感" , 916 ],
["所" , -1566 ],
["支" , 856 ],
["改" , 787 ],
["政" , 2182 ],
["教" , 704 ],
["文" , 522 ],
["方" , -856 ],
["日" , 1798 ],
["時" , 1829 ],
["最" , 845 ],
["月" , -9066 ],
["木" , -485 ],
["来" , -442 ],
["校" , -360 ],
["業" , -1043 ],
["氏" , 5388 ],
["民" , -2716 ],
["気" , -910 ],
["沢" , -939 ],
["済" , -543 ],
["物" , -735 ],
["率" , 672 ],
["球" , -1267 ],
["生" , -1286 ],
["産" , -1101 ],
["田" , -2900 ],
["町" , 1826 ],
["的" , 2586 ],
["目" , 922 ],
["省" , -3485 ],
["県" , 2997 ],
["空" , -867 ],
["立" , -2112 ],
["第" , 788 ],
["米" , 2937 ],
["系" , 786 ],
["約" , 2171 ],
["経" , 1146 ],
["統" , -1169 ],
["総" , 940 ],
["線" , -994 ],
["署" , 749 ],
["者" , 2145 ],
["能" , -730 ],
["般" , -852 ],
["行" , -792 ],
["規" , 792 ],
["警" , -1184 ],
["議" , -244 ],
["谷" , -1000 ],
["賞" , 730 ],
["車" , -1481 ],
["軍" , 1158 ],
["輪" , -1433 ],
["込" , -3370 ],
["近" , 929 ],
["道" , -1291 ],
["選" , 2596 ],
["郎" , -4866 ],
["都" , 1192 ],
["野" , -1100 ],
["銀" , -2213 ],
["長" , 357 ],
["間" , -2344 ],
["院" , -2297 ],
["際" , -2604 ],
["電" , -878 ],
["領" , -1659 ],
["題" , -792 ],
["館" , -1984 ],
["首" , 1749 ],
["高" , 2120 ],
["「" , 1895 ],
["」" , 3798 ],
["・" , -4371 ],
["ッ" , -724 ],
["ー" , -11870 ],
["カ" , 2145 ],
["コ" , 1789 ],
["セ" , 1287 ],
["ト" , -403 ],
["メ" , -1635 ],
["ラ" , -881 ],
["リ" , -541 ],
["ル" , -856 ],
["ン" , -3637]
]);
var UW5 = new Map([
["," , 465 ],
["." , -299 ],
["1" , -514 ],
["E2" , -32768 ],
["]" , -2762 ],
["、" , 465 ],
["。" , -299 ],
["「" , 363 ],
["あ" , 1655 ],
["い" , 331 ],
["う" , -503 ],
["え" , 1199 ],
["お" , 527 ],
["か" , 647 ],
["が" , -421 ],
["き" , 1624 ],
["ぎ" , 1971 ],
["く" , 312 ],
["げ" , -983 ],
["さ" , -1537 ],
["し" , -1371 ],
["す" , -852 ],
["だ" , -1186 ],
["ち" , 1093 ],
["っ" , 52 ],
["つ" , 921 ],
["て" , -18 ],
["で" , -850 ],
["と" , -127 ],
["ど" , 1682 ],
["な" , -787 ],
["に" , -1224 ],
["の" , -635 ],
["は" , -578 ],
["べ" , 1001 ],
["み" , 502 ],
["め" , 865 ],
["ゃ" , 3350 ],
["ょ" , 854 ],
["り" , -208 ],
["る" , 429 ],
["れ" , 504 ],
["わ" , 419 ],
["を" , -1264 ],
["ん" , 327 ],
["イ" , 241 ],
["ル" , 451 ],
["ン" , -343 ],
["中" , -871 ],
["京" , 722 ],
["会" , -1153 ],
["党" , -654 ],
["務" , 3519 ],
["区" , -901 ],
["告" , 848 ],
["員" , 2104 ],
["大" , -1296 ],
["学" , -548 ],
["定" , 1785 ],
["嵐" , -1304 ],
["市" , -2991 ],
["席" , 921 ],
["年" , 1763 ],
["思" , 872 ],
["所" , -814 ],
["挙" , 1618 ],
["新" , -1682 ],
["日" , 218 ],
["月" , -4353 ],
["査" , 932 ],
["格" , 1356 ],
["機" , -1508 ],
["氏" , -1347 ],
["田" , 240 ],
["町" , -3912 ],
["的" , -3149 ],
["相" , 1319 ],
["省" , -1052 ],
["県" , -4003 ],
["研" , -997 ],
["社" , -278 ],
["空" , -813 ],
["統" , 1955 ],
["者" , -2233 ],
["表" , 663 ],
["語" , -1073 ],
["議" , 1219 ],
["選" , -1018 ],
["郎" , -368 ],
["長" , 786 ],
["間" , 1191 ],
["題" , 2368 ],
["館" , -689 ],
["1" , -514 ],
["E2" , -32768 ],
["「" , 363 ],
["イ" , 241 ],
["ル" , 451 ],
["ン" , -343]
]);
var UW6 = new Map([
["," , 227 ],
["." , 808 ],
["1" , -270 ],
["E1" , 306 ],
["、" , 227 ],
["。" , 808 ],
["あ" , -307 ],
["う" , 189 ],
["か" , 241 ],
["が" , -73 ],
["く" , -121 ],
["こ" , -200 ],
["じ" , 1782 ],
["す" , 383 ],
["た" , -428 ],
["っ" , 573 ],
["て" , -1014 ],
["で" , 101 ],
["と" , -105 ],
["な" , -253 ],
["に" , -149 ],
["の" , -417 ],
["は" , -236 ],
["も" , -206 ],
["り" , 187 ],
["る" , -135 ],
["を" , 195 ],
["ル" , -673 ],
["ン" , -496 ],
["一" , -277 ],
["中" , 201 ],
["件" , -800 ],
["会" , 624 ],
["前" , 302 ],
["区" , 1792 ],
["員" , -1212 ],
["委" , 798 ],
["学" , -960 ],
["市" , 887 ],
["広" , -695 ],
["後" , 535 ],
["業" , -697 ],
["相" , 753 ],
["社" , -507 ],
["福" , 974 ],
["空" , -822 ],
["者" , 1811 ],
["連" , 463 ],
["郎" , 1082 ],
["1" , -270 ],
["E1" , 306 ],
["ル" , -673 ],
["ン" , -496]
]);
var CharMap = (() =>{
var m = new Map();
for ( var s = '一'.charCodeAt(), e = '龠'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'H' );
}
for ( var s = 'ぁ'.charCodeAt(), e = 'ん'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'I' );
}
for ( var s = 'ァ'.charCodeAt(), e = 'ヴ'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'K' );
}
for ( var s = 'ァ'.charCodeAt(), e = 'ン'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'K' );
}
for ( var s = 'a'.charCodeAt(), e = 'z'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'A' );
}
for ( var s = 'A'.charCodeAt(), e = 'Z'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'A' );
}
for ( var s = 'a'.charCodeAt(), e = 'z'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'A' );
}
for ( var s = 'A'.charCodeAt(), e = 'Z'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'A' );
}
for ( var s = '0'.charCodeAt(), e = '9'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'N' );
}
for ( var s = '0'.charCodeAt(), e = '9'.charCodeAt(); s <= e; s++ ) {
m.set( String.fromCharCode(s), 'N' );
}
'一二三四五六七八九十百千万億兆'.split('').forEach((c) => {
m.set( c, 'M' );
});
'々〆ヵヶ'.split('').forEach((c) => {
m.set( c, 'H' );
});
return m;
})();
function tokenize(text) {
if ( typeof(text) === 'undefined' || text === null || ! text instanceof String || text === "" ) {
return [];
}
var result = [];
var idx = 0;
var start = idx;
var last = text.length - 1;
var p1 = 'U'
, p2 = 'U'
, p3 = 'U'
;
var w1 = 'B3'
, w2 = 'B2'
, w3 = 'B1'
;
var c1 = 'O'
, c2 = 'O'
, c3 = 'O'
;
var w4 = text[idx];
var c4 = CharMap.get(w4) || 'O';
var idx1 = idx + 1;
var idx2 = idx + 2;
var idx3 = null;
var w5, w6, c5, c6 ;
if ( idx === last ) {
w5 = 'E1';
w6 = 'E2';
c5 = 'O';
c6 = 'O';
} else {
w5 = text[idx1];
c5 = CharMap.get(w5) || 'O';
if ( idx1 === last ) {
w6 = 'E1';
c5 = 'O';
} else {
w6 = text[idx2];
c6 = CharMap.get(w6) || 'O';
}
}
while ( idx < last ) {
var score = BIAS;
w1 = w2;
w2 = w3;
w3 = w4;
w4 = w5;
w5 = w6;
c1 = c2;
c2 = c3;
c3 = c4;
c4 = c5;
c5 = c6;
idx3 = idx + 3;
if ( idx3 <= last ) {
w6 = text[idx3];
c6 = CharMap.get(w6) || 'O';
} else if ( idx2 === last ) {
w6 = 'E1';
c6 = 'O';
} else {
w6 = 'E2';
c6 = 'O';
}
if ( p1 === 'O' ) {
score += -214;
}
if ( p2 === 'B' ) {
score += 69;
} else if ( p2 === 'O' ) {
score += 935;
}
if ( p3 === 'B' ) {
score += 189;
}
score += (BP1.get(p1 + p2) || 0);
score += (BP2.get(p2 + p3) || 0);
score += (UW1.get(w1) || 0);
score += (UW2.get(w2) || 0);
score += (UW3.get(w3) || 0);
score += (UW4.get(w4) || 0);
score += (UW5.get(w5) || 0);
score += (UW6.get(w6) || 0);
score += (BW1.get(w2 + w3) || 0);
score += (BW2.get(w3 + w4) || 0);
score += (BW3.get(w4 + w5) || 0);
score += (TW1.get(w1 + w2 + w3) || 0);
score += (TW2.get(w2 + w3 + w4) || 0);
score += (TW3.get(w3 + w4 + w5) || 0);
score += (TW4.get(w4 + w5 + w6) || 0);
score += (UC1.get(c1) || 0);
score += (UC2.get(c2) || 0);
if ( c3 === 'A' ) {
score += -1370;
} else if ( c3 === 'I' ) {
score += 2311;
}
score += (UC4.get(c4) || 0);
score += (UC5.get(c5) || 0);
score += (UC6.get(c6) || 0);
score += (BC1.get(c2 + c3) || 0);
score += (BC2.get(c3 + c4) || 0);
score += (BC3.get(c4 + c5) || 0);
score += (TC1.get(c1 + c2 + c3) || 0);
score += (TC2.get(c2 + c3 + c4) || 0);
score += (TC3.get(c3 + c4 + c5) || 0);
score += (TC4.get(c4 + c5 + c6) || 0);
score += (UQ1.get(p1 + c1) || 0);
score += (UQ2.get(p2 + c2) || 0);
score += (UQ3.get(p3 + c3) || 0);
score += (BQ1.get(p2 + c2 + c3) || 0);
score += (BQ2.get(p2 + c3 + c4) || 0);
score += (BQ3.get(p3 + c2 + c3) || 0);
score += (BQ4.get(p3 + c3 + c4) || 0);
score += (TQ1.get(p2 + c1 + c2 + c3) || 0);
score += (TQ2.get(p2 + c2 + c3 + c4) || 0);
score += (TQ3.get(p3 + c1 + c2 + c3) || 0);
score += (TQ4.get(p3 + c2 + c3 + c4) || 0);
var p = 'O';
if ( score > 0 ) {
result[ result.length ] = text.substring(start, idx1);
start = idx1;
p = 'B';
}
p1 = p2;
p2 = p3;
p3 = p;
idx = idx1;
idx1 = idx2;
idx2 = idx3;
}
result[ result.length ] = text.substring(start, idx1);
return result;
}
module.exports = tokenize;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment