Created
January 2, 2022 18:38
-
-
Save o0101/2bd0d2fe508b2ebb03f7a6a57e5259de to your computer and use it in GitHub Desktop.
TinySegmenter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript | |
// (c) 2008 Taku Kudo <taku@chasen.org> | |
// TinySegmenter is freely distributable under the terms of a new BSD licence. | |
// For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt | |
function TinySegmenter() { | |
var patterns = { | |
"[一二三四五å…七八ä¹å百åƒä¸‡å„„å…†]":"M", | |
"[一-é¾ ã€…ã€†ãƒµãƒ¶]":"H", | |
"[ã-ã‚“]":"I", | |
"[ã‚¡-ヴーア-ï¾ï¾žï½°]":"K", | |
"[a-zA-Zï½-zA-Z]":"A", | |
"[0-9ï¼-ï¼™]":"N" | |
} | |
this.chartype_ = []; | |
for (var i in patterns) { | |
var regexp = new RegExp; | |
regexp.compile(i) | |
this.chartype_.push([regexp, patterns[i]]); | |
} | |
this.BIAS__ = -332 | |
this.BC1__ = {"HH":6,"II":2461,"KH":406,"OH":-1378}; | |
this.BC2__ = {"AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920}; | |
this.BC3__ = {"HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266}; | |
this.BP1__ = {"BB":295,"OB":304,"OO":-125,"UB":352}; | |
this.BP2__ = {"BO":60,"OO":-1762}; | |
this.BQ1__ = {"BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965}; | |
this.BQ2__ = {"BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146}; | |
this.BQ3__ = {"BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699}; | |
this.BQ4__ = {"BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973}; | |
this.BW1__ = {",ã¨":660,",åŒ":727,"B1ã‚":1404,"B1åŒ":542,"ã€ã¨":660,"ã€åŒ":727,"ã€ã¨":1682,"ã‚ã£":1505,"ã„ã†":1743,"ã„ã£":-2055,"ã„ã‚‹":672,"ã†ã—":-4817,"ã†ã‚“":665,"ã‹ã‚‰":3472,"ãŒã‚‰":600,"ã“ã†":-790,"ã“ã¨":2083,"ã“ã‚“":-1262,"ã•ã‚‰":-4143,"ã•ã‚“":4573,"ã—ãŸ":2641,"ã—ã¦":1104,"ã™ã§":-3399,"ãã“":1977,"ãã‚Œ":-871,"ãŸã¡":1122,"ãŸã‚":601,"ã£ãŸ":3463,"ã¤ã„":-802,"ã¦ã„":805,"ã¦ã":1249,"ã§ã":1127,"ã§ã™":3445,"ã§ã¯":844,"ã¨ã„":-4915,"ã¨ã¿":1922,"ã©ã“":3887,"ãªã„":5713,"ãªã£":3015,"ãªã©":7379,"ãªã‚“":-1113,"ã«ã—":2468,"ã«ã¯":1498,"ã«ã‚‚":1671,"ã«å¯¾":-912,"ã®ä¸€":-501,"ã®ä¸":741,"ã¾ã›":2448,"ã¾ã§":1711,"ã¾ã¾":2600,"ã¾ã‚‹":-2155,"ã‚„ã‚€":-1947,"よã£":-2565,"ã‚ŒãŸ":2369,"ã‚Œã§":-913,"ã‚’ã—":1860,"を見":731,"亡ã":-1886,"京都":2558,"å–ã‚Š":-2784,"大ã":-2604,"大阪":1497,"平方":-2314,"引ã":-1336,"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1ã‚":1404,"B1åŒ":542,"ï½£ã¨":1682}; | |
this.BW2__ = {"..":-11822,"11":-669,"――":-5730,"−−":-13175,"ã„ã†":-1609,"ã†ã‹":2490,"ã‹ã—":-1350,"ã‹ã‚‚":-602,"ã‹ã‚‰":-7194,"ã‹ã‚Œ":4612,"ãŒã„":853,"ãŒã‚‰":-3198,"ããŸ":1941,"ããª":-1597,"ã“ã¨":-8392,"ã“ã®":-4193,"ã•ã›":4533,"ã•ã‚Œ":13168,"ã•ã‚“":-3977,"ã—ã„":-1819,"ã—ã‹":-545,"ã—ãŸ":5078,"ã—ã¦":972,"ã—ãª":939,"ãã®":-3744,"ãŸã„":-1253,"ãŸãŸ":-662,"ãŸã ":-3857,"ãŸã¡":-786,"ãŸã¨":1224,"ãŸã¯":-939,"ã£ãŸ":4589,"ã£ã¦":1647,"ã£ã¨":-2094,"ã¦ã„":6144,"ã¦ã":3640,"ã¦ã":2551,"ã¦ã¯":-3110,"ã¦ã‚‚":-3065,"ã§ã„":2666,"ã§ã":-1528,"ã§ã—":-3828,"ã§ã™":-4761,"ã§ã‚‚":-4203,"ã¨ã„":1890,"ã¨ã“":-1746,"ã¨ã¨":-2279,"ã¨ã®":720,"ã¨ã¿":5168,"ã¨ã‚‚":-3941,"ãªã„":-2488,"ãªãŒ":-1313,"ãªã©":-6509,"ãªã®":2614,"ãªã‚“":3099,"ã«ãŠ":-1615,"ã«ã—":2748,"ã«ãª":2454,"ã«ã‚ˆ":-7236,"ã«å¯¾":-14943,"ã«å¾“":-4688,"ã«é–¢":-11388,"ã®ã‹":2093,"ã®ã§":-7059,"ã®ã«":-6041,"ã®ã®":-6125,"ã¯ã„":1073,"ã¯ãŒ":-1033,"ã¯ãš":-2532,"ã°ã‚Œ":1813,"ã¾ã—":-1316,"ã¾ã§":-6621,"ã¾ã‚Œ":5409,"ã‚ã¦":-3153,"ã‚‚ã„":2230,"ã‚‚ã®":-10713,"らã‹":-944,"らã—":-1611,"らã«":-1897,"ã‚Šã—":651,"ã‚Šã¾":1620,"ã‚ŒãŸ":4270,"ã‚Œã¦":849,"ã‚Œã°":4114,"ã‚ã†":6067,"ã‚ã‚Œ":7901,"を通":-11877,"ã‚“ã ":728,"ã‚“ãª":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上ãŒ":-4479,"会社":-1116,"出ã¦":2163,"分ã®":-7758,"åŒå…š":970,"åŒæ—¥":-913,"大阪":-2471,"委員":-1250,"å°‘ãª":-1050,"年度":-8669,"å¹´é–“":-1626,"府県":-2363,"手権":-1982,"æ–°èž":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"æœé®®":-2355,"本人":-2697,"æ±äº¬":-1543,"然ã¨":-1384,"社会":-1276,"ç«‹ã¦":-990,"第ã«":-1612,"米国":-4268,"11":-669}; | |
this.BW3__ = {"ã‚ãŸ":-2194,"ã‚ã‚Š":719,"ã‚ã‚‹":3846,"ã„.":-1185,"ã„。":-1185,"ã„ã„":5308,"ã„ãˆ":2079,"ã„ã":3029,"ã„ãŸ":2056,"ã„ã£":1883,"ã„ã‚‹":5600,"ã„ã‚":1527,"ã†ã¡":1117,"ã†ã¨":4798,"ãˆã¨":1454,"ã‹.":2857,"ã‹ã€‚":2857,"ã‹ã‘":-743,"ã‹ã£":-4098,"ã‹ã«":-669,"ã‹ã‚‰":6520,"ã‹ã‚Š":-2670,"ãŒ,":1816,"ãŒã€":1816,"ãŒã":-4855,"ãŒã‘":-1127,"ãŒã£":-913,"ãŒã‚‰":-4977,"ãŒã‚Š":-2064,"ããŸ":1645,"ã‘ã©":1374,"ã“ã¨":7397,"ã“ã®":1542,"ã“ã‚":-2757,"ã•ã„":-714,"ã•ã‚’":976,"ã—,":1557,"ã—ã€":1557,"ã—ã„":-3714,"ã—ãŸ":3562,"ã—ã¦":1449,"ã—ãª":2608,"ã—ã¾":1200,"ã™.":-1310,"ã™ã€‚":-1310,"ã™ã‚‹":6521,"ãš,":3426,"ãšã€":3426,"ãšã«":841,"ãã†":428,"ãŸ.":8875,"ãŸã€‚":8875,"ãŸã„":-594,"ãŸã®":812,"ãŸã‚Š":-1183,"ãŸã‚‹":-853,"ã .":4098,"ã 。":4098,"ã ã£":1004,"ã£ãŸ":-4748,"ã£ã¦":300,"ã¦ã„":6240,"ã¦ãŠ":855,"ã¦ã‚‚":302,"ã§ã™":1437,"ã§ã«":-1482,"ã§ã¯":2295,"ã¨ã†":-1387,"ã¨ã—":2266,"ã¨ã®":541,"ã¨ã‚‚":-3543,"ã©ã†":4664,"ãªã„":1796,"ãªã":-903,"ãªã©":2135,"ã«,":-1021,"ã«ã€":-1021,"ã«ã—":1771,"ã«ãª":1906,"ã«ã¯":2644,"ã®,":-724,"ã®ã€":-724,"ã®å":-1000,"ã¯,":1337,"ã¯ã€":1337,"ã¹ã":2181,"ã¾ã—":1113,"ã¾ã™":6943,"ã¾ã£":-1549,"ã¾ã§":6154,"ã¾ã‚Œ":-793,"らã—":1479,"られ":6820,"ã‚‹ã‚‹":3818,"ã‚Œ,":854,"ã‚Œã€":854,"ã‚ŒãŸ":1850,"ã‚Œã¦":1375,"ã‚Œã°":-3246,"れる":1091,"ã‚ã‚Œ":-605,"ã‚“ã ":606,"ã‚“ã§":798,"カ月":990,"会è°":860,"入り":1232,"大会":2217,"始ã‚":1681,"市":965,"æ–°èž":-5055,"æ—¥,":974,"æ—¥ã€":974,"社会":2024,"カ月":990}; | |
this.TC1__ = {"AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832}; | |
this.TC2__ = {"HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649}; | |
this.TC3__ = {"AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393}; | |
this.TC4__ = {"HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841}; | |
this.TQ1__ = {"BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68}; | |
this.TQ2__ = {"BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591}; | |
this.TQ3__ = {"BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685}; | |
this.TQ4__ = {"BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156}; | |
this.TW1__ = {"ã«ã¤ã„":-4681,"æ±äº¬éƒ½":2026}; | |
this.TW2__ = {"ã‚る程":-2049,"ã„ã£ãŸ":-1256,"ã“ã‚ãŒ":-2434,"ã—ょã†":3873,"ãã®å¾Œ":-4430,"ã ã£ã¦":-1049,"ã¦ã„ãŸ":1833,"ã¨ã—ã¦":-4657,"ã¨ã‚‚ã«":-4517,"ã‚‚ã®ã§":1882,"一気ã«":-792,"åˆã‚ã¦":-1512,"åŒæ™‚ã«":-8097,"大ããª":-1255,"対ã—ã¦":-2721,"社会党":-3216}; | |
this.TW3__ = {"ã„ãŸã ":-1734,"ã—ã¦ã„":1314,"ã¨ã—ã¦":-4314,"ã«ã¤ã„":-5483,"ã«ã¨ã£":-5989,"ã«å½“ãŸ":-6247,"ã®ã§,":-727,"ã®ã§ã€":-727,"ã®ã‚‚ã®":-600,"ã‚Œã‹ã‚‰":-3752,"å二月":-2287}; | |
this.TW4__ = {"ã„ã†.":8576,"ã„ã†ã€‚":8576,"ã‹ã‚‰ãª":-2348,"ã—ã¦ã„":2958,"ãŸãŒ,":1516,"ãŸãŒã€":1516,"ã¦ã„ã‚‹":1538,"ã¨ã„ã†":1349,"ã¾ã—ãŸ":5543,"ã¾ã›ã‚“":1097,"よã†ã¨":-4258,"よるã¨":5865}; | |
this.UC1__ = {"A":484,"K":93,"M":645,"O":-505}; | |
this.UC2__ = {"A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646}; | |
this.UC3__ = {"A":-1370,"I":2311}; | |
this.UC4__ = {"A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646}; | |
this.UC5__ = {"H":313,"I":-1238,"K":-799,"M":539,"O":-831}; | |
this.UC6__ = {"H":-506,"I":-253,"K":87,"M":247,"O":-387}; | |
this.UP1__ = {"O":-214}; | |
this.UP2__ = {"B":69,"O":935}; | |
this.UP3__ = {"B":189}; | |
this.UQ1__ = {"BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422}; | |
this.UQ2__ = {"BH":216,"BI":113,"OK":1759}; | |
this.UQ3__ = {"BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212}; | |
this.UW1__ = {",":156,"ã€":156,"「":-463,"ã‚":-941,"ã†":-127,"ãŒ":-553,"ã":121,"ã“":505,"ã§":-201,"ã¨":-547,"ã©":-123,"ã«":-789,"ã®":-185,"ã¯":-847,"ã‚‚":-466,"ã‚„":-470,"よ":182,"ら":-292,"ã‚Š":208,"ã‚Œ":169,"ã‚’":-446,"ã‚“":-137,"・":-135,"主":-402,"京":-268,"区":-912,"åˆ":871,"国":-460,"大":561,"委":729,"市":-411,"æ—¥":-141,"ç†":361,"生":-408,"県":-386,"都":-718,"ï½¢":-463,"ï½¥":-135}; | |
this.UW2__ = {",":-829,"ã€":-829,"〇":892,"「":-645,"ã€":3145,"ã‚":-538,"ã„":505,"ã†":134,"ãŠ":-502,"ã‹":1454,"ãŒ":-856,"ã":-412,"ã“":1141,"ã•":878,"ã–":540,"ã—":1529,"ã™":-675,"ã›":300,"ã":-1011,"ãŸ":188,"ã ":1837,"ã¤":-949,"ã¦":-291,"ã§":-268,"ã¨":-981,"ã©":1273,"ãª":1063,"ã«":-1764,"ã®":130,"ã¯":-409,"ã²":-1273,"ã¹":1261,"ã¾":600,"ã‚‚":-1263,"ã‚„":-402,"よ":1639,"ã‚Š":-579,"ã‚‹":-694,"ã‚Œ":571,"ã‚’":-2516,"ã‚“":2095,"ã‚¢":-587,"ã‚«":306,"ã‚":568,"ッ":831,"三":-758,"ä¸":-2150,"世":-302,"ä¸":-968,"主":-861,"事":492,"人":-123,"会":978,"ä¿":362,"å…¥":548,"åˆ":-3025,"副":-1566,"北":-3414,"区":-422,"大":-1769,"天":-865,"太":-483,"å":-1519,"å¦":760,"実":1023,"å°":-2009,"市":-813,"å¹´":-1060,"å¼·":1067,"手":-1519,"æº":-1033,"政":1522,"æ–‡":-1355,"æ–°":-1682,"æ—¥":-1815,"明":-1462,"最":-630,"æœ":-1843,"本":-1650,"æ±":-931,"æžœ":-665,"次":-2378,"æ°‘":-180,"æ°—":-1740,"ç†":752,"発":529,"ç›®":-1584,"相":-242,"県":-1165,"ç«‹":-763,"第":810,"ç±³":509,"自":-1353,"è¡Œ":838,"西":-744,"見":-3874,"調":1010,"è°":1198,"è¾¼":3041,"é–‹":1758,"é–“":-1257,"ï½¢":-645,"ï½£":3145,"ッ":831,"ï½±":-587,"カ":306,"ï½·":568}; | |
this.UW3__ = {",":4889,"1":-800,"−":-1723,"ã€":4889,"々":-2311,"〇":5827,"ã€":2670,"〓":-3573,"ã‚":-2696,"ã„":1006,"ã†":2342,"ãˆ":1983,"ãŠ":-4864,"ã‹":-1163,"ãŒ":3271,"ã":1004,"ã‘":388,"ã’":401,"ã“":-3552,"ã”":-3116,"ã•":-1058,"ã—":-395,"ã™":584,"ã›":3685,"ã":-5228,"ãŸ":842,"ã¡":-521,"ã£":-1444,"ã¤":-1081,"ã¦":6167,"ã§":2318,"ã¨":1691,"ã©":-899,"ãª":-2788,"ã«":2745,"ã®":4056,"ã¯":4555,"ã²":-2171,"ãµ":-1798,"ã¸":1199,"ã»":-5516,"ã¾":-4384,"ã¿":-120,"ã‚":1205,"ã‚‚":2323,"ã‚„":-788,"よ":-202,"ら":727,"ã‚Š":649,"ã‚‹":5905,"ã‚Œ":2773,"ã‚":-1207,"ã‚’":6620,"ã‚“":-518,"ã‚¢":551,"ã‚°":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ãƒ":2201,"ン":278,"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"ä¸":653,"主":-758,"予":-1193,"二":974,"人":2742,"今":792,"ä»–":1889,"以":-1368,"低":811,"何":4265,"作":-361,"ä¿":-2439,"å…ƒ":4858,"å…š":3593,"å…¨":1574,"å…¬":-3030,"å…":755,"å…±":-1880,"円":5807,"å†":3095,"分":457,"åˆ":2475,"別":1129,"å‰":2286,"副":4437,"力":365,"å‹•":-949,"å‹™":-1872,"化":1327,"北":-1038,"区":4646,"åƒ":-2309,"åˆ":-783,"å”":-1006,"å£":483,"å³":1233,"å„":3588,"åˆ":-241,"åŒ":3906,"å’Œ":-837,"å“¡":4513,"国":642,"åž‹":1389,"å ´":1219,"外":-241,"妻":2016,"å¦":-1356,"安":-423,"実":-1008,"家":1078,"å°":-513,"å°‘":-3102,"å·ž":1155,"市":3197,"å¹³":-1804,"å¹´":2416,"広":-1030,"府":1605,"度":1452,"建":-2352,"当":-3885,"å¾—":1905,"æ€":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013,"æ•™":-1479,"æ•°":3222,"æ–‡":-1489,"æ–°":1764,"æ—¥":2099,"æ—§":5792,"昨":-661,"時":-1248,"曜":-951,"最":-937,"月":4125,"期":360,"æŽ":3094,"æ‘":364,"æ±":-805,"æ ¸":5156,"森":2438,"æ¥":484,"æ°":2613,"æ°‘":-1694,"決":-1073,"法":1868,"æµ·":-495,"ç„¡":979,"物":461,"特":-3850,"生":-273,"用":914,"町":1215,"çš„":7313,"ç›´":-1835,"çœ":792,"県":6293,"知":-1528,"ç§":4231,"税":401,"ç«‹":-960,"第":1201,"ç±³":7767,"ç³»":3066,"ç´„":3663,"ç´š":1384,"çµ±":-4229,"ç·":1163,"ç·š":1255,"者":6457,"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"è²»":1777,"車":1835,"è»":1375,"è¾¼":-1504,"通":-1136,"é¸":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"é•·":421,"é–‹":-1432,"é–“":1302,"é–¢":-1282,"雨":2009,"é›»":-1045,"éž":2066,"駅":1620,"1":-800,"ï½£":2670,"ï½¥":-3794,"ッ":-1350,"ï½±":551,"グ":1319,"ï½½":874,"ト":521,"ム":1109,"ï¾™":1591,"ï¾›":2201,"ï¾":278}; | |
this.UW4__ = {",":3930,".":3508,"―":-4841,"ã€":3930,"。":3508,"〇":4999,"「":1895,"ã€":3798,"〓":-5156,"ã‚":4752,"ã„":-3435,"ã†":-640,"ãˆ":-2514,"ãŠ":2405,"ã‹":530,"ãŒ":6006,"ã":-4482,"ãŽ":-3821,"ã":-3788,"ã‘":-4376,"ã’":-4734,"ã“":2255,"ã”":1979,"ã•":2864,"ã—":-843,"ã˜":-2506,"ã™":-731,"ãš":1251,"ã›":181,"ã":4091,"ãŸ":5034,"ã ":5408,"ã¡":-3654,"ã£":-5882,"ã¤":-1659,"ã¦":3994,"ã§":7410,"ã¨":4547,"ãª":5433,"ã«":6499,"ã¬":1853,"ã":1413,"ã®":7396,"ã¯":8578,"ã°":1940,"ã²":4249,"ã³":-4134,"ãµ":1345,"ã¸":6665,"ã¹":-744,"ã»":1464,"ã¾":1051,"ã¿":-2082,"ã‚€":-882,"ã‚":-5046,"ã‚‚":4169,"ゃ":-2666,"ã‚„":2795,"ょ":-1544,"よ":3351,"ら":-2922,"ã‚Š":-9726,"ã‚‹":-14896,"ã‚Œ":-2613,"ã‚":-4570,"ã‚":-1783,"ã‚’":13150,"ã‚“":-2352,"ã‚«":2145,"コ":1789,"ã‚»":1287,"ッ":-724,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"ä¸":2210,"予":782,"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"å´":4292,"å…ˆ":601,"å…š":-2006,"å…±":-1212,"内":584,"円":788,"åˆ":1347,"å‰":1623,"副":3879,"力":-302,"å‹•":-740,"å‹™":-2715,"化":776,"区":4517,"å”":1013,"å‚":1555,"åˆ":-1834,"å’Œ":-681,"å“¡":-910,"器":-851,"回":1500,"国":-619,"園":-1200,"地":866,"å ´":-1410,"å¡":-2094,"士":-1413,"多":1067,"大":571,"å":-4802,"å¦":-1397,"定":-1057,"寺":-809,"å°":1910,"屋":-1328,"å±±":-1500,"島":-2056,"å·":-2667,"市":2771,"å¹´":374,"åº":-4556,"後":456,"性":553,"æ„Ÿ":916,"所":-1566,"支":856,"改":787,"政":2182,"æ•™":704,"æ–‡":522,"æ–¹":-856,"æ—¥":1798,"時":1829,"最":845,"月":-9066,"木":-485,"æ¥":-442,"æ ¡":-360,"æ¥":-1043,"æ°":5388,"æ°‘":-2716,"æ°—":-910,"æ²¢":-939,"済":-543,"物":-735,"率":672,"çƒ":-1267,"生":-1286,"産":-1101,"ç”°":-2900,"町":1826,"çš„":2586,"ç›®":922,"çœ":-3485,"県":2997,"空":-867,"ç«‹":-2112,"第":788,"ç±³":2937,"ç³»":786,"ç´„":2171,"経":1146,"çµ±":-1169,"ç·":940,"ç·š":-994,"ç½²":749,"者":2145,"能":-730,"般":-852,"è¡Œ":-792,"è¦":792,"è¦":-1184,"è°":-244,"è°·":-1000,"賞":730,"車":-1481,"è»":1158,"輪":-1433,"è¾¼":-3370,"è¿‘":929,"é“":-1291,"é¸":2596,"郎":-4866,"都":1192,"野":-1100,"銀":-2213,"é•·":357,"é–“":-2344,"院":-2297,"éš›":-2604,"é›»":-878,"é ˜":-1659,"é¡Œ":-792,"館":-1984,"首":1749,"高":2120,"ï½¢":1895,"ï½£":3798,"ï½¥":-4371,"ッ":-724,"ï½°":-11870,"カ":2145,"コ":1789,"ï½¾":1287,"ト":-403,"ï¾’":-1635,"ï¾—":-881,"リ":-541,"ï¾™":-856,"ï¾":-3637}; | |
this.UW5__ = {",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"ã€":465,"。":-299,"「":363,"ã‚":1655,"ã„":331,"ã†":-503,"ãˆ":1199,"ãŠ":527,"ã‹":647,"ãŒ":-421,"ã":1624,"ãŽ":1971,"ã":312,"ã’":-983,"ã•":-1537,"ã—":-1371,"ã™":-852,"ã ":-1186,"ã¡":1093,"ã£":52,"ã¤":921,"ã¦":-18,"ã§":-850,"ã¨":-127,"ã©":1682,"ãª":-787,"ã«":-1224,"ã®":-635,"ã¯":-578,"ã¹":1001,"ã¿":502,"ã‚":865,"ゃ":3350,"ょ":854,"ã‚Š":-208,"ã‚‹":429,"ã‚Œ":504,"ã‚":419,"ã‚’":-1264,"ã‚“":327,"イ":241,"ル":451,"ン":-343,"ä¸":-871,"京":722,"会":-1153,"å…š":-654,"å‹™":3519,"区":-901,"å‘Š":848,"å“¡":2104,"大":-1296,"å¦":-548,"定":1785,"åµ":-1304,"市":-2991,"å¸":921,"å¹´":1763,"æ€":872,"所":-814,"挙":1618,"æ–°":-1682,"æ—¥":218,"月":-4353,"査":932,"æ ¼":1356,"æ©Ÿ":-1508,"æ°":-1347,"ç”°":240,"町":-3912,"çš„":-3149,"相":1319,"çœ":-1052,"県":-4003,"ç ”":-997,"社":-278,"空":-813,"çµ±":1955,"者":-2233,"表":663,"語":-1073,"è°":1219,"é¸":-1018,"郎":-368,"é•·":786,"é–“":1191,"é¡Œ":2368,"館":-689,"1":-514,"E2":-32768,"ï½¢":363,"ï½²":241,"ï¾™":451,"ï¾":-343}; | |
this.UW6__ = {",":227,".":808,"1":-270,"E1":306,"ã€":227,"。":808,"ã‚":-307,"ã†":189,"ã‹":241,"ãŒ":-73,"ã":-121,"ã“":-200,"ã˜":1782,"ã™":383,"ãŸ":-428,"ã£":573,"ã¦":-1014,"ã§":101,"ã¨":-105,"ãª":-253,"ã«":-149,"ã®":-417,"ã¯":-236,"ã‚‚":-206,"ã‚Š":187,"ã‚‹":-135,"ã‚’":195,"ル":-673,"ン":-496,"一":-277,"ä¸":201,"件":-800,"会":624,"å‰":302,"区":1792,"å“¡":-1212,"委":798,"å¦":-960,"市":887,"広":-695,"後":535,"æ¥":-697,"相":753,"社":-507,"ç¦":974,"空":-822,"者":1811,"連":463,"郎":1082,"1":-270,"E1":306,"ï¾™":-673,"ï¾":-496}; | |
return this; | |
} | |
TinySegmenter.prototype.ctype_ = function(str) { | |
for (var i in this.chartype_) { | |
if (str.match(this.chartype_[i][0])) { | |
return this.chartype_[i][1]; | |
} | |
} | |
return "O"; | |
} | |
TinySegmenter.prototype.ts_ = function(v) { | |
if (v) { return v; } | |
return 0; | |
} | |
TinySegmenter.prototype.segment = function(input) { | |
if (input == null || input == undefined || input == "") { | |
return []; | |
} | |
var result = []; | |
var seg = ["B3","B2","B1"]; | |
var ctype = ["O","O","O"]; | |
var o = input.split(""); | |
for (i = 0; i < o.length; ++i) { | |
seg.push(o[i]); | |
ctype.push(this.ctype_(o[i])) | |
} | |
seg.push("E1"); | |
seg.push("E2"); | |
seg.push("E3"); | |
ctype.push("O"); | |
ctype.push("O"); | |
ctype.push("O"); | |
var word = seg[3]; | |
var p1 = "U"; | |
var p2 = "U"; | |
var p3 = "U"; | |
for (var i = 4; i < seg.length - 3; ++i) { | |
var score = this.BIAS__; | |
var w1 = seg[i-3]; | |
var w2 = seg[i-2]; | |
var w3 = seg[i-1]; | |
var w4 = seg[i]; | |
var w5 = seg[i+1]; | |
var w6 = seg[i+2]; | |
var c1 = ctype[i-3]; | |
var c2 = ctype[i-2]; | |
var c3 = ctype[i-1]; | |
var c4 = ctype[i]; | |
var c5 = ctype[i+1]; | |
var c6 = ctype[i+2]; | |
score += this.ts_(this.UP1__[p1]); | |
score += this.ts_(this.UP2__[p2]); | |
score += this.ts_(this.UP3__[p3]); | |
score += this.ts_(this.BP1__[p1 + p2]); | |
score += this.ts_(this.BP2__[p2 + p3]); | |
score += this.ts_(this.UW1__[w1]); | |
score += this.ts_(this.UW2__[w2]); | |
score += this.ts_(this.UW3__[w3]); | |
score += this.ts_(this.UW4__[w4]); | |
score += this.ts_(this.UW5__[w5]); | |
score += this.ts_(this.UW6__[w6]); | |
score += this.ts_(this.BW1__[w2 + w3]); | |
score += this.ts_(this.BW2__[w3 + w4]); | |
score += this.ts_(this.BW3__[w4 + w5]); | |
score += this.ts_(this.TW1__[w1 + w2 + w3]); | |
score += this.ts_(this.TW2__[w2 + w3 + w4]); | |
score += this.ts_(this.TW3__[w3 + w4 + w5]); | |
score += this.ts_(this.TW4__[w4 + w5 + w6]); | |
score += this.ts_(this.UC1__[c1]); | |
score += this.ts_(this.UC2__[c2]); | |
score += this.ts_(this.UC3__[c3]); | |
score += this.ts_(this.UC4__[c4]); | |
score += this.ts_(this.UC5__[c5]); | |
score += this.ts_(this.UC6__[c6]); | |
score += this.ts_(this.BC1__[c2 + c3]); | |
score += this.ts_(this.BC2__[c3 + c4]); | |
score += this.ts_(this.BC3__[c4 + c5]); | |
score += this.ts_(this.TC1__[c1 + c2 + c3]); | |
score += this.ts_(this.TC2__[c2 + c3 + c4]); | |
score += this.ts_(this.TC3__[c3 + c4 + c5]); | |
score += this.ts_(this.TC4__[c4 + c5 + c6]); | |
// score += this.ts_(this.TC5__[c4 + c5 + c6]); | |
score += this.ts_(this.UQ1__[p1 + c1]); | |
score += this.ts_(this.UQ2__[p2 + c2]); | |
score += this.ts_(this.UQ3__[p3 + c3]); | |
score += this.ts_(this.BQ1__[p2 + c2 + c3]); | |
score += this.ts_(this.BQ2__[p2 + c3 + c4]); | |
score += this.ts_(this.BQ3__[p3 + c2 + c3]); | |
score += this.ts_(this.BQ4__[p3 + c3 + c4]); | |
score += this.ts_(this.TQ1__[p2 + c1 + c2 + c3]); | |
score += this.ts_(this.TQ2__[p2 + c2 + c3 + c4]); | |
score += this.ts_(this.TQ3__[p3 + c1 + c2 + c3]); | |
score += this.ts_(this.TQ4__[p3 + c2 + c3 + c4]); | |
var p = "O"; | |
if (score > 0) { | |
result.push(word); | |
word = ""; | |
p = "B"; | |
} | |
p1 = p2; | |
p2 = p3; | |
p3 = p; | |
word += seg[i]; | |
} | |
result.push(word); | |
return result; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
// TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript
// (c) 2008 Taku Kudo taku@chasen.org
// TinySegmenter is freely distributable under the terms of a new BSD licence.
// For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt