Create a gist now

Instantly share code, notes, and snippets.

TinySegmenter written in D
// Written in the D programming language.
import std.datetime;
import std.stdio;
import std.conv;
import std.file;
import tinysegmenter;
void main()
{
string text = cast(string)read("timemachineu8j.txt");
void f()
{
foreach (i; 0..100) {
TinySegmenter.segment(text);
}
}
auto times = benchmark!(f)(1);
writeln("segment: ", to!Duration(times[0]));
}
// Written in the D programming language.
/**
* TinySegmenter written in D.
*
* Original version is here: $(WEB chasen.org/~taku/software/TinySegmenter/,
* TinySegmenter : Javascriptだけで書かれたコンパクトな分かち書きソフトウェア).
*
* Example:
* -----
* auto result = TinySegmenter.segment("それD言語で出来るよ");
* result.join("|") //-> "それ|D|言語|で|出来る|よ"
* -----
*
* Copyright: Copyright Masahiro Nakagawa 2010-.
* License: <a href="http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt">New BSD License</a>.
* Authors: Masahiro Nakagawa
*/
module tinysegmenter;
import std.algorithm : map;
import std.array : Appender;
import std.range : isInputRange;
import std.utf : decode, count, isValidDchar;
import std.stdio; // writeln
/**
* TinySegmenter is a Super compact Japanese tokenizer.
*/
struct TinySegmenter
{
private:
static immutable
{
sizediff_t BIAS = -332;
sizediff_t[string] UP1, UP2, UP3;
sizediff_t[string] UC1, UC2, UC3, UC4, UC5, UC6;
sizediff_t[string] BP1, BP2;
sizediff_t[string] BC1, BC2, BC3;
sizediff_t[string] UQ1, UQ2, UQ3;
sizediff_t[string] TC1, TC2, TC3, TC4;
sizediff_t[string] BQ1, BQ2, BQ3, BQ4;
sizediff_t[string] TQ1, TQ2, TQ3, TQ4;
sizediff_t[string] UW1, UW2, UW3, UW4, UW5, UW6;
sizediff_t[string] BW1, BW2, BW3;
sizediff_t[string] TW1, TW2, TW3, TW4;
char[dchar] charTypeMap;
}
@safe
shared static this()
{
UP1 = ["O":-214];
UP2 = ["B":69,"O":935];
UP3 = ["B":189];
UC1 = ["A":484,"K":93,"M":645,"O":-505];
UC2 = ["A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646];
UC3 = ["A":-1370,"I":2311];
UC4 = ["A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646];
UC5 = ["H":313,"I":-1238,"K":-799,"M":539,"O":-831];
UC6 = ["H":-506,"I":-253,"K":87,"M":247,"O":-387];
BP1 = ["BB":295,"OB":304,"OO":-125,"UB":352];
BP2 = ["BO":60,"OO":-1762];
BC1 = ["HH":6,"II":2461,"KH":406,"OH":-1378];
BC2 = ["AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,
"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920];
BC3 = ["HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266];
UQ1 = ["BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422];
UQ2 = ["BH":216,"BI":113,"OK":1759];
UQ3 = ["BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212];
TC1 = ["AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,
"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832];
TC2 = ["HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649];
TC3 = ["AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,
"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,
"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393];
TC4 = ["HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,
"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,
"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841];
BQ1 = ["BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,
"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965];
BQ2 = ["BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146];
BQ3 = ["BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,
"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699];
BQ4 = ["BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,
"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973];
TQ1 = ["BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,
"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68];
TQ2 = ["BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591];
TQ3 = ["BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,
"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,
"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685];
TQ4 = ["BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,
"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156];
UW1 = [",":156,"":156,"":-463,"":-941,"":-127,"":-553,"":121,"":505,"":-201,"":-547,
"":-123,"":-789,"":-185,"":-847,"":-466,"":-470,"":182,"":-292,"":208,"":
169,"":-446,"":-137,"":-135,"":-402,"":-268,"":-912,"":871,"":-460,"":561,
"":729,"":-411,"":-141,"":361,"":-408,"":-386,"":-718,"":-463,"":-135];
UW2 = [",":-829,"":-829,"":892,"":-645,"":3145,"":-538,"":505,"":134,"":-502,"":1454,
"":-856,"":-412,"":1141,"":878,"":540,"":1529,"":-675,"":300,"":-1011,"":188,
"":1837,"":-949,"":-291,"":-268,"":-981,"":1273,"":1063,"":-1764,"":130,
"":-409,"":-1273,"":1261,"":600,"":-1263,"":-402,"":1639,"":-579,"":-694,
"":571,"":-2516,"":2095,"":-587,"":306,"":568,"":831,"":-758,"":-2150,"":-302,
"":-968,"":-861,"":492,"":-123,"":978,"":362,"":548,"":-3025,"":-1566,"":-3414,
"":-422,"":-1769,"":-865,"":-483,"":-1519,"":760,"":1023,"":-2009,"":-813,
"":-1060,"":1067,"":-1519,"":-1033,"":1522,"":-1355,"":-1682,"":-1815,"":-1462,
"":-630,"":-1843,"":-1650,"":-931,"":-665,"":-2378,"":-180,"":-1740,"":752,
"":529,"":-1584,"":-242,"":-1165,"":-763,"":810,"":509,"":-1353,"":838,"西":-744,
"":-3874,"調":1010,"":1198,"":3041,"":1758,"":-1257,"":-645,"":3145,"":831,"":-587,"":306,"":568];
UW3 = [",":4889,"1":-800,"":-1723,"":4889,"":-2311,"":5827,"":2670,"":-3573,"":-2696,
"":1006,"":2342,"":1983,"":-4864,"":-1163,"":3271,"":1004,"":388,"":401,
"":-3552,"":-3116,"":-1058,"":-395,"":584,"":3685,"":-5228,"":842,"":-521,
"":-1444,"":-1081,"":6167,"":2318,"":1691,"":-899,"":-2788,"":2745,"":4056,
"":4555,"":-2171,"":-1798,"":1199,"":-5516,"":-4384,"":-120,"":1205,"":2323,
"":-788,"":-202,"":727,"":649,"":5905,"":2773,"":-1207,"":6620,"":-518,
"":551,"":1319,"":874,"":-1350,"":521,"":1109,"":1591,"":2201,"":278,
"":-3794,"":-1619,"":-1759,"":-2087,"":3815,"":653,"":-758,"":-1193,"":974,
"":2742,"":792,"":1889,"":-1368,"":811,"":4265,"":-361,"":-2439,"":4858,
"":3593,"":1574,"":-3030,"":755,"":-1880,"":5807,"":3095,"":457,"":2475,
"":1129,"":2286,"":4437,"":365,"":-949,"":-1872,"":1327,"":-1038,"":4646,
"":-2309,"":-783,"":-1006,"":483,"":1233,"":3588,"":-241,"":3906,"":-837,
"":4513,"":642,"":1389,"":1219,"":-241,"":2016,"":-1356,"":-423,"":-1008,
"":1078,"":-513,"":-3102,"":1155,"":3197,"":-1804,"":2416,"":-1030,"":1605,
"":1452,"":-2352,"":-3885,"":1905,"":-1291,"":1822,"":-488,"":-3973,"":-2013,
"":-1479,"":3222,"":-1489,"":1764,"":2099,"":5792,"":-661,"":-1248,"":-951,
"":-937,"":4125,"":360,"":3094,"":364,"":-805,"":5156,"":2438,"":484,"":2613,
"":-1694,"":-1073,"":1868,"":-495,"":979,"":461,"":-3850,"":-273,"":914,
"":1215,"":7313,"":-1835,"":792,"":6293,"":-1528,"":4231,"":401,"":-960,
"":1201,"":7767,"":3066,"":3663,"":1384,"":-4229,"":1163,"":1255,"":6457,
"":725,"":-2869,"":785,"":1044,"調":-562,"":-733,"":1777,"":1835,"":1375,
"":-1504,"":-1136,"":-681,"":1026,"":4404,"":1200,"":2163,"":421,"":-1432,
"":1302,"":-1282,"":2009,"":-1045,"":2066,"":1620,"":-800,"":2670,"":-3794,
"":-1350,"":551,"グ":1319,"":874,"":521,"":1109,"":1591,"":2201,"":278];
UW4 = [",":3930,".":3508,"":-4841,"":3930,"":3508,"":4999,"":1895,"":3798,"":-5156,
"":4752,"":-3435,"":-640,"":-2514,"":2405,"":530,"":6006,"":-4482,"":-3821,
"":-3788,"":-4376,"":-4734,"":2255,"":1979,"":2864,"":-843,"":-2506,"":-731,
"":1251,"":181,"":4091,"":5034,"":5408,"":-3654,"":-5882,"":-1659,"":3994,
"":7410,"":4547,"":5433,"":6499,"":1853,"":1413,"":7396,"":8578,"":1940,
"":4249,"":-4134,"":1345,"":6665,"":-744,"":1464,"":1051,"":-2082,"":-882,
"":-5046,"":4169,"":-2666,"":2795,"":-1544,"":3351,"":-2922,"":-9726,"":-14896,
"":-2613,"":-4570,"":-1783,"":13150,"":-2352,"":2145,"":1789,"":1287,"":-724,"":-403,
"":-1635,"":-881,"":-541,"":-856,"":-3637,"":-4371,"":-11870,"":-2069,"":2210,"":782,
"":-190,"":-1768,"":1036,"":544,"":950,"":-1286,"":530,"":4292,"":601,"":-2006,
"":-1212,"":584,"":788,"":1347,"":1623,"":3879,"":-302,"":-740,"":-2715,"":776,
"":4517,"":1013,"":1555,"":-1834,"":-681,"":-910,"":-851,"":1500,"":-619,"":-1200,
"":866,"":-1410,"":-2094,"":-1413,"":1067,"":571,"":-4802,"":-1397,"":-1057,"":-809,
"":1910,"":-1328,"":-1500,"":-2056,"":-2667,"":2771,"":374,"":-4556,"":456,"":553,
"":916,"":-1566,"":856,"":787,"":2182,"":704,"":522,"":-856,"":1798,"":1829,"":845,
"":-9066,"":-485,"":-442,"":-360,"":-1043,"":5388,"":-2716,"":-910,"":-939,"":-543,
"":-735,"":672,"":-1267,"":-1286,"":-1101,"":-2900,"":1826,"":2586,"":922,"":-3485,
"":2997,"":-867,"":-2112,"":788,"":2937,"":786,"":2171,"":1146,"":-1169,"":940,
"":-994,"":749,"":2145,"":-730,"":-852,"":-792,"":792,"":-1184,"":-244,"":-1000,
"":730,"":-1481,"":1158,"":-1433,"":-3370,"":929,"":-1291,"":2596,"":-4866,"":1192,
"":-1100,"":-2213,"":357,"":-2344,"":-2297,"":-2604,"":-878,"":-1659,"":-792,
"":-1984,"":1749,"":2120,"":1895,"":3798,"":-4371,"":-724,"":-11870,"":2145,"":1789,
"":1287,"":-403,"":-1635,"":-881,"":-541,"":-856,"":-3637];
UW5 = [",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"":465,"":-299,"":363,"":1655,"":331,
"":-503,"":1199,"":527,"":647,"":-421,"":1624,"":1971,"":312,"":-983,"":-1537,
"":-1371,"":-852,"":-1186,"":1093,"":52,"":921,"":-18,"":-850,"":-127,"":1682,
"":-787,"":-1224,"":-635,"":-578,"":1001,"":502,"":865,"":3350,"":854,"":-208,
"":429,"":504,"":419,"":-1264,"":327,"":241,"":451,"":-343,"":-871,"":722,
"":-1153,"":-654,"":3519,"":-901,"":848,"":2104,"":-1296,"":-548,"":1785,
"":-1304,"":-2991,"":921,"":1763,"":872,"":-814,"":1618,"":-1682,"":218,
"":-4353,"":932,"":1356,"":-1508,"":-1347,"":240,"":-3912,"":-3149,"":1319,
"":-1052,"":-4003,"":-997,"":-278,"":-813,"":1955,"":-2233,"":663,"":-1073,
"":1219,"":-1018,"":-368,"":786,"":1191,"":2368,"":-689,"":-514,"E2":-32768,
"":363,"":241,"":451,"":-343];
UW6 = [",":227,".":808,"1":-270,"E1":306,"":227,"":808,"":-307,"":189,"":241,"":-73,"":-121,
"":-200,"":1782,"":383,"":-428,"":573,"":-1014,"":101,"":-105,"":-253,"":-149,
"":-417,"":-236,"":-206,"":187,"":-135,"":195,"":-673,"":-496,"":-277,"":201,
"":-800,"":624,"":302,"":1792,"":-1212,"":798,"":-960,"":887,"":-695,"":535,
"":-697,"":753,"":-507,"":974,"":-822,"":1811,"":463,"":1082,"":-270,"E1":306,"":-673,"":-496];
BW1 = [",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505,
"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790,
"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977,
"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127,
"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379,
"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448,
"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860,
"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336,
"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1あ":1404,"B1同":542,"」と":1682];
BW2 = ["..":-11822,"11":-669,"――":-5730,"−−":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602,
"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193,
"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939,
"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589,
"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666,
"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720,
"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615,
"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059,
"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621,
"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651,
"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728,
"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163,
"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626,
"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355,
"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"11":-669];
BW3 = ["あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029,
"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857,
"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816,
"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397,
"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562,
"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426,
"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098,
"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437,
"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796,
"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724,
"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549,
"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850,
"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860,
"入り":1232,"大会":2217,"始め":1681,"":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"カ月":990];
TW1 = ["につい":-4681,"東京都":2026];
TW2 = ["ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430,
"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882,
"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216];
TW3 = ["いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247,
"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287];
TW4 = ["いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538,
"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865];
charTypeMap = buildCharTypeMap();
}
@trusted
static immutable(char[dchar]) buildCharTypeMap()
{
// 'M' : [一二三四五六七八九十百千万億兆]
char[dchar] map = ['':'M','':'M','':'M','':'M','':'M','':'M','':'M','':'M','':'M','':'M','':'M','':'M','':'M','':'M','':'M'];
// 'H' : [一-龠々〆ヵヶ]. 一-龠 are done by getCharType.
foreach (dchar c; ['', '', '', '']) { map[c] = 'H'; }
// 'I' : [ぁ-ん]
foreach (dchar c; ''..'' + 1) { map[c] = 'I'; }
// 'K' : [ァ-ヴーア-ン゙ー]
foreach (dchar c; ''..'' + 1) { map[c] = 'K'; }
foreach (dchar c; ''..'' + 1) { map[c] = 'K'; }
foreach (dchar c; ['', '']) { map[c] = 'K'; }
// 'A' : [a-zA-Za-zA-Z]
foreach (dchar c; 'a'..'z' + 1) { map[c] = 'A'; }
foreach (dchar c; 'A'..'Z' + 1) { map[c] = 'A'; }
foreach (dchar c; ''..'' + 1) { map[c] = 'A'; }
foreach (dchar c; ''..'' + 1) { map[c] = 'A'; }
// 'N' : [0-90-9]
foreach (dchar c; '0'..'9' + 1) { map[c] = 'N'; }
foreach (dchar c; ''..'' + 1) { map[c] = 'N'; }
return cast(immutable)map;
}
public:
/**
* Do morphological analysis.
*
* Params:
* source = the input to morphological analysis.
*
* Returns:
* an analyzed result that splits $(D_PARAM source).
*/
@trusted
static string[] segment(in string source)
{
/**
* Helper for character-type mapping
*/
@safe
static pure nothrow char getCharType(in dchar c)
in
{
assert(isValidDchar(c), "invalid UTF-32 character");
}
body
{
auto ct = c in charTypeMap;
if (ct)
return *ct;
else
if ('' <= c && c <= '')
return 'H';
else
return 'O'; // othre type
}
/**
* Core calculation function.
*/
@trusted
static string[] doAnalysis(string src, in string[] segments, in char[] chrTypes)
{
auto result = Appender!(string[])([]);
size_t index = segments[3].length, start;
char[3] ctset = ['U', 'U', 'U'];
foreach (i; 4..segments.length - 3) {
auto ctype = 'O';
immutable score = scoreFor(ctset[0], ctset[1], ctset[2], chrTypes[i - 3], chrTypes[i - 2], chrTypes[i - 1],
chrTypes[i], chrTypes[i + 1], chrTypes[i + 2], segments[i - 3], segments[i - 2],
segments[i - 1], segments[i], segments[i + 1], segments[i + 2]);
if (score > 0) {
result.put(src[start..index]);
start = index;
ctype = 'B';
}
ctset[0] = ctset[1];
ctset[1] = ctset[2];
ctset[2] = ctype;
index += segments[i].length;
debug writeln("word: ", word, ", score: ", score, ", ctype: ", ctype);
}
result.put(src[start..index]);
return result.data;
}
if (source.length == 0)
return [];
immutable num = source.count();
auto segments = new string[](num + 6);
auto chrTypes = new char[](num + 6);
segments[0..3] = ["B3", "B2", "B1"];
chrTypes[0..3] = 'O';
segments[$ - 3..$] = ["E1", "E2", "E3"];
chrTypes[$ - 3..$] = 'O';
for (size_t i, j = 3; i < source.length; j++) {
immutable k = i;
immutable c = source.decode(i);
segments[j] = source[k..i];
chrTypes[j] = getCharType(c);
}
return doAnalysis(source, segments, chrTypes);
}
/**
* Returns a Range.
*
* Example:
* -----
* // range example is ["text", "テキスト", ...]
* foreach (splitted; TinySegmenter.segmenter(range)) {
* ... use splitted strings ...
* }
* -----
*/
alias map!segment segmenter;
private:
@trusted
static sizediff_t scoreFor(in char p1, in char p2, in char p3,
in char c1, in char c2, in char c3, in char c4, in char c5, in char c6,
in string w1, in string w2, in string w3, in string w4, in string w5, in string w6)
{
char[1] temp1 = [p1];
char[2] temp2 = [p1, p2];
char[3] temp3 = [c1, c2, c3];
char[4] temp4 = [p2, c1, c2, c3];
sizediff_t score = BIAS;
score += UP1.get(cast(immutable)temp1, 0); temp1[0] = p2;
score += UP2.get(cast(immutable)temp1, 0); temp1[0] = p3;
score += UP3.get(cast(immutable)temp1, 0); temp1[0] = c1;
score += UC1.get(cast(immutable)temp1, 0); temp1[0] = c2;
score += UC2.get(cast(immutable)temp1, 0); temp1[0] = c3;
score += UC3.get(cast(immutable)temp1, 0); temp1[0] = c4;
score += UC4.get(cast(immutable)temp1, 0); temp1[0] = c5;
score += UC5.get(cast(immutable)temp1, 0); temp1[0] = c6;
score += UC6.get(cast(immutable)temp1, 0);
score += BP1.get(cast(immutable)temp2, 0); temp2[0] = p2; temp2[1] = p3;
score += BP2.get(cast(immutable)temp2, 0); temp2[0] = c2; temp2[1] = c3;
score += BC1.get(cast(immutable)temp2, 0); temp2[0] = c3; temp2[1] = c4;
score += BC2.get(cast(immutable)temp2, 0); temp2[0] = c4; temp2[1] = c5;
score += BC3.get(cast(immutable)temp2, 0); temp2[0] = p1; temp2[1] = c1;
score += UQ1.get(cast(immutable)temp2, 0); temp2[0] = p2; temp2[1] = c2;
score += UQ2.get(cast(immutable)temp2, 0); temp2[0] = p3; temp2[1] = c3;
score += UQ3.get(cast(immutable)temp2, 0);
score += TC1.get(cast(immutable)temp3, 0); temp3[0] = c2; temp3[1] = c3; temp3[2] = c4;
score += TC2.get(cast(immutable)temp3, 0); temp3[0] = c3; temp3[1] = c4; temp3[2] = c5;
score += TC3.get(cast(immutable)temp3, 0); temp3[0] = c4; temp3[1] = c5; temp3[2] = c6;
score += TC4.get(cast(immutable)temp3, 0); temp3[0] = p2; temp3[1] = c2; temp3[2] = c3;
score += BQ1.get(cast(immutable)temp3, 0); temp3[0] = p2; temp3[1] = c3; temp3[2] = c4;
score += BQ2.get(cast(immutable)temp3, 0); temp3[0] = p3; temp3[1] = c2; temp3[2] = c3;
score += BQ3.get(cast(immutable)temp3, 0); temp3[0] = p3; temp3[1] = c3; temp3[2] = c4;
score += BQ4.get(cast(immutable)temp3, 0);
score += TQ1.get(cast(immutable)temp4, 0); temp4[0] = p2; temp4[1] = c2; temp4[2] = c3; temp4[3] = c4;
score += TQ2.get(cast(immutable)temp4, 0); temp4[0] = p3; temp4[1] = c1; temp4[2] = c2; temp4[3] = c3;
score += TQ3.get(cast(immutable)temp4, 0); temp4[0] = p3; temp4[1] = c2; temp4[2] = c3; temp4[3] = c4;
score += TQ4.get(cast(immutable)temp4, 0);
score += UW1.get(w1, 0);
score += UW2.get(w2, 0);
score += UW3.get(w3, 0);
score += UW4.get(w4, 0);
score += UW5.get(w5, 0);
score += UW6.get(w6, 0);
score += BW1.get(w2 ~ w3, 0);
score += BW2.get(w3 ~ w4, 0);
score += BW3.get(w4 ~ w5, 0);
score += TW1.get(w1 ~ w2 ~ w3, 0);
score += TW2.get(w2 ~ w3 ~ w4, 0);
score += TW3.get(w3 ~ w4 ~ w5, 0);
score += TW4.get(w4 ~ w5 ~ w6, 0);
// Original code comments out this line.
//score += TC5.get([c4, c5, c6], 0);
return score;
}
}
unittest
{
static struct Test
{
string text;
string[] result;
}
auto tests = [
Test("私の名前は中野です", ["", "", "名前", "", "中野", "です"]),
Test("それD言語で出来るよ", ["それ", "D", "言語", "", "出来る", ""]),
];
assert(TinySegmenter.segment(tests[0].text) == tests[0].result);
assert(TinySegmenter.segment(tests[1].text) == tests[1].result);
// Range test
size_t i;
foreach (splitted; TinySegmenter.segmenter([tests[0].text, tests[1].text]))
assert(splitted == tests[i++].result);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment