Last active
October 25, 2015 02:48
-
-
Save repeatedly/33a74fcc922a1ae529ec to your computer and use it in GitHub Desktop.
TinySegmenter written in D
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Written in the D programming language. | |
import std.datetime; | |
import std.stdio; | |
import std.conv; | |
import std.file; | |
import tinysegmenter; | |
void main() | |
{ | |
string text = cast(string)read("timemachineu8j.txt"); | |
void f() | |
{ | |
foreach (i; 0..100) { | |
TinySegmenter.segment(text); | |
} | |
} | |
auto times = benchmark!(f)(1); | |
writeln("segment: ", to!Duration(times[0])); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Written in the D programming language. | |
/** | |
* TinySegmenter written in D. | |
* | |
* Original version is here: $(WEB chasen.org/~taku/software/TinySegmenter/, | |
* TinySegmenter : Javascriptだけで書かれたコンパクトな分かち書きソフトウェア). | |
* | |
* Example: | |
* ----- | |
* auto result = TinySegmenter.segment("それD言語で出来るよ"); | |
* result.join("|") //-> "それ|D|言語|で|出来る|よ" | |
* ----- | |
* | |
* Copyright: Copyright Masahiro Nakagawa 2010-. | |
* License: <a href="http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt">New BSD License</a>. | |
* Authors: Masahiro Nakagawa | |
*/ | |
module tinysegmenter; | |
import std.algorithm : map; | |
import std.array : Appender; | |
import std.range : isInputRange; | |
import std.utf : decode, count, isValidDchar; | |
import std.stdio; // writeln | |
/** | |
* TinySegmenter is a Super compact Japanese tokenizer. | |
*/ | |
struct TinySegmenter | |
{ | |
private: | |
static immutable | |
{ | |
sizediff_t BIAS = -332; | |
sizediff_t[string] UP1, UP2, UP3; | |
sizediff_t[string] UC1, UC2, UC3, UC4, UC5, UC6; | |
sizediff_t[string] BP1, BP2; | |
sizediff_t[string] BC1, BC2, BC3; | |
sizediff_t[string] UQ1, UQ2, UQ3; | |
sizediff_t[string] TC1, TC2, TC3, TC4; | |
sizediff_t[string] BQ1, BQ2, BQ3, BQ4; | |
sizediff_t[string] TQ1, TQ2, TQ3, TQ4; | |
sizediff_t[string] UW1, UW2, UW3, UW4, UW5, UW6; | |
sizediff_t[string] BW1, BW2, BW3; | |
sizediff_t[string] TW1, TW2, TW3, TW4; | |
char[dchar] charTypeMap; | |
} | |
@safe | |
shared static this() | |
{ | |
UP1 = ["O":-214]; | |
UP2 = ["B":69,"O":935]; | |
UP3 = ["B":189]; | |
UC1 = ["A":484,"K":93,"M":645,"O":-505]; | |
UC2 = ["A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646]; | |
UC3 = ["A":-1370,"I":2311]; | |
UC4 = ["A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646]; | |
UC5 = ["H":313,"I":-1238,"K":-799,"M":539,"O":-831]; | |
UC6 = ["H":-506,"I":-253,"K":87,"M":247,"O":-387]; | |
BP1 = ["BB":295,"OB":304,"OO":-125,"UB":352]; | |
BP2 = ["BO":60,"OO":-1762]; | |
BC1 = ["HH":6,"II":2461,"KH":406,"OH":-1378]; | |
BC2 = ["AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327, | |
"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920]; | |
BC3 = ["HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266]; | |
UQ1 = ["BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422]; | |
UQ2 = ["BH":216,"BI":113,"OK":1759]; | |
UQ3 = ["BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212]; | |
TC1 = ["AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169, | |
"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832]; | |
TC2 = ["HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649]; | |
TC3 = ["AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128, | |
"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491, | |
"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393]; | |
TC4 = ["HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804, | |
"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845, | |
"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841]; | |
BQ1 = ["BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91, | |
"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965]; | |
BQ2 = ["BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146]; | |
BQ3 = ["BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775, | |
"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699]; | |
BQ4 = ["BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385, | |
"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973]; | |
TQ1 = ["BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225, | |
"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68]; | |
TQ2 = ["BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591]; | |
TQ3 = ["BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105, | |
"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481, | |
"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685]; | |
TQ4 = ["BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294, | |
"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156]; | |
UW1 = [",":156,"、":156,"「":-463,"あ":-941,"う":-127,"が":-553,"き":121,"こ":505,"で":-201,"と":-547, | |
"ど":-123,"に":-789,"の":-185,"は":-847,"も":-466,"や":-470,"よ":182,"ら":-292,"り":208,"れ": | |
169,"を":-446,"ん":-137,"・":-135,"主":-402,"京":-268,"区":-912,"午":871,"国":-460,"大":561, | |
"委":729,"市":-411,"日":-141,"理":361,"生":-408,"県":-386,"都":-718,"「":-463,"・":-135]; | |
UW2 = [",":-829,"、":-829,"〇":892,"「":-645,"」":3145,"あ":-538,"い":505,"う":134,"お":-502,"か":1454, | |
"が":-856,"く":-412,"こ":1141,"さ":878,"ざ":540,"し":1529,"す":-675,"せ":300,"そ":-1011,"た":188, | |
"だ":1837,"つ":-949,"て":-291,"で":-268,"と":-981,"ど":1273,"な":1063,"に":-1764,"の":130, | |
"は":-409,"ひ":-1273,"べ":1261,"ま":600,"も":-1263,"や":-402,"よ":1639,"り":-579,"る":-694, | |
"れ":571,"を":-2516,"ん":2095,"ア":-587,"カ":306,"キ":568,"ッ":831,"三":-758,"不":-2150,"世":-302, | |
"中":-968,"主":-861,"事":492,"人":-123,"会":978,"保":362,"入":548,"初":-3025,"副":-1566,"北":-3414, | |
"区":-422,"大":-1769,"天":-865,"太":-483,"子":-1519,"学":760,"実":1023,"小":-2009,"市":-813, | |
"年":-1060,"強":1067,"手":-1519,"揺":-1033,"政":1522,"文":-1355,"新":-1682,"日":-1815,"明":-1462, | |
"最":-630,"朝":-1843,"本":-1650,"東":-931,"果":-665,"次":-2378,"民":-180,"気":-1740,"理":752, | |
"発":529,"目":-1584,"相":-242,"県":-1165,"立":-763,"第":810,"米":509,"自":-1353,"行":838,"西":-744, | |
"見":-3874,"調":1010,"議":1198,"込":3041,"開":1758,"間":-1257,"「":-645,"」":3145,"ッ":831,"ア":-587,"カ":306,"キ":568]; | |
UW3 = [",":4889,"1":-800,"−":-1723,"、":4889,"々":-2311,"〇":5827,"」":2670,"〓":-3573,"あ":-2696, | |
"い":1006,"う":2342,"え":1983,"お":-4864,"か":-1163,"が":3271,"く":1004,"け":388,"げ":401, | |
"こ":-3552,"ご":-3116,"さ":-1058,"し":-395,"す":584,"せ":3685,"そ":-5228,"た":842,"ち":-521, | |
"っ":-1444,"つ":-1081,"て":6167,"で":2318,"と":1691,"ど":-899,"な":-2788,"に":2745,"の":4056, | |
"は":4555,"ひ":-2171,"ふ":-1798,"へ":1199,"ほ":-5516,"ま":-4384,"み":-120,"め":1205,"も":2323, | |
"や":-788,"よ":-202,"ら":727,"り":649,"る":5905,"れ":2773,"わ":-1207,"を":6620,"ん":-518, | |
"ア":551,"グ":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278, | |
"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"中":653,"主":-758,"予":-1193,"二":974, | |
"人":2742,"今":792,"他":1889,"以":-1368,"低":811,"何":4265,"作":-361,"保":-2439,"元":4858, | |
"党":3593,"全":1574,"公":-3030,"六":755,"共":-1880,"円":5807,"再":3095,"分":457,"初":2475, | |
"別":1129,"前":2286,"副":4437,"力":365,"動":-949,"務":-1872,"化":1327,"北":-1038,"区":4646, | |
"千":-2309,"午":-783,"協":-1006,"口":483,"右":1233,"各":3588,"合":-241,"同":3906,"和":-837, | |
"員":4513,"国":642,"型":1389,"場":1219,"外":-241,"妻":2016,"学":-1356,"安":-423,"実":-1008, | |
"家":1078,"小":-513,"少":-3102,"州":1155,"市":3197,"平":-1804,"年":2416,"広":-1030,"府":1605, | |
"度":1452,"建":-2352,"当":-3885,"得":1905,"思":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013, | |
"教":-1479,"数":3222,"文":-1489,"新":1764,"日":2099,"旧":5792,"昨":-661,"時":-1248,"曜":-951, | |
"最":-937,"月":4125,"期":360,"李":3094,"村":364,"東":-805,"核":5156,"森":2438,"業":484,"氏":2613, | |
"民":-1694,"決":-1073,"法":1868,"海":-495,"無":979,"物":461,"特":-3850,"生":-273,"用":914, | |
"町":1215,"的":7313,"直":-1835,"省":792,"県":6293,"知":-1528,"私":4231,"税":401,"立":-960, | |
"第":1201,"米":7767,"系":3066,"約":3663,"級":1384,"統":-4229,"総":1163,"線":1255,"者":6457, | |
"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"費":1777,"車":1835,"軍":1375, | |
"込":-1504,"通":-1136,"選":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"長":421,"開":-1432, | |
"間":1302,"関":-1282,"雨":2009,"電":-1045,"非":2066,"駅":1620,"1":-800,"」":2670,"・":-3794, | |
"ッ":-1350,"ア":551,"グ":1319,"ス":874,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278]; | |
UW4 = [",":3930,".":3508,"―":-4841,"、":3930,"。":3508,"〇":4999,"「":1895,"」":3798,"〓":-5156, | |
"あ":4752,"い":-3435,"う":-640,"え":-2514,"お":2405,"か":530,"が":6006,"き":-4482,"ぎ":-3821, | |
"く":-3788,"け":-4376,"げ":-4734,"こ":2255,"ご":1979,"さ":2864,"し":-843,"じ":-2506,"す":-731, | |
"ず":1251,"せ":181,"そ":4091,"た":5034,"だ":5408,"ち":-3654,"っ":-5882,"つ":-1659,"て":3994, | |
"で":7410,"と":4547,"な":5433,"に":6499,"ぬ":1853,"ね":1413,"の":7396,"は":8578,"ば":1940, | |
"ひ":4249,"び":-4134,"ふ":1345,"へ":6665,"べ":-744,"ほ":1464,"ま":1051,"み":-2082,"む":-882, | |
"め":-5046,"も":4169,"ゃ":-2666,"や":2795,"ょ":-1544,"よ":3351,"ら":-2922,"り":-9726,"る":-14896, | |
"れ":-2613,"ろ":-4570,"わ":-1783,"を":13150,"ん":-2352,"カ":2145,"コ":1789,"セ":1287,"ッ":-724,"ト":-403, | |
"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"中":2210,"予":782, | |
"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"側":4292,"先":601,"党":-2006, | |
"共":-1212,"内":584,"円":788,"初":1347,"前":1623,"副":3879,"力":-302,"動":-740,"務":-2715,"化":776, | |
"区":4517,"協":1013,"参":1555,"合":-1834,"和":-681,"員":-910,"器":-851,"回":1500,"国":-619,"園":-1200, | |
"地":866,"場":-1410,"塁":-2094,"士":-1413,"多":1067,"大":571,"子":-4802,"学":-1397,"定":-1057,"寺":-809, | |
"小":1910,"屋":-1328,"山":-1500,"島":-2056,"川":-2667,"市":2771,"年":374,"庁":-4556,"後":456,"性":553, | |
"感":916,"所":-1566,"支":856,"改":787,"政":2182,"教":704,"文":522,"方":-856,"日":1798,"時":1829,"最":845, | |
"月":-9066,"木":-485,"来":-442,"校":-360,"業":-1043,"氏":5388,"民":-2716,"気":-910,"沢":-939,"済":-543, | |
"物":-735,"率":672,"球":-1267,"生":-1286,"産":-1101,"田":-2900,"町":1826,"的":2586,"目":922,"省":-3485, | |
"県":2997,"空":-867,"立":-2112,"第":788,"米":2937,"系":786,"約":2171,"経":1146,"統":-1169,"総":940, | |
"線":-994,"署":749,"者":2145,"能":-730,"般":-852,"行":-792,"規":792,"警":-1184,"議":-244,"谷":-1000, | |
"賞":730,"車":-1481,"軍":1158,"輪":-1433,"込":-3370,"近":929,"道":-1291,"選":2596,"郎":-4866,"都":1192, | |
"野":-1100,"銀":-2213,"長":357,"間":-2344,"院":-2297,"際":-2604,"電":-878,"領":-1659,"題":-792, | |
"館":-1984,"首":1749,"高":2120,"「":1895,"」":3798,"・":-4371,"ッ":-724,"ー":-11870,"カ":2145,"コ":1789, | |
"セ":1287,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637]; | |
UW5 = [",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"、":465,"。":-299,"「":363,"あ":1655,"い":331, | |
"う":-503,"え":1199,"お":527,"か":647,"が":-421,"き":1624,"ぎ":1971,"く":312,"げ":-983,"さ":-1537, | |
"し":-1371,"す":-852,"だ":-1186,"ち":1093,"っ":52,"つ":921,"て":-18,"で":-850,"と":-127,"ど":1682, | |
"な":-787,"に":-1224,"の":-635,"は":-578,"べ":1001,"み":502,"め":865,"ゃ":3350,"ょ":854,"り":-208, | |
"る":429,"れ":504,"わ":419,"を":-1264,"ん":327,"イ":241,"ル":451,"ン":-343,"中":-871,"京":722, | |
"会":-1153,"党":-654,"務":3519,"区":-901,"告":848,"員":2104,"大":-1296,"学":-548,"定":1785, | |
"嵐":-1304,"市":-2991,"席":921,"年":1763,"思":872,"所":-814,"挙":1618,"新":-1682,"日":218, | |
"月":-4353,"査":932,"格":1356,"機":-1508,"氏":-1347,"田":240,"町":-3912,"的":-3149,"相":1319, | |
"省":-1052,"県":-4003,"研":-997,"社":-278,"空":-813,"統":1955,"者":-2233,"表":663,"語":-1073, | |
"議":1219,"選":-1018,"郎":-368,"長":786,"間":1191,"題":2368,"館":-689,"1":-514,"E2":-32768, | |
"「":363,"イ":241,"ル":451,"ン":-343]; | |
UW6 = [",":227,".":808,"1":-270,"E1":306,"、":227,"。":808,"あ":-307,"う":189,"か":241,"が":-73,"く":-121, | |
"こ":-200,"じ":1782,"す":383,"た":-428,"っ":573,"て":-1014,"で":101,"と":-105,"な":-253,"に":-149, | |
"の":-417,"は":-236,"も":-206,"り":187,"る":-135,"を":195,"ル":-673,"ン":-496,"一":-277,"中":201, | |
"件":-800,"会":624,"前":302,"区":1792,"員":-1212,"委":798,"学":-960,"市":887,"広":-695,"後":535, | |
"業":-697,"相":753,"社":-507,"福":974,"空":-822,"者":1811,"連":463,"郎":1082,"1":-270,"E1":306,"ル":-673,"ン":-496]; | |
BW1 = [",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505, | |
"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790, | |
"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977, | |
"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127, | |
"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379, | |
"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448, | |
"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860, | |
"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336, | |
"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1あ":1404,"B1同":542,"」と":1682]; | |
BW2 = ["..":-11822,"11":-669,"――":-5730,"−−":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602, | |
"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193, | |
"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939, | |
"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589, | |
"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666, | |
"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720, | |
"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615, | |
"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059, | |
"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621, | |
"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651, | |
"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728, | |
"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163, | |
"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626, | |
"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355, | |
"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"11":-669]; | |
BW3 = ["あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029, | |
"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857, | |
"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816, | |
"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397, | |
"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562, | |
"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426, | |
"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098, | |
"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437, | |
"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796, | |
"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724, | |
"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549, | |
"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850, | |
"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860, | |
"入り":1232,"大会":2217,"始め":1681,"市":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"カ月":990]; | |
TW1 = ["につい":-4681,"東京都":2026]; | |
TW2 = ["ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430, | |
"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882, | |
"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216]; | |
TW3 = ["いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247, | |
"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287]; | |
TW4 = ["いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538, | |
"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865]; | |
charTypeMap = buildCharTypeMap(); | |
} | |
@trusted | |
static immutable(char[dchar]) buildCharTypeMap() | |
{ | |
// 'M' : [一二三四五六七八九十百千万億兆] | |
char[dchar] map = ['一':'M','二':'M','三':'M','四':'M','五':'M','六':'M','七':'M','八':'M','九':'M','十':'M','百':'M','千':'M','万':'M','億':'M','兆':'M']; | |
// 'H' : [一-龠々〆ヵヶ]. 一-龠 are done by getCharType. | |
foreach (dchar c; ['々', '〆', 'ヵ', 'ヶ']) { map[c] = 'H'; } | |
// 'I' : [ぁ-ん] | |
foreach (dchar c; 'ぁ'..'ん' + 1) { map[c] = 'I'; } | |
// 'K' : [ァ-ヴーア-ン゙ー] | |
foreach (dchar c; 'ァ'..'ヴ' + 1) { map[c] = 'K'; } | |
foreach (dchar c; 'ア'..'゙' + 1) { map[c] = 'K'; } | |
foreach (dchar c; ['ー', 'ー']) { map[c] = 'K'; } | |
// 'A' : [a-zA-Za-zA-Z] | |
foreach (dchar c; 'a'..'z' + 1) { map[c] = 'A'; } | |
foreach (dchar c; 'A'..'Z' + 1) { map[c] = 'A'; } | |
foreach (dchar c; 'a'..'z' + 1) { map[c] = 'A'; } | |
foreach (dchar c; 'A'..'Z' + 1) { map[c] = 'A'; } | |
// 'N' : [0-90-9] | |
foreach (dchar c; '0'..'9' + 1) { map[c] = 'N'; } | |
foreach (dchar c; '0'..'9' + 1) { map[c] = 'N'; } | |
return cast(immutable)map; | |
} | |
public: | |
/** | |
* Do morphological analysis. | |
* | |
* Params: | |
* source = the input to morphological analysis. | |
* | |
* Returns: | |
* an analyzed result that splits $(D_PARAM source). | |
*/ | |
@trusted | |
static string[] segment(in string source) | |
{ | |
/** | |
* Helper for character-type mapping | |
*/ | |
@safe | |
static pure nothrow char getCharType(in dchar c) | |
in | |
{ | |
assert(isValidDchar(c), "invalid UTF-32 character"); | |
} | |
body | |
{ | |
auto ct = c in charTypeMap; | |
if (ct) | |
return *ct; | |
else | |
if ('一' <= c && c <= '龠') | |
return 'H'; | |
else | |
return 'O'; // othre type | |
} | |
/** | |
* Core calculation function. | |
*/ | |
@trusted | |
static string[] doAnalysis(string src, in string[] segments, in char[] chrTypes) | |
{ | |
auto result = Appender!(string[])([]); | |
size_t index = segments[3].length, start; | |
char[3] ctset = ['U', 'U', 'U']; | |
foreach (i; 4..segments.length - 3) { | |
auto ctype = 'O'; | |
immutable score = scoreFor(ctset[0], ctset[1], ctset[2], chrTypes[i - 3], chrTypes[i - 2], chrTypes[i - 1], | |
chrTypes[i], chrTypes[i + 1], chrTypes[i + 2], segments[i - 3], segments[i - 2], | |
segments[i - 1], segments[i], segments[i + 1], segments[i + 2]); | |
if (score > 0) { | |
result.put(src[start..index]); | |
start = index; | |
ctype = 'B'; | |
} | |
ctset[0] = ctset[1]; | |
ctset[1] = ctset[2]; | |
ctset[2] = ctype; | |
index += segments[i].length; | |
debug writeln("word: ", word, ", score: ", score, ", ctype: ", ctype); | |
} | |
result.put(src[start..index]); | |
return result.data; | |
} | |
if (source.length == 0) | |
return []; | |
immutable num = source.count(); | |
auto segments = new string[](num + 6); | |
auto chrTypes = new char[](num + 6); | |
segments[0..3] = ["B3", "B2", "B1"]; | |
chrTypes[0..3] = 'O'; | |
segments[$ - 3..$] = ["E1", "E2", "E3"]; | |
chrTypes[$ - 3..$] = 'O'; | |
for (size_t i, j = 3; i < source.length; j++) { | |
immutable k = i; | |
immutable c = source.decode(i); | |
segments[j] = source[k..i]; | |
chrTypes[j] = getCharType(c); | |
} | |
return doAnalysis(source, segments, chrTypes); | |
} | |
/** | |
* Returns a Range. | |
* | |
* Example: | |
* ----- | |
* // range example is ["text", "テキスト", ...] | |
* foreach (splitted; TinySegmenter.segmenter(range)) { | |
* ... use splitted strings ... | |
* } | |
* ----- | |
*/ | |
alias map!segment segmenter; | |
private: | |
@trusted | |
static sizediff_t scoreFor(in char p1, in char p2, in char p3, | |
in char c1, in char c2, in char c3, in char c4, in char c5, in char c6, | |
in string w1, in string w2, in string w3, in string w4, in string w5, in string w6) | |
{ | |
char[1] temp1 = [p1]; | |
char[2] temp2 = [p1, p2]; | |
char[3] temp3 = [c1, c2, c3]; | |
char[4] temp4 = [p2, c1, c2, c3]; | |
sizediff_t score = BIAS; | |
score += UP1.get(cast(immutable)temp1, 0); temp1[0] = p2; | |
score += UP2.get(cast(immutable)temp1, 0); temp1[0] = p3; | |
score += UP3.get(cast(immutable)temp1, 0); temp1[0] = c1; | |
score += UC1.get(cast(immutable)temp1, 0); temp1[0] = c2; | |
score += UC2.get(cast(immutable)temp1, 0); temp1[0] = c3; | |
score += UC3.get(cast(immutable)temp1, 0); temp1[0] = c4; | |
score += UC4.get(cast(immutable)temp1, 0); temp1[0] = c5; | |
score += UC5.get(cast(immutable)temp1, 0); temp1[0] = c6; | |
score += UC6.get(cast(immutable)temp1, 0); | |
score += BP1.get(cast(immutable)temp2, 0); temp2[0] = p2; temp2[1] = p3; | |
score += BP2.get(cast(immutable)temp2, 0); temp2[0] = c2; temp2[1] = c3; | |
score += BC1.get(cast(immutable)temp2, 0); temp2[0] = c3; temp2[1] = c4; | |
score += BC2.get(cast(immutable)temp2, 0); temp2[0] = c4; temp2[1] = c5; | |
score += BC3.get(cast(immutable)temp2, 0); temp2[0] = p1; temp2[1] = c1; | |
score += UQ1.get(cast(immutable)temp2, 0); temp2[0] = p2; temp2[1] = c2; | |
score += UQ2.get(cast(immutable)temp2, 0); temp2[0] = p3; temp2[1] = c3; | |
score += UQ3.get(cast(immutable)temp2, 0); | |
score += TC1.get(cast(immutable)temp3, 0); temp3[0] = c2; temp3[1] = c3; temp3[2] = c4; | |
score += TC2.get(cast(immutable)temp3, 0); temp3[0] = c3; temp3[1] = c4; temp3[2] = c5; | |
score += TC3.get(cast(immutable)temp3, 0); temp3[0] = c4; temp3[1] = c5; temp3[2] = c6; | |
score += TC4.get(cast(immutable)temp3, 0); temp3[0] = p2; temp3[1] = c2; temp3[2] = c3; | |
score += BQ1.get(cast(immutable)temp3, 0); temp3[0] = p2; temp3[1] = c3; temp3[2] = c4; | |
score += BQ2.get(cast(immutable)temp3, 0); temp3[0] = p3; temp3[1] = c2; temp3[2] = c3; | |
score += BQ3.get(cast(immutable)temp3, 0); temp3[0] = p3; temp3[1] = c3; temp3[2] = c4; | |
score += BQ4.get(cast(immutable)temp3, 0); | |
score += TQ1.get(cast(immutable)temp4, 0); temp4[0] = p2; temp4[1] = c2; temp4[2] = c3; temp4[3] = c4; | |
score += TQ2.get(cast(immutable)temp4, 0); temp4[0] = p3; temp4[1] = c1; temp4[2] = c2; temp4[3] = c3; | |
score += TQ3.get(cast(immutable)temp4, 0); temp4[0] = p3; temp4[1] = c2; temp4[2] = c3; temp4[3] = c4; | |
score += TQ4.get(cast(immutable)temp4, 0); | |
score += UW1.get(w1, 0); | |
score += UW2.get(w2, 0); | |
score += UW3.get(w3, 0); | |
score += UW4.get(w4, 0); | |
score += UW5.get(w5, 0); | |
score += UW6.get(w6, 0); | |
score += BW1.get(w2 ~ w3, 0); | |
score += BW2.get(w3 ~ w4, 0); | |
score += BW3.get(w4 ~ w5, 0); | |
score += TW1.get(w1 ~ w2 ~ w3, 0); | |
score += TW2.get(w2 ~ w3 ~ w4, 0); | |
score += TW3.get(w3 ~ w4 ~ w5, 0); | |
score += TW4.get(w4 ~ w5 ~ w6, 0); | |
// Original code comments out this line. | |
//score += TC5.get([c4, c5, c6], 0); | |
return score; | |
} | |
} | |
unittest | |
{ | |
static struct Test | |
{ | |
string text; | |
string[] result; | |
} | |
auto tests = [ | |
Test("私の名前は中野です", ["私", "の", "名前", "は", "中野", "です"]), | |
Test("それD言語で出来るよ", ["それ", "D", "言語", "で", "出来る", "よ"]), | |
]; | |
assert(TinySegmenter.segment(tests[0].text) == tests[0].result); | |
assert(TinySegmenter.segment(tests[1].text) == tests[1].result); | |
// Range test | |
size_t i; | |
foreach (splitted; TinySegmenter.segmenter([tests[0].text, tests[1].text])) | |
assert(splitted == tests[i++].result); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment