Skip to content

Instantly share code, notes, and snippets.

@davidalbertonogueira
Created April 4, 2019 16:53
Show Gist options
  • Save davidalbertonogueira/bce113101469ac4e95395c6488e492df to your computer and use it in GitHub Desktop.
Save davidalbertonogueira/bce113101469ac4e95395c6488e492df to your computer and use it in GitHub Desktop.
pyICU tokenizer wrapper
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from icu import Locale, BreakIterator\n",
"import six\n",
"from six import text_type as unicode"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"class Sequence(object):\n",
" \"\"\" Text with indices indicates boundaries.\"\"\"\n",
"\n",
" def __init__(self, text):\n",
"\n",
" if not text:\n",
" raise ValueError(\"This Sequence is Empty\")\n",
" if not isinstance(text, unicode):\n",
" raise ValueError(\"This is not unicode text instead {}\".format(type(text)))\n",
"\n",
" self.__text = text\n",
" self.idx = [0, len(self.text)]\n",
"\n",
" @property\n",
" def text(self):\n",
" return self.__text\n",
"\n",
" def __iter__(self):\n",
" for start, end in zip(self.idx[:-1], self.idx[1:]):\n",
" yield self.text[start: end]\n",
"\n",
" def tokens(self):\n",
" \"\"\" Returns segmented text after stripping whitespace.\"\"\"\n",
"\n",
" return [x.strip() for x in self if x.strip()]\n",
"\n",
" def __str__(self):\n",
" if six.PY3:\n",
" return self.__unicode__()\n",
" return self.__unicode__().encode(\"utf-8\")\n",
"\n",
" def __unicode__(self):\n",
" return u'\\n'.join(self.tokens())\n",
"\n",
" def split(self, sequence):\n",
" \"\"\" Split into subsequences according to `sequence`.\"\"\"\n",
"\n",
" major_idx = sequence.idx\n",
" idx2 = 0\n",
" for start, end in zip(major_idx[:-1], major_idx[1:]):\n",
" idx1 = self.idx.index(start, idx2)\n",
" idx2 = self.idx.index(end, idx2)\n",
" seq = Sequence(self.text[start:end])\n",
" seq.idx = [x-start for x in self.idx[idx1:idx2]]\n",
" yield seq\n",
"\n",
" def __len__(self):\n",
" return len(self.idx) - 1\n",
"\n",
" def empty(self):\n",
" return not self.text.strip()\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"class Breaker(object):\n",
" \"\"\" Base class to segment text.\"\"\"\n",
"\n",
" def __init__(self, locale):\n",
" self.locale = Locale('locale')\n",
" self.breaker = None\n",
"\n",
" def transform(self, sequence):\n",
" seq = Sequence(sequence.text)\n",
" seq.idx = [0]\n",
" for segment in sequence:\n",
" offset = seq.idx[-1]\n",
" self.breaker.setText(segment)\n",
" seq.idx.extend([offset+x for x in self.breaker])\n",
" return seq\n",
" \n",
"class SentenceTokenizer(Breaker):\n",
" \"\"\" Segment text to sentences. \"\"\"\n",
"\n",
" def __init__(self, locale='en'):\n",
" super(SentenceTokenizer, self).__init__(locale)\n",
" self.breaker = BreakIterator.createSentenceInstance(self.locale)\n",
"\n",
"\n",
"class WordTokenizer(Breaker):\n",
" \"\"\" Segment text to words or tokens.\"\"\"\n",
"\n",
" def __init__(self, locale='en'):\n",
" super(WordTokenizer, self).__init__(locale)\n",
" self.breaker = BreakIterator.createWordInstance(self.locale)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"en_word = WordTokenizer(locale='en')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"en_text = u\"\"\"A Ukrainian separatist leader is calling on Russia to \"absorb\" the eastern region of Donetsk after Sunday's referendum on self rule. Self-declared Donetsk People's Republic leader Denis Pushilin urged Moscow to listen to the \"will of the people\". In neighbouring Luhansk, where a vote was also held, rebels declared independence. Ukraine, the EU and US have declared the referendums illegal but Russia says the results should be \"implemented\". Moscow has so far not commented on the call for Donetsk to become part of Russia but has appealed for dialogue between the militants and Kiev, with the participation of the Organisation for Security and Co-operation in Europe.\n",
"\"\"\"\n",
"\n",
"ar_text = u\"\"\"عبر أحد قادة المتمردين الموالين لروسيا في أوكرانيا عن مساندته لفكرة الوحدة مع روسيا في أعقاب الإعلان عن نتائج الاستفتاء المثير للجدل في شرق البلاد. وقال رومان لياجين، رئيس لجنة المتمردين للانتخابات في دونيتسك إن الانضمام لروسيا \"قد يكون خطوة مناسبة\".\n",
"\"\"\"\n",
"\n",
"ja_text = u\"\"\"日本国(にほんこく、にっぽんこく)、または日本(にほん、にっぽん)は、日本列島(北海道・本州・四国・九州の主要四島およびそれに付随する島々)及び、南西諸島・伊豆諸島・小笠原諸島などから成る東アジアの島国[1][2]。議会制民主主義国家である。首都は東京都。 気候は四季の変化に富み、その国土の多くは山地で、人口は平野部に集中している。国内には行政区分として47の都道府県があり、日本人や少数の先住民族のアイヌおよび外国人系の人々が居住し、事実上の公用語として日本語が使用される。内政では、明治維新を経て立憲国家となり、第二次世界大戦後の1947年にGHQの指導の下、現行の日本国憲法を施行。 1940年代に起きた太平洋戦争から復興を遂げ、1960年代からの高度経済成長により工業化が加速し、科学技術立国が推進された結果経済大国にもなったが、1980年代末のバブル崩壊後は経済停滞期に入った。また先進国のひとつとして数えられており、G7、G8およびG20のひとつ。外交では、1956年から国際連合に加盟し、国連中心主義をとっている。\n",
"\"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"en_seq = Sequence(en_text)\n",
"ar_seq = Sequence(ar_text)\n",
"ja_seq = Sequence(ja_text)\n",
"\n",
"en_sent = SentenceTokenizer(locale='en')\n",
"ar_sent = SentenceTokenizer(locale='ar')\n",
"ja_sent = SentenceTokenizer(locale='ja')\n",
"\n",
"en_word = WordTokenizer(locale='en')\n",
"ar_word = WordTokenizer(locale='ar')\n",
"ja_word = WordTokenizer(locale='ja')\n",
"\n",
"en_sents = en_sent.transform(en_seq)\n",
"ar_sents = ar_sent.transform(ar_seq)\n",
"ja_sents = ja_sent.transform(ja_seq)\n",
"\n",
"en_words = en_word.transform(en_seq)\n",
"ar_words = ar_word.transform(ar_seq)\n",
"ja_words = ja_word.transform(ja_seq)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A Ukrainian separatist leader is calling on Russia to \"absorb\" the eastern region of Donetsk after Sunday's referendum on self rule.\n",
"Self-declared Donetsk People's Republic leader Denis Pushilin urged Moscow to listen to the \"will of the people\".\n",
"In neighbouring Luhansk, where a vote was also held, rebels declared independence.\n",
"Ukraine, the EU and US have declared the referendums illegal but Russia says the results should be \"implemented\".\n",
"Moscow has so far not commented on the call for Donetsk to become part of Russia but has appealed for dialogue between the militants and Kiev, with the participation of the Organisation for Security and Co-operation in Europe.\n",
"عبر أحد قادة المتمردين الموالين لروسيا في أوكرانيا عن مساندته لفكرة الوحدة مع روسيا في أعقاب الإعلان عن نتائج الاستفتاء المثير للجدل في شرق البلاد.\n",
"وقال رومان لياجين، رئيس لجنة المتمردين للانتخابات في دونيتسك إن الانضمام لروسيا \"قد يكون خطوة مناسبة\".\n",
"日本国(にほんこく、にっぽんこく)、または日本(にほん、にっぽん)は、日本列島(北海道・本州・四国・九州の主要四島およびそれに付随する島々)及び、南西諸島・伊豆諸島・小笠原諸島などから成る東アジアの島国[1][2]。\n",
"議会制民主主義国家である。\n",
"首都は東京都。\n",
"気候は四季の変化に富み、その国土の多くは山地で、人口は平野部に集中している。\n",
"国内には行政区分として47の都道府県があり、日本人や少数の先住民族のアイヌおよび外国人系の人々が居住し、事実上の公用語として日本語が使用される。\n",
"内政では、明治維新を経て立憲国家となり、第二次世界大戦後の1947年にGHQの指導の下、現行の日本国憲法を施行。\n",
"1940年代に起きた太平洋戦争から復興を遂げ、1960年代からの高度経済成長により工業化が加速し、科学技術立国が推進された結果経済大国にもなったが、1980年代末のバブル崩壊後は経済停滞期に入った。\n",
"また先進国のひとつとして数えられており、G7、G8およびG20のひとつ。\n",
"外交では、1956年から国際連合に加盟し、国連中心主義をとっている。\n",
"A\n",
"Ukrainian\n",
"separatist\n",
"leader\n",
"is\n",
"calling\n",
"on\n",
"Russia\n",
"to\n",
"\"\n",
"absorb\n",
"\"\n",
"the\n",
"eastern\n",
"region\n",
"of\n",
"Donetsk\n",
"after\n",
"Sunday's\n",
"referendum\n",
"on\n",
"self\n",
"rule\n",
".\n",
"Self\n",
"-\n",
"declared\n",
"Donetsk\n",
"People's\n",
"Republic\n",
"leader\n",
"Denis\n",
"Pushilin\n",
"urged\n",
"Moscow\n",
"to\n",
"listen\n",
"to\n",
"the\n",
"\"\n",
"will\n",
"of\n",
"the\n",
"people\n",
"\"\n",
".\n",
"In\n",
"neighbouring\n",
"Luhansk\n",
",\n",
"where\n",
"a\n",
"vote\n",
"was\n",
"also\n",
"held\n",
",\n",
"rebels\n",
"declared\n",
"independence\n",
".\n",
"Ukraine\n",
",\n",
"the\n",
"EU\n",
"and\n",
"US\n",
"have\n",
"declared\n",
"the\n",
"referendums\n",
"illegal\n",
"but\n",
"Russia\n",
"says\n",
"the\n",
"results\n",
"should\n",
"be\n",
"\"\n",
"implemented\n",
"\"\n",
".\n",
"Moscow\n",
"has\n",
"so\n",
"far\n",
"not\n",
"commented\n",
"on\n",
"the\n",
"call\n",
"for\n",
"Donetsk\n",
"to\n",
"become\n",
"part\n",
"of\n",
"Russia\n",
"but\n",
"has\n",
"appealed\n",
"for\n",
"dialogue\n",
"between\n",
"the\n",
"militants\n",
"and\n",
"Kiev\n",
",\n",
"with\n",
"the\n",
"participation\n",
"of\n",
"the\n",
"Organisation\n",
"for\n",
"Security\n",
"and\n",
"Co\n",
"-\n",
"operation\n",
"in\n",
"Europe\n",
".\n",
"عبر\n",
"أحد\n",
"قادة\n",
"المتمردين\n",
"الموالين\n",
"لروسيا\n",
"في\n",
"أوكرانيا\n",
"عن\n",
"مساندته\n",
"لفكرة\n",
"الوحدة\n",
"مع\n",
"روسيا\n",
"في\n",
"أعقاب\n",
"الإعلان\n",
"عن\n",
"نتائج\n",
"الاستفتاء\n",
"المثير\n",
"للجدل\n",
"في\n",
"شرق\n",
"البلاد\n",
".\n",
"وقال\n",
"رومان\n",
"لياجين\n",
"،\n",
"رئيس\n",
"لجنة\n",
"المتمردين\n",
"للانتخابات\n",
"في\n",
"دونيتسك\n",
"إن\n",
"الانضمام\n",
"لروسيا\n",
"\"\n",
"قد\n",
"يكون\n",
"خطوة\n",
"مناسبة\n",
"\"\n",
".\n",
"日本国\n",
"(\n",
"に\n",
"ほん\n",
"こく\n",
"、\n",
"にっぽん\n",
"こく\n",
")\n",
"、\n",
"または\n",
"日本\n",
"(\n",
"に\n",
"ほん\n",
"、\n",
"にっぽん\n",
")\n",
"は\n",
"、\n",
"日本\n",
"列島\n",
"(\n",
"北海道\n",
"・\n",
"本州\n",
"・\n",
"四国\n",
"・\n",
"九州\n",
"の\n",
"主要\n",
"四\n",
"島\n",
"および\n",
"それに\n",
"付随\n",
"する\n",
"島々\n",
")\n",
"及び\n",
"、\n",
"南西諸島\n",
"・\n",
"伊豆諸島\n",
"・\n",
"小笠原諸島\n",
"など\n",
"から\n",
"成る\n",
"東アジア\n",
"の\n",
"島国\n",
"[\n",
"1\n",
"]\n",
"[\n",
"2\n",
"]\n",
"。\n",
"議会\n",
"制\n",
"民主主義\n",
"国家\n",
"で\n",
"ある\n",
"。\n",
"首都\n",
"は\n",
"東京\n",
"都\n",
"。\n",
"気候\n",
"は\n",
"四季\n",
"の\n",
"変化\n",
"に\n",
"富\n",
"み\n",
"、\n",
"その\n",
"国土\n",
"の\n",
"多く\n",
"は\n",
"山地\n",
"で\n",
"、\n",
"人口\n",
"は\n",
"平野\n",
"部\n",
"に\n",
"集中\n",
"し\n",
"て\n",
"いる\n",
"。\n",
"国内\n",
"に\n",
"は\n",
"行政\n",
"区分\n",
"として\n",
"47\n",
"の\n",
"都道府県\n",
"が\n",
"あり\n",
"、\n",
"日本人\n",
"や\n",
"少数\n",
"の\n",
"先住民\n",
"族\n",
"の\n",
"アイヌ\n",
"および\n",
"外国\n",
"人\n",
"系\n",
"の\n",
"人々\n",
"が\n",
"居住\n",
"し\n",
"、\n",
"事実\n",
"上の\n",
"公\n",
"用語\n",
"として\n",
"日本語\n",
"が\n",
"使用\n",
"さ\n",
"れる\n",
"。\n",
"内政\n",
"では\n",
"、\n",
"明治維新\n",
"を\n",
"経\n",
"て\n",
"立憲\n",
"国家\n",
"となり\n",
"、\n",
"第二次\n",
"世界\n",
"大\n",
"戦後\n",
"の\n",
"1947\n",
"年\n",
"に\n",
"GHQ\n",
"の\n",
"指導\n",
"の\n",
"下\n",
"、\n",
"現行\n",
"の\n",
"日本国\n",
"憲法\n",
"を\n",
"施行\n",
"。\n",
"1940\n",
"年代\n",
"に\n",
"起\n",
"きた\n",
"太平洋戦争\n",
"から\n",
"復興\n",
"を\n",
"遂\n",
"げ\n",
"、\n",
"1960\n",
"年代\n",
"から\n",
"の\n",
"高度\n",
"経済\n",
"成長\n",
"により\n",
"工業\n",
"化\n",
"が\n",
"加速\n",
"し\n",
"、\n",
"科学\n",
"技術\n",
"立国\n",
"が\n",
"推進\n",
"さ\n",
"れ\n",
"た\n",
"結果\n",
"経済\n",
"大国\n",
"に\n",
"も\n",
"な\n",
"っ\n",
"た\n",
"が\n",
"、\n",
"1980\n",
"年代\n",
"末\n",
"の\n",
"バブル\n",
"崩壊\n",
"後\n",
"は\n",
"経済\n",
"停滞\n",
"期\n",
"に\n",
"入\n",
"っ\n",
"た\n",
"。\n",
"また\n",
"先進\n",
"国\n",
"の\n",
"ひとつ\n",
"として\n",
"数\n",
"え\n",
"ら\n",
"れ\n",
"て\n",
"おり\n",
"、\n",
"G7\n",
"、\n",
"G8\n",
"および\n",
"G20\n",
"の\n",
"ひとつ\n",
"。\n",
"外交\n",
"では\n",
"、\n",
"1956\n",
"年\n",
"から\n",
"国際\n",
"連合\n",
"に\n",
"加盟\n",
"し\n",
"、\n",
"国\n",
"連\n",
"中心\n",
"主義\n",
"を\n",
"と\n",
"って\n",
"いる\n",
"。\n"
]
}
],
"source": [
"print(en_sents)\n",
"print(ar_sents)\n",
"print(ja_sents)\n",
"print(en_words)\n",
"print(ar_words)\n",
"print(ja_words)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment