Skip to content

Instantly share code, notes, and snippets.

@kzinmr
Created October 5, 2020 13:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kzinmr/d140006058e33792d151d2c490395c85 to your computer and use it in GitHub Desktop.
Save kzinmr/d140006058e33792d151d2c490395c85 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import tokenizations"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"å\t->\tabc\n",
"BC\t->\tabc\n",
"åBC\t<-\tabc\n"
]
}
],
"source": [
"# token列同士のアラインメント\n",
"\n",
"tokens_a = [\"å\", \"BC\"]\n",
"tokens_b = [\"abc\"]\n",
"a2b, b2a = tokenizations.get_alignments(tokens_a, tokens_b)\n",
"# a2b[i] is a list representing the alignment from tokens_a to tokens_b.\n",
"for b_component, token_a in zip(a2b, tokens_a):\n",
" print( '{}\\t->\\t{}'.format(token_a, ''.join([tokens_b[i] for i in b_component])) )\n",
"for a_component, token_b in zip(b2a, tokens_b):\n",
" print( '{}\\t<-\\t{}'.format(''.join([tokens_a[i] for i in a_component]), token_b) )\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WORD\t->\tSUBWORDS\n",
"John\t->\t['john']\n",
"Johanson\t->\t['johan', '##son']\n",
"'s\t->\t[\"'\", 's']\n",
"house\t->\t['house']\n"
]
}
],
"source": [
"# 使用例. word-tokenとwordpiece-tokenの対応関係取得\n",
"tokens_word = [\"John\", \"Johanson\", \"'s\", \"house\"]\n",
"tokens_subword = [\"john\", \"johan\", \"##son\", \"'\", \"s\", \"house\"]\n",
"w2s, _ = tokenizations.get_alignments(tokens_word, tokens_subword)\n",
"\n",
"print('WORD\\t->\\tSUBWORDS')\n",
"for s_component, token_word in zip(w2s, tokens_word):\n",
" print( '{}\\t->\\t{}'.format(token_word, [tokens_subword[i] for i in s_component]) )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"f\t->\tF\n",
"o\t->\tO\n",
"o\t->\to\n",
"b\t->\tB\n",
"a\t->\tå\n",
"r\t->\tR\n",
"b\t->\tb\n",
"a\t->\ta\n",
"z\t->\tZ\n",
"f\t<-\tF\n",
"o\t<-\tO\n",
"None\t<-\t \n",
"o\t<-\to\n",
"None\t<-\t.\n",
"b\t<-\tB\n",
"a\t<-\tå\n",
"r\t<-\tR\n",
"None\t<-\t \n",
"b\t<-\tb\n",
"a\t<-\ta\n",
"z\t<-\tZ\n"
]
}
],
"source": [
"# 文字列同士のアラインメント\n",
"chars_a = \"foobarbaz\"\n",
"chars_b = \"FO o.BåR baZ\"\n",
"a2b, b2a = tokenizations.get_charmap(chars_a, chars_b)\n",
"\n",
"for b_component, char_a in zip(a2b, chars_a):\n",
" print( '{}\\t->\\t{}'.format(char_a, ''.join([chars_b[i] for i in b_component])) )\n",
"# for a_component, token_b in zip(b2a, tokens_b):\n",
"# print( '{}\\t<-\\t{}'.format(''.join([tokens_a[i] for i in a_component]), token_b) )\n",
"for a_component, char_b in zip(b2a, chars_b):\n",
" print( '{}\\t<-\\t{}'.format(''.join([chars_a[i] for i in a_component] if a_component else 'None'), char_b ))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"トークン\t->\t正規化前トークン\n",
"foo\t->\tFO o\n",
"bar\t->\tBåR\n"
]
}
],
"source": [
"# トークン列と文字列のアラインメント(スパンで取得)\n",
"# 与えられたトークン列に対応する、正規化前文字列のスパンを復元する\n",
"tokens = [\"foo\", \"bar\"]\n",
"original_text = \"FO o.BåR baZ\"\n",
"original_spans = tokenizations.get_original_spans(tokens, original_text)\n",
"\n",
"print('トークン\\t->\\t正規化前トークン')\n",
"for token, (orgs, orge) in zip(tokens, original_spans):\n",
" print('{}\\t->\\t{}'.format(token, original_text[orgs:orge]))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"foo -> FO\n",
"foo -> o\n",
"bar -> BåR\n"
]
}
],
"source": [
"# 文字列間のアラインメントをとり対応するスパンを復元(tokenizations.get_original_spansよりやや厳密な対応を取得)\n",
"import textspan\n",
"\n",
"spans = [(0, 3), (3, 6)]\n",
"text = \"foobarbaz\"\n",
"\n",
"original_text = \"FO o.BåR baZ\"\n",
"original_spans = textspan.align_spans(spans, text, original_text)\n",
"# 内部的には tokenizations.get_charmap(text, original_text) を経由\n",
"for (s, e), orgspans in zip(spans, original_spans):\n",
" for orgs, orge in orgspans:\n",
" print('{} -> {}'.format(text[s:e], original_text[orgs:orge]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment