-
-
Save TaiToTo/ebec39b166a18b7ba1f0739d0f0e483c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "sudden-surprise", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from bpemb import BPEmb\n", | |
"\n", | |
"# You load BPEmb model for each language.\n", | |
"bpemb_de = BPEmb(lang='de', vs=10000, dim=100)\n", | |
"bpemb_en = BPEmb(lang='en', vs=10000, dim=100)\n", | |
"bpemb_ja = BPEmb(lang='ja', vs=10000, dim=100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "confused-there", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[386, 9937, 9927, 7, 491, 272, 305, 3838, 5447, 819, 26, 7, 1645, 9935]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Let's see how the sentences in three different languages are encoded into integer are decoded. \n", | |
"\n", | |
"# Encoding in English.\n", | |
"bpemb_en.encode_ids(\"I'm the only one who cannot speak German in the office.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "developmental-cleveland", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[3077,\n", | |
" 4725,\n", | |
" 26,\n", | |
" 3478,\n", | |
" 39,\n", | |
" 131,\n", | |
" 4969,\n", | |
" 9927,\n", | |
" 9940,\n", | |
" 26,\n", | |
" 1502,\n", | |
" 250,\n", | |
" 7690,\n", | |
" 713,\n", | |
" 9935]" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Encoding in German. \n", | |
"bpemb_de.encode_ids(\"Ich bin der Einzige in dem Büro, der kein Deutsch sprechen kann.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "distinguished-small", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[2025,\n", | |
" 4406,\n", | |
" 6056,\n", | |
" 6047,\n", | |
" 421,\n", | |
" 6310,\n", | |
" 6049,\n", | |
" 6406,\n", | |
" 6234,\n", | |
" 41,\n", | |
" 382,\n", | |
" 6988,\n", | |
" 1755,\n", | |
" 6085,\n", | |
" 6037]" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Encoding in Japanese. \n", | |
"bpemb_ja.encode_ids(\"そのオフィスでドイツ語が話せないのは私だけです。\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "prostate-ballot", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"i'm the only one who cannot speak german in the office.\n", | |
"ich bin der einzige in dem büro, der kein deutsch sprechen kann.\n", | |
"そのオフィスでドイツ語が話せないのは私だけです。\n" | |
] | |
} | |
], | |
"source": [ | |
"# The encoded sentences can be conversely decoded into the original languages with few changes. \n", | |
"print( bpemb_en.decode_ids([386, 9937, 9927, 7, 491, 272, 305, 3838, 5447, 819, 26, 7, 1645, 9935]) )\n", | |
"print( bpemb_de.decode_ids([3077, 4725, 26, 3478, 39, 131, 4969, 9927, 9940, 26, 1502, 250, 7690, 713, 9935]) )\n", | |
"print( bpemb_ja.decode_ids([2025, 4406, 6056, 6047, 421, 6310, 6049, 6406, 6234, 41, 382, 6988, 1755, 6085, 6037]) )" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment