Last active
April 2, 2016 14:17
-
-
Save ikegami-yukino/d71350cbb9fbec7f42c507c494bbe8b1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 日本語文字列変換モジュールの比較" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## ドグラマグラをとってくる" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[' % Total % Received % Xferd Average Speed Time Time Time Current',\n", | |
" ' Dload Upload Total Spent Left Speed',\n", | |
" '',\n", | |
" ' 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0',\n", | |
" '100 411k 100 411k 0 0 707k 0 --:--:-- --:--:-- --:--:-- 707k',\n", | |
" 'Archive: /tmp/2093_ruby_28087.zip',\n", | |
" 'Made with MacWinZipper™',\n", | |
" ' inflating: /tmp/dogura_magura.txt ']" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%system\n", | |
"rm -f /tmp/2093_ruby_28087.zip /tmp/dogura_magura.txt\n", | |
"curl -o /tmp/2093_ruby_28087.zip http://www.aozora.gr.jp/cards/000096/files/2093_ruby_28087.zip\n", | |
"unzip -d /tmp /tmp/2093_ruby_28087.zip\n", | |
"nkf -Sw --overwrite /tmp/dogura_magura.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('/tmp/dogura_magura.txt') as fd:\n", | |
" dogura_magura = fd.read()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 短文を半角→全角" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10000 loops, best of 3: 27.1 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import jaconv\n", | |
"%timeit jaconv.h2z(dogura_magura[:140], ascii=False, digit=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10000 loops, best of 3: 96.4 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import cnvk\n", | |
"%timeit cnvk.convert(dogura_magura[:140], cnvk.Z_KATA, cnvk.H_ASCII, cnvk.H_SPACE)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100000 loops, best of 3: 5.04 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import mojimoji\n", | |
"%timeit mojimoji.han_to_zen(dogura_magura[:140])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10000 loops, best of 3: 75.8 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import zenhan\n", | |
"%timeit zenhan.h2z(dogura_magura[:140])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1000 loops, best of 3: 222 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import rfZenHan\n", | |
"%timeit rfZenHan.h2z(dogura_magura[:140])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10000 loops, best of 3: 23 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import nkf\n", | |
"%timeit nkf.nkf('-x -m0 --z1', dogura_magura[:140])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 長文を半角→全角" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 89.9 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import jaconv\n", | |
"%timeit jaconv.h2z(dogura_magura, ascii=False, digit=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 38.6 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import cnvk\n", | |
"%timeit cnvk.convert(dogura_magura, cnvk.Z_KATA, cnvk.H_ASCII, cnvk.H_SPACE)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 23.1 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import mojimoji\n", | |
"%timeit mojimoji.han_to_zen(dogura_magura)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 360 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import zenhan\n", | |
"%timeit zenhan.h2z(dogura_magura)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 237 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import rfZenHan\n", | |
"%timeit rfZenHan.h2z(dogura_magura)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 95.4 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit nkf.nkf('-x -m0 --z1', dogura_magura)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 短文をひらがな→カタカナ" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100000 loops, best of 3: 18.1 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit jaconv.hira2kata(dogura_magura[:140])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10000 loops, best of 3: 79.1 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit cnvk.convert(dogura_magura[:140], cnvk.HIRA2KATA, cnvk.Z_KATA)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10000 loops, best of 3: 25.4 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"import mohayonao\n", | |
"%timeit mohayonao.katakana(dogura_magura[:140])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The slowest run took 4.00 times longer than the fastest. This could mean that an intermediate result is being cached.\n", | |
"10000 loops, best of 3: 23.2 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit nkf.nkf('-x -m0 --katakana', dogura_magura[:140])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 長文をひらがな→カタカナ" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 51.6 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit jaconv.hira2kata(dogura_magura)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 41.8 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit cnvk.convert(dogura_magura, cnvk.HIRA2KATA, cnvk.Z_KATA)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 246 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit mohayonao.katakana(dogura_magura)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 98.6 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit nkf.nkf('-x -m0 --katakana', dogura_magura)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment