Skip to content

Instantly share code, notes, and snippets.

@riow1983
Created July 16, 2017 01:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save riow1983/989cadcd1b7c8c3873b8ee67d4ec505d to your computer and use it in GitHub Desktop.
Save riow1983/989cadcd1b7c8c3873b8ee67d4ec505d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from bayes import BayesianFilter\n",
"bf = BayesianFilter()"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# テキストを学習\n",
"bf.fit(\"Adam Smith\", \"外国人\")\n",
"bf.fit(\"Napoleon Bonaparte\", \"外国人\")\n",
"bf.fit(\"Adolf Hitler\", \"外国人\")\n",
"bf.fit(\"Gabriel Lippmann\", \"外国人\")\n",
"bf.fit(\"トーマス ベイズ\", \"カタカナ外国人\")\n",
"bf.fit(\"カール ハイド\",\"カタカナ外国人\")\n",
"bf.fit(\"マーク ザッカーバーグ\", \"カタカナ外国人\")\n",
"bf.fit(\"リー クワンユー\", \"カタカナ外国人\")\n",
"bf.fit(\"湯川 秀樹\", \"日本人\")\n",
"bf.fit(\"朝永 振一郎\",\"日本人\")\n",
"bf.fit(\"小林 誠\",\"日本人\")\n",
"bf.fit(\"益川 敏英\",\"日本人\")\n",
"bf.fit(\"毛 沢東\", \"漢字外国人\")\n",
"bf.fit(\"習 近平\", \"漢字外国人\")\n",
"bf.fit(\"金 日成\", \"漢字外国人\")\n",
"bf.fit(\"江 沢民\", \"漢字外国人\")"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"カタカナ外国人 [('外国人', -12.936737166250065), ('カタカナ外国人', -11.327299253815966), ('日本人', -11.390459481409463), ('漢字外国人', -11.327299253815966)]\n"
]
}
],
"source": [
"# 予測\n",
"pre, scorelist = bf.predict(\"近衛 文麿\")\n",
"print(pre, scorelist)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{' ',\n",
" 'Adam',\n",
" 'Adolf',\n",
" 'Bonaparte',\n",
" 'Gabriel',\n",
" 'Hitler',\n",
" 'Lippmann',\n",
" 'Napoleon',\n",
" 'Smith',\n",
" '\\u3000',\n",
" 'カール',\n",
" 'クワンユー',\n",
" 'ザッカーバーグ',\n",
" 'トーマス',\n",
" 'ハイド',\n",
" 'ベイズ',\n",
" 'マーク',\n",
" 'リー',\n",
" '小林',\n",
" '川',\n",
" '振一郎',\n",
" '敏英',\n",
" '日成',\n",
" '朝永',\n",
" '毛',\n",
" '江',\n",
" '沢東',\n",
" '沢民',\n",
" '湯川',\n",
" '益',\n",
" '秀樹',\n",
" '習',\n",
" '誠',\n",
" '近平',\n",
" '金'}"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 出現した単語を全て記録したセットを取得\n",
"wos = bf.words\n",
"wosd = dict.fromkeys(wos, 0)\n",
"wos"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'カタカナ外国人': {'\\u3000': 4,\n",
" 'カール': 1,\n",
" 'クワンユー': 1,\n",
" 'ザッカーバーグ': 1,\n",
" 'トーマス': 1,\n",
" 'ハイド': 1,\n",
" 'ベイズ': 1,\n",
" 'マーク': 1,\n",
" 'リー': 1},\n",
" '外国人': {' ': 4,\n",
" 'Adam': 1,\n",
" 'Adolf': 1,\n",
" 'Bonaparte': 1,\n",
" 'Gabriel': 1,\n",
" 'Hitler': 1,\n",
" 'Lippmann': 1,\n",
" 'Napoleon': 1,\n",
" 'Smith': 1},\n",
" '日本人': {'\\u3000': 4,\n",
" '小林': 1,\n",
" '川': 1,\n",
" '振一郎': 1,\n",
" '敏英': 1,\n",
" '朝永': 1,\n",
" '湯川': 1,\n",
" '益': 1,\n",
" '秀樹': 1,\n",
" '誠': 1},\n",
" '漢字外国人': {'\\u3000': 4,\n",
" '日成': 1,\n",
" '毛': 1,\n",
" '江': 1,\n",
" '沢東': 1,\n",
" '沢民': 1,\n",
" '習': 1,\n",
" '近平': 1,\n",
" '金': 1}}"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# カテゴリごとの単語出現回数を記録した辞書を取得\n",
"wod = bf.word_dict\n",
"wod"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'カタカナ外国人': 4, '外国人': 4, '日本人': 4, '漢字外国人': 4}"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# カテゴリの出現回数を記録した辞書を取得\n",
"cad = bf.category_dict\n",
"cad"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# convert dict object to json file\n",
"with open('wosd.json', 'w') as fp:\n",
" json.dump(wosd, fp)\n",
"\n",
"with open('wod.json', 'w') as fp:\n",
" json.dump(wod, fp)\n",
" \n",
"with open('cad.json', 'w') as fp:\n",
" json.dump(cad, fp)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# create dict object from json file\n",
"wosd = json.loads(open('wosd.json').read())\n",
"wos = set(wosd.keys())\n",
"\n",
"wod = json.loads(open('wod.json').read())\n",
"cad = json.loads(open('cad.json').read())"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 一度初期化し、保存した過去の学習情報を引き継ぐ\n",
"from bayes import BayesianFilter\n",
"bf = BayesianFilter()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"set()"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.words"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.word_dict"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.category_dict"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"bf.words = wos\n",
"bf.word_dict = wod\n",
"bf.category_dict = cad"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('外国人',\n",
" [('外国人', -11.327299253815966),\n",
" ('カタカナ外国人', -12.936737166250065),\n",
" ('日本人', -12.999897393843563),\n",
" ('漢字外国人', -12.936737166250065)])"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"Tom Hanks\")"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('外国人',\n",
" [('外国人', -17.41815654480198),\n",
" ('カタカナ外国人', -20.637032369670184),\n",
" ('日本人', -20.742299415659346),\n",
" ('漢字外国人', -20.637032369670184)])"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"Robert De Niro\")"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('外国人',\n",
" [('外国人', -11.327299253815966),\n",
" ('カタカナ外国人', -12.936737166250065),\n",
" ('日本人', -12.999897393843563),\n",
" ('漢字外国人', -12.936737166250065)])"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"Gen Hoshino\")"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('漢字外国人',\n",
" [('外国人', -12.936737166250065),\n",
" ('カタカナ外国人', -11.327299253815966),\n",
" ('日本人', -11.390459481409463),\n",
" ('漢字外国人', -10.634152073256022)])"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"金 正男\")"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -12.936737166250065),\n",
" ('カタカナ外国人', -11.327299253815966),\n",
" ('日本人', -11.390459481409463),\n",
" ('漢字外国人', -11.327299253815966)])"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"朴 璐美\")"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -16.786884767960125),\n",
" ('カタカナ外国人', -15.177446855526025),\n",
" ('日本人', -15.261660492317354),\n",
" ('漢字外国人', -15.177446855526025)])"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"李 小龍\")"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -20.637032369670184),\n",
" ('カタカナ外国人', -19.027594457236084),\n",
" ('日本人', -19.132861503225246),\n",
" ('漢字外国人', -19.027594457236084)])"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"古歩道 ベンジャミン\")"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -12.936737166250065),\n",
" ('カタカナ外国人', -11.327299253815966),\n",
" ('日本人', -11.390459481409463),\n",
" ('漢字外国人', -11.327299253815966)])"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"キム イルソン\")"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('外国人',\n",
" [('外国人', -11.327299253815966),\n",
" ('カタカナ外国人', -12.936737166250065),\n",
" ('日本人', -12.999897393843563),\n",
" ('漢字外国人', -12.936737166250065)])"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"山下 奉文\")"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -12.936737166250065),\n",
" ('カタカナ外国人', -11.327299253815966),\n",
" ('日本人', -11.390459481409463),\n",
" ('漢字外国人', -11.327299253815966)])"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"宮沢 賢治\")"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -12.936737166250065),\n",
" ('カタカナ外国人', -11.327299253815966),\n",
" ('日本人', -11.390459481409463),\n",
" ('漢字外国人', -11.327299253815966)])"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"徳川 家康\")"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -12.936737166250065),\n",
" ('カタカナ外国人', -11.327299253815966),\n",
" ('日本人', -11.390459481409463),\n",
" ('漢字外国人', -11.327299253815966)])"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"井浦 新\")"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -16.786884767960125),\n",
" ('カタカナ外国人', -15.177446855526025),\n",
" ('日本人', -15.261660492317354),\n",
" ('漢字外国人', -15.177446855526025)])"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"窪塚 洋介\")"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('カタカナ外国人',\n",
" [('外国人', -12.936737166250065),\n",
" ('カタカナ外国人', -11.327299253815966),\n",
" ('日本人', -11.390459481409463),\n",
" ('漢字外国人', -11.327299253815966)])"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bf.predict(\"伊藤 博文\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment