Created
April 23, 2018 15:35
-
-
Save hurutoriya/1c417853a723206d5deb19b5083538ff to your computer and use it in GitHub Desktop.
Google Colabratory で Mecab-ipadic-Neologd を使うまで。 日本語のNLPで最近の語句にも対応している必須の形態素解析に必要な Mecab-ipadic-NEologd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "colab-mecab-ipadic-NEologd.ipynb", | |
"version": "0.3.2", | |
"views": {}, | |
"default_view": {}, | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"metadata": { | |
"id": "CSVnUqVaEkmM", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"base_uri": "https://localhost:8080/", | |
"height": 136 | |
}, | |
"outputId": "99bab0e4-3afa-4210-8503-c4464b035d10", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1524496839265, | |
"user_tz": -540, | |
"elapsed": 251651, | |
"user": { | |
"displayName": "Shunya Ueta", | |
"photoUrl": "//lh5.googleusercontent.com/-h1SgqhkQ2rE/AAAAAAAAAAI/AAAAAAAA4rM/Lbs_FhJHV1o/s50-c-k-no/photo.jpg", | |
"userId": "102611377204768639960" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# install MeCab\n", | |
"!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null\n", | |
"!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git > /dev/null \n", | |
"!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n > /dev/null 2>&1\n", | |
"!pip install mecab-python3 > /dev/null" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Cloning into 'mecab-ipadic-neologd'...\n", | |
"remote: Counting objects: 72, done.\u001b[K\n", | |
"remote: Compressing objects: 100% (71/71), done.\u001b[K\n", | |
"remote: Total 72 (delta 5), reused 53 (delta 0), pack-reused 0\u001b[K\n", | |
"Unpacking objects: 100% (72/72), done.\n", | |
"\u001b[33mYou are using pip version 9.0.3, however version 10.0.1 is available.\n", | |
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "Z-sWhqjhE8Ar", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "7a73d990-e6a5-4b38-b511-cc37162a12d2", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1524496871540, | |
"user_tz": -540, | |
"elapsed": 1656, | |
"user": { | |
"displayName": "Shunya Ueta", | |
"photoUrl": "//lh5.googleusercontent.com/-h1SgqhkQ2rE/AAAAAAAAAAI/AAAAAAAA4rM/Lbs_FhJHV1o/s50-c-k-no/photo.jpg", | |
"userId": "102611377204768639960" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# check path to \"ipadic-neologd\" \n", | |
"!echo `mecab-config --dicdir`\"/mecab-ipadic-neologd\"" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd\r\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "ui5sRgj-GCnG", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"import MeCab" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Tjmm-nA1GKUd", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"base_uri": "https://localhost:8080/", | |
"height": 442 | |
}, | |
"outputId": "5e8ed53f-fa29-4362-f801-e857da256256", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1524496988864, | |
"user_tz": -540, | |
"elapsed": 680, | |
"user": { | |
"displayName": "Shunya Ueta", | |
"photoUrl": "//lh5.googleusercontent.com/-h1SgqhkQ2rE/AAAAAAAAAAI/AAAAAAAA4rM/Lbs_FhJHV1o/s50-c-k-no/photo.jpg", | |
"userId": "102611377204768639960" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"m = MeCab.Tagger()\n", | |
"sample_txt = \"彼女はペンパイナッポーアッポーペンと恋ダンスを踊った。\"\n", | |
"print(\"Mecab:\\n\", m.parse(sample_txt))\n", | |
"\n", | |
"path = \"-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd\"\n", | |
"m = MeCab.Tagger(path)\n", | |
"print(\"Mecab ipadic NEologd:\\n\",m.parse(sample_txt))" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Mecab:\n", | |
" 彼女\t名詞,代名詞,一般,*,*,*,彼女,カノジョ,カノジョ\n", | |
"は\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n", | |
"ペンパイナッポーアッポーペン\t名詞,一般,*,*,*,*,*\n", | |
"と\t助詞,並立助詞,*,*,*,*,と,ト,ト\n", | |
"恋\t名詞,一般,*,*,*,*,恋,コイ,コイ\n", | |
"ダンス\t名詞,サ変接続,*,*,*,*,ダンス,ダンス,ダンス\n", | |
"を\t助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n", | |
"踊っ\t動詞,自立,*,*,五段・ラ行,連用タ接続,踊る,オドッ,オドッ\n", | |
"た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ\n", | |
"。\t記号,句点,*,*,*,*,。,。,。\n", | |
"EOS\n", | |
"\n", | |
"Mecab ipadic NEologd:\n", | |
" 彼女\t名詞,代名詞,一般,*,*,*,彼女,カノジョ,カノジョ\n", | |
"は\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n", | |
"ペンパイナッポーアッポーペン\t名詞,固有名詞,一般,*,*,*,Pen-Pineapple-Apple-Pen,ペンパイナッポーアッポーペン,ペンパイナッポーアッポーペン\n", | |
"と\t助詞,並立助詞,*,*,*,*,と,ト,ト\n", | |
"恋ダンス\t名詞,固有名詞,一般,*,*,*,恋ダンス,コイダンス,コイダンス\n", | |
"を\t助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n", | |
"踊っ\t動詞,自立,*,*,五段・ラ行,連用タ接続,踊る,オドッ,オドッ\n", | |
"た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ\n", | |
"。\t記号,句点,*,*,*,*,。,。,。\n", | |
"EOS\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "jEBJ-tGMGWPa", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment