Created
May 4, 2014 10:01
-
-
Save nezuQ/11513898 to your computer and use it in GitHub Desktop.
NLTKのplotで日本語を使う方法 ref: http://qiita.com/nezuq/items/7d165448638d15993451
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('UTF-8') | |
import MeCab | |
import nltk | |
from numpy import * | |
from nltk.corpus.reader import * | |
from nltk.corpus.reader.util import * | |
from nltk.text import Text | |
import jptokenizer | |
### matplotデフォルトフォントの指定 ### ←ポイント1:明示的に日本語フォントを指定 | |
import matplotlib | |
import matplotlib.font_manager as font_manager | |
#TTFファイル(フォント)のアドレスを指定 | |
font_path = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf' | |
#フォントの詳細情報を取得 | |
font_prop = font_manager.FontProperties(fname = font_path) | |
#フォントの名前を使い、matplotのデフォルトフォントに指定 | |
matplotlib.rcParams['font.family'] = font_prop.get_name() | |
### 日本語コーパス(unicode)の作成 ### ←ポイント2:単語群はunicodeで管理 | |
#コーパスを読み込み | |
jp_sent_tokenizer = nltk.RegexpTokenizer(u'[^ 「」!?。]*[!?。]') | |
reader = PlaintextCorpusReader("/home/ユーザ/デスクトップ", r'NKMK.txt', | |
encoding='utf-8', | |
para_block_reader=read_line_block, | |
sent_tokenizer=jp_sent_tokenizer, | |
word_tokenizer=jptokenizer.JPMeCabTokenizer()) | |
#コーパスからunicode指定で単語群を取得 | |
nkmk = Text(unicode(w) for w in reader.words()) | |
### 描画 ### ←ポイント3:引数もunicodeで指定 | |
nkmk.dispersion_plot([u'にこ',u'真姫',u'ここあ',u'こころ']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment