Skip to content

Instantly share code, notes, and snippets.

@nezuQ
Created April 19, 2014 10:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nezuQ/11079953 to your computer and use it in GitHub Desktop.
Save nezuQ/11079953 to your computer and use it in GitHub Desktop.
ニコニコ大百科データからMeCab辞書を生成する ref: http://qiita.com/nezuq/items/875ac806d9b1afb3798d
python nc2mecab.py
# -*- encoding: utf-8 -*-
import os
import csv
import re
def main():
#入力フォルダ名
pth = 'head'
#出力ファイル名
wtnme = 'ncnc.csv'
#単語整形用の削除文字列パターン
rmvptn = re.compile(r'(^\d[1,2]月\d[1,2]日$)|((\(|().+(\)|))$)') #月日タグとタグ後ろのジャンル名は削除
with open(wtnme,'wb') as wtfh:
wt = csv.writer(wtfh)
fnmes = os.listdir(pth)
for fnme in fnmes:
with open(os.path.join(pth,fnme),'rb') as rdfh:
rd = csv.reader(rdfh)
for row in rd:
if row[3]=='a':
wrd = rmvptn.sub('',row[1]).lower()
if(0 < len(wrd)):
wt.writerow(
[wrd,'0','0',int(max(-32768.0, (6000 - 200 *(len(wrd)**1.3)))),'名詞','一般','*','*','*','*',wrd,row[2],row[2],'ニコニコ大百科']
)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment