Skip to content

Instantly share code, notes, and snippets.

@shunsukeaihara
Created May 22, 2013 08:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shunsukeaihara/5626098 to your computer and use it in GitHub Desktop.
Save shunsukeaihara/5626098 to your computer and use it in GitHub Desktop.
cabochaのラッパーのベース
# -*- coding: utf-8 -*-
import CaboCha
import chardet
import syslog
import math
from collections import defaultdict
import unicodedata
syslog.openlog('cabocha_wrapper',syslog.LOG_PID|syslog.LOG_PERROR,syslog.LOG_SYSLOG)
AVG_LENGTH = 200.0
def is_japanese_string(string):
pass
def to_utf8(string,encode=None):
if not encode:
encode = chardet.detect(string)["encoding"]
uni = unicode(string,encode)
return unicodedata.normalize("NFKC", uni).encode('utf-8')
def is_contentword(token):
pos_base = token.feature_list(0)
if pos_base == "動詞":
return token.feature_list(6)
elif pos_base == "名詞":
return token.surface(6)
elif pos_base == "形容詞":
return token.feature_list(6)
elif pos_base == "副詞":
return token.feature_list(6)
else:
return None
class Phrase(object):
def __init__(self,chunk,phrases,cid):
self._chunk = chunk
self._sentence = phrases
self._cid = cid
self._dependants = []
self._tokens = []
self._head = None
def push_token(self,token):
self._tokens.append(token)
def set_dependency(self):
if self._chunk.link>0:
self._head = self._sentence[self._chunk.link]
self._head.add_dependant(self._cid)
def add_dependant(self,cid):
self._dependants.append(cid)
def is_ne(self):
for token in self._tokens:
if token.ne!='O':
return token.ne
return None
def is_num(self):
for token in self._tokens:
if token.feature_list(1)=="数":
return True
return False
@property
def surface(self):
return "".join([token.surface for token in self._tokens])
@property
def tokens(self):
return self._tokens
@property
def base_tokens(self):
return [t.feature_list(6) for t in self._tokens]
class CaboChaWrapper(object):
def __init__(self,option):
self._cabo = CaboCha.Parser(option)
def parse_sentence(self,sentence):
tree = self._cabo.parse(sentence)
phrases=[]
try:
size = tree.size()
except:
syslog.syslog(syslog.LOG_ALERT,'cabocha failed! %s'%sentence)
return phrases
phrase = None
cid = 0
for i in xrange(size):
token = tree.token(i)
if token.chunk:
if phrase:
phrases.append(phrase)
phrase=Phrase(token.chunk,phrases,cid)
cid += 1
phrase.push_token(token)
if phrase:
phrases.append(phrase)
for phrase in phrases:
phrase.set_dependency()
return phrases
class TextFeatureExtractor(object):
def __init__(self,option=None):
if option==None:
option=""
self._cabo = CaboChaWrapper(option)
def to_dictionary(self,vec):
dic = defaultdict(int)
for v in vec:
if not(v):
continue
dic[v]+=1
return dic
def to_weighted_dictionary(self,vec):
dic = defaultdict(float)
for i, v in enumerate(vec):
if not(v):
continue
dic[unicode(v,'utf-8')]+=math.exp(-i/AVG_LENGTH)
return dic
def remove_none(self,vec):
return [unicode(token,'utf-8') for token in vec if token]
def parse(self,string):
"""
vectype:
normal: term frequency dict
weighted: word position weighted frequency dict
list: return a list not a dictonary
"""
tokens = []
string = to_utf8(string)
for sentence in string.split('。'):
phrases = self._cabo.parse_sentence(sentence+"。")
for phrase in phrases:
if phrase.is_num():#if phrase contains number
#後で数詞正規化処理を入れる。現在は数詞を含む名詞句は読み飛ばす
for token in phrase.tokens:
tokens.append(None)
else: #後で固有表現の扱いを考える
for token in phrase.tokens:
tokens.append(is_contentword(token))
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment