shunsukeaihara/cabocha_warpper.py

## cabocha_warpper.py
# -*- coding: utf-8 -*-
import CaboCha
import chardet
import syslog
import math
from collections import defaultdict
import unicodedata
syslog.openlog('cabocha_wrapper',syslog.LOG_PID|syslog.LOG_PERROR,syslog.LOG_SYSLOG)

AVG_LENGTH = 200.0

def is_japanese_string(string):
    pass


def to_utf8(string,encode=None):
    if not encode:
        encode = chardet.detect(string)["encoding"]
    uni = unicode(string,encode)
    return unicodedata.normalize("NFKC", uni).encode('utf-8')

def is_contentword(token):
    pos_base = token.feature_list(0)
    if pos_base == "動詞":
        return token.feature_list(6)
    elif pos_base == "名詞":
        return token.surface(6)
    elif pos_base == "形容詞":
        return token.feature_list(6)
    elif pos_base == "副詞":
        return token.feature_list(6)
    else:
        return None

class Phrase(object):
    def __init__(self,chunk,phrases,cid):
        self._chunk = chunk
        self._sentence = phrases
        self._cid = cid
        self._dependants = []
        self._tokens = []
        self._head = None

    def push_token(self,token):
        self._tokens.append(token)

    def set_dependency(self):
        if self._chunk.link>0:
            self._head = self._sentence[self._chunk.link]
            self._head.add_dependant(self._cid)

    def add_dependant(self,cid):
        self._dependants.append(cid)

    def is_ne(self):
        for token in self._tokens:
            if token.ne!='O':
                return token.ne
        return None

    def is_num(self):
        for token in self._tokens:
            if token.feature_list(1)=="数":
                return True
        return False

    @property
    def surface(self):
        return "".join([token.surface for token in self._tokens])

    @property
    def tokens(self):
        return self._tokens

    @property
    def base_tokens(self):
        return [t.feature_list(6) for t in self._tokens]

class CaboChaWrapper(object):

    def __init__(self,option):
        self._cabo = CaboCha.Parser(option)

    def parse_sentence(self,sentence):
        tree = self._cabo.parse(sentence)
        phrases=[]
        try:
            size = tree.size()
        except:
            syslog.syslog(syslog.LOG_ALERT,'cabocha failed! %s'%sentence)
            return phrases
        phrase = None
        cid = 0
        for i in xrange(size):
            token = tree.token(i)
            if token.chunk:
                if phrase:
                    phrases.append(phrase)
                phrase=Phrase(token.chunk,phrases,cid)
                cid += 1
            phrase.push_token(token)
        if phrase:
            phrases.append(phrase)
        for phrase in phrases:
            phrase.set_dependency()
        return phrases


class TextFeatureExtractor(object):

    def __init__(self,option=None):
        if option==None:
            option=""
        self._cabo = CaboChaWrapper(option)

    def to_dictionary(self,vec):
        dic = defaultdict(int)
        for v in vec:
            if not(v):
                continue
            dic[v]+=1
        return dic

    def to_weighted_dictionary(self,vec):
        dic = defaultdict(float)
        for i, v in enumerate(vec):
            if not(v):
                continue
            dic[unicode(v,'utf-8')]+=math.exp(-i/AVG_LENGTH)
        return dic

    def remove_none(self,vec):
        return [unicode(token,'utf-8') for token in vec if token]

    def parse(self,string):
        """
        vectype:
               normal: term frequency dict
               weighted: word position weighted frequency dict
               list: return a list not a dictonary
        """
        tokens = []
        string =  to_utf8(string)
        for sentence in string.split('。'):
            phrases = self._cabo.parse_sentence(sentence+"。")
            for phrase in phrases:
                if phrase.is_num():#if phrase contains number
                    #後で数詞正規化処理を入れる。現在は数詞を含む名詞句は読み飛ばす
                    for token in phrase.tokens:
                        tokens.append(None)
                else: #後で固有表現の扱いを考える
                    for token in phrase.tokens:
                        tokens.append(is_contentword(token))
        return tokens
	# -- coding: utf-8 --
	import CaboCha
	import chardet
	import syslog
	import math
	from collections import defaultdict
	import unicodedata
	syslog.openlog('cabocha_wrapper',syslog.LOG_PID\|syslog.LOG_PERROR,syslog.LOG_SYSLOG)

	AVG_LENGTH = 200.0

	def is_japanese_string(string):
	pass


	def to_utf8(string,encode=None):
	if not encode:
	encode = chardet.detect(string)["encoding"]
	uni = unicode(string,encode)
	return unicodedata.normalize("NFKC", uni).encode('utf-8')

	def is_contentword(token):
	pos_base = token.feature_list(0)
	if pos_base == "動詞":
	return token.feature_list(6)
	elif pos_base == "名詞":
	return token.surface(6)
	elif pos_base == "形容詞":
	return token.feature_list(6)
	elif pos_base == "副詞":
	return token.feature_list(6)
	else:
	return None

	class Phrase(object):
	def __init__(self,chunk,phrases,cid):
	self._chunk = chunk
	self._sentence = phrases
	self._cid = cid
	self._dependants = []
	self._tokens = []
	self._head = None

	def push_token(self,token):
	self._tokens.append(token)

	def set_dependency(self):
	if self._chunk.link>0:
	self._head = self._sentence[self._chunk.link]
	self._head.add_dependant(self._cid)

	def add_dependant(self,cid):
	self._dependants.append(cid)

	def is_ne(self):
	for token in self._tokens:
	if token.ne!='O':
	return token.ne
	return None

	def is_num(self):
	for token in self._tokens:
	if token.feature_list(1)=="数":
	return True
	return False

	@property
	def surface(self):
	return "".join([token.surface for token in self._tokens])

	@property
	def tokens(self):
	return self._tokens

	@property
	def base_tokens(self):
	return [t.feature_list(6) for t in self._tokens]

	class CaboChaWrapper(object):

	def __init__(self,option):
	self._cabo = CaboCha.Parser(option)

	def parse_sentence(self,sentence):
	tree = self._cabo.parse(sentence)
	phrases=[]
	try:
	size = tree.size()
	except:
	syslog.syslog(syslog.LOG_ALERT,'cabocha failed! %s'%sentence)
	return phrases
	phrase = None
	cid = 0
	for i in xrange(size):
	token = tree.token(i)
	if token.chunk:
	if phrase:
	phrases.append(phrase)
	phrase=Phrase(token.chunk,phrases,cid)
	cid += 1
	phrase.push_token(token)
	if phrase:
	phrases.append(phrase)
	for phrase in phrases:
	phrase.set_dependency()
	return phrases


	class TextFeatureExtractor(object):

	def __init__(self,option=None):
	if option==None:
	option=""
	self._cabo = CaboChaWrapper(option)

	def to_dictionary(self,vec):
	dic = defaultdict(int)
	for v in vec:
	if not(v):
	continue
	dic[v]+=1
	return dic

	def to_weighted_dictionary(self,vec):
	dic = defaultdict(float)
	for i, v in enumerate(vec):
	if not(v):
	continue
	dic[unicode(v,'utf-8')]+=math.exp(-i/AVG_LENGTH)
	return dic

	def remove_none(self,vec):
	return [unicode(token,'utf-8') for token in vec if token]

	def parse(self,string):
	"""
	vectype:
	normal: term frequency dict
	weighted: word position weighted frequency dict
	list: return a list not a dictonary
	"""
	tokens = []
	string = to_utf8(string)
	for sentence in string.split('。'):
	phrases = self._cabo.parse_sentence(sentence+"。")
	for phrase in phrases:
	if phrase.is_num():#if phrase contains number
	#後で数詞正規化処理を入れる。現在は数詞を含む名詞句は読み飛ばす
	for token in phrase.tokens:
	tokens.append(None)
	else: #後で固有表現の扱いを考える
	for token in phrase.tokens:
	tokens.append(is_contentword(token))
	return tokens