legendmohe/corrector.py

## corrector.py
#!/usr/bin/env python
# encoding: utf-8
# Copyright 2014 Xinyu, He <legendmohe@foxmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os
import re
import json
import itertools
# from pprint import pprint


# word_order = ['while', 'if', 'then', 'else', 'delay', 'trigger', 'stop', 'finish',
#          'action', 'target', 'next', 'logical', 'compare']

class Corrector(object):

    path = 'init.json'
    words = ['action', 'target']
    single_words = ['action']
    sep_words = ['while', 'if', 'then', 'else', 'next', 'logical', 'compare']

    def __init__(self):
        with open(Corrector.path) as f:
            json_obj = json.load(f)
            if not json_obj:
                raise Exception("invaild json file.")

        cmd_object = json_obj["command"]
        sep_arrays = [cmd_object[x] for x in Corrector.sep_words if x in cmd_object.keys()]

        sep_list = []
        map(lambda x: sep_list.extend(x), sep_arrays)
        self._pattern = re.compile(u"|".join(sep_list))
        self._msg_pattern = re.compile(ur"#[\S]+#")

        word_arrays = [cmd_object[x] for x in Corrector.words if x in cmd_object.keys()]
        term_set = [(set(x + y), x + y) for x, y in itertools.product(*word_arrays)]
        s_words = [cmd_object[x] for x in Corrector.single_words if x in cmd_object.keys()]
        [term_set.extend([(set(x), x) for x in y]) for y in s_words]
        self._term_set = term_set

    def evalueate(self, src, target):
        return len(src.intersection(target))*1.0/(len(src.symmetric_difference(target)) + 1)

    def correct_cmd(self, src):
        # extract msg
        msg = u""
        start_index = 0
        fliter_src = u""
        for m in re.finditer(self._msg_pattern, src):
            msg += src[m.start(0) + 1:m.end(0) - 1]
            fliter_src += src[start_index:m.start(0)]
            start_index = m.end(0)

        if len(fliter_src) == 0:
            cmd_set = set(src)
        else:
            cmd_set = set(fliter_src)
        res = [(self.evalueate(s, cmd_set), t) for s, t in self._term_set]
        max_res = max(res, key=lambda x: x[0])
        if max_res[0] == 0:
            return u""
        if len(msg) != 0:
            return max_res[1] + u"#%s#" % msg
        else:
            return max_res[1]

    def correct(self, src):
        uncorrect = []
        sep = []
        sep_start_index = 0
        for m in re.finditer(self._pattern, src):
            sep.append(src[m.start(0):m.end(0)])
            uncorrect.append(src[sep_start_index:m.start(0)])
            sep_start_index = m.end(0)
        uncorrect.append(src[sep_start_index:])

        corrected_cmds = [self.correct_cmd(x) for x in uncorrect]
        corrented_res = u"".join([x + y for x, y in zip(corrected_cmds, sep)])
        corrented_res += corrected_cmds[-1]

        return corrented_res

if __name__  == "__main__":
    s = u"停止播放"
    corrector = Corrector()
    print s, "->", corrector.correct(s)
	#!/usr/bin/env python
	# encoding: utf-8
	# Copyright 2014 Xinyu, He <legendmohe@foxmail.com>
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import os
	import re
	import json
	import itertools
	# from pprint import pprint


	# word_order = ['while', 'if', 'then', 'else', 'delay', 'trigger', 'stop', 'finish',
	# 'action', 'target', 'next', 'logical', 'compare']

	class Corrector(object):

	path = 'init.json'
	words = ['action', 'target']
	single_words = ['action']
	sep_words = ['while', 'if', 'then', 'else', 'next', 'logical', 'compare']

	def __init__(self):
	with open(Corrector.path) as f:
	json_obj = json.load(f)
	if not json_obj:
	raise Exception("invaild json file.")

	cmd_object = json_obj["command"]
	sep_arrays = [cmd_object[x] for x in Corrector.sep_words if x in cmd_object.keys()]

	sep_list = []
	map(lambda x: sep_list.extend(x), sep_arrays)
	self._pattern = re.compile(u"\|".join(sep_list))
	self._msg_pattern = re.compile(ur"#[\S]+#")

	word_arrays = [cmd_object[x] for x in Corrector.words if x in cmd_object.keys()]
	term_set = [(set(x + y), x + y) for x, y in itertools.product(*word_arrays)]
	s_words = [cmd_object[x] for x in Corrector.single_words if x in cmd_object.keys()]
	[term_set.extend([(set(x), x) for x in y]) for y in s_words]
	self._term_set = term_set

	def evalueate(self, src, target):
	return len(src.intersection(target))*1.0/(len(src.symmetric_difference(target)) + 1)

	def correct_cmd(self, src):
	# extract msg
	msg = u""
	start_index = 0
	fliter_src = u""
	for m in re.finditer(self._msg_pattern, src):
	msg += src[m.start(0) + 1:m.end(0) - 1]
	fliter_src += src[start_index:m.start(0)]
	start_index = m.end(0)

	if len(fliter_src) == 0:
	cmd_set = set(src)
	else:
	cmd_set = set(fliter_src)
	res = [(self.evalueate(s, cmd_set), t) for s, t in self._term_set]
	max_res = max(res, key=lambda x: x[0])
	if max_res[0] == 0:
	return u""
	if len(msg) != 0:
	return max_res[1] + u"#%s#" % msg
	else:
	return max_res[1]

	def correct(self, src):
	uncorrect = []
	sep = []
	sep_start_index = 0
	for m in re.finditer(self._pattern, src):
	sep.append(src[m.start(0):m.end(0)])
	uncorrect.append(src[sep_start_index:m.start(0)])
	sep_start_index = m.end(0)
	uncorrect.append(src[sep_start_index:])

	corrected_cmds = [self.correct_cmd(x) for x in uncorrect]
	corrented_res = u"".join([x + y for x, y in zip(corrected_cmds, sep)])
	corrented_res += corrected_cmds[-1]

	return corrented_res

	if __name__ == "__main__":
	s = u"停止播放"
	corrector = Corrector()
	print s, "->", corrector.correct(s)