Skip to content

Instantly share code, notes, and snippets.

@legendmohe
Created November 11, 2014 09:15
Show Gist options
  • Save legendmohe/ef72632d4ec5ae5defc2 to your computer and use it in GitHub Desktop.
Save legendmohe/ef72632d4ec5ae5defc2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# encoding: utf-8
# Copyright 2014 Xinyu, He <legendmohe@foxmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import json
import itertools
# from pprint import pprint
# word_order = ['while', 'if', 'then', 'else', 'delay', 'trigger', 'stop', 'finish',
# 'action', 'target', 'next', 'logical', 'compare']
class Corrector(object):
path = 'init.json'
words = ['action', 'target']
single_words = ['action']
sep_words = ['while', 'if', 'then', 'else', 'next', 'logical', 'compare']
def __init__(self):
with open(Corrector.path) as f:
json_obj = json.load(f)
if not json_obj:
raise Exception("invaild json file.")
cmd_object = json_obj["command"]
sep_arrays = [cmd_object[x] for x in Corrector.sep_words if x in cmd_object.keys()]
sep_list = []
map(lambda x: sep_list.extend(x), sep_arrays)
self._pattern = re.compile(u"|".join(sep_list))
self._msg_pattern = re.compile(ur"#[\S]+#")
word_arrays = [cmd_object[x] for x in Corrector.words if x in cmd_object.keys()]
term_set = [(set(x + y), x + y) for x, y in itertools.product(*word_arrays)]
s_words = [cmd_object[x] for x in Corrector.single_words if x in cmd_object.keys()]
[term_set.extend([(set(x), x) for x in y]) for y in s_words]
self._term_set = term_set
def evalueate(self, src, target):
return len(src.intersection(target))*1.0/(len(src.symmetric_difference(target)) + 1)
def correct_cmd(self, src):
# extract msg
msg = u""
start_index = 0
fliter_src = u""
for m in re.finditer(self._msg_pattern, src):
msg += src[m.start(0) + 1:m.end(0) - 1]
fliter_src += src[start_index:m.start(0)]
start_index = m.end(0)
if len(fliter_src) == 0:
cmd_set = set(src)
else:
cmd_set = set(fliter_src)
res = [(self.evalueate(s, cmd_set), t) for s, t in self._term_set]
max_res = max(res, key=lambda x: x[0])
if max_res[0] == 0:
return u""
if len(msg) != 0:
return max_res[1] + u"#%s#" % msg
else:
return max_res[1]
def correct(self, src):
uncorrect = []
sep = []
sep_start_index = 0
for m in re.finditer(self._pattern, src):
sep.append(src[m.start(0):m.end(0)])
uncorrect.append(src[sep_start_index:m.start(0)])
sep_start_index = m.end(0)
uncorrect.append(src[sep_start_index:])
corrected_cmds = [self.correct_cmd(x) for x in uncorrect]
corrented_res = u"".join([x + y for x, y in zip(corrected_cmds, sep)])
corrented_res += corrected_cmds[-1]
return corrented_res
if __name__ == "__main__":
s = u"停止播放"
corrector = Corrector()
print s, "->", corrector.correct(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment