Created
May 6, 2017 11:09
-
-
Save ratsgo/9d67443a515f0fed62da66d647575d4b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class BranchingEntropy: | |
def __init__(self, min_length=2, max_length=7): | |
self.min_length = min_length | |
self.max_length = max_length | |
self.encoder = IntegerEncoder() | |
self.L = defaultdict(lambda: defaultdict(int)) | |
self.R = defaultdict(lambda: defaultdict(int)) | |
def get_all_access_variety(self): | |
av = {} | |
#words = set(self.L.keys()) | |
#words += set(self.R.keys()) | |
words = set(list(set(self.L.keys())) + list(set(self.R.keys()))) # 내가 수정 | |
for word in words: | |
av[word] = self.get_access_variety(word) | |
return av | |
def get_access_variety(self, word, ignore_space=False): | |
return (len(self.get_left_branch(word, ignore_space)), len(self.get_right_branch(word, ignore_space))) | |
def get_all_branching_entropies(self, ignore_space=False): | |
be = {} | |
words = set(self.L.keys()) | |
for word in self.R.keys(): | |
words.add(word) | |
for word in words: | |
be[self.encoder.decode(word)] = self.get_branching_entropy(word, ignore_space) | |
return be | |
def get_branching_entropy(self, word, ignore_space=False): | |
be_l = self.entropy(self.get_left_branch(word, ignore_space)) | |
be_r = self.entropy(self.get_right_branch(word, ignore_space)) | |
return (be_l, be_r) | |
def entropy(self, dic): | |
if not dic: | |
return 0.0 | |
sum_count = sum(dic.values()) | |
entropy = 0 | |
for freq in dic.values(): | |
prob = freq / sum_count | |
entropy += prob * math.log(prob) | |
return -1 * entropy | |
def get_left_branch(self, word, ignore_space=False): | |
if isinstance(word, int): | |
word_index = word | |
else: | |
word_index = self.encoder.encode(word) | |
if (word_index == -1) or (not word_index in self.L): | |
return {} | |
branch = self.L[word_index] | |
if ignore_space: | |
return {w: f for w, f in branch.items() if not ' ' in self.encoder.decode(w, unknown=' ')} | |
else: | |
return branch | |
def get_right_branch(self, word, ignore_space=False): | |
if isinstance(word, int): | |
word_index = word | |
else: | |
word_index = self.encoder.encode(word) | |
if (word_index == -1) or (not word_index in self.R): | |
return {} | |
branch = self.R[word_index] | |
if ignore_space: | |
return {w: f for w, f in branch.items() if not ' ' in self.encoder.decode(w, unknown=' ')} | |
else: | |
return branch | |
def counter_size(self): | |
return (len(self.L), len(self.R)) | |
def prune_extreme_case(self, min_count): | |
# TODO: encoder remove & compatify | |
before_size = self.counter_size() | |
self.L = defaultdict(lambda: defaultdict(int), | |
{word: dic for word, dic in self.L.items() if sum(dic.values()) > min_count}) | |
self.R = defaultdict(lambda: defaultdict(int), | |
{word: dic for word, dic in self.R.items() if sum(dic.values()) > min_count}) | |
after_size = self.counter_size() | |
return (before_size, after_size) | |
def train(self, sents, min_count=5, num_for_pruning=10000): | |
for num_sent, sent in enumerate(sents): | |
sent = sent.strip() | |
if not sent: | |
continue | |
sent = ' ' + sent.strip() + ' ' | |
length = len(sent) | |
for i in range(1, length - 1): | |
for window in range(self.min_length, self.max_length + 1): | |
if i + window - 1 >= length: | |
continue | |
word = sent[i:i + window] | |
if ' ' in word: | |
continue | |
word_index = self.encoder.fit(word) | |
if sent[i - 1] == ' ': | |
left_extension = sent[max(0, i - 2):i + window] | |
else: | |
left_extension = sent[i - 1:i + window] | |
if sent[i + window] == ' ': | |
right_extension = sent[i:min(length, i + window + 2)] | |
else: | |
right_extension = sent[i:i + window + 1] | |
if left_extension == None or right_extension == None: | |
print(sent, i, window) | |
left_index = self.encoder.fit(left_extension) | |
right_index = self.encoder.fit(right_extension) | |
self.L[word_index][left_index] += 1 | |
self.R[word_index][right_index] += 1 | |
if (num_for_pruning > 0) and ((num_sent + 1) % num_for_pruning == 0): | |
before, after = self.prune_extreme_case(min_count) | |
sys.stdout.write('\rnum sent = %d: %s --> %s' % (num_sent, str(before), str(after))) | |
if (num_for_pruning > 0) and ((num_sent + 1) % num_for_pruning == 0): | |
self.prune_extreme_case(min_count) | |
sys.stdout.write('\rnum_sent = %d: %s --> %s' % (num_sent, str(before), str(after))) | |
def load(self, model_fname, encoder_fname): | |
self.encoder.load(encoder_fname) | |
try: | |
with open(model_fname, encoding='utf-8') as f: | |
next(f) # SKIP: parameters (min_length, max_length) | |
token = next(f).split() | |
self.min_length = int(token[0]) | |
self.max_length = int(token[1]) | |
next(f) # SKIP: left side extension | |
is_right_side = True | |
for line in f: | |
if '# right side extension' in line: | |
is_right_side = True | |
continue | |
token = line.split(); | |
word = int(token[0]) | |
extension = int(token[1]) | |
freq = int(token[2]) | |
if is_right_side: | |
self.R[word][extension] = freq | |
else: | |
self.L[word][extension] = freq | |
except Exception as e: | |
print(e) | |
def save(self, model_fname, encoder_fname): | |
self.encoder.save(encoder_fname) | |
try: | |
with open(model_fname, 'w', encoding='utf-8') as f: | |
f.write("# parameters (min_length max_length)\n") | |
f.write('%d %d\n' % (self.min_length, self.max_length)) | |
f.write('# left side extension\n') | |
for word, extension_dict in self.L.items(): | |
for extension, freq in extension_dict.items(): | |
f.write('%d %d %d\n' % (word, extension, freq)) | |
f.write('# right side extension\n') | |
for word, extension_dict in self.R.items(): | |
for extension, freq in extension_dict.items(): | |
f.write('%d %d %d\n' % (word, extension, freq)) | |
except Exception as e: | |
print(e) | |
def words(self): | |
return set(self.encoder.inverse) | |
class MaxScoreTokenizer: | |
def __init__(self, max_length=10, scores={}, default_score=0.0): | |
self.max_length = max_length | |
self.scores = scores | |
self.ds = default_score | |
def tokenize(self, sentence): | |
#return [self._recursive_tokenize(token) for token in sentence.split()] | |
tmp = [self._recursive_tokenize(token) for token in sentence.split()] | |
result = [] | |
for words in tmp: | |
for tokens in words: | |
result.append(tokens[0]) | |
return result | |
def _recursive_tokenize(self, token, range_l=0, debug=False): | |
length = len(token) | |
if length <= 2: | |
return [(token, 0, length, self.ds, length)] | |
if range_l == 0: | |
range_l = min(self.max_length, length) | |
scores = self._initialize(token, range_l, length) | |
if debug: | |
pprint(scores) | |
result = self._find(scores) | |
adds = self._add_inter_subtokens(token, result) | |
if result[-1][2] != length: | |
adds += self._add_first_subtoken(token, result) | |
if result[0][1] != 0: | |
adds += self._add_last_subtoken(token, result) | |
return sorted(result + adds, key=lambda x: x[1]) | |
def _initialize(self, token, range_l, length): | |
scores = [] | |
for b in range(0, length - 1): | |
for r in range(2, range_l + 1): | |
e = b + r | |
if e > length: | |
continue | |
subtoken = token[b:e] | |
score = self.scores.get(subtoken, self.ds) | |
scores.append((subtoken, b, e, score, r)) | |
#return sorted(scores, key=lambda x: (x[3], x[4]), reverse=True) | |
return sorted(scores, key=lambda x: (x[0], x[1]), reverse=True) | |
def _find(self, scores): | |
result = [] | |
num_iter = 0 | |
while scores: | |
word, b, e, score, r = scores.pop(0) | |
result.append((word, b, e, score, r)) | |
if not scores: | |
break | |
removals = [] | |
for i, (_1, b_, e_, _2, _3) in enumerate(scores): | |
if (b_ < e and b < e_) or (b_ < e and e_ > b): | |
removals.append(i) | |
for i in reversed(removals): | |
del scores[i] | |
num_iter += 1 | |
if num_iter > 100: break | |
return sorted(result, key=lambda x: x[1]) | |
def _add_inter_subtokens(self, token, result): | |
adds = [] | |
for i, base in enumerate(result[:-1]): | |
if base[2] == result[i + 1][1]: | |
continue | |
b = base[2] | |
e = result[i + 1][1] | |
subtoken = token[b:e] | |
adds.append((subtoken, b, e, self.ds, e - b)) | |
return adds | |
def _add_first_subtoken(self, token, result): | |
b = result[-1][2] | |
subtoken = token[b:] | |
score = self.scores.get(subtoken, self.ds) | |
return [(subtoken, b, len(token), score, len(subtoken))] | |
def _add_last_subtoken(self, token, result): | |
e = result[0][1] | |
subtoken = token[0:e] | |
score = self.scores.get(subtoken, self.ds) | |
return [(subtoken, 0, e, score, e)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
['\ufeff', '특히', '기', '판과', '기', '판이', '합착된', '표준규격의', '표시패널을', '재가공', '하여', '표준', '규격과는', '다른', '크기,', '형상으로', '제작되는', '표시패널에', '관한', '것', '이다.', '최근,', '음극선관의', '단', '점인', '무게와', '부', '피를', '줄일', '수', '있는', '평판', '표시장치들이', '개', '발되고', '있다.', '평판', '표시장치는', '액정', '표시소자,', '전계', '방출', '표시소자,', '플라즈마', '디스', '플레이', '패널,', '전계발광소자,', '전기영동', '표시소자,', '플렉서블', '디스', '플레이', '등이', '있다.', '전계발광소자는', '무기', '전계발광소자와', '유기', '전계발광소자를', '포', '함한다.', '유기', '전계발광소자는', '유기발광다', '이오드소', '자를', '포', '함한다.기존', '디스', '플레이', '모듈', '메이', '커는', '아래의', '표', '1과', '같은', '표준', '해상도를', '만족', '하는', '규격', '으로', '표시패널을', '제작', '하고', '있다.', '해상', ':Y', '최근에는', '디스', '플레이의', '활용', '범위', '가', '확대되면서', '표준', '규격', '이외의', '비', '표준', '크기나', '특이한', '형태의', '디스', '플레이를', '요구', '하는', '수요', '가', '증가', '하고', '있다.', '극', '장,', '결', '혼식장,', '호텔', '로', '비,', '관공', '서,', '관', '광지', '장소에서', '안내와', '홍보를', '위한', '디', '지털', '광고', '판으로', '사', '용되는', '디', '지털', '정보', '디스', '플레이,', '자동', '차,', '선박,', '비', '행기', '항법장치로', '사', '용되는', '표시장치', '등이', '디스', '플레이업계의', '새로운', '제', '품군으로', '서', '주목을', '받고', '있다.', '현재', '사', '용되고', '있는', 'LCD,', 'FED,', 'E', 'LD,', 'E', 'PD,', 'PDP,', '플렉서블', '디스', '플레이', '표시장치는', '표준규격으로', '제', '조되기', '때문', '에,', '다', '양한', '크기', '가', '요구되는', 'D', 'ID나', '항법표시장치로', '사', '용되는', '것이', '어려운', '실상', '이다.기존', '디스', '플레이', '모듈', '메이', '커는', '비', '표준', '규격의', '수요', '가', '있더라도', '시', '장이', '표준', '규격의', '디스', '플레이', '시', '장에', '비', '하여', '매우', '작기', '때', '문에', '비', '표준', '규격의', '표시패널을', '생산', '하지', '않고', '있다.', '다', '양한', '크기의', '표시패널을', '제작', '하려면', '크기별로', '포토마스크를', '디', '자인,', '설계', '및', '제작', '하여야', '하는데,', '포토마스크의', '디', '자인,', '설계', '및', '제작에는', '고가의', '비', '용이', '소', '요되기', '때', '문에', '비', '표준의', '표시패널을', '별도의', '포토마스크를', '이용', '하여', '제작', '하는', '것은', '매우', '비', '효율적이다.', '비', '표준', '규격의', '표시패널', '또는,', '기존', '디스', '플레이', '모듈', '업', '체에서', '제작', '하지', '않은', '크기나', '형태의', '표시패널', '수', '요를', '충족할', '수', '있는', '방', '법이', '요구되고', '있다.', '위', '하여,', '최근에는', '기존', '디스', '플레이', '모듈', '메이', '커에', '의해', '완성된', '표준', '규격의', '표시패널을', '재가공', '하여', '비', '표준', '규격의', '표시패널이나', '특이한', '형태의', '표시패널을', '제작', '하는', '방', '법에', '대', '하여', '연구', '가', '진', '행되고', '있다.', '이', '방', '법은', '완성된', '표준', '규격의', '표시패널의', '일부를', '원', '하는', '크기나', '형태로', '커팅', '하고', '커', '팅면을', '실링', '하여야', '하나,', '커', '팅면을', '실링', '하기', '가', '쉽지', '않고', '커', '팅면을', '실링한', '후에도', '실링이', '견고', '하지', '않아', '실링', '불량이', '발', '생되고', '있다.액', '티브', '매', '트릭스', '박막', '트랜지스터', '액정', '표시장치의', '기존', 'LCD', '모듈', '메이', '커에', '의해', '완성된', '액정', '표시패널은', '및', '상부', '유리기', '판과', '하부', '유리기판', '사', '이에', '액정', '층이', '실런', '트로', '밀', '봉되어', '있다.', '상부', '유리기', '판에는', '컬러', '필터', '어레이', '가', '형성되고', '하부', '유리기', '판에는', 'TFT', '어레이', '가', '형성되며,', '사', '이에는', '액정', '층의', '셀갭을', '유지', '하기', '위한', '스', '페이서', '가', '형성된다.', '커', '팅전에,', '액정', '표시패널의', '내부', '진공에', '의', '해서', '액정', '층의', '셀갭이', '스', '페이서', '높', '이만큼', '유', '지되', '지만,', '커', '팅과', '동', '시에', '액정', '표시패널', '진공압이', '변', '하여', '액', '정층', '내에', '공기', '가', '혼입되거나', '자', '중에', '의해', '휘거나', '틀어지면서', '액', '정이', '부분', '적으로', '모', '이거나', '줄어들게', '된다.', '따', '라서', '커팅', '후에', '발생', '하는', '문', '제를', '해결', '해야', '할', '필요성이', '대', '두되', '었다.', '고가의', '포토마스크를', '별도로', '제작', '하지', '않고도', '표준', '규격의', '표시패널을', '재가공', '하여', '소비자', '가', '원', '하는', '크기나', '형상의', '표시패널을', '제작할', '수', '있으므로,', '저렴한', '비용', '으로', '비', '표준', '규격을', '사용', '하는', '표시장치에', '적용될', '수', '있는', '효과를', '얻을', '수', '있을', '뿐', '아니라', '보다', '신뢰성', '있고', '견', '고한', '표시패널을', '얻을', '수', '있게', '된다.', '목', '적은', '종래', '기', '술의', '문', '제점들을', '해결하고자', '안', '출된', '발명', '으로', '서,', '표시패널의', '커팅', '후의', '문', '제를', '발', '생시', '키기', '않을', '뿐', '아니라', '고가의', '포토마스크를', '별도로', '제작', '하지', '않고도', '원', '하는', '크기와', '형상을', '갖는', '표시패널을', '제공', '하는데', '있다.', '목', '적을', '달성', '하기', '위', '하여,', '표시패널은', '신', '호라인들이', '형성된', '표시영역을', '갖는', '기', '판과,', '기', '판의', '표시영역에', '대응', '하는', '표시영역을', '갖는', '기', '판과,', '기', '판과', '기', '판을', '합착하기', '위해', '기', '판의', '표시영역', '및', '기', '판의', '표시영역의', '어느', '하나의', '표시영역', '내에', '적어도', '일부', '가', '형성되는', '실런', '트를', '포', '함하는', '것을', '특징으로', '한다.', '다른', '표시패널은', '신', '호라인들이', '형성된', '표시영역을', '갖는', '기', '판;', '기', '판의', '표시영역에', '대응', '하는', '표시영역을', '갖는', '기', '판;', '및', '기', '판과', '기', '판을', '합착하기', '위해', '기', '판의', '표시영역', '컬러', '필터와', '적어도', '일부', '가', '중', '첩되도록', '형성되는', '실런', '트를', '포', '함하는', '것을', '특징으로', '한다.', '구성', '에서,', '표시패널은', '액정', '표시소자의', '표시패널,', '전계', '방출', '표시소자의', '표시패널,', '플라즈마', '디스', '플레이', '패널의', '표시패널,', '전계발광소자의', '표시패널,', '전기영동', '표시소자의', '표시패널', '플렉서블', '표시소자의', '표시패널', '중', '어느', '하나인', '것을', '특징으로', '한다.또한,', '실런', '트는', '폐루', '프형', '또는', '일부분이', '개', '방된', '개', '루프', '형으로', '형성된', '것을', '특징으로', '한다.', '실런', '트는', '기', '판의', '표시영역에', '형성된', '신', '호라인들', '및', '박막', '트랜지스터들', '중의', '적어도', '하나와', '중', '첩되거나', '교', '차되도록', '형성되는', '것을', '특징으로', '한다.']