-
-
Save ratsgo/9d67443a515f0fed62da66d647575d4b to your computer and use it in GitHub Desktop.
class BranchingEntropy: | |
def __init__(self, min_length=2, max_length=7): | |
self.min_length = min_length | |
self.max_length = max_length | |
self.encoder = IntegerEncoder() | |
self.L = defaultdict(lambda: defaultdict(int)) | |
self.R = defaultdict(lambda: defaultdict(int)) | |
def get_all_access_variety(self): | |
av = {} | |
#words = set(self.L.keys()) | |
#words += set(self.R.keys()) | |
words = set(list(set(self.L.keys())) + list(set(self.R.keys()))) # 내가 수정 | |
for word in words: | |
av[word] = self.get_access_variety(word) | |
return av | |
def get_access_variety(self, word, ignore_space=False): | |
return (len(self.get_left_branch(word, ignore_space)), len(self.get_right_branch(word, ignore_space))) | |
def get_all_branching_entropies(self, ignore_space=False): | |
be = {} | |
words = set(self.L.keys()) | |
for word in self.R.keys(): | |
words.add(word) | |
for word in words: | |
be[self.encoder.decode(word)] = self.get_branching_entropy(word, ignore_space) | |
return be | |
def get_branching_entropy(self, word, ignore_space=False): | |
be_l = self.entropy(self.get_left_branch(word, ignore_space)) | |
be_r = self.entropy(self.get_right_branch(word, ignore_space)) | |
return (be_l, be_r) | |
def entropy(self, dic): | |
if not dic: | |
return 0.0 | |
sum_count = sum(dic.values()) | |
entropy = 0 | |
for freq in dic.values(): | |
prob = freq / sum_count | |
entropy += prob * math.log(prob) | |
return -1 * entropy | |
def get_left_branch(self, word, ignore_space=False): | |
if isinstance(word, int): | |
word_index = word | |
else: | |
word_index = self.encoder.encode(word) | |
if (word_index == -1) or (not word_index in self.L): | |
return {} | |
branch = self.L[word_index] | |
if ignore_space: | |
return {w: f for w, f in branch.items() if not ' ' in self.encoder.decode(w, unknown=' ')} | |
else: | |
return branch | |
def get_right_branch(self, word, ignore_space=False): | |
if isinstance(word, int): | |
word_index = word | |
else: | |
word_index = self.encoder.encode(word) | |
if (word_index == -1) or (not word_index in self.R): | |
return {} | |
branch = self.R[word_index] | |
if ignore_space: | |
return {w: f for w, f in branch.items() if not ' ' in self.encoder.decode(w, unknown=' ')} | |
else: | |
return branch | |
def counter_size(self): | |
return (len(self.L), len(self.R)) | |
def prune_extreme_case(self, min_count): | |
# TODO: encoder remove & compatify | |
before_size = self.counter_size() | |
self.L = defaultdict(lambda: defaultdict(int), | |
{word: dic for word, dic in self.L.items() if sum(dic.values()) > min_count}) | |
self.R = defaultdict(lambda: defaultdict(int), | |
{word: dic for word, dic in self.R.items() if sum(dic.values()) > min_count}) | |
after_size = self.counter_size() | |
return (before_size, after_size) | |
def train(self, sents, min_count=5, num_for_pruning=10000): | |
for num_sent, sent in enumerate(sents): | |
sent = sent.strip() | |
if not sent: | |
continue | |
sent = ' ' + sent.strip() + ' ' | |
length = len(sent) | |
for i in range(1, length - 1): | |
for window in range(self.min_length, self.max_length + 1): | |
if i + window - 1 >= length: | |
continue | |
word = sent[i:i + window] | |
if ' ' in word: | |
continue | |
word_index = self.encoder.fit(word) | |
if sent[i - 1] == ' ': | |
left_extension = sent[max(0, i - 2):i + window] | |
else: | |
left_extension = sent[i - 1:i + window] | |
if sent[i + window] == ' ': | |
right_extension = sent[i:min(length, i + window + 2)] | |
else: | |
right_extension = sent[i:i + window + 1] | |
if left_extension == None or right_extension == None: | |
print(sent, i, window) | |
left_index = self.encoder.fit(left_extension) | |
right_index = self.encoder.fit(right_extension) | |
self.L[word_index][left_index] += 1 | |
self.R[word_index][right_index] += 1 | |
if (num_for_pruning > 0) and ((num_sent + 1) % num_for_pruning == 0): | |
before, after = self.prune_extreme_case(min_count) | |
sys.stdout.write('\rnum sent = %d: %s --> %s' % (num_sent, str(before), str(after))) | |
if (num_for_pruning > 0) and ((num_sent + 1) % num_for_pruning == 0): | |
self.prune_extreme_case(min_count) | |
sys.stdout.write('\rnum_sent = %d: %s --> %s' % (num_sent, str(before), str(after))) | |
def load(self, model_fname, encoder_fname): | |
self.encoder.load(encoder_fname) | |
try: | |
with open(model_fname, encoding='utf-8') as f: | |
next(f) # SKIP: parameters (min_length, max_length) | |
token = next(f).split() | |
self.min_length = int(token[0]) | |
self.max_length = int(token[1]) | |
next(f) # SKIP: left side extension | |
is_right_side = True | |
for line in f: | |
if '# right side extension' in line: | |
is_right_side = True | |
continue | |
token = line.split(); | |
word = int(token[0]) | |
extension = int(token[1]) | |
freq = int(token[2]) | |
if is_right_side: | |
self.R[word][extension] = freq | |
else: | |
self.L[word][extension] = freq | |
except Exception as e: | |
print(e) | |
def save(self, model_fname, encoder_fname): | |
self.encoder.save(encoder_fname) | |
try: | |
with open(model_fname, 'w', encoding='utf-8') as f: | |
f.write("# parameters (min_length max_length)\n") | |
f.write('%d %d\n' % (self.min_length, self.max_length)) | |
f.write('# left side extension\n') | |
for word, extension_dict in self.L.items(): | |
for extension, freq in extension_dict.items(): | |
f.write('%d %d %d\n' % (word, extension, freq)) | |
f.write('# right side extension\n') | |
for word, extension_dict in self.R.items(): | |
for extension, freq in extension_dict.items(): | |
f.write('%d %d %d\n' % (word, extension, freq)) | |
except Exception as e: | |
print(e) | |
def words(self): | |
return set(self.encoder.inverse) | |
class MaxScoreTokenizer: | |
def __init__(self, max_length=10, scores={}, default_score=0.0): | |
self.max_length = max_length | |
self.scores = scores | |
self.ds = default_score | |
def tokenize(self, sentence): | |
#return [self._recursive_tokenize(token) for token in sentence.split()] | |
tmp = [self._recursive_tokenize(token) for token in sentence.split()] | |
result = [] | |
for words in tmp: | |
for tokens in words: | |
result.append(tokens[0]) | |
return result | |
def _recursive_tokenize(self, token, range_l=0, debug=False): | |
length = len(token) | |
if length <= 2: | |
return [(token, 0, length, self.ds, length)] | |
if range_l == 0: | |
range_l = min(self.max_length, length) | |
scores = self._initialize(token, range_l, length) | |
if debug: | |
pprint(scores) | |
result = self._find(scores) | |
adds = self._add_inter_subtokens(token, result) | |
if result[-1][2] != length: | |
adds += self._add_first_subtoken(token, result) | |
if result[0][1] != 0: | |
adds += self._add_last_subtoken(token, result) | |
return sorted(result + adds, key=lambda x: x[1]) | |
def _initialize(self, token, range_l, length): | |
scores = [] | |
for b in range(0, length - 1): | |
for r in range(2, range_l + 1): | |
e = b + r | |
if e > length: | |
continue | |
subtoken = token[b:e] | |
score = self.scores.get(subtoken, self.ds) | |
scores.append((subtoken, b, e, score, r)) | |
#return sorted(scores, key=lambda x: (x[3], x[4]), reverse=True) | |
return sorted(scores, key=lambda x: (x[0], x[1]), reverse=True) | |
def _find(self, scores): | |
result = [] | |
num_iter = 0 | |
while scores: | |
word, b, e, score, r = scores.pop(0) | |
result.append((word, b, e, score, r)) | |
if not scores: | |
break | |
removals = [] | |
for i, (_1, b_, e_, _2, _3) in enumerate(scores): | |
if (b_ < e and b < e_) or (b_ < e and e_ > b): | |
removals.append(i) | |
for i in reversed(removals): | |
del scores[i] | |
num_iter += 1 | |
if num_iter > 100: break | |
return sorted(result, key=lambda x: x[1]) | |
def _add_inter_subtokens(self, token, result): | |
adds = [] | |
for i, base in enumerate(result[:-1]): | |
if base[2] == result[i + 1][1]: | |
continue | |
b = base[2] | |
e = result[i + 1][1] | |
subtoken = token[b:e] | |
adds.append((subtoken, b, e, self.ds, e - b)) | |
return adds | |
def _add_first_subtoken(self, token, result): | |
b = result[-1][2] | |
subtoken = token[b:] | |
score = self.scores.get(subtoken, self.ds) | |
return [(subtoken, b, len(token), score, len(subtoken))] | |
def _add_last_subtoken(self, token, result): | |
e = result[0][1] | |
subtoken = token[0:e] | |
score = self.scores.get(subtoken, self.ds) | |
return [(subtoken, 0, e, score, e)] |
['\ufeff', '특히', '기', '판과', '기', '판이', '합착된', '표준규격의', '표시패널을', '재가공', '하여', '표준', '규격과는', '다른', '크기,', '형상으로', '제작되는', '표시패널에', '관한', '것', '이다.', '최근,', '음극선관의', '단', '점인', '무게와', '부', '피를', '줄일', '수', '있는', '평판', '표시장치들이', '개', '발되고', '있다.', '평판', '표시장치는', '액정', '표시소자,', '전계', '방출', '표시소자,', '플라즈마', '디스', '플레이', '패널,', '전계발광소자,', '전기영동', '표시소자,', '플렉서블', '디스', '플레이', '등이', '있다.', '전계발광소자는', '무기', '전계발광소자와', '유기', '전계발광소자를', '포', '함한다.', '유기', '전계발광소자는', '유기발광다', '이오드소', '자를', '포', '함한다.기존', '디스', '플레이', '모듈', '메이', '커는', '아래의', '표', '1과', '같은', '표준', '해상도를', '만족', '하는', '규격', '으로', '표시패널을', '제작', '하고', '있다.', '해상', ':Y', '최근에는', '디스', '플레이의', '활용', '범위', '가', '확대되면서', '표준', '규격', '이외의', '비', '표준', '크기나', '특이한', '형태의', '디스', '플레이를', '요구', '하는', '수요', '가', '증가', '하고', '있다.', '극', '장,', '결', '혼식장,', '호텔', '로', '비,', '관공', '서,', '관', '광지', '장소에서', '안내와', '홍보를', '위한', '디', '지털', '광고', '판으로', '사', '용되는', '디', '지털', '정보', '디스', '플레이,', '자동', '차,', '선박,', '비', '행기', '항법장치로', '사', '용되는', '표시장치', '등이', '디스', '플레이업계의', '새로운', '제', '품군으로', '서', '주목을', '받고', '있다.', '현재', '사', '용되고', '있는', 'LCD,', 'FED,', 'E', 'LD,', 'E', 'PD,', 'PDP,', '플렉서블', '디스', '플레이', '표시장치는', '표준규격으로', '제', '조되기', '때문', '에,', '다', '양한', '크기', '가', '요구되는', 'D', 'ID나', '항법표시장치로', '사', '용되는', '것이', '어려운', '실상', '이다.기존', '디스', '플레이', '모듈', '메이', '커는', '비', '표준', '규격의', '수요', '가', '있더라도', '시', '장이', '표준', '규격의', '디스', '플레이', '시', '장에', '비', '하여', '매우', '작기', '때', '문에', '비', '표준', '규격의', '표시패널을', '생산', '하지', '않고', '있다.', '다', '양한', '크기의', '표시패널을', '제작', '하려면', '크기별로', '포토마스크를', '디', '자인,', '설계', '및', '제작', '하여야', '하는데,', '포토마스크의', '디', '자인,', '설계', '및', '제작에는', '고가의', '비', '용이', '소', '요되기', '때', '문에', '비', '표준의', '표시패널을', '별도의', '포토마스크를', '이용', '하여', '제작', '하는', '것은', '매우', '비', '효율적이다.', '비', '표준', '규격의', '표시패널', '또는,', '기존', '디스', '플레이', '모듈', '업', '체에서', '제작', '하지', '않은', '크기나', '형태의', '표시패널', '수', '요를', '충족할', '수', '있는', '방', '법이', '요구되고', '있다.', '위', '하여,', '최근에는', '기존', '디스', '플레이', '모듈', '메이', '커에', '의해', '완성된', '표준', '규격의', '표시패널을', '재가공', '하여', '비', '표준', '규격의', '표시패널이나', '특이한', '형태의', '표시패널을', '제작', '하는', '방', '법에', '대', '하여', '연구', '가', '진', '행되고', '있다.', '이', '방', '법은', '완성된', '표준', '규격의', '표시패널의', '일부를', '원', '하는', '크기나', '형태로', '커팅', '하고', '커', '팅면을', '실링', '하여야', '하나,', '커', '팅면을', '실링', '하기', '가', '쉽지', '않고', '커', '팅면을', '실링한', '후에도', '실링이', '견고', '하지', '않아', '실링', '불량이', '발', '생되고', '있다.액', '티브', '매', '트릭스', '박막', '트랜지스터', '액정', '표시장치의', '기존', 'LCD', '모듈', '메이', '커에', '의해', '완성된', '액정', '표시패널은', '및', '상부', '유리기', '판과', '하부', '유리기판', '사', '이에', '액정', '층이', '실런', '트로', '밀', '봉되어', '있다.', '상부', '유리기', '판에는', '컬러', '필터', '어레이', '가', '형성되고', '하부', '유리기', '판에는', 'TFT', '어레이', '가', '형성되며,', '사', '이에는', '액정', '층의', '셀갭을', '유지', '하기', '위한', '스', '페이서', '가', '형성된다.', '커', '팅전에,', '액정', '표시패널의', '내부', '진공에', '의', '해서', '액정', '층의', '셀갭이', '스', '페이서', '높', '이만큼', '유', '지되', '지만,', '커', '팅과', '동', '시에', '액정', '표시패널', '진공압이', '변', '하여', '액', '정층', '내에', '공기', '가', '혼입되거나', '자', '중에', '의해', '휘거나', '틀어지면서', '액', '정이', '부분', '적으로', '모', '이거나', '줄어들게', '된다.', '따', '라서', '커팅', '후에', '발생', '하는', '문', '제를', '해결', '해야', '할', '필요성이', '대', '두되', '었다.', '고가의', '포토마스크를', '별도로', '제작', '하지', '않고도', '표준', '규격의', '표시패널을', '재가공', '하여', '소비자', '가', '원', '하는', '크기나', '형상의', '표시패널을', '제작할', '수', '있으므로,', '저렴한', '비용', '으로', '비', '표준', '규격을', '사용', '하는', '표시장치에', '적용될', '수', '있는', '효과를', '얻을', '수', '있을', '뿐', '아니라', '보다', '신뢰성', '있고', '견', '고한', '표시패널을', '얻을', '수', '있게', '된다.', '목', '적은', '종래', '기', '술의', '문', '제점들을', '해결하고자', '안', '출된', '발명', '으로', '서,', '표시패널의', '커팅', '후의', '문', '제를', '발', '생시', '키기', '않을', '뿐', '아니라', '고가의', '포토마스크를', '별도로', '제작', '하지', '않고도', '원', '하는', '크기와', '형상을', '갖는', '표시패널을', '제공', '하는데', '있다.', '목', '적을', '달성', '하기', '위', '하여,', '표시패널은', '신', '호라인들이', '형성된', '표시영역을', '갖는', '기', '판과,', '기', '판의', '표시영역에', '대응', '하는', '표시영역을', '갖는', '기', '판과,', '기', '판과', '기', '판을', '합착하기', '위해', '기', '판의', '표시영역', '및', '기', '판의', '표시영역의', '어느', '하나의', '표시영역', '내에', '적어도', '일부', '가', '형성되는', '실런', '트를', '포', '함하는', '것을', '특징으로', '한다.', '다른', '표시패널은', '신', '호라인들이', '형성된', '표시영역을', '갖는', '기', '판;', '기', '판의', '표시영역에', '대응', '하는', '표시영역을', '갖는', '기', '판;', '및', '기', '판과', '기', '판을', '합착하기', '위해', '기', '판의', '표시영역', '컬러', '필터와', '적어도', '일부', '가', '중', '첩되도록', '형성되는', '실런', '트를', '포', '함하는', '것을', '특징으로', '한다.', '구성', '에서,', '표시패널은', '액정', '표시소자의', '표시패널,', '전계', '방출', '표시소자의', '표시패널,', '플라즈마', '디스', '플레이', '패널의', '표시패널,', '전계발광소자의', '표시패널,', '전기영동', '표시소자의', '표시패널', '플렉서블', '표시소자의', '표시패널', '중', '어느', '하나인', '것을', '특징으로', '한다.또한,', '실런', '트는', '폐루', '프형', '또는', '일부분이', '개', '방된', '개', '루프', '형으로', '형성된', '것을', '특징으로', '한다.', '실런', '트는', '기', '판의', '표시영역에', '형성된', '신', '호라인들', '및', '박막', '트랜지스터들', '중의', '적어도', '하나와', '중', '첩되거나', '교', '차되도록', '형성되는', '것을', '특징으로', '한다.']
zz 결과 되게 웃김