Skip to content

Instantly share code, notes, and snippets.

@ratsgo
Created May 6, 2017 11:09
Show Gist options
  • Save ratsgo/9d67443a515f0fed62da66d647575d4b to your computer and use it in GitHub Desktop.
Save ratsgo/9d67443a515f0fed62da66d647575d4b to your computer and use it in GitHub Desktop.
class BranchingEntropy:
def __init__(self, min_length=2, max_length=7):
self.min_length = min_length
self.max_length = max_length
self.encoder = IntegerEncoder()
self.L = defaultdict(lambda: defaultdict(int))
self.R = defaultdict(lambda: defaultdict(int))
def get_all_access_variety(self):
av = {}
#words = set(self.L.keys())
#words += set(self.R.keys())
words = set(list(set(self.L.keys())) + list(set(self.R.keys()))) # 내가 수정
for word in words:
av[word] = self.get_access_variety(word)
return av
def get_access_variety(self, word, ignore_space=False):
return (len(self.get_left_branch(word, ignore_space)), len(self.get_right_branch(word, ignore_space)))
def get_all_branching_entropies(self, ignore_space=False):
be = {}
words = set(self.L.keys())
for word in self.R.keys():
words.add(word)
for word in words:
be[self.encoder.decode(word)] = self.get_branching_entropy(word, ignore_space)
return be
def get_branching_entropy(self, word, ignore_space=False):
be_l = self.entropy(self.get_left_branch(word, ignore_space))
be_r = self.entropy(self.get_right_branch(word, ignore_space))
return (be_l, be_r)
def entropy(self, dic):
if not dic:
return 0.0
sum_count = sum(dic.values())
entropy = 0
for freq in dic.values():
prob = freq / sum_count
entropy += prob * math.log(prob)
return -1 * entropy
def get_left_branch(self, word, ignore_space=False):
if isinstance(word, int):
word_index = word
else:
word_index = self.encoder.encode(word)
if (word_index == -1) or (not word_index in self.L):
return {}
branch = self.L[word_index]
if ignore_space:
return {w: f for w, f in branch.items() if not ' ' in self.encoder.decode(w, unknown=' ')}
else:
return branch
def get_right_branch(self, word, ignore_space=False):
if isinstance(word, int):
word_index = word
else:
word_index = self.encoder.encode(word)
if (word_index == -1) or (not word_index in self.R):
return {}
branch = self.R[word_index]
if ignore_space:
return {w: f for w, f in branch.items() if not ' ' in self.encoder.decode(w, unknown=' ')}
else:
return branch
def counter_size(self):
return (len(self.L), len(self.R))
def prune_extreme_case(self, min_count):
# TODO: encoder remove & compatify
before_size = self.counter_size()
self.L = defaultdict(lambda: defaultdict(int),
{word: dic for word, dic in self.L.items() if sum(dic.values()) > min_count})
self.R = defaultdict(lambda: defaultdict(int),
{word: dic for word, dic in self.R.items() if sum(dic.values()) > min_count})
after_size = self.counter_size()
return (before_size, after_size)
def train(self, sents, min_count=5, num_for_pruning=10000):
for num_sent, sent in enumerate(sents):
sent = sent.strip()
if not sent:
continue
sent = ' ' + sent.strip() + ' '
length = len(sent)
for i in range(1, length - 1):
for window in range(self.min_length, self.max_length + 1):
if i + window - 1 >= length:
continue
word = sent[i:i + window]
if ' ' in word:
continue
word_index = self.encoder.fit(word)
if sent[i - 1] == ' ':
left_extension = sent[max(0, i - 2):i + window]
else:
left_extension = sent[i - 1:i + window]
if sent[i + window] == ' ':
right_extension = sent[i:min(length, i + window + 2)]
else:
right_extension = sent[i:i + window + 1]
if left_extension == None or right_extension == None:
print(sent, i, window)
left_index = self.encoder.fit(left_extension)
right_index = self.encoder.fit(right_extension)
self.L[word_index][left_index] += 1
self.R[word_index][right_index] += 1
if (num_for_pruning > 0) and ((num_sent + 1) % num_for_pruning == 0):
before, after = self.prune_extreme_case(min_count)
sys.stdout.write('\rnum sent = %d: %s --> %s' % (num_sent, str(before), str(after)))
if (num_for_pruning > 0) and ((num_sent + 1) % num_for_pruning == 0):
self.prune_extreme_case(min_count)
sys.stdout.write('\rnum_sent = %d: %s --> %s' % (num_sent, str(before), str(after)))
def load(self, model_fname, encoder_fname):
self.encoder.load(encoder_fname)
try:
with open(model_fname, encoding='utf-8') as f:
next(f) # SKIP: parameters (min_length, max_length)
token = next(f).split()
self.min_length = int(token[0])
self.max_length = int(token[1])
next(f) # SKIP: left side extension
is_right_side = True
for line in f:
if '# right side extension' in line:
is_right_side = True
continue
token = line.split();
word = int(token[0])
extension = int(token[1])
freq = int(token[2])
if is_right_side:
self.R[word][extension] = freq
else:
self.L[word][extension] = freq
except Exception as e:
print(e)
def save(self, model_fname, encoder_fname):
self.encoder.save(encoder_fname)
try:
with open(model_fname, 'w', encoding='utf-8') as f:
f.write("# parameters (min_length max_length)\n")
f.write('%d %d\n' % (self.min_length, self.max_length))
f.write('# left side extension\n')
for word, extension_dict in self.L.items():
for extension, freq in extension_dict.items():
f.write('%d %d %d\n' % (word, extension, freq))
f.write('# right side extension\n')
for word, extension_dict in self.R.items():
for extension, freq in extension_dict.items():
f.write('%d %d %d\n' % (word, extension, freq))
except Exception as e:
print(e)
def words(self):
return set(self.encoder.inverse)
class MaxScoreTokenizer:
def __init__(self, max_length=10, scores={}, default_score=0.0):
self.max_length = max_length
self.scores = scores
self.ds = default_score
def tokenize(self, sentence):
#return [self._recursive_tokenize(token) for token in sentence.split()]
tmp = [self._recursive_tokenize(token) for token in sentence.split()]
result = []
for words in tmp:
for tokens in words:
result.append(tokens[0])
return result
def _recursive_tokenize(self, token, range_l=0, debug=False):
length = len(token)
if length <= 2:
return [(token, 0, length, self.ds, length)]
if range_l == 0:
range_l = min(self.max_length, length)
scores = self._initialize(token, range_l, length)
if debug:
pprint(scores)
result = self._find(scores)
adds = self._add_inter_subtokens(token, result)
if result[-1][2] != length:
adds += self._add_first_subtoken(token, result)
if result[0][1] != 0:
adds += self._add_last_subtoken(token, result)
return sorted(result + adds, key=lambda x: x[1])
def _initialize(self, token, range_l, length):
scores = []
for b in range(0, length - 1):
for r in range(2, range_l + 1):
e = b + r
if e > length:
continue
subtoken = token[b:e]
score = self.scores.get(subtoken, self.ds)
scores.append((subtoken, b, e, score, r))
#return sorted(scores, key=lambda x: (x[3], x[4]), reverse=True)
return sorted(scores, key=lambda x: (x[0], x[1]), reverse=True)
def _find(self, scores):
result = []
num_iter = 0
while scores:
word, b, e, score, r = scores.pop(0)
result.append((word, b, e, score, r))
if not scores:
break
removals = []
for i, (_1, b_, e_, _2, _3) in enumerate(scores):
if (b_ < e and b < e_) or (b_ < e and e_ > b):
removals.append(i)
for i in reversed(removals):
del scores[i]
num_iter += 1
if num_iter > 100: break
return sorted(result, key=lambda x: x[1])
def _add_inter_subtokens(self, token, result):
adds = []
for i, base in enumerate(result[:-1]):
if base[2] == result[i + 1][1]:
continue
b = base[2]
e = result[i + 1][1]
subtoken = token[b:e]
adds.append((subtoken, b, e, self.ds, e - b))
return adds
def _add_first_subtoken(self, token, result):
b = result[-1][2]
subtoken = token[b:]
score = self.scores.get(subtoken, self.ds)
return [(subtoken, b, len(token), score, len(subtoken))]
def _add_last_subtoken(self, token, result):
e = result[0][1]
subtoken = token[0:e]
score = self.scores.get(subtoken, self.ds)
return [(subtoken, 0, e, score, e)]
@dladmschd
Copy link

['\ufeff', '특히', '기', '판과', '기', '판이', '합착된', '표준규격의', '표시패널을', '재가공', '하여', '표준', '규격과는', '다른', '크기,', '형상으로', '제작되는', '표시패널에', '관한', '것', '이다.', '최근,', '음극선관의', '단', '점인', '무게와', '부', '피를', '줄일', '수', '있는', '평판', '표시장치들이', '개', '발되고', '있다.', '평판', '표시장치는', '액정', '표시소자,', '전계', '방출', '표시소자,', '플라즈마', '디스', '플레이', '패널,', '전계발광소자,', '전기영동', '표시소자,', '플렉서블', '디스', '플레이', '등이', '있다.', '전계발광소자는', '무기', '전계발광소자와', '유기', '전계발광소자를', '포', '함한다.', '유기', '전계발광소자는', '유기발광다', '이오드소', '자를', '포', '함한다.기존', '디스', '플레이', '모듈', '메이', '커는', '아래의', '표', '1과', '같은', '표준', '해상도를', '만족', '하는', '규격', '으로', '표시패널을', '제작', '하고', '있다.', '해상', ':Y', '최근에는', '디스', '플레이의', '활용', '범위', '가', '확대되면서', '표준', '규격', '이외의', '비', '표준', '크기나', '특이한', '형태의', '디스', '플레이를', '요구', '하는', '수요', '가', '증가', '하고', '있다.', '극', '장,', '결', '혼식장,', '호텔', '로', '비,', '관공', '서,', '관', '광지', '장소에서', '안내와', '홍보를', '위한', '디', '지털', '광고', '판으로', '사', '용되는', '디', '지털', '정보', '디스', '플레이,', '자동', '차,', '선박,', '비', '행기', '항법장치로', '사', '용되는', '표시장치', '등이', '디스', '플레이업계의', '새로운', '제', '품군으로', '서', '주목을', '받고', '있다.', '현재', '사', '용되고', '있는', 'LCD,', 'FED,', 'E', 'LD,', 'E', 'PD,', 'PDP,', '플렉서블', '디스', '플레이', '표시장치는', '표준규격으로', '제', '조되기', '때문', '에,', '다', '양한', '크기', '가', '요구되는', 'D', 'ID나', '항법표시장치로', '사', '용되는', '것이', '어려운', '실상', '이다.기존', '디스', '플레이', '모듈', '메이', '커는', '비', '표준', '규격의', '수요', '가', '있더라도', '시', '장이', '표준', '규격의', '디스', '플레이', '시', '장에', '비', '하여', '매우', '작기', '때', '문에', '비', '표준', '규격의', '표시패널을', '생산', '하지', '않고', '있다.', '다', '양한', '크기의', '표시패널을', '제작', '하려면', '크기별로', '포토마스크를', '디', '자인,', '설계', '및', '제작', '하여야', '하는데,', '포토마스크의', '디', '자인,', '설계', '및', '제작에는', '고가의', '비', '용이', '소', '요되기', '때', '문에', '비', '표준의', '표시패널을', '별도의', '포토마스크를', '이용', '하여', '제작', '하는', '것은', '매우', '비', '효율적이다.', '비', '표준', '규격의', '표시패널', '또는,', '기존', '디스', '플레이', '모듈', '업', '체에서', '제작', '하지', '않은', '크기나', '형태의', '표시패널', '수', '요를', '충족할', '수', '있는', '방', '법이', '요구되고', '있다.', '위', '하여,', '최근에는', '기존', '디스', '플레이', '모듈', '메이', '커에', '의해', '완성된', '표준', '규격의', '표시패널을', '재가공', '하여', '비', '표준', '규격의', '표시패널이나', '특이한', '형태의', '표시패널을', '제작', '하는', '방', '법에', '대', '하여', '연구', '가', '진', '행되고', '있다.', '이', '방', '법은', '완성된', '표준', '규격의', '표시패널의', '일부를', '원', '하는', '크기나', '형태로', '커팅', '하고', '커', '팅면을', '실링', '하여야', '하나,', '커', '팅면을', '실링', '하기', '가', '쉽지', '않고', '커', '팅면을', '실링한', '후에도', '실링이', '견고', '하지', '않아', '실링', '불량이', '발', '생되고', '있다.액', '티브', '매', '트릭스', '박막', '트랜지스터', '액정', '표시장치의', '기존', 'LCD', '모듈', '메이', '커에', '의해', '완성된', '액정', '표시패널은', '및', '상부', '유리기', '판과', '하부', '유리기판', '사', '이에', '액정', '층이', '실런', '트로', '밀', '봉되어', '있다.', '상부', '유리기', '판에는', '컬러', '필터', '어레이', '가', '형성되고', '하부', '유리기', '판에는', 'TFT', '어레이', '가', '형성되며,', '사', '이에는', '액정', '층의', '셀갭을', '유지', '하기', '위한', '스', '페이서', '가', '형성된다.', '커', '팅전에,', '액정', '표시패널의', '내부', '진공에', '의', '해서', '액정', '층의', '셀갭이', '스', '페이서', '높', '이만큼', '유', '지되', '지만,', '커', '팅과', '동', '시에', '액정', '표시패널', '진공압이', '변', '하여', '액', '정층', '내에', '공기', '가', '혼입되거나', '자', '중에', '의해', '휘거나', '틀어지면서', '액', '정이', '부분', '적으로', '모', '이거나', '줄어들게', '된다.', '따', '라서', '커팅', '후에', '발생', '하는', '문', '제를', '해결', '해야', '할', '필요성이', '대', '두되', '었다.', '고가의', '포토마스크를', '별도로', '제작', '하지', '않고도', '표준', '규격의', '표시패널을', '재가공', '하여', '소비자', '가', '원', '하는', '크기나', '형상의', '표시패널을', '제작할', '수', '있으므로,', '저렴한', '비용', '으로', '비', '표준', '규격을', '사용', '하는', '표시장치에', '적용될', '수', '있는', '효과를', '얻을', '수', '있을', '뿐', '아니라', '보다', '신뢰성', '있고', '견', '고한', '표시패널을', '얻을', '수', '있게', '된다.', '목', '적은', '종래', '기', '술의', '문', '제점들을', '해결하고자', '안', '출된', '발명', '으로', '서,', '표시패널의', '커팅', '후의', '문', '제를', '발', '생시', '키기', '않을', '뿐', '아니라', '고가의', '포토마스크를', '별도로', '제작', '하지', '않고도', '원', '하는', '크기와', '형상을', '갖는', '표시패널을', '제공', '하는데', '있다.', '목', '적을', '달성', '하기', '위', '하여,', '표시패널은', '신', '호라인들이', '형성된', '표시영역을', '갖는', '기', '판과,', '기', '판의', '표시영역에', '대응', '하는', '표시영역을', '갖는', '기', '판과,', '기', '판과', '기', '판을', '합착하기', '위해', '기', '판의', '표시영역', '및', '기', '판의', '표시영역의', '어느', '하나의', '표시영역', '내에', '적어도', '일부', '가', '형성되는', '실런', '트를', '포', '함하는', '것을', '특징으로', '한다.', '다른', '표시패널은', '신', '호라인들이', '형성된', '표시영역을', '갖는', '기', '판;', '기', '판의', '표시영역에', '대응', '하는', '표시영역을', '갖는', '기', '판;', '및', '기', '판과', '기', '판을', '합착하기', '위해', '기', '판의', '표시영역', '컬러', '필터와', '적어도', '일부', '가', '중', '첩되도록', '형성되는', '실런', '트를', '포', '함하는', '것을', '특징으로', '한다.', '구성', '에서,', '표시패널은', '액정', '표시소자의', '표시패널,', '전계', '방출', '표시소자의', '표시패널,', '플라즈마', '디스', '플레이', '패널의', '표시패널,', '전계발광소자의', '표시패널,', '전기영동', '표시소자의', '표시패널', '플렉서블', '표시소자의', '표시패널', '중', '어느', '하나인', '것을', '특징으로', '한다.또한,', '실런', '트는', '폐루', '프형', '또는', '일부분이', '개', '방된', '개', '루프', '형으로', '형성된', '것을', '특징으로', '한다.', '실런', '트는', '기', '판의', '표시영역에', '형성된', '신', '호라인들', '및', '박막', '트랜지스터들', '중의', '적어도', '하나와', '중', '첩되거나', '교', '차되도록', '형성되는', '것을', '특징으로', '한다.']

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment