Skip to content

Instantly share code, notes, and snippets.

@BrambleXu
Last active June 3, 2021 11:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BrambleXu/2d443e3c894f230195bf1b098c63b963 to your computer and use it in GitHub Desktop.
Save BrambleXu/2d443e3c894f230195bf1b098c63b963 to your computer and use it in GitHub Desktop.
import unicodedata
from typing import List
from pathlib import Path
from collections import defaultdict
from ahocorasick import Automaton
def read_dictionary(dict_path: str) -> dict:
with open(dict_path, 'r', encoding='utf-8') as f:
company_dict = {}
for i, line in enumerate(f, start=1):
try:
name = line.strip()
name = unicodedata.normalize('NFKC', name)
company_dict[name] = i
except Exception as e:
print(e)
print(line)
print('Read dictionary done.')
return company_dict
def build_trie(company_dict: dict) -> Automaton:
trie = Automaton()
for name, idx in company_dict.items():
trie.add_word(name, (idx, name))
trie.make_automaton()
print('Build dictionary trie done.')
return trie
def filter_chunks(chunks: list) -> list:
chunks = sorted(chunks)
# same start but for longest match
dic = defaultdict(list)
last_chunk = chunks[0]
for chunk in chunks:
start_idx = chunk[0]
end_idx = chunk[1]
if start_idx not in dic:
# [131, 139, 'ジャパンエナジー'], [133, 134, 'パ']
if last_chunk[0] <= start_idx and last_chunk[1] >= end_idx:
continue
else: # [48, 53, '愛知学泉大']
dic[start_idx] = chunk
else: # [131, 135, 'ジャパン'], [131, 139, 'ジャパンエナジー']
if dic[start_idx][1] < chunk[1]:
dic[start_idx] = chunk
last_chunk = chunk
# same end but for longest match
chunks = dic.values()
dic = defaultdict(list)
for chunk in chunks:
end_idx = chunk[1]
if end_idx not in dic:
dic[end_idx] = chunk
else:
if dic[end_idx][0] > chunk[0]:
dic[end_idx] = chunk
return list(dic.values())
def tag_with_dict(company_trie: Automaton, sents: list, duplicate=None) -> float:
for sent in sents:
text = ''.join(sent).strip()
text = unicodedata.normalize('NFKC', text)
chunks = []
# find all chunks
for idx, (_, w) in company_trie.iter(text):
end_idx = idx + 1
start_idx = end_idx - len(w)
chunks.append([start_idx, end_idx, w]) # [[48, 53, '愛知学泉大'], [122, 130, 'シャンソン化粧品'], [131, 135, 'ジャパン'], [131, 139, 'ジャパンエナジー'], [133, 134, 'パ'], [140, 144, '第一勧銀']]
# find chunks
if len(chunks) != 0:
# filter chunks
chunks = filter_chunks(chunks) # [[122, 130, 'シャンソン化粧品'], [131, 139, 'ジャパンエナジー'], [140, 144, '第一勧銀']]
return chunks
if __name__ == "__main__":
# dict_path = Path('jcl_slim.csv')
dict_path = Path('/Users/smap10/Project/japanese-company-lexicon/data/dictionaries/output/jcl_slim.csv')
company_dict = read_dictionary(dict_path)
company_trie = build_trie(company_dict)
sents = ['TISインテックグループのTIS株式会社は、自然言語処理で企業名認識を行うための辞書JCLdic(日本会社名辞書)を無償公開。']
chunks = tag_with_dict(company_trie, sents)
print(chunks)
@shuxinjin
Copy link

not working.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment