Skip to content

Instantly share code, notes, and snippets.

@woodongk
Created May 26, 2020 04:48
Show Gist options
  • Save woodongk/69823183dec55d33f0a1c4c0388422d7 to your computer and use it in GitHub Desktop.
Save woodongk/69823183dec55d33f0a1c4c0388422d7 to your computer and use it in GitHub Desktop.
말뭉치 ngram counter
from collections import Counter
from itertools import chain
def ngram_count(docs_tokenized, n, n_display=50):
'''
Args:
docs : 토큰 뭉치 2d list
예시 :[['문재인', '원전', '국민', '혈세', '물어내', '문재인', '대통령', '물어내'],
['전쟁', '제일', '먼저', '아가리', '대통령', '특수', '부대', '실미'],
n : n-gram 선택. e.g., unigram : 1, bigram : 2
n_display : 출력할 개수
'''
# get bigram
ngrams = []
for doc in docs_tokenized:
for b in range(0, len(doc) - n + 1):
if n==1:
ngrams.append(tuple(doc[b:b+n])[0])
else:
ngrams.append(tuple(doc[b:b+n]))
ngram_dic = dict(Counter(ngrams))
keys = sorted(ngram_dic.items(), key = lambda x: x[1], reverse = True)
for word, count in keys[:n_display]:
print("{0}({1}) ".format(word, count), end = "")
# [] 없애주는 코드
words = set(chain(*docs_tokenized))
n_vocab = len(words)
print()
print()
print("Total Vocab: ", n_vocab)
print()
return keys, n_vocab
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment