woodongk/count_ngram.py

## count_ngram.py
from collections import Counter
from itertools import chain

def ngram_count(docs_tokenized, n, n_display=50):
    '''
    Args:
        docs : 토큰 뭉치 2d list
        예시 :[['문재인', '원전',  '국민', '혈세', '물어내', '문재인', '대통령', '물어내'],
                ['전쟁', '제일', '먼저', '아가리', '대통령', '특수', '부대', '실미'],
        n : n-gram 선택. e.g., unigram : 1, bigram : 2
        n_display : 출력할 개수
    '''

    # get bigram


    ngrams = []
    for doc in docs_tokenized:
        for b in range(0, len(doc) - n + 1):
            if n==1:
                ngrams.append(tuple(doc[b:b+n])[0])
            else:
                ngrams.append(tuple(doc[b:b+n]))

    ngram_dic = dict(Counter(ngrams))

    keys = sorted(ngram_dic.items(), key = lambda x: x[1], reverse = True)
    for word, count in keys[:n_display]:
        print("{0}({1}) ".format(word, count), end = "")

    # [] 없애주는 코드
    words = set(chain(*docs_tokenized))

    n_vocab = len(words)
    print()
    print()
    print("Total Vocab: ", n_vocab)
    print()

    return keys, n_vocab
	from collections import Counter
	from itertools import chain

	def ngram_count(docs_tokenized, n, n_display=50):
	'''
	Args:
	docs : 토큰 뭉치 2d list
	예시 :[['문재인', '원전', '국민', '혈세', '물어내', '문재인', '대통령', '물어내'],
	['전쟁', '제일', '먼저', '아가리', '대통령', '특수', '부대', '실미'],
	n : n-gram 선택. e.g., unigram : 1, bigram : 2
	n_display : 출력할 개수
	'''

	# get bigram


	ngrams = []
	for doc in docs_tokenized:
	for b in range(0, len(doc) - n + 1):
	if n==1:
	ngrams.append(tuple(doc[b:b+n])[0])
	else:
	ngrams.append(tuple(doc[b:b+n]))

	ngram_dic = dict(Counter(ngrams))

	keys = sorted(ngram_dic.items(), key = lambda x: x[1], reverse = True)
	for word, count in keys[:n_display]:
	print("{0}({1}) ".format(word, count), end = "")

	# [] 없애주는 코드
	words = set(chain(*docs_tokenized))

	n_vocab = len(words)
	print()
	print()
	print("Total Vocab: ", n_vocab)
	print()

	return keys, n_vocab