bowbowbow/extract_morpheme_from_sejong_corpus.py

## extract_morpheme_from_sejong_corpus.py
import glob
from bs4 import BeautifulSoup
from tqdm import tqdm
import json

if __name__ == '__main__':
    paths = glob.glob('./corpus-utf8/*.txt')
    print('len(paths):', len(paths))

    pair_count = 0
    data = []
    for path in tqdm(paths):
        with open(path, 'r') as f:
            soup = BeautifulSoup(f, 'html.parser')
            title = soup.select('title')
            if '형태소' not in title[0].get_text():
                continue

            texts = soup.select('text body p, text p')

            if len(texts) == 0:
                # 8CT_0042.txt 처럼 대화문은 text 태그 안에 내용이 있음
                texts = soup.select('text')

            for text in texts:
                item = []
                for line in text.get_text().split('\n'):
                    morpheme_pair = line.split('\t')[1:]
                    if len(morpheme_pair) < 2:
                        if len(item) > 0:
                            pair_count += len(item)
                            data.append(item)
                        item = []
                        continue

                    item.append(morpheme_pair)

    with open("./output.json", "w") as f:
        json.dump(data, f, indent=2, sort_keys=True, ensure_ascii=False)

    print('pair_count :', pair_count)
    print('sent_count :', len(data))
	import glob
	from bs4 import BeautifulSoup
	from tqdm import tqdm
	import json

	if __name__ == '__main__':
	paths = glob.glob('./corpus-utf8/*.txt')
	print('len(paths):', len(paths))

	pair_count = 0
	data = []
	for path in tqdm(paths):
	with open(path, 'r') as f:
	soup = BeautifulSoup(f, 'html.parser')
	title = soup.select('title')
	if '형태소' not in title[0].get_text():
	continue

	texts = soup.select('text body p, text p')

	if len(texts) == 0:
	# 8CT_0042.txt 처럼 대화문은 text 태그 안에 내용이 있음
	texts = soup.select('text')

	for text in texts:
	item = []
	for line in text.get_text().split('\n'):
	morpheme_pair = line.split('\t')[1:]
	if len(morpheme_pair) < 2:
	if len(item) > 0:
	pair_count += len(item)
	data.append(item)
	item = []
	continue

	item.append(morpheme_pair)

	with open("./output.json", "w") as f:
	json.dump(data, f, indent=2, sort_keys=True, ensure_ascii=False)

	print('pair_count :', pair_count)
	print('sent_count :', len(data))