Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Last active December 16, 2019 19:51
Show Gist options
  • Save bowbowbow/a037ff61c61c13947a56f6a2df52047d to your computer and use it in GitHub Desktop.
Save bowbowbow/a037ff61c61c13947a56f6a2df52047d to your computer and use it in GitHub Desktop.
세종 코퍼스에서 형태소 분석 데이터 추출
import glob
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
if __name__ == '__main__':
paths = glob.glob('./corpus-utf8/*.txt')
print('len(paths):', len(paths))
pair_count = 0
data = []
for path in tqdm(paths):
with open(path, 'r') as f:
soup = BeautifulSoup(f, 'html.parser')
title = soup.select('title')
if '형태소' not in title[0].get_text():
continue
texts = soup.select('text body p, text p')
if len(texts) == 0:
# 8CT_0042.txt 처럼 대화문은 text 태그 안에 내용이 있음
texts = soup.select('text')
for text in texts:
item = []
for line in text.get_text().split('\n'):
morpheme_pair = line.split('\t')[1:]
if len(morpheme_pair) < 2:
if len(item) > 0:
pair_count += len(item)
data.append(item)
item = []
continue
item.append(morpheme_pair)
with open("./output.json", "w") as f:
json.dump(data, f, indent=2, sort_keys=True, ensure_ascii=False)
print('pair_count :', pair_count)
print('sent_count :', len(data))
@bowbowbow
Copy link
Author

https://github.com/coolengineer/sejong-corpus 을 통해 세종 코퍼스 문서 다운 받은 뒤 추출 진행 함

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment