kohiro37/preprocess_kokoro.py

## preprocess_kokoro.py
"""青空文庫にある夏目漱石の『こころ』のテキストを章ごとに分割して形態素解析する

【テキストデータ】
青空文庫の夏目漱石の『こころ』（773_ruby_5968.zip）を解凍したファイル（kokoro.txt）
https://www.aozora.gr.jp/cards/000148/card773.html

【動作環境】
-Ubuntu 20.04
-Python 3.8.5
-MeCab 0.996
https://taku910.github.io/mecab/

【必要なPythonライブラリ】
-mecab-python3 1.0.3
"""
import re
import MeCab

# 対象の(品詞, 詳細分類1)
TARGET_POS = [
    ('形容詞', '自立'),
    ('動詞', '自立'),
    ('副詞', '一般'),
    ('名詞', 'サ変接続'),
    ('名詞', 'ナイ形容詞語幹'),
    ('名詞', '一般'),
    ('名詞', '形容動詞語幹'),
    ('名詞', '固有名詞'),
    ]

# ストップワード
STOP_WORDS = ['*']

def ruby2txt(ruby):
    #ルビなどの作品本文以外の文字や記号を取り除く

    # テキスト上部の【テキスト中に現れる記号について】箇所の除去
    txt = re.split(r'-{50,}', ruby)[2]

    # テキスト下部の「底本：～」の除去
    txt = re.split(r'底本：', txt)[0]

    # ルビ、ルビの付く文字列の始まりを特定する記号、入力者注を除去
    txt = re.sub(r'《.*?》|［＃.*?］|｜', '', txt)

    # テキスト前後の空白を除去
    return txt.strip()


def remove_blank(chapter):
    # 空白行と段落先頭の空白を削除

    lines = chapter.splitlines()

    # 空白行削除
    # 行頭の空白削除
    lines_cleaned = [l.strip() for l in lines if len(l) != 0]

    return '\n'.join(lines_cleaned)

def doc2chapter(doc):
    # 文章を章ごとに分割

    # タイトル削除
    doc = doc.replace('上　先生と私', '').replace('中　両親と私', '').replace('下　先生と遺書', '')

    # 章番号で章ごとに分割
    doc_split = re.split('[一二三四五六七八九十]{1,3}\n', doc)

    # 先頭は空白行なので削除
    del doc_split[0]

    print('Total chapter number: ', len(doc_split))

    chapters = list(map(remove_blank, doc_split))

    return chapters

def chapter2bform(chapters):
    # 章ごとに形態素解析して単語の原型のリストを作成

    m = MeCab.Tagger()

    texts = []
    for i, ch in enumerate(chapters):
        node = m.parseToNode(ch)

        bforms = []
        while node:
            feature_split = node.feature.split(',')

            pos1 = feature_split[0]
            pos2 = feature_split[1]
            base_form = feature_split[6]

            if (pos1, pos2) in TARGET_POS and base_form not in STOP_WORDS:
                bforms.append(base_form)

            node = node.next

        texts.append(bforms)
        print('Term number of chapter {}: '.format(i+1), len(bforms))

    return texts

def preprocess():

    # 青空文庫の『こころ』テキストの読み込み
    with open('kokoro.txt', mode='r', encoding='shift-jis') as f:
        kokoro = f.read()

    # ルビなどの除去
    text = ruby2txt(kokoro)

    # テキストを章ごとに分割
    chapters = doc2chapter(text)

    return chapter2bform(chapters)
	"""青空文庫にある夏目漱石の『こころ』のテキストを章ごとに分割して形態素解析する

	【テキストデータ】
	青空文庫の夏目漱石の『こころ』（773_ruby_5968.zip）を解凍したファイル（kokoro.txt）
	https://www.aozora.gr.jp/cards/000148/card773.html

	【動作環境】
	-Ubuntu 20.04
	-Python 3.8.5
	-MeCab 0.996
	https://taku910.github.io/mecab/

	【必要なPythonライブラリ】
	-mecab-python3 1.0.3
	"""
	import re
	import MeCab

	# 対象の(品詞, 詳細分類1)
	TARGET_POS = [
	('形容詞', '自立'),
	('動詞', '自立'),
	('副詞', '一般'),
	('名詞', 'サ変接続'),
	('名詞', 'ナイ形容詞語幹'),
	('名詞', '一般'),
	('名詞', '形容動詞語幹'),
	('名詞', '固有名詞'),
	]

	# ストップワード
	STOP_WORDS = ['*']

	def ruby2txt(ruby):
	#ルビなどの作品本文以外の文字や記号を取り除く

	# テキスト上部の【テキスト中に現れる記号について】箇所の除去
	txt = re.split(r'-{50,}', ruby)[2]

	# テキスト下部の「底本：～」の除去
	txt = re.split(r'底本：', txt)[0]

	# ルビ、ルビの付く文字列の始まりを特定する記号、入力者注を除去
	txt = re.sub(r'《.?》\|［＃.?］\|｜', '', txt)

	# テキスト前後の空白を除去
	return txt.strip()


	def remove_blank(chapter):
	# 空白行と段落先頭の空白を削除

	lines = chapter.splitlines()

	# 空白行削除
	# 行頭の空白削除
	lines_cleaned = [l.strip() for l in lines if len(l) != 0]

	return '\n'.join(lines_cleaned)

	def doc2chapter(doc):
	# 文章を章ごとに分割

	# タイトル削除
	doc = doc.replace('上　先生と私', '').replace('中　両親と私', '').replace('下　先生と遺書', '')

	# 章番号で章ごとに分割
	doc_split = re.split('[一二三四五六七八九十]{1,3}\n', doc)

	# 先頭は空白行なので削除
	del doc_split[0]

	print('Total chapter number: ', len(doc_split))

	chapters = list(map(remove_blank, doc_split))

	return chapters

	def chapter2bform(chapters):
	# 章ごとに形態素解析して単語の原型のリストを作成

	m = MeCab.Tagger()

	texts = []
	for i, ch in enumerate(chapters):
	node = m.parseToNode(ch)

	bforms = []
	while node:
	feature_split = node.feature.split(',')

	pos1 = feature_split[0]
	pos2 = feature_split[1]
	base_form = feature_split[6]

	if (pos1, pos2) in TARGET_POS and base_form not in STOP_WORDS:
	bforms.append(base_form)

	node = node.next

	texts.append(bforms)
	print('Term number of chapter {}: '.format(i+1), len(bforms))

	return texts

	def preprocess():

	# 青空文庫の『こころ』テキストの読み込み
	with open('kokoro.txt', mode='r', encoding='shift-jis') as f:
	kokoro = f.read()

	# ルビなどの除去
	text = ruby2txt(kokoro)

	# テキストを章ごとに分割
	chapters = doc2chapter(text)

	return chapter2bform(chapters)