naoh16/edu-reportcheck-level2.py

## edu-reportcheck-level2.py
#!env python
# -*- coding: utf-8 -*-
'''Edu report checker: Level 2

形態素解析も行いながら，発展的な内容もチェックしています．

Message Examples
-------------------

- [LV2-1-1] 1段落に{:d}単語含まれています．多くとも 600 単語以下となるよう，文や段落の構成を考え直してみましょう．
- [LV2-2-1] 1文に{:d}単語含まれているようです．多くとも 60 単語以下となるよう，短い文にすることを考えてみましょう．
- [LV2-2-2] 1文に読点が{:d}個含まれています．多くとも 6 個以下になるよう，文の区切りを考え直してみましょう．
- [LV2-3-1] 助詞の「の」が{:d}回繰り返されており，わかりにくい文になっています．3 回未満を目安として，適切に読点を入れたり，1文の長さを再考する，などを考えてみましょう．
- [LV2-3-2] 二重否定と思われるセンテンス「{:s}」が含まれています．修正を検討しましょう．
- [LV2-4-1] 敬体（「です，ます」など）が使われています．レポートでは常体（「である」など）を使いましょう（ただし，感想を除く）．
- [LV2-4-2] 弱い表現が使われています．レポートでは主体的に主張した文章を書きましょう（ただし，感想を除く）．

Note
-----

- 利用には 'Janome <https://github.com/mocobeta/janome>'_ が必要です．

'''
import sys
import re

# Parameters for Check (1)
THRESHOLD_MAX_WORDS_IN_PARAGRAPH = 600
FORMAT_MAX_WORDS_IN_PARAGRAPH = '* [LV2-1-1] 1段落に{:d}単語含まれています．多くとも{:d}単語以下となるよう，文や段落の構成を考え直してみましょう．'

# Parameters for Check (2a)
THRESHOLD_MAX_WORDS_IN_SENTENCE  = 60
FORMAT_MAX_WORDS_IN_SENTENCE  = '> {:.32s}...\n  * [LV2-2-1] 1文に{:d}単語含まれているようです．多くとも{:d}単語以下となるよう，短い文にすることを考えてみましょう．'
# Parameters for Check (2b)
THRESHOLD_MAX_COMMA_IN_SENTENCE  = 6
FORMAT_MAX_COMMA_IN_SENTENCE  = '> {:.32s}...\n  * [LV2-2-2] 1文に読点が{:d}個含まれています．多くとも{:d}個以下になるよう，文の区切りを考え直してみましょう．'

# Parameters for Check (3a)
THREASHOLD_MAX_JOSHI_NO = 3
FORMAT_MAX_JOSHI_NO = '> {:.32s}...\n  * [LV2-3-1] 助詞の「の」が{:d}回繰り返されており，わかりにくい文になっています．{:d}回未満を目安として，適切に読点を入れたり，1文の長さを再考する，などを考えてみましょう．'
# Parameters for Check (3b)
FORMAT_DOUBLE_NEGTIVE = '> {:.32s}...\n  * [LV2-3-2] 二重否定と思われるセンテンス「{:s}」が含まれています．修正を検討しましょう．'

# Parameters for Check (4a)
FORMAT_DESUMASU = '> {:.32s}...\n  * [LV2-4-1] 敬体（「です，ます」など）が使われています．レポートでは常体（「である」など）を使いましょう（ただし，感想を除く）．'
# Parameters for Cchek (4b)
FORMAT_WEAK_WORD = '> {:.32s}...\n  * [LV2-4-2] 弱い表現が使われています．レポートでは主体的に主張した文章を書きましょう（ただし，感想を除く）．'

def warning(linenum_st, linenum_en, source_text, messages):
    print('\033[1;32mWARNING: Lines {:d}--{:d}\033[0m: {:s}'.format(linenum_st, linenum_en, source_text))
    for msg in messages:
        print('{}'.format(msg))
    print()

def load_texfile(src_filename):
    paragraphs = []
    with open(src_filename, 'r', encoding="utf-8") as f:
        n = 0
        n_parstart = 0
        in_verbatim = False
        in_table    = False
        in_figure   = False
        in_itemize  = False

        sentence = ""
        for line in f:
            n += 1
            line = line.rstrip()

            if in_verbatim:
                if re.search(r'\\end\{verbatim\}', line):
                    in_verbatim = False
                continue
            if re.search(r'\\begin\{verbatim\}', line):
                in_verbatim = True
                continue

            if in_table:
                if re.search(r'\\end\{table\}', line):
                    in_table = False
                continue
            if re.search(r'\\begin\{table\}', line):
                in_verbatim = True
                continue
            if in_figure:
                if re.search(r'\\end\{figure\}', line):
                    in_figure = False
                continue
            if re.search(r'\\begin\{figure\}', line):
                in_figure = True
                continue
            if in_itemize:
                if re.search(r'\\end\{(itemize|enumerate)\}', line):
                    in_itemize = False
                continue
            if re.search(r'\\begin\{(itemize|enumerate)\}', line):
                in_itemize = True
                continue

            # remove comments
            line = re.sub(r'(?<!\\)%.*', '', line)
            # remove some markups
            line = re.sub(r'\\(sub)*section\{[^\}]+\}', '', line)
            line = re.sub(r'\\(label|ref|cite)\{[^\}]+\}', '', line)
            line = re.sub(r'\\(small|large|huge)', '', line)
            line = re.sub(r'\\LaTeX\s*', 'LaTeX', line)

            if n_parstart == 0 and line != '':
                n_parstart = n

            sentence = sentence + line
            if (line == '' and len(sentence) > 0) or line == '\\par':
                paragraphs.append({'line_start': n_parstart, 'line_end': n-1,
                                'str': sentence + line})
                sentence = ''
                n_parstart = 0

    return paragraphs

def preprocess_sentences(paragraphs):
    ''' preprocess for sentences
    par['str'] is processed 'by reference,' not 'by value'.
    '''
    for par in paragraphs:
        #par['str'] = re.sub(r'\\verb(.)([^\1]*?)\1', r' \g<2> ', par['str'])
        par['str'] = re.sub(r'\\verb(.)([^\1]*?)\1', r' VERB ', par['str'])
        par['str'] = re.sub(r'\$([^\$]*)\$', r'\g<1>', par['str'])
        par['str'] = re.sub(r'\\\w+', '', par['str'])
        par['str'] = re.sub(r'\{|\}', '', par['str'])
        par['str'] = re.sub(r'(?<!\w)\s+', '', par['str'])

def report_check_level2(src_filename):
    from janome.tokenizer import Tokenizer
    pos_tagger = Tokenizer()

    # Load text
    paragraphs = load_texfile(src_filename)
    preprocess_sentences(paragraphs)

    for par in paragraphs:
        num_of_words_in_paragraph = 0
        num_of_words_in_sentence  = 0
        num_of_comma_in_sentence  = 0
        num_of_joshi_no = 0
        has_negative = False
        has_desumasu = False
        has_weak_word = False

        str_sentence = ""
        str_short_sentence = ""
        str_negative = ""

        warning_messages = []

        for token in pos_tagger.tokenize(par['str']):
            pos0,pos1 = token.part_of_speech.split(',')[:2]
            #print('{:s}+{:s}+{:s}'.format(token.surface, token.base_form, token.part_of_speech), end=' ')   # debug
            #print('{:s}+{:s}+{:s}'.format(token.surface, pos0, pos1), end=' ')   # debug

            str_sentence += token.surface
            str_short_sentence += token.surface

            if pos0 != '記号' and token.surface != '（' and token.surface != '）':
                num_of_words_in_paragraph += 1
                num_of_words_in_sentence  += 1

            if pos0 == '助詞':
                if token.surface == 'の':
                    num_of_joshi_no += 1
                else:
                    # Check (3a)
                    if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
                        warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
                            str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))

                    num_of_joshi_no = 0

            # Check (4a) DESU, MASU
            if pos0 == '助動詞' and re.search(r'です|ます', token.base_form):
                has_desumasu = True

            # Check (4b) Weak words
            if pos1 == '副助詞' and token.base_form == 'かも':
                has_weak_word = True
            if pos0 == '助動詞' and token.infl_form == '未然形' and token.base_form == 'だ':
                has_weak_word = True
            if (pos0 == '助動詞' or pos0 == '形容詞') and token.infl_form == '未然ウ接続':
                has_weak_word = True
            if pos0 == '形容詞' and token.base_form == '難い':
                has_weak_word = True

            if pos0 == '形容詞' and (re.search(r'無い|ない', token.base_form)):
                has_negative = True
            if has_negative:
                str_negative += token.surface
            if has_negative and pos0 == '助動詞':
                # Check (3b) Double Negative
                if token.surface == 'ない':
                    warning_messages.append( FORMAT_DOUBLE_NEGTIVE.format(
                        str_short_sentence, str_negative))

                has_negative = False
                str_negative = ''

            if pos1 == '読点':
                num_of_comma_in_sentence += 1

                # Check (3a)
                if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
                    warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
                        str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))
                # Check (4a) DESU, MASU
                if has_desumasu:
                    warning_messages.append( FORMAT_DESUMASU.format(str_short_sentence) )
                # Check (4b) Weak words
                if has_weak_word:
                    warning_messages.append( FORMAT_WEAK_WORD.format(str_short_sentence) )

                # reset
                num_of_joshi_no = 0
                str_short_sentence = '...，'
                has_negative = False
                has_weak_word = False

            if pos1 == '句点':  # '．'
                # Check (2a)
                if num_of_words_in_sentence > THRESHOLD_MAX_WORDS_IN_SENTENCE:
                    warning_messages.append( FORMAT_MAX_WORDS_IN_SENTENCE.format(
                        str_sentence, num_of_words_in_sentence, THRESHOLD_MAX_WORDS_IN_SENTENCE))
                # Check (2b)
                if num_of_comma_in_sentence > THRESHOLD_MAX_COMMA_IN_SENTENCE:
                    warning_messages.append( FORMAT_MAX_COMMA_IN_SENTENCE.format(
                        str_sentence, num_of_comma_in_sentence, THRESHOLD_MAX_COMMA_IN_SENTENCE))

                # Check (3a)
                if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
                    warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
                        str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))
                # Check (4a) DESU, MASU
                if has_desumasu:
                    warning_messages.append( FORMAT_DESUMASU.format(str_short_sentence) )
                # Check (4b) Weak words
                if has_weak_word:
                    warning_messages.append( FORMAT_WEAK_WORD.format(str_short_sentence) )

                # reset
                num_of_words_in_sentence = 0
                num_of_comma_in_sentence = 0
                str_sentence = ''
                num_of_joshi_no = 0
                str_short_sentence = ''
                has_negative = False
                has_desumasu = False
                has_weak_word = False

        #print('')   # debug

        # Check (1)
        if num_of_words_in_paragraph > THRESHOLD_MAX_WORDS_IN_PARAGRAPH:
            warning_messages.append( FORMAT_MAX_WORDS_IN_PARAGRAPH.format(
                num_of_words_in_paragraph, THRESHOLD_MAX_WORDS_IN_PARAGRAPH))

        if len(warning_messages) > 0:
            warning(par['line_start'], par['line_end'], '', warning_messages)

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print('Usage: python {:s} filename1.tex [filename2.tex ...]'.format(sys.argv[0]))
        exit(1)
    else:
        for filename in sys.argv[1:]:
            report_check_level2(filename)
	#!env python
	# -- coding: utf-8 --
	'''Edu report checker: Level 2

	形態素解析も行いながら，発展的な内容もチェックしています．

	Message Examples
	-------------------

	- [LV2-1-1] 1段落に{:d}単語含まれています．多くとも 600 単語以下となるよう，文や段落の構成を考え直してみましょう．
	- [LV2-2-1] 1文に{:d}単語含まれているようです．多くとも 60 単語以下となるよう，短い文にすることを考えてみましょう．
	- [LV2-2-2] 1文に読点が{:d}個含まれています．多くとも 6 個以下になるよう，文の区切りを考え直してみましょう．
	- [LV2-3-1] 助詞の「の」が{:d}回繰り返されており，わかりにくい文になっています．3 回未満を目安として，適切に読点を入れたり，1文の長さを再考する，などを考えてみましょう．
	- [LV2-3-2] 二重否定と思われるセンテンス「{:s}」が含まれています．修正を検討しましょう．
	- [LV2-4-1] 敬体（「です，ます」など）が使われています．レポートでは常体（「である」など）を使いましょう（ただし，感想を除く）．
	- [LV2-4-2] 弱い表現が使われています．レポートでは主体的に主張した文章を書きましょう（ただし，感想を除く）．

	Note
	-----

	- 利用には 'Janome <https://github.com/mocobeta/janome>'_ が必要です．

	'''
	import sys
	import re

	# Parameters for Check (1)
	THRESHOLD_MAX_WORDS_IN_PARAGRAPH = 600
	FORMAT_MAX_WORDS_IN_PARAGRAPH = '* [LV2-1-1] 1段落に{:d}単語含まれています．多くとも{:d}単語以下となるよう，文や段落の構成を考え直してみましょう．'

	# Parameters for Check (2a)
	THRESHOLD_MAX_WORDS_IN_SENTENCE = 60
	FORMAT_MAX_WORDS_IN_SENTENCE = '> {:.32s}...\n * [LV2-2-1] 1文に{:d}単語含まれているようです．多くとも{:d}単語以下となるよう，短い文にすることを考えてみましょう．'
	# Parameters for Check (2b)
	THRESHOLD_MAX_COMMA_IN_SENTENCE = 6
	FORMAT_MAX_COMMA_IN_SENTENCE = '> {:.32s}...\n * [LV2-2-2] 1文に読点が{:d}個含まれています．多くとも{:d}個以下になるよう，文の区切りを考え直してみましょう．'

	# Parameters for Check (3a)
	THREASHOLD_MAX_JOSHI_NO = 3
	FORMAT_MAX_JOSHI_NO = '> {:.32s}...\n * [LV2-3-1] 助詞の「の」が{:d}回繰り返されており，わかりにくい文になっています．{:d}回未満を目安として，適切に読点を入れたり，1文の長さを再考する，などを考えてみましょう．'
	# Parameters for Check (3b)
	FORMAT_DOUBLE_NEGTIVE = '> {:.32s}...\n * [LV2-3-2] 二重否定と思われるセンテンス「{:s}」が含まれています．修正を検討しましょう．'

	# Parameters for Check (4a)
	FORMAT_DESUMASU = '> {:.32s}...\n * [LV2-4-1] 敬体（「です，ます」など）が使われています．レポートでは常体（「である」など）を使いましょう（ただし，感想を除く）．'
	# Parameters for Cchek (4b)
	FORMAT_WEAK_WORD = '> {:.32s}...\n * [LV2-4-2] 弱い表現が使われています．レポートでは主体的に主張した文章を書きましょう（ただし，感想を除く）．'

	def warning(linenum_st, linenum_en, source_text, messages):
	print('\033[1;32mWARNING: Lines {:d}--{:d}\033[0m: {:s}'.format(linenum_st, linenum_en, source_text))
	for msg in messages:
	print('{}'.format(msg))
	print()

	def load_texfile(src_filename):
	paragraphs = []
	with open(src_filename, 'r', encoding="utf-8") as f:
	n = 0
	n_parstart = 0
	in_verbatim = False
	in_table = False
	in_figure = False
	in_itemize = False

	sentence = ""
	for line in f:
	n += 1
	line = line.rstrip()

	if in_verbatim:
	if re.search(r'\\end\{verbatim\}', line):
	in_verbatim = False
	continue
	if re.search(r'\\begin\{verbatim\}', line):
	in_verbatim = True
	continue

	if in_table:
	if re.search(r'\\end\{table\}', line):
	in_table = False
	continue
	if re.search(r'\\begin\{table\}', line):
	in_verbatim = True
	continue
	if in_figure:
	if re.search(r'\\end\{figure\}', line):
	in_figure = False
	continue
	if re.search(r'\\begin\{figure\}', line):
	in_figure = True
	continue
	if in_itemize:
	if re.search(r'\\end\{(itemize\|enumerate)\}', line):
	in_itemize = False
	continue
	if re.search(r'\\begin\{(itemize\|enumerate)\}', line):
	in_itemize = True
	continue

	# remove comments
	line = re.sub(r'(?<!\\)%.*', '', line)
	# remove some markups
	line = re.sub(r'\\(sub)*section\{[^\}]+\}', '', line)
	line = re.sub(r'\\(label\|ref\|cite)\{[^\}]+\}', '', line)
	line = re.sub(r'\\(small\|large\|huge)', '', line)
	line = re.sub(r'\\LaTeX\s*', 'LaTeX', line)

	if n_parstart == 0 and line != '':
	n_parstart = n

	sentence = sentence + line
	if (line == '' and len(sentence) > 0) or line == '\\par':
	paragraphs.append({'line_start': n_parstart, 'line_end': n-1,
	'str': sentence + line})
	sentence = ''
	n_parstart = 0

	return paragraphs

	def preprocess_sentences(paragraphs):
	''' preprocess for sentences
	par['str'] is processed 'by reference,' not 'by value'.
	'''
	for par in paragraphs:
	#par['str'] = re.sub(r'\\verb(.)([^\1]*?)\1', r' \g<2> ', par['str'])
	par['str'] = re.sub(r'\\verb(.)([^\1]*?)\1', r' VERB ', par['str'])
	par['str'] = re.sub(r'\$([^\$]*)\$', r'\g<1>', par['str'])
	par['str'] = re.sub(r'\\\w+', '', par['str'])
	par['str'] = re.sub(r'\{\|\}', '', par['str'])
	par['str'] = re.sub(r'(?<!\w)\s+', '', par['str'])

	def report_check_level2(src_filename):
	from janome.tokenizer import Tokenizer
	pos_tagger = Tokenizer()

	# Load text
	paragraphs = load_texfile(src_filename)
	preprocess_sentences(paragraphs)

	for par in paragraphs:
	num_of_words_in_paragraph = 0
	num_of_words_in_sentence = 0
	num_of_comma_in_sentence = 0
	num_of_joshi_no = 0
	has_negative = False
	has_desumasu = False
	has_weak_word = False

	str_sentence = ""
	str_short_sentence = ""
	str_negative = ""

	warning_messages = []

	for token in pos_tagger.tokenize(par['str']):
	pos0,pos1 = token.part_of_speech.split(',')[:2]
	#print('{:s}+{:s}+{:s}'.format(token.surface, token.base_form, token.part_of_speech), end=' ') # debug
	#print('{:s}+{:s}+{:s}'.format(token.surface, pos0, pos1), end=' ') # debug

	str_sentence += token.surface
	str_short_sentence += token.surface

	if pos0 != '記号' and token.surface != '（' and token.surface != '）':
	num_of_words_in_paragraph += 1
	num_of_words_in_sentence += 1

	if pos0 == '助詞':
	if token.surface == 'の':
	num_of_joshi_no += 1
	else:
	# Check (3a)
	if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
	warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
	str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))

	num_of_joshi_no = 0

	# Check (4a) DESU, MASU
	if pos0 == '助動詞' and re.search(r'です\|ます', token.base_form):
	has_desumasu = True

	# Check (4b) Weak words
	if pos1 == '副助詞' and token.base_form == 'かも':
	has_weak_word = True
	if pos0 == '助動詞' and token.infl_form == '未然形' and token.base_form == 'だ':
	has_weak_word = True
	if (pos0 == '助動詞' or pos0 == '形容詞') and token.infl_form == '未然ウ接続':
	has_weak_word = True
	if pos0 == '形容詞' and token.base_form == '難い':
	has_weak_word = True

	if pos0 == '形容詞' and (re.search(r'無い\|ない', token.base_form)):
	has_negative = True
	if has_negative:
	str_negative += token.surface
	if has_negative and pos0 == '助動詞':
	# Check (3b) Double Negative
	if token.surface == 'ない':
	warning_messages.append( FORMAT_DOUBLE_NEGTIVE.format(
	str_short_sentence, str_negative))

	has_negative = False
	str_negative = ''

	if pos1 == '読点':
	num_of_comma_in_sentence += 1

	# Check (3a)
	if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
	warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
	str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))
	# Check (4a) DESU, MASU
	if has_desumasu:
	warning_messages.append( FORMAT_DESUMASU.format(str_short_sentence) )
	# Check (4b) Weak words
	if has_weak_word:
	warning_messages.append( FORMAT_WEAK_WORD.format(str_short_sentence) )

	# reset
	num_of_joshi_no = 0
	str_short_sentence = '...，'
	has_negative = False
	has_weak_word = False

	if pos1 == '句点': # '．'
	# Check (2a)
	if num_of_words_in_sentence > THRESHOLD_MAX_WORDS_IN_SENTENCE:
	warning_messages.append( FORMAT_MAX_WORDS_IN_SENTENCE.format(
	str_sentence, num_of_words_in_sentence, THRESHOLD_MAX_WORDS_IN_SENTENCE))
	# Check (2b)
	if num_of_comma_in_sentence > THRESHOLD_MAX_COMMA_IN_SENTENCE:
	warning_messages.append( FORMAT_MAX_COMMA_IN_SENTENCE.format(
	str_sentence, num_of_comma_in_sentence, THRESHOLD_MAX_COMMA_IN_SENTENCE))

	# Check (3a)
	if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
	warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
	str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))
	# Check (4a) DESU, MASU
	if has_desumasu:
	warning_messages.append( FORMAT_DESUMASU.format(str_short_sentence) )
	# Check (4b) Weak words
	if has_weak_word:
	warning_messages.append( FORMAT_WEAK_WORD.format(str_short_sentence) )

	# reset
	num_of_words_in_sentence = 0
	num_of_comma_in_sentence = 0
	str_sentence = ''
	num_of_joshi_no = 0
	str_short_sentence = ''
	has_negative = False
	has_desumasu = False
	has_weak_word = False

	#print('') # debug

	# Check (1)
	if num_of_words_in_paragraph > THRESHOLD_MAX_WORDS_IN_PARAGRAPH:
	warning_messages.append( FORMAT_MAX_WORDS_IN_PARAGRAPH.format(
	num_of_words_in_paragraph, THRESHOLD_MAX_WORDS_IN_PARAGRAPH))

	if len(warning_messages) > 0:
	warning(par['line_start'], par['line_end'], '', warning_messages)

	if __name__ == '__main__':
	if len(sys.argv) == 1:
	print('Usage: python {:s} filename1.tex [filename2.tex ...]'.format(sys.argv[0]))
	exit(1)
	else:
	for filename in sys.argv[1:]:
	report_check_level2(filename)