naoh16/edu-reportcheck-level1.py

## edu-reportcheck-level1.py
#!env python
# -*- coding: utf-8 -*-
'''Edu report checker: Level 1

主にLaTeXの使い方のミスに起因するような，機械的に見つけられるレベルの問題を指摘しています．

Message Examples
-------------------

- [LV1-1-1] 全角句点には「．」を使いましょう．
- [LV1-1-2] 全角読点には「，」を使いましょう．
- [LV1-1-3] 日本語文の句点には全角ピリオドを使いましょう．
- [LV1-1-4] 句点相当の半角ピリオドの直後にはスペースを入れましょう．
- [LV1-1-5] 半角カンマの直後にはスペースを入れましょう．
- [LV1-1-6] 全角読点の後ろに半角スペースは不要ですので，削除しましょう．
- [LV1-1-7] 全角英数字ではなく，半角英数字を使いましょう．
- [LV1-2-1] 本文中での改行コマンド[\\\\]は利用しないようにしましょう．[代替案：空行による適切な段落分け，\\parの利用]
- [LV1-2-2] 全角スペースは利用しないようにしましょう．[代替案：空行による適切な段落分け，\\quadの利用]
- [LV1-3-1] 図表にはcaptionをつけましょう．
- [LV1-3-2] 図のcaptionは下側につけましょう．
- [LV1-3-3] 表のcaptionは上側につけましょう．
- [LV1-4-1] verbatim環境内で1行に半角 MAX_VERBATIM_WIDTH 文字以上を書くと，余白にはみ出る恐れがあります．
            読みやすくなる位置に改行を加えるなどして，文章を修正しましょう．（参考：現在の文字幅＝半角 CHAR_NUM 文字相当）
- [LV1-5-1] 階層 SECTION の次に SUBSUBSECTION が出現しています．
            階層 SECTION の次の階層は SUBSECTION となるように，階層構造を修正しましょう．

'''

import sys
import re

MAX_VERBATIM_WIDTH = 86

def warning(linenum, source_text, messages):
    try:
        print('\033[1;32mWARNING: Line {:2d}\033[0m: {:s}'.format(linenum, source_text))
    except UnicodeEncodeError:
        pass
    for msg in messages:
        print('   * {}'.format(msg))
    print()

class LatexStatus:
    def __init__(self):
        self.body = True
        self.env_table = False
        self.env_figure = False
        self.env_verbatim = False
        self.env_tabular = False
        self.env_equation = False
        self.float_caption = False
        self.float_table_contents = False
        self.float_figure_contents = False
        self.ending = ""

    def update(self, text):
        self.ending = ""

        if re.search(r'\\documentclass', text):
            self.body = False
        if re.search(r'\\begin\s*{document}', text):
            self.body = True

        if re.search(r'\\begin\s*{(align|equation|eqnarray)\*?}', text):
            self.env_equation = True
        if self.env_equation and re.search(r'\\end\s*{(align|equation|eqnarray)\*?}', text):
            self.env_equation = False
            self.ending = "equation"

        if not self.env_table and re.search(r'\\begin\s*{table\*?}', text):
            self.env_table = True
            self.float_caption = False
            self.float_table_contents = False
        if self.env_table and re.search(r'\\end\s*{table\*?}', text):
            self.env_table = False
            self.ending = "table"

        if not self.env_figure and re.search(r'\\begin\s*{figure\*?}', text):
            self.env_figure = True
            self.float_caption = False
            self.float_figure_contents = False
        if self.env_figure and re.search(r'\\end\s*{figure\*?}', text):
            self.env_figure = False
            self.ending = "figure"

        if not self.env_verbatim and re.search(r'\\begin\s*{verbatim\*?}', text):
            self.env_verbatim = True
        if self.env_verbatim and re.search(r'\\end\s*{verbatim\*?}', text):
            self.env_verbatim = False

        if re.search(r'\\begin{tabular}', text):
            self.env_tabular = True
            self.float_table_contents = True
        if re.search(r'\\end{tabular}', text):
            self.env_tabular = False
            self.ending = "tabular"

        if re.search(r'\\includegraphics', text):
            self.float_figure_contents = True

        if re.search(r'\\caption', text):
            self.float_caption = True

def remove_unecessary_elements(text):
    ''' 1文から不要な要素を取り除く
    1. 行末の改行を削除
    2. コメント文を削除
    3. includeXXX, ref, label, cite などは見る必要がない
    4. 部分数式の中も無視する
    '''
    text = re.sub(r'(?<!\\)%.*', '', text)
    text = re.sub(r'\\(ref|label|cite|include\w*)(\[[^\]]+\])?{[^\}]+}', '', text)
    text = re.sub(r'\$[^\$]+\$', '', text)
    return text

import unicodedata
def len_japanese(text):
    n = 0
    for c in text:
        cx = unicodedata.east_asian_width(c)
        if cx == 'W' or cx == 'H' or cx == 'A':
            n += 2
        elif c == '\t':
            n += 4
        else:
            n += 1
    return n

dict_section_depth = {'chapter': 0, 'section': 1, 'subsection': 2, 'subsubsection': 3}

def report_check_level1(src_filename):
    n = 0
    latex_status = LatexStatus()
    cur_section_depth = 0

    for line in open(src_filename, 'r', encoding='utf8'):
        n = n + 1

        # テキストの整形
        # 1. 行末の改行を削除
        # 2. 不要な文を削除（ただし，画面表示のために原文も残しておく）
        line = line.rstrip('\r\n')
        untagged_text = remove_unecessary_elements(line)

        messages = []
        latex_status.update(untagged_text)

        # \begin{document}に至るまではプレアンブルなので無視する
        if not latex_status.body:
            continue

        # (1) 基本的な文字の使い方についてのチェック
        if re.search(r'。', untagged_text):
            messages.append('[LV1-1-1] 全角句点には「．」を使いましょう．')
        if re.search(r'、', untagged_text):
            messages.append('[LV1-1-2] 全角読点には「，」を使いましょう．')
        if re.search(r'(?<=[あ-ん])\.', untagged_text):
            messages.append('[LV1-1-3] 日本語文の句点には全角ピリオドを使いましょう．')
        if re.search(r'\.(?!\s|~|$)', untagged_text):
            if latex_status.env_equation or latex_status.env_verbatim:
                pass # 数式環境内やverbatim環境内ではOKとする
            elif re.search(r'\d\.\d', untagged_text):
                pass # 数字で挟まれてる場合，2.38などであればOKとする
            elif re.search(r'\\(verb|tt)', untagged_text):
                pass # verbの中，textttの中ならOKとする
            else:
                messages.append('[LV1-1-4] 句点相当の半角ピリオドの直後にはスペースを入れましょう．')
                messages.append('（参考：本文中でファイル名や構造体変数等を表現するなら\\verbや\\textttを使う）')
        if re.search(r'\,(?!\s|~|$)', untagged_text):
            if latex_status.env_equation or latex_status.env_verbatim:
                pass
            elif re.search(r'\d,\d{3}', untagged_text):
                pass # 4桁以上の数字，1,024などであればOKとする
            elif re.search(r'\\(verb|tt)', untagged_text):
                pass # verbの中，textttの中ならOKとする
            else:
                messages.append('[LV1-1-5] 半角カンマの直後にはスペースを入れましょう．')
        if re.search(r'，\s', untagged_text):
            messages.append('[LV1-1-6] 全角読点の後ろに半角スペースは不要ですので，削除しましょう．')
        if re.search(r'[０-９ａ-ｚＡ-Ｚ]+', untagged_text):
            messages.append('[LV1-1-7] 全角英数字ではなく，半角英数字を使いましょう．')

        # (2) LaTeXコマンドの使い方についてのチェック
        if re.search(r'\\\\', untagged_text):
            if latex_status.env_tabular or latex_status.env_equation or latex_status.env_verbatim:
                pass
            elif re.search(r'\\item', untagged_text):
                pass
            else:
                messages.append('[LV1-2-1] 本文中での改行コマンド[\\\\]は利用しないようにしましょう．[代替案：空行による適切な段落分け，\\parの利用]')
        if re.search(r'[　]+', untagged_text) and not latex_status.env_verbatim:
            messages.append('[LV1-2-2] 全角スペースは利用しないようにしましょう．[代替案：空行による適切な段落分け，\\quadの利用]')
            line = line.replace('　', '◇')   # 全角空白の強調

        # (3) Float環境内のcaptionのチェック
        if re.search(r'table|figure', latex_status.ending) and not latex_status.float_caption:
            messages.append('[LV1-3-1] 図表にはcaptionをつけましょう．')
        if re.search(r'\\caption', untagged_text):
            if latex_status.env_figure and not latex_status.float_figure_contents:
                messages.append('[LV1-3-2] 図のcaptionは下側につけましょう．')
            if latex_status.env_table and latex_status.float_table_contents:
                messages.append('[LV1-3-3] 表のcaptionは上側につけましょう．')

        # (4) Verbatim環境のチェック
        line_len = len_japanese(line)
        if latex_status.env_verbatim and line_len >= MAX_VERBATIM_WIDTH:
            messages.append('[LV1-4-1] verbatim環境内で1行に半角{:d}文字以上を書くと，余白にはみ出る恐れがあります．'\
                            '読みやすくなる位置に改行を加えるなどして，文章を修正しましょう．'\
                            '（参考：現在の文字幅＝半角{:d}文字相当）'\
                            .format(MAX_VERBATIM_WIDTH, line_len))

        # (5) Sectionの階層チェック
        if latex_status.body:
            match_section = re.search(r'\\(chapter|section|subsection|subsubsection)', untagged_text)
            if match_section:
                new_section_depth = dict_section_depth[match_section.group(1)]

                if new_section_depth - cur_section_depth > 1:
                    cur_section_tag = [k for k, v in dict_section_depth.items() if v == cur_section_depth][0]
                    new_section_tag = [k for k, v in dict_section_depth.items() if v == new_section_depth][0]
                    preferred_section_tag = [k for k, v in dict_section_depth.items() if v == (cur_section_depth+1)][0]
                    messages.append('[LV1-5-1] 階層 {:s} の次に階層 {:s} が出現しています．'\
                                    '階層 {:s} の次の階層は {:s} となるように，階層構造を修正しましょう．'\
                                    .format(cur_section_tag, new_section_tag, cur_section_tag, preferred_section_tag))

                cur_section_depth = new_section_depth

        # 最終的に問題があればメッセージを表示する
        if len(messages) > 0:
            warning(n, line, messages)


if __name__ == '__main__':
    if len(sys.argv) == 1:
        print('Usage: python {:s} filename1.tex [filename2.tex ...]'.format(sys.argv[0]))
        exit(1)
    else:
        for filename in sys.argv[1:]:
            report_check_level1(filename)
	#!env python
	# -- coding: utf-8 --
	'''Edu report checker: Level 1

	主にLaTeXの使い方のミスに起因するような，機械的に見つけられるレベルの問題を指摘しています．

	Message Examples
	-------------------

	- [LV1-1-1] 全角句点には「．」を使いましょう．
	- [LV1-1-2] 全角読点には「，」を使いましょう．
	- [LV1-1-3] 日本語文の句点には全角ピリオドを使いましょう．
	- [LV1-1-4] 句点相当の半角ピリオドの直後にはスペースを入れましょう．
	- [LV1-1-5] 半角カンマの直後にはスペースを入れましょう．
	- [LV1-1-6] 全角読点の後ろに半角スペースは不要ですので，削除しましょう．
	- [LV1-1-7] 全角英数字ではなく，半角英数字を使いましょう．
	- [LV1-2-1] 本文中での改行コマンド[\\\\]は利用しないようにしましょう．[代替案：空行による適切な段落分け，\\parの利用]
	- [LV1-2-2] 全角スペースは利用しないようにしましょう．[代替案：空行による適切な段落分け，\\quadの利用]
	- [LV1-3-1] 図表にはcaptionをつけましょう．
	- [LV1-3-2] 図のcaptionは下側につけましょう．
	- [LV1-3-3] 表のcaptionは上側につけましょう．
	- [LV1-4-1] verbatim環境内で1行に半角 MAX_VERBATIM_WIDTH 文字以上を書くと，余白にはみ出る恐れがあります．
	読みやすくなる位置に改行を加えるなどして，文章を修正しましょう．（参考：現在の文字幅＝半角 CHAR_NUM 文字相当）
	- [LV1-5-1] 階層 SECTION の次に SUBSUBSECTION が出現しています．
	階層 SECTION の次の階層は SUBSECTION となるように，階層構造を修正しましょう．

	'''

	import sys
	import re

	MAX_VERBATIM_WIDTH = 86

	def warning(linenum, source_text, messages):
	try:
	print('\033[1;32mWARNING: Line {:2d}\033[0m: {:s}'.format(linenum, source_text))
	except UnicodeEncodeError:
	pass
	for msg in messages:
	print(' * {}'.format(msg))
	print()

	class LatexStatus:
	def __init__(self):
	self.body = True
	self.env_table = False
	self.env_figure = False
	self.env_verbatim = False
	self.env_tabular = False
	self.env_equation = False
	self.float_caption = False
	self.float_table_contents = False
	self.float_figure_contents = False
	self.ending = ""

	def update(self, text):
	self.ending = ""

	if re.search(r'\\documentclass', text):
	self.body = False
	if re.search(r'\\begin\s*{document}', text):
	self.body = True

	if re.search(r'\\begin\s{(align\|equation\|eqnarray)\?}', text):
	self.env_equation = True
	if self.env_equation and re.search(r'\\end\s{(align\|equation\|eqnarray)\?}', text):
	self.env_equation = False
	self.ending = "equation"

	if not self.env_table and re.search(r'\\begin\s{table\?}', text):
	self.env_table = True
	self.float_caption = False
	self.float_table_contents = False
	if self.env_table and re.search(r'\\end\s{table\?}', text):
	self.env_table = False
	self.ending = "table"

	if not self.env_figure and re.search(r'\\begin\s{figure\?}', text):
	self.env_figure = True
	self.float_caption = False
	self.float_figure_contents = False
	if self.env_figure and re.search(r'\\end\s{figure\?}', text):
	self.env_figure = False
	self.ending = "figure"

	if not self.env_verbatim and re.search(r'\\begin\s{verbatim\?}', text):
	self.env_verbatim = True
	if self.env_verbatim and re.search(r'\\end\s{verbatim\?}', text):
	self.env_verbatim = False

	if re.search(r'\\begin{tabular}', text):
	self.env_tabular = True
	self.float_table_contents = True
	if re.search(r'\\end{tabular}', text):
	self.env_tabular = False
	self.ending = "tabular"

	if re.search(r'\\includegraphics', text):
	self.float_figure_contents = True

	if re.search(r'\\caption', text):
	self.float_caption = True

	def remove_unecessary_elements(text):
	''' 1文から不要な要素を取り除く
	1. 行末の改行を削除
	2. コメント文を削除
	3. includeXXX, ref, label, cite などは見る必要がない
	4. 部分数式の中も無視する
	'''
	text = re.sub(r'(?<!\\)%.*', '', text)
	text = re.sub(r'\\(ref\|label\|cite\|include\w*)(\[[^\]]+\])?{[^\}]+}', '', text)
	text = re.sub(r'\$[^\$]+\$', '', text)
	return text

	import unicodedata
	def len_japanese(text):
	n = 0
	for c in text:
	cx = unicodedata.east_asian_width(c)
	if cx == 'W' or cx == 'H' or cx == 'A':
	n += 2
	elif c == '\t':
	n += 4
	else:
	n += 1
	return n

	dict_section_depth = {'chapter': 0, 'section': 1, 'subsection': 2, 'subsubsection': 3}

	def report_check_level1(src_filename):
	n = 0
	latex_status = LatexStatus()
	cur_section_depth = 0

	for line in open(src_filename, 'r', encoding='utf8'):
	n = n + 1

	# テキストの整形
	# 1. 行末の改行を削除
	# 2. 不要な文を削除（ただし，画面表示のために原文も残しておく）
	line = line.rstrip('\r\n')
	untagged_text = remove_unecessary_elements(line)

	messages = []
	latex_status.update(untagged_text)

	# \begin{document}に至るまではプレアンブルなので無視する
	if not latex_status.body:
	continue

	# (1) 基本的な文字の使い方についてのチェック
	if re.search(r'。', untagged_text):
	messages.append('[LV1-1-1] 全角句点には「．」を使いましょう．')
	if re.search(r'、', untagged_text):
	messages.append('[LV1-1-2] 全角読点には「，」を使いましょう．')
	if re.search(r'(?<=[あ-ん])\.', untagged_text):
	messages.append('[LV1-1-3] 日本語文の句点には全角ピリオドを使いましょう．')
	if re.search(r'\.(?!\s\|~\|$)', untagged_text):
	if latex_status.env_equation or latex_status.env_verbatim:
	pass # 数式環境内やverbatim環境内ではOKとする
	elif re.search(r'\d\.\d', untagged_text):
	pass # 数字で挟まれてる場合，2.38などであればOKとする
	elif re.search(r'\\(verb\|tt)', untagged_text):
	pass # verbの中，textttの中ならOKとする
	else:
	messages.append('[LV1-1-4] 句点相当の半角ピリオドの直後にはスペースを入れましょう．')
	messages.append('（参考：本文中でファイル名や構造体変数等を表現するなら\\verbや\\textttを使う）')
	if re.search(r'\,(?!\s\|~\|$)', untagged_text):
	if latex_status.env_equation or latex_status.env_verbatim:
	pass
	elif re.search(r'\d,\d{3}', untagged_text):
	pass # 4桁以上の数字，1,024などであればOKとする
	elif re.search(r'\\(verb\|tt)', untagged_text):
	pass # verbの中，textttの中ならOKとする
	else:
	messages.append('[LV1-1-5] 半角カンマの直後にはスペースを入れましょう．')
	if re.search(r'，\s', untagged_text):
	messages.append('[LV1-1-6] 全角読点の後ろに半角スペースは不要ですので，削除しましょう．')
	if re.search(r'[０-９ａ-ｚＡ-Ｚ]+', untagged_text):
	messages.append('[LV1-1-7] 全角英数字ではなく，半角英数字を使いましょう．')

	# (2) LaTeXコマンドの使い方についてのチェック
	if re.search(r'\\\\', untagged_text):
	if latex_status.env_tabular or latex_status.env_equation or latex_status.env_verbatim:
	pass
	elif re.search(r'\\item', untagged_text):
	pass
	else:
	messages.append('[LV1-2-1] 本文中での改行コマンド[\\\\]は利用しないようにしましょう．[代替案：空行による適切な段落分け，\\parの利用]')
	if re.search(r'[　]+', untagged_text) and not latex_status.env_verbatim:
	messages.append('[LV1-2-2] 全角スペースは利用しないようにしましょう．[代替案：空行による適切な段落分け，\\quadの利用]')
	line = line.replace('　', '◇') # 全角空白の強調

	# (3) Float環境内のcaptionのチェック
	if re.search(r'table\|figure', latex_status.ending) and not latex_status.float_caption:
	messages.append('[LV1-3-1] 図表にはcaptionをつけましょう．')
	if re.search(r'\\caption', untagged_text):
	if latex_status.env_figure and not latex_status.float_figure_contents:
	messages.append('[LV1-3-2] 図のcaptionは下側につけましょう．')
	if latex_status.env_table and latex_status.float_table_contents:
	messages.append('[LV1-3-3] 表のcaptionは上側につけましょう．')

	# (4) Verbatim環境のチェック
	line_len = len_japanese(line)
	if latex_status.env_verbatim and line_len >= MAX_VERBATIM_WIDTH:
	messages.append('[LV1-4-1] verbatim環境内で1行に半角{:d}文字以上を書くと，余白にはみ出る恐れがあります．'\
	'読みやすくなる位置に改行を加えるなどして，文章を修正しましょう．'\
	'（参考：現在の文字幅＝半角{:d}文字相当）'\
	.format(MAX_VERBATIM_WIDTH, line_len))

	# (5) Sectionの階層チェック
	if latex_status.body:
	match_section = re.search(r'\\(chapter\|section\|subsection\|subsubsection)', untagged_text)
	if match_section:
	new_section_depth = dict_section_depth[match_section.group(1)]

	if new_section_depth - cur_section_depth > 1:
	cur_section_tag = [k for k, v in dict_section_depth.items() if v == cur_section_depth][0]
	new_section_tag = [k for k, v in dict_section_depth.items() if v == new_section_depth][0]
	preferred_section_tag = [k for k, v in dict_section_depth.items() if v == (cur_section_depth+1)][0]
	messages.append('[LV1-5-1] 階層 {:s} の次に階層 {:s} が出現しています．'\
	'階層 {:s} の次の階層は {:s} となるように，階層構造を修正しましょう．'\
	.format(cur_section_tag, new_section_tag, cur_section_tag, preferred_section_tag))

	cur_section_depth = new_section_depth

	# 最終的に問題があればメッセージを表示する
	if len(messages) > 0:
	warning(n, line, messages)


	if __name__ == '__main__':
	if len(sys.argv) == 1:
	print('Usage: python {:s} filename1.tex [filename2.tex ...]'.format(sys.argv[0]))
	exit(1)
	else:
	for filename in sys.argv[1:]:
	report_check_level1(filename)