Skip to content

Instantly share code, notes, and snippets.

@naoh16
Last active January 11, 2019 02:05
Show Gist options
  • Save naoh16/78265a0aedce491fdda0d8784b4eee5e to your computer and use it in GitHub Desktop.
Save naoh16/78265a0aedce491fdda0d8784b4eee5e to your computer and use it in GitHub Desktop.
Simple latex report checker: Level 2
#!env python
# -*- coding: utf-8 -*-
'''Edu report checker: Level 2
形態素解析も行いながら,発展的な内容もチェックしています.
Message Examples
-------------------
- [LV2-1-1] 1段落に{:d}単語含まれています.多くとも 600 単語以下となるよう,文や段落の構成を考え直してみましょう.
- [LV2-2-1] 1文に{:d}単語含まれているようです.多くとも 60 単語以下となるよう,短い文にすることを考えてみましょう.
- [LV2-2-2] 1文に読点が{:d}個含まれています.多くとも 6 個以下になるよう,文の区切りを考え直してみましょう.
- [LV2-3-1] 助詞の「の」が{:d}回繰り返されており,わかりにくい文になっています.3 回未満を目安として,適切に読点を入れたり,1文の長さを再考する,などを考えてみましょう.
- [LV2-3-2] 二重否定と思われるセンテンス「{:s}」が含まれています.修正を検討しましょう.
- [LV2-4-1] 敬体(「です,ます」など)が使われています.レポートでは常体(「である」など)を使いましょう(ただし,感想を除く).
- [LV2-4-2] 弱い表現が使われています.レポートでは主体的に主張した文章を書きましょう(ただし,感想を除く).
Note
-----
- 利用には 'Janome <https://github.com/mocobeta/janome>'_ が必要です.
'''
import sys
import re
# Parameters for Check (1)
THRESHOLD_MAX_WORDS_IN_PARAGRAPH = 600
FORMAT_MAX_WORDS_IN_PARAGRAPH = '* [LV2-1-1] 1段落に{:d}単語含まれています.多くとも{:d}単語以下となるよう,文や段落の構成を考え直してみましょう.'
# Parameters for Check (2a)
THRESHOLD_MAX_WORDS_IN_SENTENCE = 60
FORMAT_MAX_WORDS_IN_SENTENCE = '> {:.32s}...\n * [LV2-2-1] 1文に{:d}単語含まれているようです.多くとも{:d}単語以下となるよう,短い文にすることを考えてみましょう.'
# Parameters for Check (2b)
THRESHOLD_MAX_COMMA_IN_SENTENCE = 6
FORMAT_MAX_COMMA_IN_SENTENCE = '> {:.32s}...\n * [LV2-2-2] 1文に読点が{:d}個含まれています.多くとも{:d}個以下になるよう,文の区切りを考え直してみましょう.'
# Parameters for Check (3a)
THREASHOLD_MAX_JOSHI_NO = 3
FORMAT_MAX_JOSHI_NO = '> {:.32s}...\n * [LV2-3-1] 助詞の「の」が{:d}回繰り返されており,わかりにくい文になっています.{:d}回未満を目安として,適切に読点を入れたり,1文の長さを再考する,などを考えてみましょう.'
# Parameters for Check (3b)
FORMAT_DOUBLE_NEGTIVE = '> {:.32s}...\n * [LV2-3-2] 二重否定と思われるセンテンス「{:s}」が含まれています.修正を検討しましょう.'
# Parameters for Check (4a)
FORMAT_DESUMASU = '> {:.32s}...\n * [LV2-4-1] 敬体(「です,ます」など)が使われています.レポートでは常体(「である」など)を使いましょう(ただし,感想を除く).'
# Parameters for Cchek (4b)
FORMAT_WEAK_WORD = '> {:.32s}...\n * [LV2-4-2] 弱い表現が使われています.レポートでは主体的に主張した文章を書きましょう(ただし,感想を除く).'
def warning(linenum_st, linenum_en, source_text, messages):
print('\033[1;32mWARNING: Lines {:d}--{:d}\033[0m: {:s}'.format(linenum_st, linenum_en, source_text))
for msg in messages:
print('{}'.format(msg))
print()
def load_texfile(src_filename):
paragraphs = []
with open(src_filename, 'r', encoding="utf-8") as f:
n = 0
n_parstart = 0
in_verbatim = False
in_table = False
in_figure = False
in_itemize = False
sentence = ""
for line in f:
n += 1
line = line.rstrip()
if in_verbatim:
if re.search(r'\\end\{verbatim\}', line):
in_verbatim = False
continue
if re.search(r'\\begin\{verbatim\}', line):
in_verbatim = True
continue
if in_table:
if re.search(r'\\end\{table\}', line):
in_table = False
continue
if re.search(r'\\begin\{table\}', line):
in_verbatim = True
continue
if in_figure:
if re.search(r'\\end\{figure\}', line):
in_figure = False
continue
if re.search(r'\\begin\{figure\}', line):
in_figure = True
continue
if in_itemize:
if re.search(r'\\end\{(itemize|enumerate)\}', line):
in_itemize = False
continue
if re.search(r'\\begin\{(itemize|enumerate)\}', line):
in_itemize = True
continue
# remove comments
line = re.sub(r'(?<!\\)%.*', '', line)
# remove some markups
line = re.sub(r'\\(sub)*section\{[^\}]+\}', '', line)
line = re.sub(r'\\(label|ref|cite)\{[^\}]+\}', '', line)
line = re.sub(r'\\(small|large|huge)', '', line)
line = re.sub(r'\\LaTeX\s*', 'LaTeX', line)
if n_parstart == 0 and line != '':
n_parstart = n
sentence = sentence + line
if (line == '' and len(sentence) > 0) or line == '\\par':
paragraphs.append({'line_start': n_parstart, 'line_end': n-1,
'str': sentence + line})
sentence = ''
n_parstart = 0
return paragraphs
def preprocess_sentences(paragraphs):
''' preprocess for sentences
par['str'] is processed 'by reference,' not 'by value'.
'''
for par in paragraphs:
#par['str'] = re.sub(r'\\verb(.)([^\1]*?)\1', r' \g<2> ', par['str'])
par['str'] = re.sub(r'\\verb(.)([^\1]*?)\1', r' VERB ', par['str'])
par['str'] = re.sub(r'\$([^\$]*)\$', r'\g<1>', par['str'])
par['str'] = re.sub(r'\\\w+', '', par['str'])
par['str'] = re.sub(r'\{|\}', '', par['str'])
par['str'] = re.sub(r'(?<!\w)\s+', '', par['str'])
def report_check_level2(src_filename):
from janome.tokenizer import Tokenizer
pos_tagger = Tokenizer()
# Load text
paragraphs = load_texfile(src_filename)
preprocess_sentences(paragraphs)
for par in paragraphs:
num_of_words_in_paragraph = 0
num_of_words_in_sentence = 0
num_of_comma_in_sentence = 0
num_of_joshi_no = 0
has_negative = False
has_desumasu = False
has_weak_word = False
str_sentence = ""
str_short_sentence = ""
str_negative = ""
warning_messages = []
for token in pos_tagger.tokenize(par['str']):
pos0,pos1 = token.part_of_speech.split(',')[:2]
#print('{:s}+{:s}+{:s}'.format(token.surface, token.base_form, token.part_of_speech), end=' ') # debug
#print('{:s}+{:s}+{:s}'.format(token.surface, pos0, pos1), end=' ') # debug
str_sentence += token.surface
str_short_sentence += token.surface
if pos0 != '記号' and token.surface != '(' and token.surface != ')':
num_of_words_in_paragraph += 1
num_of_words_in_sentence += 1
if pos0 == '助詞':
if token.surface == 'の':
num_of_joshi_no += 1
else:
# Check (3a)
if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))
num_of_joshi_no = 0
# Check (4a) DESU, MASU
if pos0 == '助動詞' and re.search(r'です|ます', token.base_form):
has_desumasu = True
# Check (4b) Weak words
if pos1 == '副助詞' and token.base_form == 'かも':
has_weak_word = True
if pos0 == '助動詞' and token.infl_form == '未然形' and token.base_form == 'だ':
has_weak_word = True
if (pos0 == '助動詞' or pos0 == '形容詞') and token.infl_form == '未然ウ接続':
has_weak_word = True
if pos0 == '形容詞' and token.base_form == '難い':
has_weak_word = True
if pos0 == '形容詞' and (re.search(r'無い|ない', token.base_form)):
has_negative = True
if has_negative:
str_negative += token.surface
if has_negative and pos0 == '助動詞':
# Check (3b) Double Negative
if token.surface == 'ない':
warning_messages.append( FORMAT_DOUBLE_NEGTIVE.format(
str_short_sentence, str_negative))
has_negative = False
str_negative = ''
if pos1 == '読点':
num_of_comma_in_sentence += 1
# Check (3a)
if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))
# Check (4a) DESU, MASU
if has_desumasu:
warning_messages.append( FORMAT_DESUMASU.format(str_short_sentence) )
# Check (4b) Weak words
if has_weak_word:
warning_messages.append( FORMAT_WEAK_WORD.format(str_short_sentence) )
# reset
num_of_joshi_no = 0
str_short_sentence = '...,'
has_negative = False
has_weak_word = False
if pos1 == '句点': # '.'
# Check (2a)
if num_of_words_in_sentence > THRESHOLD_MAX_WORDS_IN_SENTENCE:
warning_messages.append( FORMAT_MAX_WORDS_IN_SENTENCE.format(
str_sentence, num_of_words_in_sentence, THRESHOLD_MAX_WORDS_IN_SENTENCE))
# Check (2b)
if num_of_comma_in_sentence > THRESHOLD_MAX_COMMA_IN_SENTENCE:
warning_messages.append( FORMAT_MAX_COMMA_IN_SENTENCE.format(
str_sentence, num_of_comma_in_sentence, THRESHOLD_MAX_COMMA_IN_SENTENCE))
# Check (3a)
if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO:
warning_messages.append( FORMAT_MAX_JOSHI_NO.format(
str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO))
# Check (4a) DESU, MASU
if has_desumasu:
warning_messages.append( FORMAT_DESUMASU.format(str_short_sentence) )
# Check (4b) Weak words
if has_weak_word:
warning_messages.append( FORMAT_WEAK_WORD.format(str_short_sentence) )
# reset
num_of_words_in_sentence = 0
num_of_comma_in_sentence = 0
str_sentence = ''
num_of_joshi_no = 0
str_short_sentence = ''
has_negative = False
has_desumasu = False
has_weak_word = False
#print('') # debug
# Check (1)
if num_of_words_in_paragraph > THRESHOLD_MAX_WORDS_IN_PARAGRAPH:
warning_messages.append( FORMAT_MAX_WORDS_IN_PARAGRAPH.format(
num_of_words_in_paragraph, THRESHOLD_MAX_WORDS_IN_PARAGRAPH))
if len(warning_messages) > 0:
warning(par['line_start'], par['line_end'], '', warning_messages)
if __name__ == '__main__':
if len(sys.argv) == 1:
print('Usage: python {:s} filename1.tex [filename2.tex ...]'.format(sys.argv[0]))
exit(1)
else:
for filename in sys.argv[1:]:
report_check_level2(filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment