theo-m/style_checker.py

## style_checker.py

import re
import argparse
import collections


class TermColors:
    header = '\033[95m'
    okblue = '\033[94m'
    okgreen = '\033[92m'
    warning = '\033[93m'
    fail = '\033[91m'
    endc = '\033[0m'
    bold = '\033[1m'
    underline = '\033[4m'


WEASELS = [
    "many", "various", "very", "fairly", "several", "extremely",
    "exceedingly", "quite", "remarkably", "few", "surprisingly",
    "mostly", "largely", "huge", "tiny", "are a number", "is a number",
    "excellent", "interestingly", "significantly", "substantially",
    "clearly", "vast", "relatively", "completely", "literally",
    "not rocket science", "outside the box"
]

STOPWORDS = [
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
    'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
    'being', 'below', 'between', 'both', 'but', 'by', 'could', 'did', 'do',
    'does', 'doing', 'down', 'were',
    'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have',
    'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his',
    'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'me', 'more',
    'most', 'my', 'myself', 'nor', 'of', 'on', 'once', 'only', 'or', 'other',
    'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'she',
    'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs',
    'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
    'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we',
    'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with',
    'would', 'you', 'your', 'yours', 'yourself', 'yourselves'
]


rgx_tok = re.compile(r'(\W+)')


def tokenizer(line: str) -> list:
    return rgx_tok.sub(r' \1 ', line.strip()).split()


def untokenizer(tokens: list) -> str:
    line = ' '.join(tokens)
    unpunct = re.sub(r' (\W+) ', r'\1 ', line)
    return ' '.join(unpunct.split())


def latex_keep_line(line: str) -> bool:
    return not (
        line.strip().startswith('\\') or
        line.strip() == '' or
        line.strip().startswith('%')
    )


def insert_color(tokens: list, idx: int, color=TermColors.okblue) -> list:
    newtokens = list(tokens[:idx])
    newtokens += [TermColors.bold, color, tokens[idx], TermColors.endc]
    newtokens += list(tokens[idx + 1:])
    return newtokens


def weasels_finder(tokens: list) -> int:
    wis = [i for i, tok in enumerate(tokens) if tok in WEASELS]
    return wis if len(wis) != 0 else -1


def filter_punct(tokens: list) -> list:
    return [tok for tok in tokens if not re.match(r'\W', tok)]


def word_count(tokens: list) -> int:
    return len(filter_punct(tokens))


def repeating_words(lines: list) -> list:
    words = [
        w for line in lines
        for w in filter_punct(tokenizer(line.strip().lower()))]
    wcounter = collections.Counter(words)
    mc = wcounter.most_common(20)
    return [(w, c) for w, c in mc if w not in STOPWORDS]


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--files', type=str, nargs='+')
    parser.add_argument(
        '--styles', type=str, nargs='+', default=['wc', 'weasel', 'rw'])
    args = parser.parse_args()

    for fn in args.files:

        print(f'\n-- {fn} --\n')
        with open(fn) as fi:
            lines = fi.readlines()

        for nl, line in enumerate(lines):
            if not latex_keep_line(line):
                continue

            tokens = tokenizer(line)

            if 'wc' in args.styles:
                if word_count(tokens) > 24:
                    print(
                        f'WC = {word_count(tokens)}: {nl:5d} - {line.strip()}')

            if 'weasel' in args.styles:
                weasels_idxs = weasels_finder(tokens)
                if weasels_idxs != -1:
                    colored = list(tokens)
                    for ii, idx in enumerate(weasels_idxs):
                        colored = insert_color(
                            colored, idx + 2 * ii, color=TermColors.warning)
                    print(f'WEASELS: {nl:5d} - {untokenizer(colored)}')

        print()
        if 'rw' in args.styles:
            for w, c in repeating_words(filter(latex_keep_line, lines)):
                print(f'FREQWRD: {w:<20} #: {c}')

	import re
	import argparse
	import collections


	class TermColors:
	header = '\033[95m'
	okblue = '\033[94m'
	okgreen = '\033[92m'
	warning = '\033[93m'
	fail = '\033[91m'
	endc = '\033[0m'
	bold = '\033[1m'
	underline = '\033[4m'


	WEASELS = [
	"many", "various", "very", "fairly", "several", "extremely",
	"exceedingly", "quite", "remarkably", "few", "surprisingly",
	"mostly", "largely", "huge", "tiny", "are a number", "is a number",
	"excellent", "interestingly", "significantly", "substantially",
	"clearly", "vast", "relatively", "completely", "literally",
	"not rocket science", "outside the box"
	]

	STOPWORDS = [
	'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
	'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
	'being', 'below', 'between', 'both', 'but', 'by', 'could', 'did', 'do',
	'does', 'doing', 'down', 'were',
	'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have',
	'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his',
	'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'me', 'more',
	'most', 'my', 'myself', 'nor', 'of', 'on', 'once', 'only', 'or', 'other',
	'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'she',
	'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs',
	'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
	'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we',
	'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with',
	'would', 'you', 'your', 'yours', 'yourself', 'yourselves'
	]


	rgx_tok = re.compile(r'(\W+)')


	def tokenizer(line: str) -> list:
	return rgx_tok.sub(r' \1 ', line.strip()).split()


	def untokenizer(tokens: list) -> str:
	line = ' '.join(tokens)
	unpunct = re.sub(r' (\W+) ', r'\1 ', line)
	return ' '.join(unpunct.split())


	def latex_keep_line(line: str) -> bool:
	return not (
	line.strip().startswith('\\') or
	line.strip() == '' or
	line.strip().startswith('%')
	)


	def insert_color(tokens: list, idx: int, color=TermColors.okblue) -> list:
	newtokens = list(tokens[:idx])
	newtokens += [TermColors.bold, color, tokens[idx], TermColors.endc]
	newtokens += list(tokens[idx + 1:])
	return newtokens


	def weasels_finder(tokens: list) -> int:
	wis = [i for i, tok in enumerate(tokens) if tok in WEASELS]
	return wis if len(wis) != 0 else -1


	def filter_punct(tokens: list) -> list:
	return [tok for tok in tokens if not re.match(r'\W', tok)]


	def word_count(tokens: list) -> int:
	return len(filter_punct(tokens))


	def repeating_words(lines: list) -> list:
	words = [
	w for line in lines
	for w in filter_punct(tokenizer(line.strip().lower()))]
	wcounter = collections.Counter(words)
	mc = wcounter.most_common(20)
	return [(w, c) for w, c in mc if w not in STOPWORDS]


	if __name__ == '__main__':

	parser = argparse.ArgumentParser()
	parser.add_argument('--files', type=str, nargs='+')
	parser.add_argument(
	'--styles', type=str, nargs='+', default=['wc', 'weasel', 'rw'])
	args = parser.parse_args()

	for fn in args.files:

	print(f'\n-- {fn} --\n')
	with open(fn) as fi:
	lines = fi.readlines()

	for nl, line in enumerate(lines):
	if not latex_keep_line(line):
	continue

	tokens = tokenizer(line)

	if 'wc' in args.styles:
	if word_count(tokens) > 24:
	print(
	f'WC = {word_count(tokens)}: {nl:5d} - {line.strip()}')

	if 'weasel' in args.styles:
	weasels_idxs = weasels_finder(tokens)
	if weasels_idxs != -1:
	colored = list(tokens)
	for ii, idx in enumerate(weasels_idxs):
	colored = insert_color(
	colored, idx + 2 * ii, color=TermColors.warning)
	print(f'WEASELS: {nl:5d} - {untokenizer(colored)}')

	print()
	if 'rw' in args.styles:
	for w, c in repeating_words(filter(latex_keep_line, lines)):
	print(f'FREQWRD: {w:<20} #: {c}')