Created
September 12, 2018 14:31
-
-
Save theo-m/d4a0873a1aaa9f5a81187c0fbc3a1650 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import argparse | |
import collections | |
class TermColors: | |
header = '\033[95m' | |
okblue = '\033[94m' | |
okgreen = '\033[92m' | |
warning = '\033[93m' | |
fail = '\033[91m' | |
endc = '\033[0m' | |
bold = '\033[1m' | |
underline = '\033[4m' | |
WEASELS = [ | |
"many", "various", "very", "fairly", "several", "extremely", | |
"exceedingly", "quite", "remarkably", "few", "surprisingly", | |
"mostly", "largely", "huge", "tiny", "are a number", "is a number", | |
"excellent", "interestingly", "significantly", "substantially", | |
"clearly", "vast", "relatively", "completely", "literally", | |
"not rocket science", "outside the box" | |
] | |
STOPWORDS = [ | |
'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', | |
'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before', | |
'being', 'below', 'between', 'both', 'but', 'by', 'could', 'did', 'do', | |
'does', 'doing', 'down', 'were', | |
'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have', | |
'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', | |
'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'me', 'more', | |
'most', 'my', 'myself', 'nor', 'of', 'on', 'once', 'only', 'or', 'other', | |
'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'she', | |
'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', | |
'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', | |
'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', | |
'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with', | |
'would', 'you', 'your', 'yours', 'yourself', 'yourselves' | |
] | |
rgx_tok = re.compile(r'(\W+)') | |
def tokenizer(line: str) -> list: | |
return rgx_tok.sub(r' \1 ', line.strip()).split() | |
def untokenizer(tokens: list) -> str: | |
line = ' '.join(tokens) | |
unpunct = re.sub(r' (\W+) ', r'\1 ', line) | |
return ' '.join(unpunct.split()) | |
def latex_keep_line(line: str) -> bool: | |
return not ( | |
line.strip().startswith('\\') or | |
line.strip() == '' or | |
line.strip().startswith('%') | |
) | |
def insert_color(tokens: list, idx: int, color=TermColors.okblue) -> list: | |
newtokens = list(tokens[:idx]) | |
newtokens += [TermColors.bold, color, tokens[idx], TermColors.endc] | |
newtokens += list(tokens[idx + 1:]) | |
return newtokens | |
def weasels_finder(tokens: list) -> int: | |
wis = [i for i, tok in enumerate(tokens) if tok in WEASELS] | |
return wis if len(wis) != 0 else -1 | |
def filter_punct(tokens: list) -> list: | |
return [tok for tok in tokens if not re.match(r'\W', tok)] | |
def word_count(tokens: list) -> int: | |
return len(filter_punct(tokens)) | |
def repeating_words(lines: list) -> list: | |
words = [ | |
w for line in lines | |
for w in filter_punct(tokenizer(line.strip().lower()))] | |
wcounter = collections.Counter(words) | |
mc = wcounter.most_common(20) | |
return [(w, c) for w, c in mc if w not in STOPWORDS] | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--files', type=str, nargs='+') | |
parser.add_argument( | |
'--styles', type=str, nargs='+', default=['wc', 'weasel', 'rw']) | |
args = parser.parse_args() | |
for fn in args.files: | |
print(f'\n-- {fn} --\n') | |
with open(fn) as fi: | |
lines = fi.readlines() | |
for nl, line in enumerate(lines): | |
if not latex_keep_line(line): | |
continue | |
tokens = tokenizer(line) | |
if 'wc' in args.styles: | |
if word_count(tokens) > 24: | |
print( | |
f'WC = {word_count(tokens)}: {nl:5d} - {line.strip()}') | |
if 'weasel' in args.styles: | |
weasels_idxs = weasels_finder(tokens) | |
if weasels_idxs != -1: | |
colored = list(tokens) | |
for ii, idx in enumerate(weasels_idxs): | |
colored = insert_color( | |
colored, idx + 2 * ii, color=TermColors.warning) | |
print(f'WEASELS: {nl:5d} - {untokenizer(colored)}') | |
print() | |
if 'rw' in args.styles: | |
for w, c in repeating_words(filter(latex_keep_line, lines)): | |
print(f'FREQWRD: {w:<20} #: {c}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment