Skip to content

Instantly share code, notes, and snippets.

@theo-m
Created September 12, 2018 14:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save theo-m/d4a0873a1aaa9f5a81187c0fbc3a1650 to your computer and use it in GitHub Desktop.
Save theo-m/d4a0873a1aaa9f5a81187c0fbc3a1650 to your computer and use it in GitHub Desktop.
import re
import argparse
import collections
class TermColors:
header = '\033[95m'
okblue = '\033[94m'
okgreen = '\033[92m'
warning = '\033[93m'
fail = '\033[91m'
endc = '\033[0m'
bold = '\033[1m'
underline = '\033[4m'
WEASELS = [
"many", "various", "very", "fairly", "several", "extremely",
"exceedingly", "quite", "remarkably", "few", "surprisingly",
"mostly", "largely", "huge", "tiny", "are a number", "is a number",
"excellent", "interestingly", "significantly", "substantially",
"clearly", "vast", "relatively", "completely", "literally",
"not rocket science", "outside the box"
]
STOPWORDS = [
'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
'being', 'below', 'between', 'both', 'but', 'by', 'could', 'did', 'do',
'does', 'doing', 'down', 'were',
'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have',
'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his',
'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'me', 'more',
'most', 'my', 'myself', 'nor', 'of', 'on', 'once', 'only', 'or', 'other',
'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'she',
'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs',
'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we',
'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with',
'would', 'you', 'your', 'yours', 'yourself', 'yourselves'
]
rgx_tok = re.compile(r'(\W+)')
def tokenizer(line: str) -> list:
return rgx_tok.sub(r' \1 ', line.strip()).split()
def untokenizer(tokens: list) -> str:
line = ' '.join(tokens)
unpunct = re.sub(r' (\W+) ', r'\1 ', line)
return ' '.join(unpunct.split())
def latex_keep_line(line: str) -> bool:
return not (
line.strip().startswith('\\') or
line.strip() == '' or
line.strip().startswith('%')
)
def insert_color(tokens: list, idx: int, color=TermColors.okblue) -> list:
newtokens = list(tokens[:idx])
newtokens += [TermColors.bold, color, tokens[idx], TermColors.endc]
newtokens += list(tokens[idx + 1:])
return newtokens
def weasels_finder(tokens: list) -> int:
wis = [i for i, tok in enumerate(tokens) if tok in WEASELS]
return wis if len(wis) != 0 else -1
def filter_punct(tokens: list) -> list:
return [tok for tok in tokens if not re.match(r'\W', tok)]
def word_count(tokens: list) -> int:
return len(filter_punct(tokens))
def repeating_words(lines: list) -> list:
words = [
w for line in lines
for w in filter_punct(tokenizer(line.strip().lower()))]
wcounter = collections.Counter(words)
mc = wcounter.most_common(20)
return [(w, c) for w, c in mc if w not in STOPWORDS]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--files', type=str, nargs='+')
parser.add_argument(
'--styles', type=str, nargs='+', default=['wc', 'weasel', 'rw'])
args = parser.parse_args()
for fn in args.files:
print(f'\n-- {fn} --\n')
with open(fn) as fi:
lines = fi.readlines()
for nl, line in enumerate(lines):
if not latex_keep_line(line):
continue
tokens = tokenizer(line)
if 'wc' in args.styles:
if word_count(tokens) > 24:
print(
f'WC = {word_count(tokens)}: {nl:5d} - {line.strip()}')
if 'weasel' in args.styles:
weasels_idxs = weasels_finder(tokens)
if weasels_idxs != -1:
colored = list(tokens)
for ii, idx in enumerate(weasels_idxs):
colored = insert_color(
colored, idx + 2 * ii, color=TermColors.warning)
print(f'WEASELS: {nl:5d} - {untokenizer(colored)}')
print()
if 'rw' in args.styles:
for w, c in repeating_words(filter(latex_keep_line, lines)):
print(f'FREQWRD: {w:<20} #: {c}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment