Skip to content

Instantly share code, notes, and snippets.

@mpacer
Last active February 10, 2016 20:54
Show Gist options
  • Save mpacer/02326b51a7e0009c2c04 to your computer and use it in GitHub Desktop.
Save mpacer/02326b51a7e0009c2c04 to your computer and use it in GitHub Desktop.
Prose clock: an extensible framework for regularly clocking how much prose (in tex) you've written + some other stats
import os
import re
import os.path
import datetime
import csv
import pandas as pd
from difflib import SequenceMatcher
def map_feat_to_vals(features,vals,f,words,contents):
return {f:v for f,v in zip(features,vals)}
def test_pandas_io_check(pd1,pd2):
return pd1.equals(pd2)
def test_valid_dict_encoding(change_table,bag_o_words,sorted_word_set,sorted_word_counts):
# at time of writing sorted_word_set = "unique_words"; sorted_word_counts="bag_o_words_vals"
return {k : v for k,v in zip(change_table[-1][sorted_word_set],change_table[-1][sorted_word_counts])} == bag_o_words
blah = re.compile('\.ipynb.*$')
#could have used glob
d='.'
dirs = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o)) and not blah.search(o)]
f_path_list = []
for zub in dirs:
f_path_list.extend([os.path.join(zub,f) for f in os.listdir(zub) if f.endswith(".tex")])
f_path_list = []
for dirname, dirnames, filenames in os.walk(os.getcwd(), followlinks=True):
if blah.search(dirname):
continue
for f in filenames:
if f.endswith(".tex"):
f_path_list.append(os.path.join(dirname, f))
time = datetime.datetime.now()
change_table = []
features = ['f_name',
'time',
'num_words',
'num_unique_words',
'unique_words',
'bag_o_words_vals',
'file_str',
'word_list_words',
'file_path'
]
db_name = 'db.pkl'
try:
db = pd.read_pickle(db_name)
flag_not_empty = True
except:
db = pd.DataFrame(columns=features)
db.to_pickle(db_name)
flag_not_empty = False
for f in f_path_list:
with open(f, 'r') as fh:
contents = fh.read()
if flag_not_empty:
if not any(db['file_path'].isin([f])):
pass
elif contents == db[db['f_name'] == os.path.basename(f)].sort_values(by='time')[-1:]["file_str"].reset_index(drop=True)[0]:
continue
words = sorted(contents.replace(" \n","\n ").replace("\t"," \t ").replace('"',"").split(" "))
words = [word for word in words if word is not ""]
bag_o_words = {word: contents.count(word) for word in sorted(set(words))}
vals_for_feats = [os.path.basename(f),
time,
len(words),
len(set(words)),
sorted(set(words)),
[bag_o_words[word] for word in sorted(set(words))],
contents,
words,
f]
change_table.append(map_feat_to_vals(features,vals_for_feats,f,words,contents))
if change_table != []:
df = pd.DataFrame(change_table)
df = df[features]
test_invertibility = 'test_db_file.pkl'
# if you want to test any other formats to be saved, be sure to test_pandas_io_check
# csv and json have been shown to fail for these reasons
# df.to_csv(save_db_name)
# d2 = pd.read_csv(save_db_name, index_col=0)
# df.to_json(test_invertibility)
# d2 = pd.read_json(test_invertibility)
df.to_pickle(test_invertibility)
d2 = pd.read_pickle(test_invertibility)
assert(test_pandas_io_check(df,d2))
os.remove(test_invertibility)
db = db.append(df,ignore_index=True)
db.to_pickle(db_name)
#use /bin/bash to run commands, overriding the default set by cron
SHELL=/bin/bash
#change path to be able to access `sysctl` and other utilities
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
# run proseclock on minute 42 of every hour
# replace `~/your/proseclock/path/here/` with the path to the directory that you put proseclock in
# replace `~/your/executable/python3` with the path to `python3`, try `which python3` to get this path
# for me that path was `/usr/local/bin/python3`
42 * * * * cd ~/your/proseclock/path/here/ && ~/your/executable/python3 proseclock.py
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment