mpacer/proseclock.py

## proseclock.py
import os
import re
import os.path
import datetime
import csv
import pandas as pd
from difflib import SequenceMatcher

def map_feat_to_vals(features,vals,f,words,contents):
    return {f:v for f,v in zip(features,vals)}

def test_pandas_io_check(pd1,pd2):
    return pd1.equals(pd2)

def test_valid_dict_encoding(change_table,bag_o_words,sorted_word_set,sorted_word_counts):
    # at time of writing sorted_word_set = "unique_words"; sorted_word_counts="bag_o_words_vals"
    return {k : v for k,v in zip(change_table[-1][sorted_word_set],change_table[-1][sorted_word_counts])} == bag_o_words


blah = re.compile('\.ipynb.*$')
#could have used glob
d='.'
dirs = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o)) and not blah.search(o)]
f_path_list = []
for zub in dirs:
    f_path_list.extend([os.path.join(zub,f) for f in os.listdir(zub) if f.endswith(".tex")])

f_path_list = []
for dirname, dirnames, filenames in os.walk(os.getcwd(), followlinks=True):
    if blah.search(dirname):
        continue

    for f in filenames:
        if f.endswith(".tex"):
            f_path_list.append(os.path.join(dirname, f))

time = datetime.datetime.now()
change_table = []
features = ['f_name',
            'time',
            'num_words',
            'num_unique_words',
            'unique_words',
            'bag_o_words_vals',
            'file_str',
            'word_list_words',
            'file_path'
           ]
db_name = 'db.pkl'

try:
    db = pd.read_pickle(db_name)
    flag_not_empty = True
except:
    db = pd.DataFrame(columns=features)
    db.to_pickle(db_name)
    flag_not_empty = False

for f in f_path_list:
    with open(f, 'r') as fh:
        contents = fh.read()

    if flag_not_empty:
        if not any(db['file_path'].isin([f])):
            pass
        elif contents ==  db[db['f_name'] == os.path.basename(f)].sort_values(by='time')[-1:]["file_str"].reset_index(drop=True)[0]:
            continue

    words = sorted(contents.replace(" \n","\n ").replace("\t"," \t ").replace('"',"").split(" "))
    words = [word for word in words if word is not ""]
    bag_o_words = {word: contents.count(word) for word in sorted(set(words))}

    vals_for_feats = [os.path.basename(f),
                    time,
                    len(words),
                    len(set(words)),
                    sorted(set(words)),
                    [bag_o_words[word] for word in sorted(set(words))],
                    contents,
                    words,
                    f]


    change_table.append(map_feat_to_vals(features,vals_for_feats,f,words,contents))

if change_table != []:
    df = pd.DataFrame(change_table)
    df = df[features]

    test_invertibility = 'test_db_file.pkl'

    # if you want to test any other formats to be saved, be sure to test_pandas_io_check
    # csv and json have been shown to fail for these reasons

    # df.to_csv(save_db_name)
    # d2 = pd.read_csv(save_db_name, index_col=0)
    # df.to_json(test_invertibility)
    # d2 = pd.read_json(test_invertibility)

    df.to_pickle(test_invertibility)
    d2 = pd.read_pickle(test_invertibility)
    assert(test_pandas_io_check(df,d2))
    os.remove(test_invertibility)

    db = db.append(df,ignore_index=True)
    db.to_pickle(db_name)

## proseclock_crontab.txt
#use /bin/bash to run commands, overriding the default set by cron
SHELL=/bin/bash
#change path to be able to access `sysctl` and other utilities
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
# run proseclock on minute 42 of every hour
# replace `~/your/proseclock/path/here/` with the path to the directory that you put proseclock in
# replace `~/your/executable/python3` with the path to `python3`, try `which python3` to get this path
# for me that path was `/usr/local/bin/python3`
42 * * * * cd ~/your/proseclock/path/here/ && ~/your/executable/python3 proseclock.py
	import os
	import re
	import os.path
	import datetime
	import csv
	import pandas as pd
	from difflib import SequenceMatcher

	def map_feat_to_vals(features,vals,f,words,contents):
	return {f:v for f,v in zip(features,vals)}

	def test_pandas_io_check(pd1,pd2):
	return pd1.equals(pd2)

	def test_valid_dict_encoding(change_table,bag_o_words,sorted_word_set,sorted_word_counts):
	# at time of writing sorted_word_set = "unique_words"; sorted_word_counts="bag_o_words_vals"
	return {k : v for k,v in zip(change_table[-1][sorted_word_set],change_table[-1][sorted_word_counts])} == bag_o_words



	blah = re.compile('\.ipynb.*$')
	#could have used glob
	d='.'
	dirs = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o)) and not blah.search(o)]
	f_path_list = []
	for zub in dirs:
	f_path_list.extend([os.path.join(zub,f) for f in os.listdir(zub) if f.endswith(".tex")])

	f_path_list = []
	for dirname, dirnames, filenames in os.walk(os.getcwd(), followlinks=True):
	if blah.search(dirname):
	continue

	for f in filenames:
	if f.endswith(".tex"):
	f_path_list.append(os.path.join(dirname, f))

	time = datetime.datetime.now()
	change_table = []
	features = ['f_name',
	'time',
	'num_words',
	'num_unique_words',
	'unique_words',
	'bag_o_words_vals',
	'file_str',
	'word_list_words',
	'file_path'
	]
	db_name = 'db.pkl'

	try:
	db = pd.read_pickle(db_name)
	flag_not_empty = True
	except:
	db = pd.DataFrame(columns=features)
	db.to_pickle(db_name)
	flag_not_empty = False

	for f in f_path_list:
	with open(f, 'r') as fh:
	contents = fh.read()

	if flag_not_empty:
	if not any(db['file_path'].isin([f])):
	pass
	elif contents == db[db['f_name'] == os.path.basename(f)].sort_values(by='time')[-1:]["file_str"].reset_index(drop=True)[0]:
	continue

	words = sorted(contents.replace(" \n","\n ").replace("\t"," \t ").replace('"',"").split(" "))
	words = [word for word in words if word is not ""]
	bag_o_words = {word: contents.count(word) for word in sorted(set(words))}

	vals_for_feats = [os.path.basename(f),
	time,
	len(words),
	len(set(words)),
	sorted(set(words)),
	[bag_o_words[word] for word in sorted(set(words))],
	contents,
	words,
	f]


	change_table.append(map_feat_to_vals(features,vals_for_feats,f,words,contents))

	if change_table != []:
	df = pd.DataFrame(change_table)
	df = df[features]

	test_invertibility = 'test_db_file.pkl'

	# if you want to test any other formats to be saved, be sure to test_pandas_io_check
	# csv and json have been shown to fail for these reasons

	# df.to_csv(save_db_name)
	# d2 = pd.read_csv(save_db_name, index_col=0)
	# df.to_json(test_invertibility)
	# d2 = pd.read_json(test_invertibility)

	df.to_pickle(test_invertibility)
	d2 = pd.read_pickle(test_invertibility)
	assert(test_pandas_io_check(df,d2))
	os.remove(test_invertibility)

	db = db.append(df,ignore_index=True)
	db.to_pickle(db_name)
	#use /bin/bash to run commands, overriding the default set by cron
	SHELL=/bin/bash
	#change path to be able to access `sysctl` and other utilities
	PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
	# run proseclock on minute 42 of every hour
	# replace `~/your/proseclock/path/here/` with the path to the directory that you put proseclock in
	# replace `~/your/executable/python3` with the path to `python3`, try `which python3` to get this path
	# for me that path was `/usr/local/bin/python3`
	42 * * * * cd ~/your/proseclock/path/here/ && ~/your/executable/python3 proseclock.py