Skip to content

Instantly share code, notes, and snippets.

@aadm
Last active April 29, 2021 14:29
Show Gist options
  • Save aadm/1dfee048ff9a2716c39ece8486df7241 to your computer and use it in GitHub Desktop.
Save aadm/1dfee048ff9a2716c39ece8486df7241 to your computer and use it in GitHub Desktop.
Script to analyse sport activity log created with jrnl

This Python script is meant to be run on a journal file created with jrnl.sh where I log all my sports activities.

For example, if today I've run 7km in 40 minutes:

$ jrnl 40m 7km @CORSA

If I forgot to add last Sunday's bike ride and yesterday's run:

$ jrnl yesterday: 20m 5km @CORSA
$ jrnl sunday: 40km @BICI

The resulting jrnl file has this format:

2021-03-30 29m 5km @CORSA
2021-04-10 54km @BICI
2021-04-11 72m 12.5km @CORSA
2021-04-25 10km+800m @TRAIL
2021-04-25 18km @BICI
2021-04-27 39m 7.4km @CORSA

The names of the activities are derived from the tags assigned to each entry, in the example jrnl file above I have:

  • @CORSA for running
  • @BICI for biking
  • @TRAIL for trail running

In sportlog.py, the name of the jrnl file is hardcoded so remember to change it.

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
from datetime import datetime, timedelta
import re
import sys
def read_jrnl_sport(jrnlfile, colormap='Pastel1'):
# convert datafile to Pandas DataFrame
df = pd.read_csv(jrnlfile, sep='@', names=['tmp', 'sport'])
# extract date and copy to `date` column
temp = df.tmp.str.extract('.*\[(.*)\].*')
df['date'] = temp.astype('datetime64[ns]')
# set date as index
df.set_index('date', inplace=True)
# extract description and copy to `desc` column
df['desc'] = df['tmp'].str.slice(13)
# set `sport` column to categorical
df['sport'] = df.sport.astype('category')
# make `sport_id` column with numerical ids
# corresponding to categories
df['sport_id'] = df.sport.cat.codes
# assign one color to each sport activity
allsports = df.sport.cat.categories
list_of_colors = plt.cm.get_cmap(colormap).colors
cdict = dict(zip(np.unique(allsports), list_of_colors))
# remove `tmp` column
return df.drop(['tmp'], axis=1), cdict
def select_period(df, year=None, date=None, days=30, forward=False):
'''
Select period either by specifying a single year
or a specific date with an interval in days.
> df_sel = select_period(df, year=2021)
Select the last 30 days starting now:
> df_sel = select_period(df, days=30)
Select the last 30 days starting on a specific date:
> df_sel = select_period(df, date='2020-06-19', days=30)
'''
if year is not None:
mask = df.index.year == year
else:
if date is None:
date = datetime.now()
else:
date = datetime.strptime(date, '%Y-%m-%d')
if forward:
start_date = date
end_date = date + timedelta(days=days)
else:
start_date = date - timedelta(days=days)
end_date = date
mask = (df.index >= start_date) & (df.index <= end_date)
return df.loc[mask]
def get_runs(df):
#----------------------------------
# FIRST, get trail runs
trails = df[df.sport=='TRAIL']
# extracts distances from `desc` column for trail runs
kilometers = []
for i, r in trails.iterrows():
kk = re.search(r'([0-9]*[.])?[0-9]+km', r.desc)
if kk is not None:
kilometers.append(kk.group().replace('km',''))
else:
kilometers.append(None)
kkarr = np.asarray(kilometers, dtype=np.float)
trails = trails.assign(distance=kkarr)
#----------------------------------
# SECOND, get normal runs
runs = df[df.sport=='CORSA']
# extracts time and distances from `desc` column for runs
minutes = []
kilometers = []
for i, r in runs.iterrows():
mm = re.search(r'([0-9]*[.])?[0-9]+m', r.desc)
kk = re.search(r'([0-9]*[.])?[0-9]+km', r.desc)
if mm is not None:
minutes.append(mm.group().replace('m',''))
else:
minutes.append(None)
if kk is not None:
kilometers.append(kk.group().replace('km',''))
else:
kilometers.append(None)
mmarr = np.asarray(minutes, dtype=np.float)
kkarr = np.asarray(kilometers, dtype=np.float)
runs = runs.assign(distance=kkarr)
runs = runs.assign(time=mmarr)
# set up some filters
nodist = pd.isna(runs.distance)
notime = pd.isna(runs.time)
both = ~notime & ~nodist
# calculate speed for each activity
# it will be NaN where either speed or distance is not registered
runs['speed'] = runs['time'] / runs['distance']
# calculate avg speed in minutes/km
avgspeed = runs['speed'].mean()
# fill in runs where only distance has been recorded
# time is based on average speed, and speed = average speed
onlydist = notime & ~nodist
runs.loc[onlydist, 'time'] = runs[onlydist]['distance'] * avgspeed
runs.loc[onlydist, 'speed'] = avgspeed
# fill in runs where only time has been recorded
# time is based on average speed, and speed = average speed
onlytime = ~notime & nodist
runs.loc[onlytime, 'distance'] = runs[onlytime]['time'] / avgspeed
runs.loc[onlytime, 'speed'] = avgspeed
#----------------------------------
# LAST, append trails to the end of runs
# speed and time is of no interest for trail runs
# sorted on dates
return runs.append(trails).sort_index()
def plot_sportpie(df, cdict, ax=None):
def label_pie_slice(val):
return f'{val/100*len(df):.0f} ({val:.0f}%)'
grouped = df.groupby(df['sport'].cat.remove_unused_categories(), sort=False)
sports = grouped['sport'].count().index.values.to_list()
date_min = df.index.min().strftime('%Y-%m-%d')
date_max = df.index.max().strftime('%Y-%m-%d')
if ax is None:
f, ax = plt.subplots(constrained_layout=True, num=1)
grouped.size().plot(kind='pie', autopct=label_pie_slice,
colors=[cdict[key] for key in sports], ax=ax)
ax.set_title('START: {}\nEND: {}'.format(date_min, date_max), fontsize='x-large')
ax.set_ylabel('')
def plot_sportbar(df, cdict, ax=None):
date_min = df.index.min().strftime('%Y-%m-%d')
date_max = df.index.max().strftime('%Y-%m-%d')
sports = df['sport'].cat.categories.to_list()
if ax is None:
f, ax = plt.subplots(constrained_layout=True, num=2)
df['sport'].value_counts(sort=False).plot(kind='bar', rot=45, ax=ax,
color=[cdict[key] for key in sports])
ax.set_title('START: {}\nEND: {}'.format(date_min, date_max), fontsize='x-large')
ax.set_ylabel('Number of activities')
def plot_runs_hist(df, ax=None):
# calculate statistics
mean = df.distance.mean()
p25 = df.distance.describe().loc['25%']
p75 = df.distance.describe().loc['75%']
# selects only trail runs
tt = df[df.sport=='TRAIL']
# set plot options
opt = dict(lw=2, alpha=0.5)
opt_tr = dict(marker=2, color='g', ls='none', ms=10, label='Trail Runs')
# make plot
if ax is None:
_, ax = plt.subplots(constrained_layout=True, figsize=(8,4), num=3)
df.distance.plot.hist(bins=50, color='k', alpha=0.25, ax=ax, label='')
ax.axvline(mean, color='r', ls='--', label='Mean: {:.1f} km'.format(mean), **opt)
ax.axvline(p25, color='r', ls=':', label='P25: {:.1f} km'.format(p25), **opt)
ax.axvline(p75, color='r', ls=':', label='P75: {:.1f} km'.format(p75), **opt)
if tt.shape[0] > 0:
ax.plot(tt.distance, np.zeros(tt.shape[0]), **opt_tr)
ax.legend()
ax.set_xlabel('Distance (km)')
def plot_runs_stats(df):
# trim input dataframe to first sunday
nn = df.index.day_name()=='Sunday'
start_weekly_stats = df[nn].iloc[[0]].index.to_pydatetime()[0]
mm = df.index >= start_weekly_stats
weekly_runs = df[mm].resample('W', closed='left').sum()
avg_weekd = weekly_runs.distance.mean()
max_weekd = weekly_runs.distance.max()
# calculate stats
runs = df.shape[0]
weeks = (df.index[-1]-df.index[0]).days//7
freq = runs/weeks
avgdst = df.describe().loc['mean', 'distance']
pb_dst = df.describe().loc['max', 'distance']
totdst = df['distance'].sum()
tmp0 = df.describe().loc['mean', 'speed']
avgs_min = int(tmp0)
avgs_sec = np.round((tmp0 - int(tmp0))*60)
avgspd = "{:.0f}'{:.0f}\"".format(avgs_min, avgs_sec)
tmp1 = df.describe().loc['min', 'speed']
pbs_min = int(tmp1)
pbs_sec = np.round((tmp1 - int(tmp1))*60)
pb_spd = "{:.0f}'{:.0f}\"".format(pbs_min, pbs_sec)
textstr = f'''
Runs: {runs}
Frequency: {freq:.1f} runs/week
Avg distance: {avgdst:.1f} km
Avg speed: {avgspd} min/km
PB distance: {pb_dst} km
PB speed: {pb_spd} mins/km
Total distance: {totdst:.0f} km
Avg weekly: {avg_weekd:.1f} km
Max weekly: {max_weekd:.1f} km
'''
opt = dict(color='k', marker='_', ls='none')
f, ax = plt.subplots(nrows=3, num=4,
sharex=True, figsize=(10, 6))
# first subplot: TIME
ax[0].bar(df.index, height=df.time, width=1.5, alpha=0.2, color='b')
ax[0].set_ylabel('Time (min)', color='b')
ax[0].tick_params(axis='y', colors='b')
axwt = ax[0].twinx()
axwt.plot(weekly_runs.index, weekly_runs.time, **opt)
axwt.set_ylabel('Weekly Avg', color='k')
axwt.tick_params(axis='y', colors='k')
# second subplot: DISTANCES
ax[1].bar(df.index, height=df.distance, width=1.5, alpha=0.2, color='g')
ax[1].set_ylabel('Distance (km)', color='g')
ax[1].tick_params(axis='y', colors='g')
axwd = ax[1].twinx()
axwd.plot(weekly_runs.index, weekly_runs.distance, **opt)
axwd.set_ylabel('Weekly Avg', color='k')
axwd.tick_params(axis='y', colors='k')
# third subplot: SPEED
ax[2].bar(df.index, height=df.speed, width=1.5, alpha=0.2, color='r')
ax[2].plot(df.rolling('7d').mean().speed, '-r')
ax[2].set_ylabel('Speed (min/km)', color='r')
ax[2].tick_params(axis='y', colors='r')
ax[2].tick_params(axis='x', rotation=45)
ax[2].set_ylim(np.floor(df.speed.min()), np.ceil(df.speed.max()))
# turn on horizontal gridlines
for aa in ax:
aa.grid(axis='y')
# add stats
props = dict(boxstyle='round', facecolor='white', alpha=0.7)
plt.figtext(0.78, 0.5, textstr, va='center', bbox=props)
plt.subplots_adjust(right=0.7, left=0.1, top=0.95, bottom=0.1)
def make_plots(df, cdict):
plot_sportpie(df, cdict)
plot_sportbar(df, cdict)
runs = get_runs(df)
plot_runs_hist(runs)
plot_runs_stats(runs)
plt.show()
if len(sys.argv) == 1:
print('Usage: python', sys.argv[0], '[year YYYY | days DD | all]')
sys.exit(1)
if sys.argv[1]=='year':
df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt')
selected_period = select_period(df, year=int(sys.argv[2]))
make_plots(selected_period, cdict)
elif sys.argv[1]=='days':
df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt')
selected_period = select_period(df, days=int(sys.argv[2]))
make_plots(selected_period, cdict)
elif sys.argv[1]=='all':
df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt')
make_plots(df, cdict)
@aadm
Copy link
Author

aadm commented Apr 28, 2021

Examples of output plots

Pie chart with all activities in selected period:

Figure_1

Histogram of running distances:

Figure_3

Duration, distances and speed of runs; the grey markers on the duration and distance plots mark weekly averages, while the continuous red line on the speed plot is a running average over a period of a week:

Figure_4

Distances and speed are in kilometers and minutes/kilometers because that's what I'm used to.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment