Skip to content

Instantly share code, notes, and snippets.

@TheDataLeek
Last active December 12, 2017 21:40
Show Gist options
  • Save TheDataLeek/c5aff9ca5ee2a8db61fe8dbc5fca95c9 to your computer and use it in GitHub Desktop.
Save TheDataLeek/c5aff9ca5ee2a8db61fe8dbc5fca95c9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import sys
import os
import argparse
import re
from pprint import pprint as pp
import pytz
import dateutil
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.io as sc_io
import tqdm
IMG_FOLDER = './img'
ZOE = '@SpatulaFish#8544'
if not os.path.exists(IMG_FOLDER):
os.mkdir(IMG_FOLDER)
days_of_week = [
'Monday',
'Tuesday',
'Wednesday',
'Thursday',
'Friday',
'Saturday',
'Sunday'
]
def getpath(filename):
return os.path.join(IMG_FOLDER, filename)
def main():
args = get_args()
df = get_data(args.directory)
plot_message_counts(df)
plot_over_time(df)
for user in tqdm.tqdm(df['user'].unique()):
try:
plot_time_of_day(df, name=user)
plot_time_of_week(df, name=user)
except TypeError as e:
print(f'{user}: {e}')
pass
def get_data(directory):
# 2016-03-16 21:10:32 #general @SpatulaFish#8544: test
num = '[0-9]'
channel = '#[a-zA-Z\-]+'
user = f'@.+#{num}{{4}}'
date = f'{num}{{4}}-{num}{{2}}-{num}{{2}}'
time = f'{num}{{2}}:{num}{{2}}:{num}{{2}}'
message = f'({date} {time}) ({channel}) ({user}): (.*)'
message_re = re.compile(message)
messages = []
for root, dirs, files in os.walk(directory):
for filename in files:
with open(os.path.join(root, filename), 'r') as fileobj:
for message in re.findall(message_re, fileobj.read()):
messages.append(message)
df = pd.DataFrame(messages,
columns=['timestamp', 'channel', 'user', 'text'])
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['timestamp'] = df['timestamp'] - pd.Timedelta('07:00:00')
df['time_of_day'] = df['timestamp'].apply(lambda x: x - pd.Timestamp(x.date()))
df['day_of_week'] = df.timestamp.apply(lambda x: x.dayofweek)
pp(sorted(df.user.unique()))
return df
def get_user_counts(df):
user_counts = df[['user', 'channel']].groupby('user').count()
user_counts = user_counts.reset_index()
user_counts.columns = ['username', 'messagecount']
user_counts = user_counts.sort_values('messagecount')
return user_counts
def plot_message_counts(df):
# plot user message counts
user_counts = get_user_counts(df)
user_counts = user_counts[user_counts.messagecount > 10]
user_counts['log(messagecount)'] = np.log10(user_counts.messagecount.values)
for xaxis in ['messagecount', 'log(messagecount)']:
user_counts.plot(x='username',
y=xaxis,
kind='barh',
figsize=(8, 8),
title=f'{xaxis} per user')
plt.tight_layout()
plt.savefig(getpath(f'{xaxis}.png'))
def plot_over_time(df):
# plot over time
# Restructure for Michelle
# overtime = df[['user', 'timestamp', 'channel']]\
# .groupby([pd.Grouper(key='timestamp', freq='60min'), 'user'])\
# .count()
# overtime = overtime.reset_index()
# overtime.columns = ['timestamp', 'username', 'count']
# users = sorted(overtime['username'].unique())
# weeks = sorted(overtime['timestamp'].unique())
# user_weeks = pd.DataFrame({'username': users,
# **{week: np.zeros(len(users))
# for week in weeks}})
# for user, group in tqdm.tqdm(overtime.groupby('username')):
# row = []
# for week in weeks:
# weekdata = group[group['timestamp'] == pd.Timestamp(week)]
# if len(weekdata) == 0:
# row.append(0)
# else:
# row.append(weekdata['count'].values[0])
# user_weeks.loc[user_weeks['username'] == user] = ([user] + row)
# sc_io.savemat('./overtime.mat', {'data': user_weeks.values})
# for week in weeks:
# user_weeks[week] = overtime[overtime['timestamp'] == pd.Timestamp(week)].sort_values('username')[['count']]
# sc_io.savemat(
# 'overtime.mat',
# {
# 'timestamp': overtime['timestamp'].values,
# 'username': overtime['username'].values,
# 'count': overtime['count'].values
# }
# )
overtime = df[['user', 'timestamp', 'channel']]\
.groupby([pd.Grouper(key='timestamp', freq='1W'), 'user'])\
.count()
overtime = overtime.reset_index()
overtime.columns = ['timestamp', 'username', 'count']
fig = plt.figure(figsize=(8, 8))
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
user_counts = get_user_counts(df)
most_frequent = sorted(user_counts[user_counts.messagecount > 10000]['username'].values)
freqs = []
fft_N = 2048
for user in most_frequent:
user_df = overtime[overtime.username == user]
user_df.plot(
x='timestamp',
y='count',
kind='line',
ax=ax
)
freq_data = np.abs(np.fft.fft(user_df['count'].values, fft_N))
freqs.append(freq_data)
plt.tight_layout()
plt.ylabel('Messages per Week')
plt.title('Most Frequent Users Posting Timeline')
plt.legend(most_frequent, loc=0)
plt.savefig(getpath('overtime.png'))
weeks = sorted(overtime['timestamp'].unique())
scatter_df = pd.DataFrame(
{
user: np.zeros(len(weeks))
for user in most_frequent
}
)
for i, week in enumerate(weeks):
row = []
for user in most_frequent:
weekval = overtime[(overtime.username == user) &
(overtime.timestamp == pd.Timestamp(week))]['count']
if len(weekval) == 0:
row.append(0)
else:
row.append(weekval.values[0])
scatter_df.iloc[i] = row
fig, axarr = plt.subplots(len(most_frequent), len(most_frequent), figsize=(16, 16))
for i, user1 in enumerate(most_frequent):
for j, user2 in enumerate(most_frequent):
if i >= j:
corr = np.correlate(
scatter_df[user1].values / scatter_df[user1].std(),
scatter_df[user2].values / scatter_df[user2].std(),
mode='full'
)
corr /= len(corr)
corr -= 1
axarr[i, j].plot(np.arange(len(corr)), corr, label=f'{user1}\n{user2}')
axarr[i, j].plot(np.linspace(0, len(corr), 10), np.zeros(10), 'k-', alpha=0.5)
axarr[i, j].set_ylim((-1, 1))
axarr[i, j].set_xlim((0, len(corr)))
axarr[i, j].tick_params(
axis='both',
which='both',
direction='in'
)
axarr[i, j].legend(loc=0)
else:
user1data = scatter_df[user1].values
user2data = scatter_df[user2].values
axarr[i, j].plot(np.arange(len(user1data)),
user1data,
label=user1)
axarr[i, j].plot(np.arange(len(user2data)),
user2data,
label=user2)
axarr[i, j].legend(loc=0)
plt.tight_layout()
plt.savefig(getpath('scatter.png'))
#frequency domain
plt.figure(figsize=(8, 8))
for name, freq in zip(most_frequent, freqs):
plt.plot(range(len(freq)), freq / fft_N, label=name)
plt.xlabel('Normalized Frequency')
plt.ylabel('FFT Values')
plt.legend(loc=0)
plt.savefig(getpath('frequency.png'))
def plot_time_of_day(df, name=ZOE):
# by time of day
tod = df[['user', 'time_of_day', 'channel']]\
.groupby(['user', pd.Grouper(key='time_of_day', freq='30T')])\
.count()
tod = tod.reset_index()
tod.columns = ['user', 'tod', 'count']
tod = tod[tod.user == name].sort_values('tod')
tod.plot(
x='tod',
y='count',
kind='bar'
)
plt.title(f'{name}\'s Day')
plt.tight_layout()
plt.savefig(getpath(f'{name}_tod.png'))
def plot_time_of_week(df, name=ZOE):
tod = df[['user', 'time_of_day', 'day_of_week', 'channel']]\
.groupby(['user', 'day_of_week', pd.Grouper(key='time_of_day', freq='30T')])\
.count()
tod = tod.reset_index()
tod.columns = ['user', 'dow', 'tod', 'count']
tod = tod[tod.user == name].sort_values(['dow', 'tod'])
maxval = tod['count'].values.max()
fig, axarr = plt.subplots(7, 1, figsize=(8, 8))
for i in range(7):
tod[tod.dow == i].plot(
x='tod',
y='count',
kind='bar',
ax=axarr[i]
)
axarr[i].xaxis.set_visible(False)
axarr[i].set_title(f'{days_of_week[i]}')
legend = axarr[i].legend()
legend.remove()
axarr[i].set_ylim([0, maxval])
plt.suptitle(f'{name}\'s Week')
plt.tight_layout()
plt.savefig(getpath(f'{name}_tow.png'))
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--directory', type=str, default='./cortex',
help='Directory with logfiles')
args = parser.parse_args()
return args
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment