Email statistics with Ruby and mu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (C) 2020 Ana Maria Martinez Gomez <anamaria@martinezgomez.name> | |
# | |
# This program is free software: you can redistribute it and/or modify it under | |
# the terms of the GNU General Public License as published by the Free Software | |
# Foundation, either version 3 of the License, or (at your option) any later | |
# version. | |
# | |
# This program is distributed in the hope that it will be useful, but WITHOUT | |
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | |
# details. | |
# | |
# See <https://www.gnu.org/licenses>. | |
# | |
# SPDX-License-Identifier: GPLv3-or-later | |
# INSTRUCTIONS | |
# To run it just execute (tried with Ruby 2.6): | |
# ruby email_statistics.rb | |
# | |
# Uses 'mu find' to get emails information and prints the following statistics: | |
# - Histogram of average sent emails per hour | |
# - Histogram of average sent emails per hour for every day of the week | |
# - Distance of every of the day histogram to each other (Chi-Square distance | |
# is used) | |
# - Number of emails sent per person | |
# - Average reply times per person | |
# | |
# Emails can be hidden by setting the $hide_emails variable to true | |
require 'date' | |
$my_emails = ['am5320@columbia.edu', 'anamma06@gmail.com', 'anamaria@martinezgomez.name'] | |
$end_date = Date.new(2020, 04, 23) | |
$num_days = 30 | |
$decimals = 2 | |
$hide_emails = true | |
def round(float) | |
float.to_i == float ? float.to_i : float.round($decimals) | |
end | |
def count_for_print(float) | |
float == 0 ? nil : round(float) | |
end | |
def wday_s(i) | |
Date::DAYNAMES[(i + 1) % 7] # I like Mondays first! | |
end | |
def print_cell(content=nil) | |
print "#{ '% 10s' % content } |" | |
end | |
def email_for_print(email) | |
return email unless $hide_emails | |
email_parts = email.split('@') | |
email_parts[0] = '*****' | |
email_parts.join('@') | |
end | |
def print_histogram(count_array, title=nil) | |
puts "\n #{ title }\n #{ '-' * (38 + $decimals) }" | |
count_array.each_with_index do |count, index| | |
puts " #{ '% 2s' % index } | #{ '%-30s' % ('*' * (count * 15)) } #{ count_for_print(count) }" | |
end | |
end | |
# Chi-Squared distance | |
def distance(x,y) | |
distance = 0 | |
x.zip(y) { |x_i, y_i| distance += ((x_i - y_i)**2 / (x_i + y_i)) if (x_i + y_i) != 0 } | |
round(distance) | |
end | |
def print_distances(day_counts) | |
puts "\n\n\nDistances beetween histograms (Chi-Squared distance):\n\n" | |
print_cell | |
7.times { |i| print_cell(wday_s(i)) } | |
print_cell('Sum') | |
(0..6).each do |i| | |
puts | |
sum = 0 | |
print_cell(wday_s(i)) | |
(0..6).each do |j| | |
distance = distance(day_counts[i], day_counts[j]) | |
sum += distance | |
print_cell(distance) | |
end | |
print_cell(round(sum)) | |
end | |
end | |
start_date = $end_date - $num_days | |
range_dates = start_date..$end_date | |
email_data = `mu find date:#{range_dates} and \\(from:#{ $my_emails.join(' or from:') }\\) -f d~t,c~l -s d -u` | |
total_count = Array.new(24,0) | |
day_counts = Array.new(7){ Array.new(24,0) } | |
to_count = Hash.new(0) | |
reply_times = {} | |
# Parse email_data | |
email_data.split(/$/)[0..-2].each do |line| | |
time_s, to_emails, location = line.split('~') | |
time = DateTime.parse(time_s) | |
total_count[time.hour] += 1 | |
day_counts[time.wday][time.hour] += 1 | |
result = `grep In-Reply-To '#{location}'` | |
/<(?<msgid>.*)>/ =~ result | |
if msgid | |
time_s = `mu find msgid:#{msgid} -f d -u` | |
time_0 = DateTime.parse(time_s) | |
end | |
to_emails.split(',').each do |to| | |
/<(?<to_email>.*)>/ =~ to | |
to_email = to unless to_email | |
to_count[to_email] += 1 | |
if time_0 | |
reply_times[to_email] ||= [] | |
reply_times[to_email] << ((time - time_0) * 1440).round(0) | |
end | |
end | |
end | |
# Print histogram | |
total_count.map! { |count| count.fdiv($num_days) } | |
print_histogram(total_count, 'All') | |
day_counts.rotate! # I like Mondays first! | |
# Print histogram per day | |
day_counts.each_with_index do |day_count, i| | |
index = (i + 1) % 7 # I like Mondays first! | |
number_days = range_dates.count{ |day| (day.wday) == index } | |
day_count.map! { |count| count.fdiv(number_days) } | |
print_histogram(day_count, Date::DAYNAMES[index]) | |
end | |
# Print distances between historams using Chi-Squared distance | |
print_distances(day_counts) | |
# Print number of emails sent per person | |
puts "\n\n\n I sent emails to #{ to_count.size } different people:\n\n" | |
to_count.sort_by {|k, v| -v }.each do |to, count| | |
puts "#{ '% 4s' % count } | #{ email_for_print(to) }" | |
end | |
# Print average reply times per person | |
puts "\n\n Average reply times (in minutes):\n\n" | |
reply_avgs = reply_times.map do |to, times_array| | |
[to, times_array.sum.fdiv(times_array.length)] | |
end | |
reply_avgs.sort_by { |k, v| v }.each do | |
|to, avg| puts "#{ '% 6s' % avg.round(0)} | #{ email_for_print(to) }" | |
end | |
puts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment