Skip to content

Instantly share code, notes, and snippets.

@Ana06
Last active April 24, 2020 13:20
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ana06/21762eb2d52467ff2b9339c702898e18 to your computer and use it in GitHub Desktop.
Save Ana06/21762eb2d52467ff2b9339c702898e18 to your computer and use it in GitHub Desktop.
Email statistics with Ruby and mu
# Copyright (C) 2020 Ana Maria Martinez Gomez <anamaria@martinezgomez.name>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# See <https://www.gnu.org/licenses>.
#
# SPDX-License-Identifier: GPLv3-or-later
# INSTRUCTIONS
# To run it just execute (tried with Ruby 2.6):
# ruby email_statistics.rb
#
# Uses 'mu find' to get emails information and prints the following statistics:
# - Histogram of average sent emails per hour
# - Histogram of average sent emails per hour for every day of the week
# - Distance of every of the day histogram to each other (Chi-Square distance
# is used)
# - Number of emails sent per person
# - Average reply times per person
#
# Emails can be hidden by setting the $hide_emails variable to true
require 'date'
$my_emails = ['am5320@columbia.edu', 'anamma06@gmail.com', 'anamaria@martinezgomez.name']
$end_date = Date.new(2020, 04, 23)
$num_days = 30
$decimals = 2
$hide_emails = true
def round(float)
float.to_i == float ? float.to_i : float.round($decimals)
end
def count_for_print(float)
float == 0 ? nil : round(float)
end
def wday_s(i)
Date::DAYNAMES[(i + 1) % 7] # I like Mondays first!
end
def print_cell(content=nil)
print "#{ '% 10s' % content } |"
end
def email_for_print(email)
return email unless $hide_emails
email_parts = email.split('@')
email_parts[0] = '*****'
email_parts.join('@')
end
def print_histogram(count_array, title=nil)
puts "\n #{ title }\n #{ '-' * (38 + $decimals) }"
count_array.each_with_index do |count, index|
puts " #{ '% 2s' % index } | #{ '%-30s' % ('*' * (count * 15)) } #{ count_for_print(count) }"
end
end
# Chi-Squared distance
def distance(x,y)
distance = 0
x.zip(y) { |x_i, y_i| distance += ((x_i - y_i)**2 / (x_i + y_i)) if (x_i + y_i) != 0 }
round(distance)
end
def print_distances(day_counts)
puts "\n\n\nDistances beetween histograms (Chi-Squared distance):\n\n"
print_cell
7.times { |i| print_cell(wday_s(i)) }
print_cell('Sum')
(0..6).each do |i|
puts
sum = 0
print_cell(wday_s(i))
(0..6).each do |j|
distance = distance(day_counts[i], day_counts[j])
sum += distance
print_cell(distance)
end
print_cell(round(sum))
end
end
start_date = $end_date - $num_days
range_dates = start_date..$end_date
email_data = `mu find date:#{range_dates} and \\(from:#{ $my_emails.join(' or from:') }\\) -f d~t,c~l -s d -u`
total_count = Array.new(24,0)
day_counts = Array.new(7){ Array.new(24,0) }
to_count = Hash.new(0)
reply_times = {}
# Parse email_data
email_data.split(/$/)[0..-2].each do |line|
time_s, to_emails, location = line.split('~')
time = DateTime.parse(time_s)
total_count[time.hour] += 1
day_counts[time.wday][time.hour] += 1
result = `grep In-Reply-To '#{location}'`
/<(?<msgid>.*)>/ =~ result
if msgid
time_s = `mu find msgid:#{msgid} -f d -u`
time_0 = DateTime.parse(time_s)
end
to_emails.split(',').each do |to|
/<(?<to_email>.*)>/ =~ to
to_email = to unless to_email
to_count[to_email] += 1
if time_0
reply_times[to_email] ||= []
reply_times[to_email] << ((time - time_0) * 1440).round(0)
end
end
end
# Print histogram
total_count.map! { |count| count.fdiv($num_days) }
print_histogram(total_count, 'All')
day_counts.rotate! # I like Mondays first!
# Print histogram per day
day_counts.each_with_index do |day_count, i|
index = (i + 1) % 7 # I like Mondays first!
number_days = range_dates.count{ |day| (day.wday) == index }
day_count.map! { |count| count.fdiv(number_days) }
print_histogram(day_count, Date::DAYNAMES[index])
end
# Print distances between historams using Chi-Squared distance
print_distances(day_counts)
# Print number of emails sent per person
puts "\n\n\n I sent emails to #{ to_count.size } different people:\n\n"
to_count.sort_by {|k, v| -v }.each do |to, count|
puts "#{ '% 4s' % count } | #{ email_for_print(to) }"
end
# Print average reply times per person
puts "\n\n Average reply times (in minutes):\n\n"
reply_avgs = reply_times.map do |to, times_array|
[to, times_array.sum.fdiv(times_array.length)]
end
reply_avgs.sort_by { |k, v| v }.each do
|to, avg| puts "#{ '% 6s' % avg.round(0)} | #{ email_for_print(to) }"
end
puts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment