vmonaco/cmu_powerlaw.py

## cmu_powerlaw.py
'''
Created on May 26, 2015

@author: vinnie, vincent@vmonaco.com

Power-law results from:
"DATA FORENSIC TECHNIQUES USING BENFORD’S LAW AND ZIPF’S LAW FOR KEYSTROKE
DYNAMICS", Aamo Iorliam, Anthony T.S. Ho, Norman Poh, Santosh Tirunagari,
and Patrick Bours. IWBF 2015.

Uses the data from:
"Comparing Anomaly-Detection Algorithms for Keystroke Dynamics,"
Kevin Killourhy and Roy Maxion. DSN 2009. http://www.cs.cmu.edu/~keystroke/

Requires numpy, pandas, powerlaw, and matplotlib. Run the script as:
$ python cmu_powerlaw.py
'''

import numpy as np
import pandas as pd
import powerlaw as pl
import matplotlib.pyplot as plt

FIT_METHOD = 'KS' # Can also be 'Likelihood'
DATA_URL = 'http://www.cs.cmu.edu/~keystroke/DSL-StrongPasswordData.csv'

# Use none as the x_mins to estimate them
UD_XMIN, DD_XMIN, DUR_XMIN = None, None, None

# Uncomment line below to use the estimates for x_min,
# The resulting x_min estimate from this script is 0.1852, not 0.1818
# UD_XMIN, DD_XMIN, DUR_XMIN = 0.9801, 0.9880, 0.1818

print('Downloading data from', DATA_URL)
# First 3 cols are subject, session, repetition
df = pd.read_csv(DATA_URL, index_col=[0,1,2])

# Columns are labeled like: feature_type.keyname[.secondkeyname]
# Get the columns for a specific feature type
get_feature_cols = lambda feat: [c for c in df.columns if c.startswith(feat)]

ud = df[get_feature_cols('UD')].values.flatten() # up-down latency
dd = df[get_feature_cols('DD')].values.flatten() # down-down latency
dur = df[get_feature_cols('H')].values.flatten() # Hold time
# UD can be negative, so use abs values
ud = np.abs(ud)

print('Fitting models, may take a while...')
fit_ud = pl.Fit(ud, fit_method=FIT_METHOD, xmin=UD_XMIN)
fit_dd = pl.Fit(dd, fit_method=FIT_METHOD, xmin=DD_XMIN)
fit_dur = pl.Fit(dur, fit_method=FIT_METHOD, xmin=DUR_XMIN)

summarize = lambda fit: 'x_min = %.4f\nalpha = %.4f\nL = %.4f' \
            %(fit.power_law.xmin, fit.power_law.alpha, fit.power_law.loglikelihoods(fit.data).sum())

print('Up-down\n', summarize(fit_ud), sep='')
print('Down-down\n', summarize(fit_dd), sep='')
print('Duration\n', summarize(fit_dur), sep='')

# Helper to make a nice plot
def make_subplot(name, fit, ax, visiblex=False):
    plt.setp(ax1.get_xticklabels(), visible=visiblex)
    fit.plot_ccdf(color='k')
    fit.power_law.plot_ccdf(color='r', linestyle='--')
    ax.text(0.9, 0.9, '%s\n%s' %(name, summarize(fit)),
            ha='right', va='top', transform=ax.transAxes)

plt.figure(figsize=(6,9))

ax1 = plt.subplot(311)
ax1.set_title('CMU keystroke power laws')
make_subplot('Up-down', fit_ud, ax1)

ax2 = plt.subplot(312, sharex=ax1)
make_subplot('Down-down', fit_dd, ax2)

ax3 = plt.subplot(313, sharex=ax1)
make_subplot('Duration', fit_dur, ax3, True)

plt.tight_layout()
plt.show()
	'''
	Created on May 26, 2015

	@author: vinnie, vincent@vmonaco.com

	Power-law results from:
	"DATA FORENSIC TECHNIQUES USING BENFORD’S LAW AND ZIPF’S LAW FOR KEYSTROKE
	DYNAMICS", Aamo Iorliam, Anthony T.S. Ho, Norman Poh, Santosh Tirunagari,
	and Patrick Bours. IWBF 2015.

	Uses the data from:
	"Comparing Anomaly-Detection Algorithms for Keystroke Dynamics,"
	Kevin Killourhy and Roy Maxion. DSN 2009. http://www.cs.cmu.edu/~keystroke/

	Requires numpy, pandas, powerlaw, and matplotlib. Run the script as:
	$ python cmu_powerlaw.py
	'''

	import numpy as np
	import pandas as pd
	import powerlaw as pl
	import matplotlib.pyplot as plt

	FIT_METHOD = 'KS' # Can also be 'Likelihood'
	DATA_URL = 'http://www.cs.cmu.edu/~keystroke/DSL-StrongPasswordData.csv'

	# Use none as the x_mins to estimate them
	UD_XMIN, DD_XMIN, DUR_XMIN = None, None, None

	# Uncomment line below to use the estimates for x_min,
	# The resulting x_min estimate from this script is 0.1852, not 0.1818
	# UD_XMIN, DD_XMIN, DUR_XMIN = 0.9801, 0.9880, 0.1818

	print('Downloading data from', DATA_URL)
	# First 3 cols are subject, session, repetition
	df = pd.read_csv(DATA_URL, index_col=[0,1,2])

	# Columns are labeled like: feature_type.keyname[.secondkeyname]
	# Get the columns for a specific feature type
	get_feature_cols = lambda feat: [c for c in df.columns if c.startswith(feat)]

	ud = df[get_feature_cols('UD')].values.flatten() # up-down latency
	dd = df[get_feature_cols('DD')].values.flatten() # down-down latency
	dur = df[get_feature_cols('H')].values.flatten() # Hold time
	# UD can be negative, so use abs values
	ud = np.abs(ud)

	print('Fitting models, may take a while...')
	fit_ud = pl.Fit(ud, fit_method=FIT_METHOD, xmin=UD_XMIN)
	fit_dd = pl.Fit(dd, fit_method=FIT_METHOD, xmin=DD_XMIN)
	fit_dur = pl.Fit(dur, fit_method=FIT_METHOD, xmin=DUR_XMIN)

	summarize = lambda fit: 'x_min = %.4f\nalpha = %.4f\nL = %.4f' \
	%(fit.power_law.xmin, fit.power_law.alpha, fit.power_law.loglikelihoods(fit.data).sum())

	print('Up-down\n', summarize(fit_ud), sep='')
	print('Down-down\n', summarize(fit_dd), sep='')
	print('Duration\n', summarize(fit_dur), sep='')

	# Helper to make a nice plot
	def make_subplot(name, fit, ax, visiblex=False):
	plt.setp(ax1.get_xticklabels(), visible=visiblex)
	fit.plot_ccdf(color='k')
	fit.power_law.plot_ccdf(color='r', linestyle='--')
	ax.text(0.9, 0.9, '%s\n%s' %(name, summarize(fit)),
	ha='right', va='top', transform=ax.transAxes)

	plt.figure(figsize=(6,9))

	ax1 = plt.subplot(311)
	ax1.set_title('CMU keystroke power laws')
	make_subplot('Up-down', fit_ud, ax1)

	ax2 = plt.subplot(312, sharex=ax1)
	make_subplot('Down-down', fit_dd, ax2)

	ax3 = plt.subplot(313, sharex=ax1)
	make_subplot('Duration', fit_dur, ax3, True)

	plt.tight_layout()
	plt.show()