Skip to content

Instantly share code, notes, and snippets.

@jonathanvdc
Last active January 30, 2020 17:00
Show Gist options
  • Save jonathanvdc/58a27a76e0801874be0d5111d38ca99b to your computer and use it in GitHub Desktop.
Save jonathanvdc/58a27a76e0801874be0d5111d38ca99b to your computer and use it in GitHub Desktop.
Computes Cohen's Kappa Coefficient
#!/usr/bin/env python3
"""This script computes Cohen's Kappa Coefficient. Invoke with `python3 kappy.py data.csv`,
where `data.csv` is a CSV file that contains three columns and no header. The first column
contains identifiers that are not used for the purpose of computing the Kappa Coefficient.
The second column encodes the first rater's findings. The third column encodes the second rater's findings."""
import csv
import sys
def read_csv_data(path):
"""Reads the data in a CSV at a particular path. Produces a list of pairs
where each pair represents the two categories assigned by the raters
to an item."""
with open(path, 'r') as csvfile:
return [tuple(elem.strip() for elem in row[1:]) for row in csv.reader(csvfile, delimiter=',', quotechar='"')]
def get_categories(data):
"""Extracts all possible categories from parsed CSV data."""
return sorted(set(item for pair in data for item in pair))
def compose_confusion_matrix(category, data):
"""Composes a 2x2 confusion matrix for a category and dataset. Returns the
matrix as a flat list of four elements."""
matrix = [0, 0, 0, 0]
for first, second in data:
if first == category and second == category:
matrix[0] += 1
elif first == category and second != category:
matrix[1] += 1
elif first != category and second == category:
matrix[2] += 1
else:
matrix[3] += 1
return matrix
def compute_kappa(confusion_matrix):
"""Computes Cohen's Kappa Coefficient. Takes a confusion matrix as input.
Produces a (kappa, P_O, P_E) triple."""
# Compute the observed agreement.
total_count = sum(confusion_matrix)
agreement_observed = (confusion_matrix[0] + confusion_matrix[3]) / total_count
# Compute the rate at which the first rater positively classifies a sample.
first_rate = (confusion_matrix[0] + confusion_matrix[1]) / total_count
# Compute the rate at which the second rater positively classifies a sample.
second_rate = (confusion_matrix[0] + confusion_matrix[2]) / total_count
# Compute the expected agreement rate.
agreement_expected = first_rate * second_rate + (1 - first_rate) * (1 - second_rate)
# Compute the Kappa Coefficient.
return (
(agreement_observed - agreement_expected) / (1 - agreement_expected),
agreement_observed,
agreement_expected)
if __name__ == '__main__':
if len(sys.argv) != 2:
print('usage: kappa.py data.csv', file=sys.stderr)
exit(1)
data = read_csv_data(sys.argv[1])
categories = get_categories(data)
print('Extant categories: %s' % ', '.join(categories))
for category in categories:
print('Kappa for %s: %.2f (observed agreement: %.2f, expected agreement: %.2f)' % (
(category,) + compute_kappa(compose_confusion_matrix(category, data))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment