jonathanvdc/kappa.py

## kappa.py
#!/usr/bin/env python3

"""This script computes Cohen's Kappa Coefficient. Invoke with `python3 kappy.py data.csv`,
   where `data.csv` is a CSV file that contains three columns and no header. The first column
   contains identifiers that are not used for the purpose of computing the Kappa Coefficient.
   The second column encodes the first rater's findings. The third column encodes the second rater's findings."""

import csv
import sys

def read_csv_data(path):
    """Reads the data in a CSV at a particular path. Produces a list of pairs
       where each pair represents the two categories assigned by the raters
       to an item."""
    with open(path, 'r') as csvfile:
        return [tuple(elem.strip() for elem in row[1:]) for row in csv.reader(csvfile, delimiter=',', quotechar='"')]

def get_categories(data):
    """Extracts all possible categories from parsed CSV data."""
    return sorted(set(item for pair in data for item in pair))

def compose_confusion_matrix(category, data):
    """Composes a 2x2 confusion matrix for a category and dataset. Returns the
       matrix as a flat list of four elements."""
    matrix = [0, 0, 0, 0]
    for first, second in data:
        if first == category and second == category:
            matrix[0] += 1
        elif first == category and second != category:
            matrix[1] += 1
        elif first != category and second == category:
            matrix[2] += 1
        else:
            matrix[3] += 1
    return matrix

def compute_kappa(confusion_matrix):
    """Computes Cohen's Kappa Coefficient. Takes a confusion matrix as input.
       Produces a (kappa, P_O, P_E) triple."""
    # Compute the observed agreement.
    total_count = sum(confusion_matrix)
    agreement_observed = (confusion_matrix[0] + confusion_matrix[3]) / total_count
    # Compute the rate at which the first rater positively classifies a sample.
    first_rate = (confusion_matrix[0] + confusion_matrix[1]) / total_count
    # Compute the rate at which the second rater positively classifies a sample.
    second_rate = (confusion_matrix[0] + confusion_matrix[2]) / total_count
    # Compute the expected agreement rate.
    agreement_expected = first_rate * second_rate + (1 - first_rate) * (1 - second_rate)
    # Compute the Kappa Coefficient.
    return (
        (agreement_observed - agreement_expected) / (1 - agreement_expected),
        agreement_observed,
        agreement_expected)

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print('usage: kappa.py data.csv', file=sys.stderr)
        exit(1)

    data = read_csv_data(sys.argv[1])
    categories = get_categories(data)
    print('Extant categories: %s' % ', '.join(categories))
    for category in categories:
        print('Kappa for %s: %.2f (observed agreement: %.2f, expected agreement: %.2f)' % (
            (category,) + compute_kappa(compose_confusion_matrix(category, data))))
	#!/usr/bin/env python3

	"""This script computes Cohen's Kappa Coefficient. Invoke with `python3 kappy.py data.csv`,
	where `data.csv` is a CSV file that contains three columns and no header. The first column
	contains identifiers that are not used for the purpose of computing the Kappa Coefficient.
	The second column encodes the first rater's findings. The third column encodes the second rater's findings."""

	import csv
	import sys

	def read_csv_data(path):
	"""Reads the data in a CSV at a particular path. Produces a list of pairs
	where each pair represents the two categories assigned by the raters
	to an item."""
	with open(path, 'r') as csvfile:
	return [tuple(elem.strip() for elem in row[1:]) for row in csv.reader(csvfile, delimiter=',', quotechar='"')]

	def get_categories(data):
	"""Extracts all possible categories from parsed CSV data."""
	return sorted(set(item for pair in data for item in pair))

	def compose_confusion_matrix(category, data):
	"""Composes a 2x2 confusion matrix for a category and dataset. Returns the
	matrix as a flat list of four elements."""
	matrix = [0, 0, 0, 0]
	for first, second in data:
	if first == category and second == category:
	matrix[0] += 1
	elif first == category and second != category:
	matrix[1] += 1
	elif first != category and second == category:
	matrix[2] += 1
	else:
	matrix[3] += 1
	return matrix

	def compute_kappa(confusion_matrix):
	"""Computes Cohen's Kappa Coefficient. Takes a confusion matrix as input.
	Produces a (kappa, P_O, P_E) triple."""
	# Compute the observed agreement.
	total_count = sum(confusion_matrix)
	agreement_observed = (confusion_matrix[0] + confusion_matrix[3]) / total_count
	# Compute the rate at which the first rater positively classifies a sample.
	first_rate = (confusion_matrix[0] + confusion_matrix[1]) / total_count
	# Compute the rate at which the second rater positively classifies a sample.
	second_rate = (confusion_matrix[0] + confusion_matrix[2]) / total_count
	# Compute the expected agreement rate.
	agreement_expected = first_rate * second_rate + (1 - first_rate) * (1 - second_rate)
	# Compute the Kappa Coefficient.
	return (
	(agreement_observed - agreement_expected) / (1 - agreement_expected),
	agreement_observed,
	agreement_expected)

	if __name__ == '__main__':
	if len(sys.argv) != 2:
	print('usage: kappa.py data.csv', file=sys.stderr)
	exit(1)

	data = read_csv_data(sys.argv[1])
	categories = get_categories(data)
	print('Extant categories: %s' % ', '.join(categories))
	for category in categories:
	print('Kappa for %s: %.2f (observed agreement: %.2f, expected agreement: %.2f)' % (
	(category,) + compute_kappa(compose_confusion_matrix(category, data))))