AndreyAkinshin/EdPeltChangePointDetector.cs

## EdPeltChangePointDetector.cs
// Copyright (c) 2019 Andrey Akinshin
// Licensed under The MIT License https://opensource.org/licenses/MIT
using System;
using System.Collections.Generic;
using System.Linq;

/// <summary>
/// The ED-PELT algorithm for changepoint detection.
///
/// <remarks>
/// The implementation is based on the following papers:
/// <list type="bullet">
/// <item>
/// <b>[Haynes2017]</b> Haynes, Kaylea, Paul Fearnhead, and Idris A. Eckley.
/// "A computationally efficient nonparametric approach for changepoint detection."
/// Statistics and Computing 27, no. 5 (2017): 1293-1305.
/// https://doi.org/10.1007/s11222-016-9687-5
/// </item>
/// <item>
/// <b>[Killick2012]</b> Killick, Rebecca, Paul Fearnhead, and Idris A. Eckley.
/// "Optimal detection of changepoints with a linear computational cost."
/// Journal of the American Statistical Association 107, no. 500 (2012): 1590-1598.
/// https://arxiv.org/pdf/1101.1438.pdf
/// </item>
/// </list>
/// </remarks>
/// </summary>
public class EdPeltChangePointDetector
{
    public static readonly EdPeltChangePointDetector Instance = new EdPeltChangePointDetector();

    /// <summary>
    /// For given array of `double` values, detects locations of changepoints that
    /// splits original series of values into "statistically homogeneous" segments.
    /// Such points correspond to moments when statistical properties of the distribution are changing.
    ///
    /// This method supports nonparametric distributions and has O(N*log(N)) algorithmic complexity.
    /// </summary>
    /// <param name="data">An array of double values</param>
    /// <param name="minDistance">Minimum distance between changepoints</param>
    /// <returns>
    /// Returns an `int[]` array with 0-based indexes of changepoint.
    /// Changepoints correspond to the end of the detected segments.
    /// For example, changepoints for { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 } are { 5, 11 }.
    /// </returns>
    public int[] GetChangePointIndexes(double[] data, int minDistance = 1)
    {
        // We will use `n` as the number of elements in the `data` array
        int n = data.Length;

        // Checking corner cases
        if (n <= 2)
            return new int[0];
        if (minDistance < 1 || minDistance > n)
            throw new ArgumentOutOfRangeException(
                nameof(minDistance),$"{minDistance} should be in range from 1 to data.Length");

        // The penalty which we add to the final cost for each additional changepoint
        // Here we use the Modified Bayesian Information Criterion
        double penalty = 3 * Math.Log(n);

        // `k` is the number of quantiles that we use to approximate an integral during the segment cost evaluation
        // We use `k=Ceiling(4*log(n))` as suggested in the Section 4.3 "Choice of K in ED-PELT" in [Haynes2017]
        // `k` can't be greater than `n`, so we should always use the `Min` function here (important for n <= 8)
        int k = Math.Min(n, (int) Math.Ceiling(4 * Math.Log(n)));

        // We should precalculate sums for empirical CDF, it will allow fast evaluating of the segment cost
        var partialSums = GetPartialSums(data, k);

        // Since we use the same values of `partialSums`, `k`, `n` all the time,
        // we introduce a shortcut `Cost(tau1, tau2)` for segment cost evaluation.
        // Hereinafter, we use `tau` to name variables that are changepoint candidates.
        double Cost(int tau1, int tau2) => GetSegmentCost(partialSums, tau1, tau2, k, n);

        // We will use dynamic programming to find the best solution; `bestCost` is the cost array.
        // `bestCost[i]` is the cost for subarray `data[0..i-1]`.
        // It's a 1-based array (`data[0]`..`data[n-1]` correspond to `bestCost[1]`..`bestCost[n]`)
        var bestCost = new double[n + 1];
        bestCost[0] = -penalty;
        for (int currentTau = minDistance; currentTau < 2 * minDistance; currentTau++)
            bestCost[currentTau] = Cost(0, currentTau);

        // `previousChangePointIndex` is an array of references to previous changepoints. If the current segment ends at
        // the position `i`, the previous segment ends at the position `previousChangePointIndex[i]`. It's a 1-based
        // array (`data[0]`..`data[n-1]` correspond to the `previousChangePointIndex[1]`..`previousChangePointIndex[n]`)
        var previousChangePointIndex = new int[n + 1];

        // We use PELT (Pruned Exact Linear Time) approach which means that instead of enumerating all possible previous
        // tau values, we use a whitelist of "good" tau values that can be used in the optimal solution. If we are 100%
        // sure that some of the tau values will not help us to form the optimal solution, such values should be
        // removed. See [Killick2012] for details.
        var previousTaus = new List<int>(n + 1) { 0, minDistance };
        var costForPreviousTau = new List<double>(n + 1);

        // Following the dynamic programming approach, we enumerate all tau positions. For each `currentTau`, we pretend
        // that it's the end of the last segment and trying to find the end of the previous segment.
        for (int currentTau = 2 * minDistance; currentTau < n + 1; currentTau++)
        {
            // For each previous tau, we should calculate the cost of taking this tau as the end of the previous
            // segment. This cost equals the cost for the `previousTau` plus cost of the new segment (from `previousTau`
            // to `currentTau`) plus penalty for the new changepoint.
            costForPreviousTau.Clear();
            foreach (int previousTau in previousTaus)
                costForPreviousTau.Add(bestCost[previousTau] + Cost(previousTau, currentTau) + penalty);

            // Now we should choose the tau that provides the minimum possible cost.
            int bestPreviousTauIndex = WhichMin(costForPreviousTau);
            bestCost[currentTau] = costForPreviousTau[bestPreviousTauIndex];
            previousChangePointIndex[currentTau] = previousTaus[bestPreviousTauIndex];

            // Prune phase: we remove "useless" tau values that will not help to achieve minimum cost in the future
            double currentBestCost = bestCost[currentTau];
            int newPreviousTausSize = 0;
            for (int i = 0; i < previousTaus.Count; i++)
                if (costForPreviousTau[i] < currentBestCost + penalty)
                    previousTaus[newPreviousTausSize++] = previousTaus[i];
            previousTaus.RemoveRange(newPreviousTausSize, previousTaus.Count - newPreviousTausSize);

            // We add a new tau value that is located on the `minDistance` distance from the next `currentTau` value
            previousTaus.Add(currentTau - (minDistance - 1));
        }

        // Here we collect the result list of changepoint indexes `changePointIndexes` using `previousChangePointIndex`
        var changePointIndexes = new List<int>();
        int currentIndex = previousChangePointIndex[n]; // The index of the end of the last segment is `n`
        while (currentIndex != 0)
        {
            changePointIndexes.Add(currentIndex - 1); // 1-based indexes should be be transformed to 0-based indexes
            currentIndex = previousChangePointIndex[currentIndex];
        }
        changePointIndexes.Reverse(); // The result changepoints should be sorted in ascending order.
        return changePointIndexes.ToArray();
    }

    /// <summary>
    /// Partial sums for empirical CDF (formula (2.1) from Section 2.1 "Model" in [Haynes2017])
    /// <code>
    /// partialSums[i, tau] = (count(data[j] &lt; t) * 2 + count(data[j] == t) * 1) for j=0..tau-1
    /// where t is the i-th quantile value (see Section 3.1 "Discrete approximation" in [Haynes2017] for details)
    /// </code>
    /// <remarks>
    /// <list type="bullet">
    /// <item>
    /// We use doubled sum values in order to use <c>int[,]</c> instead of <c>double[,]</c> (it provides noticeable
    /// performance boost). Thus, multipliers for <c>count(data[j] &lt; t)</c> and <c>count(data[j] == t)</c> are
    /// 2 and 1 instead of 1 and 0.5 from the [Haynes2017].
    /// </item>
    /// <item>
    /// Note that these quantiles are not uniformly distributed: tails of the <c>data</c> distribution contain more
    /// quantile values than the center of the distribution
    /// </item>
    /// </list>
    /// </remarks>
    /// </summary>
    private static int[,] GetPartialSums(double[] data, int k)
    {
        int n = data.Length;
        var partialSums = new int[k, n + 1];
        var sortedData = data.OrderBy(it => it).ToArray();

        for (int i = 0; i < k; i++)
        {
            double z = -1 + (2 * i + 1.0) / k; // Values from (-1+1/k) to (1-1/k) with step = 2/k
            double p = 1.0 / (1 + Math.Pow(2 * n - 1, -z)); // Values from 0.0 to 1.0
            double t = sortedData[(int) Math.Truncate((n - 1) * p)]; // Quantile value, formula (2.1) in [Haynes2017]

            for (int tau = 1; tau <= n; tau++)
            {
                partialSums[i, tau] = partialSums[i, tau - 1];
                if (data[tau - 1] < t)
                    partialSums[i, tau] += 2; // We use doubled value (2) instead of original 1.0
                if (data[tau - 1] == t)
                    partialSums[i, tau] += 1; // We use doubled value (1) instead of original 0.5
            }
        }
        return partialSums;
    }

    /// <summary>
    /// Calculates the cost of the (tau1; tau2] segment.
    /// </summary>
    private static double GetSegmentCost(int[,] partialSums, int tau1, int tau2, int k, int n)
    {
        double sum = 0;
        for (int i = 0; i < k; i++)
        {
            // actualSum is (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=tau1..tau2-1
            int actualSum = partialSums[i, tau2] - partialSums[i, tau1];

            // We skip these two cases (correspond to fit = 0 or fit = 1) because of invalid Math.Log values
            if (actualSum != 0 && actualSum != (tau2 - tau1) * 2)
            {
                // Empirical CDF $\hat{F}_i(t)$ (Section 2.1 "Model" in [Haynes2017])
                double fit = actualSum * 0.5 / (tau2 - tau1);
                // Segment cost $\mathcal{L}_{np}$ (Section 2.2 "Nonparametric maximum likelihood" in [Haynes2017])
                double lnp = (tau2 - tau1) * (fit * Math.Log(fit) + (1 - fit) * Math.Log(1 - fit));
                sum += lnp;
            }
        }
        double c = -Math.Log(2 * n - 1); // Constant from Lemma 3.1 in [Haynes2017]
        return 2.0 * c / k * sum; // See Section 3.1 "Discrete approximation" in [Haynes2017]
    }

    /// <summary>
    /// Returns the index of the minimum element.
    /// In case if there are several minimum elements in the given list, the index of the first one will be returned.
    /// </summary>
    private static int WhichMin(IList<double> values)
    {
        if (values.Count == 0)
            throw new InvalidOperationException("Array should contain elements");

        double minValue = values[0];
        int minIndex = 0;
        for (int i = 1; i < values.Count; i++)
            if (values[i] < minValue)
            {
                minValue = values[i];
                minIndex = i;
            }

        return minIndex;
    }
}
	// Copyright (c) 2019 Andrey Akinshin
	// Licensed under The MIT License https://opensource.org/licenses/MIT
	using System;
	using System.Collections.Generic;
	using System.Linq;

	/// <summary>
	/// The ED-PELT algorithm for changepoint detection.
	///
	/// <remarks>
	/// The implementation is based on the following papers:
	/// <list type="bullet">
	/// <item>
	/// <b>[Haynes2017]</b> Haynes, Kaylea, Paul Fearnhead, and Idris A. Eckley.
	/// "A computationally efficient nonparametric approach for changepoint detection."
	/// Statistics and Computing 27, no. 5 (2017): 1293-1305.
	/// https://doi.org/10.1007/s11222-016-9687-5
	/// </item>
	/// <item>
	/// <b>[Killick2012]</b> Killick, Rebecca, Paul Fearnhead, and Idris A. Eckley.
	/// "Optimal detection of changepoints with a linear computational cost."
	/// Journal of the American Statistical Association 107, no. 500 (2012): 1590-1598.
	/// https://arxiv.org/pdf/1101.1438.pdf
	/// </item>
	/// </list>
	/// </remarks>
	/// </summary>
	public class EdPeltChangePointDetector
	{
	public static readonly EdPeltChangePointDetector Instance = new EdPeltChangePointDetector();

	/// <summary>
	/// For given array of `double` values, detects locations of changepoints that
	/// splits original series of values into "statistically homogeneous" segments.
	/// Such points correspond to moments when statistical properties of the distribution are changing.
	///
	/// This method supports nonparametric distributions and has O(N*log(N)) algorithmic complexity.
	/// </summary>
	/// <param name="data">An array of double values</param>
	/// <param name="minDistance">Minimum distance between changepoints</param>
	/// <returns>
	/// Returns an `int[]` array with 0-based indexes of changepoint.
	/// Changepoints correspond to the end of the detected segments.
	/// For example, changepoints for { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 } are { 5, 11 }.
	/// </returns>
	public int[] GetChangePointIndexes(double[] data, int minDistance = 1)
	{
	// We will use `n` as the number of elements in the `data` array
	int n = data.Length;

	// Checking corner cases
	if (n <= 2)
	return new int[0];
	if (minDistance < 1 \|\| minDistance > n)
	throw new ArgumentOutOfRangeException(
	nameof(minDistance),$"{minDistance} should be in range from 1 to data.Length");

	// The penalty which we add to the final cost for each additional changepoint
	// Here we use the Modified Bayesian Information Criterion
	double penalty = 3 * Math.Log(n);

	// `k` is the number of quantiles that we use to approximate an integral during the segment cost evaluation
	// We use `k=Ceiling(4*log(n))` as suggested in the Section 4.3 "Choice of K in ED-PELT" in [Haynes2017]
	// `k` can't be greater than `n`, so we should always use the `Min` function here (important for n <= 8)
	int k = Math.Min(n, (int) Math.Ceiling(4 * Math.Log(n)));

	// We should precalculate sums for empirical CDF, it will allow fast evaluating of the segment cost
	var partialSums = GetPartialSums(data, k);

	// Since we use the same values of `partialSums`, `k`, `n` all the time,
	// we introduce a shortcut `Cost(tau1, tau2)` for segment cost evaluation.
	// Hereinafter, we use `tau` to name variables that are changepoint candidates.
	double Cost(int tau1, int tau2) => GetSegmentCost(partialSums, tau1, tau2, k, n);

	// We will use dynamic programming to find the best solution; `bestCost` is the cost array.
	// `bestCost[i]` is the cost for subarray `data[0..i-1]`.
	// It's a 1-based array (`data[0]`..`data[n-1]` correspond to `bestCost[1]`..`bestCost[n]`)
	var bestCost = new double[n + 1];
	bestCost[0] = -penalty;
	for (int currentTau = minDistance; currentTau < 2 * minDistance; currentTau++)
	bestCost[currentTau] = Cost(0, currentTau);

	// `previousChangePointIndex` is an array of references to previous changepoints. If the current segment ends at
	// the position `i`, the previous segment ends at the position `previousChangePointIndex[i]`. It's a 1-based
	// array (`data[0]`..`data[n-1]` correspond to the `previousChangePointIndex[1]`..`previousChangePointIndex[n]`)
	var previousChangePointIndex = new int[n + 1];

	// We use PELT (Pruned Exact Linear Time) approach which means that instead of enumerating all possible previous
	// tau values, we use a whitelist of "good" tau values that can be used in the optimal solution. If we are 100%
	// sure that some of the tau values will not help us to form the optimal solution, such values should be
	// removed. See [Killick2012] for details.
	var previousTaus = new List<int>(n + 1) { 0, minDistance };
	var costForPreviousTau = new List<double>(n + 1);

	// Following the dynamic programming approach, we enumerate all tau positions. For each `currentTau`, we pretend
	// that it's the end of the last segment and trying to find the end of the previous segment.
	for (int currentTau = 2 * minDistance; currentTau < n + 1; currentTau++)
	{
	// For each previous tau, we should calculate the cost of taking this tau as the end of the previous
	// segment. This cost equals the cost for the `previousTau` plus cost of the new segment (from `previousTau`
	// to `currentTau`) plus penalty for the new changepoint.
	costForPreviousTau.Clear();
	foreach (int previousTau in previousTaus)
	costForPreviousTau.Add(bestCost[previousTau] + Cost(previousTau, currentTau) + penalty);

	// Now we should choose the tau that provides the minimum possible cost.
	int bestPreviousTauIndex = WhichMin(costForPreviousTau);
	bestCost[currentTau] = costForPreviousTau[bestPreviousTauIndex];
	previousChangePointIndex[currentTau] = previousTaus[bestPreviousTauIndex];

	// Prune phase: we remove "useless" tau values that will not help to achieve minimum cost in the future
	double currentBestCost = bestCost[currentTau];
	int newPreviousTausSize = 0;
	for (int i = 0; i < previousTaus.Count; i++)
	if (costForPreviousTau[i] < currentBestCost + penalty)
	previousTaus[newPreviousTausSize++] = previousTaus[i];
	previousTaus.RemoveRange(newPreviousTausSize, previousTaus.Count - newPreviousTausSize);

	// We add a new tau value that is located on the `minDistance` distance from the next `currentTau` value
	previousTaus.Add(currentTau - (minDistance - 1));
	}

	// Here we collect the result list of changepoint indexes `changePointIndexes` using `previousChangePointIndex`
	var changePointIndexes = new List<int>();
	int currentIndex = previousChangePointIndex[n]; // The index of the end of the last segment is `n`
	while (currentIndex != 0)
	{
	changePointIndexes.Add(currentIndex - 1); // 1-based indexes should be be transformed to 0-based indexes
	currentIndex = previousChangePointIndex[currentIndex];
	}
	changePointIndexes.Reverse(); // The result changepoints should be sorted in ascending order.
	return changePointIndexes.ToArray();
	}

	/// <summary>
	/// Partial sums for empirical CDF (formula (2.1) from Section 2.1 "Model" in [Haynes2017])
	/// <code>
	/// partialSums[i, tau] = (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=0..tau-1
	/// where t is the i-th quantile value (see Section 3.1 "Discrete approximation" in [Haynes2017] for details)
	/// </code>
	/// <remarks>
	/// <list type="bullet">
	/// <item>
	/// We use doubled sum values in order to use <c>int[,]</c> instead of <c>double[,]</c> (it provides noticeable
	/// performance boost). Thus, multipliers for <c>count(data[j] < t)</c> and <c>count(data[j] == t)</c> are
	/// 2 and 1 instead of 1 and 0.5 from the [Haynes2017].
	/// </item>
	/// <item>
	/// Note that these quantiles are not uniformly distributed: tails of the <c>data</c> distribution contain more
	/// quantile values than the center of the distribution
	/// </item>
	/// </list>
	/// </remarks>
	/// </summary>
	private static int[,] GetPartialSums(double[] data, int k)
	{
	int n = data.Length;
	var partialSums = new int[k, n + 1];
	var sortedData = data.OrderBy(it => it).ToArray();

	for (int i = 0; i < k; i++)
	{
	double z = -1 + (2 * i + 1.0) / k; // Values from (-1+1/k) to (1-1/k) with step = 2/k
	double p = 1.0 / (1 + Math.Pow(2 * n - 1, -z)); // Values from 0.0 to 1.0
	double t = sortedData[(int) Math.Truncate((n - 1) * p)]; // Quantile value, formula (2.1) in [Haynes2017]

	for (int tau = 1; tau <= n; tau++)
	{
	partialSums[i, tau] = partialSums[i, tau - 1];
	if (data[tau - 1] < t)
	partialSums[i, tau] += 2; // We use doubled value (2) instead of original 1.0
	if (data[tau - 1] == t)
	partialSums[i, tau] += 1; // We use doubled value (1) instead of original 0.5
	}
	}
	return partialSums;
	}

	/// <summary>
	/// Calculates the cost of the (tau1; tau2] segment.
	/// </summary>
	private static double GetSegmentCost(int[,] partialSums, int tau1, int tau2, int k, int n)
	{
	double sum = 0;
	for (int i = 0; i < k; i++)
	{
	// actualSum is (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=tau1..tau2-1
	int actualSum = partialSums[i, tau2] - partialSums[i, tau1];

	// We skip these two cases (correspond to fit = 0 or fit = 1) because of invalid Math.Log values
	if (actualSum != 0 && actualSum != (tau2 - tau1) * 2)
	{
	// Empirical CDF $\hat{F}_i(t)$ (Section 2.1 "Model" in [Haynes2017])
	double fit = actualSum * 0.5 / (tau2 - tau1);
	// Segment cost $\mathcal{L}_{np}$ (Section 2.2 "Nonparametric maximum likelihood" in [Haynes2017])
	double lnp = (tau2 - tau1) * (fit * Math.Log(fit) + (1 - fit) * Math.Log(1 - fit));
	sum += lnp;
	}
	}
	double c = -Math.Log(2 * n - 1); // Constant from Lemma 3.1 in [Haynes2017]
	return 2.0 * c / k * sum; // See Section 3.1 "Discrete approximation" in [Haynes2017]
	}

	/// <summary>
	/// Returns the index of the minimum element.
	/// In case if there are several minimum elements in the given list, the index of the first one will be returned.
	/// </summary>
	private static int WhichMin(IList<double> values)
	{
	if (values.Count == 0)
	throw new InvalidOperationException("Array should contain elements");

	double minValue = values[0];
	int minIndex = 0;
	for (int i = 1; i < values.Count; i++)
	if (values[i] < minValue)
	{
	minValue = values[i];
	minIndex = i;
	}

	return minIndex;
	}
	}