Skip to content

Instantly share code, notes, and snippets.

@emrul
Created July 13, 2019 20:25
Show Gist options
  • Save emrul/42c6baf23eb16791c48e7226e616d116 to your computer and use it in GitHub Desktop.
Save emrul/42c6baf23eb16791c48e7226e616d116 to your computer and use it in GitHub Desktop.
C#/.NET implementation of P2 (P-Squared) algorithm for calculating percentiles on streaming data
using System;
using System.Collections.Generic;
/*
* References:
* Initial idea: https://www.cse.wustl.edu/~jain/papers/ftp/psqr.pdf
* Based on implementation: https://github.com/skycmoon/P2
*
* Author:
* Emrul Islam <emrul@emrul.com>
*
* Usage:
* Either initialise with a specified quantile:
* P2Engine p2Engine = new P2Engine(0.5);
* or a list of quantiles:
* P2Engine p2Engine = new P2Engine(new[] {0.25, 0.5, 0.75});
*
* Then for each data-point call the add method. E.g.:
* int SAMPLE_SIZE = 1000000;
* Random rnd = new Random(78651);
* double[] values = Enumerable.Repeat(0, SAMPLE_SIZE).Select(i=>rnd.NextDouble()).ToArray();
* foreach (double value in values)
* {
* p2Engine.add(value);
* }
*
* If you passed a single double to the constructor you can call result():
* p2Engine.result();
* // 0.499828816523464
* Or if you passed a list of doubles you must call result with a specific quantile:
* p2Engine.result(0.25);
* // 0.249839636176194
*/
public class P2Engine
{
private double[] q;
private double[] dn;
private double[] np;
private int[] n;
private int count = 0;
private int marker_count;
public P2Engine(double quantile) : this(new[] {quantile})
{
}
public P2Engine(ICollection<double> quantiles)
{
count = 0;
// We need len(quantiles) * 3 + 2 markers
// One marker for start, one for end, and each quantile we're interested in needs 3 markers.
marker_count = quantiles.Count * 3 + 2;
q = new double[marker_count];
dn = new double[marker_count];
np = new double[marker_count];
n = new int[marker_count];
dn[0] = 0; // First Marker is always 0
dn[marker_count-1] = 1; // Last marker is always 1
int pointer = 1;
// Add each quantile
foreach (double quantile in quantiles)
{
dn[pointer] = quantile;
dn[pointer + 1] = quantile / 2;
dn[pointer + 2] = (1 + quantile) / 2;
pointer += 3;
}
// Sort the markers
Array.Sort(dn);
for (int i = 0; i < marker_count; i++)
{
np[i] = (marker_count - 1) * dn[i] + 1;
}
}
private int sign(double d)
{
if (d >= 0)
{
return 1;
}
return -1;
}
private double parabolic(int i, int d)
{
return q[i] + d / ((n[i + 1] - n[i - 1]) *
((n[i] - n[i - 1] + d) * ((q[i + 1] - q[i]) / (n[i + 1] - n[i])) +
(n[i + 1] - (n[i] - d)) * ((q[i] - q[i - 1]) / (n[i] - n[i - 1]))));
}
private double linear(int i, int d)
{
return q[i] + d * ((q[i + d] - q[i]) / (n[i + d] - n[i]));
}
public void add(double data)
{
int i;
int k = 0;
if (count >= marker_count)
{
count++;
// B1
if (data < q[0])
{
q[0] = data;
k = 1;
}
else if (data >= q[marker_count - 1])
{
q[marker_count - 1] = data;
k = marker_count - 1;
}
else
{
for (i = 1; i < marker_count; i++)
{
if (!(data < q[i])) continue;
k = i;
break;
}
}
// B2
for (i = k; i < marker_count; i++)
{
n[i]++;
np[i] = np[i] + dn[i];
}
for (i = 0; i < k; i++)
{
np[i] = np[i] + dn[i];
}
// B3
for (i = 1; i < marker_count - 1; i++)
{
var d = np[i] - n[i];
if ((!(d >= 1) || n[i + 1] - n[i] <= 1) && (!(d <= -1) || n[i - 1] - n[i] >= -1)) continue;
var newq = parabolic(i, sign(d));
if (q[i - 1] < newq && newq < q[i + 1])
{
q[i] = newq;
}
else
{
q[i] = linear(i, sign(d));
}
n[i] = n[i] + sign(d);
}
}
else
{
q[count] = data;
count++;
if (count != marker_count) return;
// We have enough to start the algorithm, initialize
Array.Sort(q);
//bubbleSort(q, marker_count);
for (i = 0; i < marker_count; i++)
{
n[i] = i + 1;
}
}
}
public double result()
{
if (marker_count != 5)
{
// result() NoArg method will only work as long as only 1 quantile was specified during class instantiation.
throw new Exception("result() NoArg method will only work as long as only 1 quantile was specified during class instantiation.");
}
return result(dn[(marker_count - 1) / 2]);
}
// ReSharper disable once MemberCanBePrivate.Global
public double result(double quantile)
{
if (count < marker_count)
{
int closest = 1;
Array.Sort(q);
//bubbleSort(q, count);
for (int i = 2; i < count; i++)
{
if (Math.Abs((double) i / count - quantile) < Math.Abs((double) closest / marker_count - quantile))
{
closest = i;
}
}
return q[closest];
}
else
{
// Figure out which quantile is the one we're looking for by nearest dn
int closest = 1;
for (int i = 2; i < marker_count - 1; i++)
{
if (Math.Abs(dn[i] - quantile) < Math.Abs(dn[closest] - quantile))
{
closest = i;
}
}
return q[closest];
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment