Skip to content

Instantly share code, notes, and snippets.

@christerswahn
Created June 1, 2012 18:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save christerswahn/2854260 to your computer and use it in GitHub Desktop.
Save christerswahn/2854260 to your computer and use it in GitHub Desktop.
A small set of classes that enable efficient collection of data samples and provides statistical measures on them.
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
/** Non-instantiable class containing SampleSet factories and implementations.
*/
public final class Samples {
/** Gets the IntBucketSampleSet factory singleton.
* @return the IntBucketSampleSet SampleSetFactory singleton
*/
public static SampleSetFactory getIntBucketSampleSetFactory() {
return intBucketSampleSetFactory;
}
/** Gets the DecBucketSampleSet factory singleton.
* @return the DecBucketSampleSet SampleSetFactory singleton
*/
public static SampleSetFactory getDecBucketSampleSetFactory(int minOrderOfMagnitude, int decimalPrecision) {
return new DecBucketSampleSetFactory(minOrderOfMagnitude, decimalPrecision);
}
/** the IntBucketSampleSet factory singleton */
private static final SampleSetFactory intBucketSampleSetFactory = new SampleSetFactory() {
@Override
public SampleSet makeSampleSet() {
return new IntBucketSampleCounter();
}
};
/** the DecBucketSampleSet factory singleton */
private static final class DecBucketSampleSetFactory implements SampleSetFactory {
private final int minOrderOfMagnitude;
private final int decimalPrecision;
private DecBucketSampleSetFactory(int minOrderOfMagnitude, int decimalPrecision) {
this.minOrderOfMagnitude = minOrderOfMagnitude;
this.decimalPrecision = decimalPrecision;
}
@Override
public SampleSet makeSampleSet() {
return new DecBucketSampleCounter(minOrderOfMagnitude, decimalPrecision);
}
};
/** General abstract implementation of SampleSet that uses buckets to store the samples.
* Each bucket represents a value interval and counts the number of samples
* that have fallen within that interval.
* With this approach the memory size grows with the order of magnitude between the smallest
* and the largest data point, but not with the number of data points.
* <P>
* Note: This class is not thread-safe.
*
* @author Christer Swahn
*/
static abstract class BucketSampleCounter implements SampleSet {
private final SortedMap<Double,Counter> buckets = new TreeMap<Double,Counter>();
private int totalCount = 0;
double minSampleValue = Double.MAX_VALUE;
double maxSampleValue = -Double.MAX_VALUE;
private double totalSum = 0;
private int maxCounter = 0;
private Double cachedApproxMean = null;
private Double cachedMedian = null;
private Double cachedStdDev = null;
protected BucketSampleCounter() {
}
/** Returns the number of sample buckets currently held. */
public int getBucketCount() {
return buckets.size();
}
/** Returns the count value of the highest bucket counter. */
protected int getMaxCounter() {
return maxCounter;
}
@Override
public int getCount() {
return totalCount;
}
@Override
public double getSum() {
return totalSum;
}
@Override
public double getMin() {
if (totalCount == 0)
return 0;
return minSampleValue;
}
@Override
public double getMax() {
if (totalCount == 0)
return 0;
return maxSampleValue;
}
/** Puts a sample into this bucket sample counter.
* @param sampleValue the sample value to insert
*/
@Override
public void putSample(double sampleValue) {
Double bucketMedian = getBucketMedian(sampleValue);
Counter counter = buckets.get(bucketMedian);
if (counter == null) {
counter = new Counter();
buckets.put(bucketMedian, counter);
}
counter.count++;
maxCounter = Math.max(maxCounter, counter.count);
totalCount++;
totalSum += sampleValue;
minSampleValue = Math.min(minSampleValue, sampleValue);
maxSampleValue = Math.max(maxSampleValue, sampleValue);
// clear cached calculated values:
cachedApproxMean = null;
cachedMedian = null;
cachedStdDev = null;
}
/** Gets the median value of the bucket that the specified sample value is put into.
* This is the middle value of the bucket's value range, e.g. if the bucket is for
* samples with values between 1 and 3, the bucket's median is 2.
* <P>
* This method must be implemented by concrete subclasses.
*
* @param sampleValue the sample value to assign a bucket
* @return the median value of the sample value's bucket
*/
protected abstract double getBucketMedian(double sampleValue);
/** Gets the approximate mean (average) of this sample bucket set.
* <P>
* Note that the returned value will probably differ from the true mean since this is
* an approximating sample set.
* @see #getMean()
*/
public double getApproxMean() {
if (cachedApproxMean == null)
cachedApproxMean = calcApproxMean();
return cachedApproxMean;
}
/** Gets the true mean (average) of this sample bucket set.
* The return value is equal to getSum()/getCount() if getCount()>0.
*/
@Override
public double getMean() {
if (totalCount == 0)
return 0;
return totalSum / totalCount;
}
/** Gets the median of this sample bucket set, which is an approximation of the true median. */
@Override
public double getMedian() {
if (cachedMedian == null)
cachedMedian = calcMedian();
return cachedMedian;
}
/** Gets the so-called sample standard deviation of this sample bucket set,
* which is an approximation of the true sample standard deviation. */
@Override
public double getStdDev() {
if (cachedStdDev == null)
cachedStdDev = calcStdDev();
return cachedStdDev;
}
/** Calculates the mean of this sample bucket set. */
private double calcApproxMean() {
if (totalCount == 0)
return 0;
double sum = 0;
for (Map.Entry<Double,Counter> e : buckets.entrySet()) {
sum += e.getKey() * e.getValue().count;
}
double mean = sum / totalCount;
return mean;
}
/** Calculates the median of this sample bucket set. */
private double calcMedian() {
if (totalCount == 0)
return 0;
int traversedSamplesCount = 0;
for (Map.Entry<Double,Counter> e : buckets.entrySet()) {
int count = e.getValue().count;
traversedSamplesCount += count;
if (traversedSamplesCount >= totalCount/2) {
double median = e.getKey();
return median;
}
}
assert false : "Program error finding median of " + this;
return 0; // shouldn't happen
}
/** Calculates the so-called sample standard deviation of this sample bucket set. */
private double calcStdDev() {
if (totalCount <= 1)
return 0;
double mean = getMean();
double varianceSum = 0;
for (Map.Entry<Double,Counter> e : buckets.entrySet()) {
double diff = e.getKey() - mean;
varianceSum += (diff * diff) * e.getValue().count;
}
int denom = totalCount - 1;
// (For a "population standard deviation", i.e. if the sample values were the complete value population,
// the denominator would not have been subtracted by 1.)
double stdDev = Math.sqrt(varianceSum / denom);
return stdDev;
}
@Override
public String[] dumpData() {
String[] dump = new String[buckets.size()];
int bucketP = 1;
int bucketW = getDecMagnitude(maxSampleValue);
bucketW = bucketW + ((bucketW-1) / 3) + 1 + bucketP; // take into account: a delimiter for every 3 digits, comma, and fraction digits
int counterW = getDecMagnitude(maxCounter);
counterW = counterW + ((counterW-1) / 3); // take into account: a delimiter for every 3 digits
String format = "%," + bucketW + "." + bucketP + "f: %," + counterW + "d";
int b = 0;
for (Map.Entry<Double,Counter> e : buckets.entrySet()) {
dump[b++] = String.format(format, e.getKey(), e.getValue().count);
}
return dump;
}
@Override
public String toString() {
String str = String.format("A=%.1f xA=%.1f SD=%.1f Md=%.1f Mi=%.1f Ma=%.1f T=%.1f C=%d bc=%d",
getMean(), getApproxMean(), getStdDev(), getMedian(),
getMin(), getMax(), getSum(), getCount(), getBucketCount());
return str;
}
/** Wrapper class to handle a mutable int in a collection. */
private static final class Counter {
public int count = 0;
@Override
public String toString() {
return String.valueOf(count);
}
}
}
/** A simple BucketSampleCounter where each bucket represents the interval [-0.5;0.5)
* around an integer (i.e. the sample value rounded to the closest integer).
*/
static class IntBucketSampleCounter extends BucketSampleCounter {
@Override
protected double getBucketMedian(double sampleValue) {
double median = Math.rint(sampleValue);
return median;
}
}
/** A BucketSampleCounter where each bucket represents a value interval with
* a given decimal precision.
* For example, with a decimal precision of 2 (the default), for each decimal
* order of magnitude (e.g. between 1.0 and 9.9, between 10 and 99 and so on)
* up to 90 intervals (buckets) are stored.
*/
static class DecBucketSampleCounter extends BucketSampleCounter {
@SuppressWarnings("unused")
private final int minOrderOfMagnitude;
private final int decimalPrecision;
private final double minAbsValue;
/** Creates a DecBucketSampleCounter with a minimum precision of 0. */
DecBucketSampleCounter() {
this(0);
}
/** Creates a DecBucketSampleCounter with the specified minimum order of magnitude. */
DecBucketSampleCounter(int minOrderOfMagnitude) {
this(minOrderOfMagnitude, 2);
}
/** Creates a DecBucketSampleCounter with the specified minimum order of magnitude. */
DecBucketSampleCounter(int minOrderOfMagnitude, int decimalPrecision) {
this.minOrderOfMagnitude = minOrderOfMagnitude;
this.decimalPrecision = decimalPrecision;
this.minAbsValue = Math.pow(10, minOrderOfMagnitude);
}
@Override
protected double getBucketMedian(double sampleValue) {
double median = roundDec(sampleValue, decimalPrecision);
if (Math.abs(median) < minAbsValue)
return 0;
else
return median;
}
}
/** Rounds a double value to the closest double value with the specified
* number of significant decimal digits. E.g:
* <UL>
* <LI>roundDec(111, 1) -> 100
* <LI>roundDec(0.111, 1) -> 0.1
* <LI>roundDec(111, 2) -> 110
* <LI>roundDec(111, 3) -> 111
* <LI>roundDec(111, 4) -> 111
* <LI>roundDec(-111, 2) -> -110
* </UL>
*
* @param value the value to round
* @param digits the number of significant decimal digits, must be equal to or greater than 1
* @return the rounded value
*/
public static final double roundDec(double value, int digits) {
assert digits > 0 : "digits less than 1: " + digits;
if (Math.abs(value) < Double.MIN_NORMAL)
return 0; // value is equal to or very close to zero
int mag = getDecMagnitude(value);
double precisionScale = Math.pow(10, mag-digits);
double result = Math.rint(value / precisionScale) * precisionScale;
return result;
}
/** Gets the decimal order of magnitude of a value. This represents the position
* of the most significant digit in the decimal representation of the value.
* (Note that the decimal exponent corresponding to the value's magnitude
* equals the result of this method minus 1.)
* The value must not be zero.
* The result is the same regardless of the sign of the passed value.
* E.g:
* <UL>
* <LI> 0,01 -> -1
* <LI> 0,011-> -1
* <LI> 0,09 -> -1
* <LI> 0,1 -> 0
* <LI> 0,11 -> 0
* <LI> 0,9 -> 0
* <LI> 1 -> 1
* <LI> 2 -> 1
* <LI> 9 -> 1
* <LI> 10 -> 2
* <LI> 11 -> 2
* <LI> 99 -> 2
* <LI> 100 -> 3
* <LI> 101 -> 3
* </UL>
*
* @param value a non-zero value
* @return
*/
public static final int getDecMagnitude(double value) {
int mag = ((int) Math.floor(Math.log10(Math.abs(value)))) + 1;
return mag;
}
}
/** A container that collects a set of data samples and provides statistical measures on them.
* <P>
* The general contract is that the methods shall not return any other value than 0
* in case no samples' have been inserted into the set.
* Implementing classes may choose to throw IllegalStateException in such an illegal use case.
* <P>
* Implementing classes are not generally required to be thread-safe.
*/
public interface SampleSet {
/** Puts a sample into this bucket sample counter.
* @param sampleValue the sample value to insert
*/
public void putSample(double sampleValue);
/** Gets the total number of samples put into this sample set.
* @return the total number of samples put into this sample set
*/
public int getCount();
/** Gets the minimum sample value put into this sample set. */
public double getMin();
/** Gets the maximum sample value put into this sample set. */
public double getMax();
/** Gets the total sum of the sample values put into this sample set.
* @return the total sum of the sample values
*/
public double getSum();
/** Gets the mean (average) of this sample set.
* @return the mean (average) of this sample set
*/
public double getMean();
/** Gets the median of this sample set.
* Certain implementations may produce an approximate value.
* @return the median of this sample bucket set
*/
public double getMedian();
/** Gets the so-called sample standard deviation of this sample bucket set.
* Certain implementations may produce an approximate value.
* @return the sample standard deviation of this sample set
*/
public double getStdDev();
/** Returns a data dump of this sample set suitable for console output.
*/
public String[] dumpData();
}
/** Interface of a SampleSet factory. */
public interface SampleSetFactory {
/** Returns a newly created SampleSet instance. */
public SampleSet makeSampleSet();
}
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
/** The statistics compiler.
* The static methods provide convenient short-hands for instantiating statistics compilers.
* <P>
* Typically one instance will be created for a measurement area, and samples will be
* put to it for each of the area's metrics, using a unique label per metric.
* <P>
* Usage example:
* <P>
<code><pre>
public class IOPacker {
public static final StatCompiler STAT = StatCompiler.getStatCompiler(IOPacker.class);
...
public void pack() {
long startTime = System.nanoTime();
...
STAT.putSample("Pack time [ms]", (System.nanoTime()-startTime)/1e6);
}
}
</pre></code>
* <P>
* The compiled statistics can be logged using the following simple statement,
* for example just before the program exits:
* <P>
<code>
StatCompiler.logAllStats(LOG, Level.INFO);
</code>
* <P>
* The static methods are thread-safe.
* <P>
* Instances of StatCompiler are not thread-safe.
*
*/
public class StatCompiler {
/*--- class members ---*/
/** Interface of a StatisticsFormatter. */
public static interface StatisticsFormatter {
public String toString(SampleSet sampleSet);
}
/** The default StatisticsFormatter used. */
private static final class DefaultStatFormatter implements StatisticsFormatter {
@Override
public String toString(SampleSet sampleSet) {
String str = String.format("sum: %,7.0f; avg: %,5.1f (sd +- %,.1f);\tcount: %,d; min<med<max: %,5.1f < %,5.1f < %,5.1f",
sampleSet.getSum(), sampleSet.getMean(), sampleSet.getStdDev(), sampleSet.getCount(),
sampleSet.getMin(), sampleSet.getMedian(), sampleSet.getMax());
return str;
}
}
private static final StatisticsFormatter defaultStatFormatter = new DefaultStatFormatter();
private static SampleSetFactory defaultSampleSetFactory = Samples.getDecBucketSampleSetFactory(-1, 2);
private static final Map<String,StatCompiler> instances = new LinkedHashMap<String,StatCompiler>();
/** Sets the SampleSetFactory to be used by default by the getStatCompiler() methods.
* @param factory the SampleSetFactory to use
*/
public static void setDefaultSampleSetFactory(SampleSetFactory factory) {
synchronized (instances) {
defaultSampleSetFactory = factory;
}
}
/** Gets a StatCompiler that shall be registered (associated) with the specified class.
* This method is thread-safe (though the returned StatCompiler instance is not).
* @param clazz the class to associate the StatCompiler with
* @return a new or previously existing StatCompiler
*/
public static StatCompiler getStatCompiler(Class<?> clazz) {
return getStatCompiler(clazz.getSimpleName());
}
/** Gets a StatCompiler that shall be registered (associated) with the specified name.
* This method is thread-safe (though the returned StatCompiler instance is not).
* @param name the name to associate the StatCompiler with
* @return a new or previously existing StatCompiler
*/
public static StatCompiler getStatCompiler(String name) {
synchronized (instances) {
StatCompiler sc = instances.get(name);
if (sc == null) {
sc = new StatCompiler(name, defaultSampleSetFactory);
sc.setStatFormatter(defaultStatFormatter);
instances.put(name, sc);
}
return sc;
}
}
/** Outputs all the registered StatCompiler instances' statistics via the provider logger.
* @param outputLog the logger to use for the output
* @param logLevel the logging level to use for the output
*/
public static void logAllStats(Logger outputLog, Level logLevel) {
List<StatCompiler> statCompilers;
synchronized (instances) {
statCompilers = new ArrayList<StatCompiler>(instances.values());
}
for (StatCompiler sc : statCompilers) {
sc.logStats(outputLog, logLevel);
}
}
/*--- instance members ---*/
private final String name;
private final SampleSetFactory sampleSetFactory;
private final Map<String,SampleSet> sampleSets = new LinkedHashMap<String,SampleSet>();
private StatisticsFormatter statFormatter;
private boolean printDataDump = false;
/** Creates a new StatCompiler instance.
* @param name the name of this StatCompiler
* @param sampleSetFactory the factory it will use to create a sample set for each metric
*/
public StatCompiler(String name, SampleSetFactory sampleSetFactory) {
this.name = name;
this.sampleSetFactory = sampleSetFactory;
}
/** Gets the name of this StatCompiler. */
public String getName() {
return name;
}
/** Gets the SampleSetFactory this StatCompiler uses. */
public SampleSetFactory getSampleSetFactory() {
return sampleSetFactory;
}
/** Gets the StatisticsFormatter this StatCompiler uses. */
public StatisticsFormatter getStatFormatter() {
return statFormatter;
}
/** Sets the StatisticsFormatter this StatCompiler uses. */
public void setStatFormatter(StatisticsFormatter statPrinter) {
this.statFormatter = statPrinter;
}
/** Returns true if print-data-dump is enabled for this StatCompiler.
* If enabled, the sample sets' detailed data will be output along with the
* compiled statistics. This is disabled by default.
*/
public boolean isPrintDataDumpEnabled() {
return printDataDump;
}
/** Sets print-data-dump to be enabled or disabled for this StatCompiler.
* If enabled, the sample sets' detailed data will be output along with the
* compiled statistics. This is disabled by default.
*/
public void setPrintDataDumpEnabled(boolean printDataDump) {
this.printDataDump = printDataDump;
}
/** Puts a new data sample to this statistics compiler.
* @param label the label of the sampled metric
* @param value the sample value
*/
public void putSample(String label, double value) {
SampleSet sampleSet = sampleSets.get(label);
if (sampleSet == null) {
sampleSet = sampleSetFactory.makeSampleSet();
sampleSets.put(label, sampleSet);
}
sampleSet.putSample(value);
}
/** Outputs the statistics of this StatCompiler to the provided Logger using the specified log level.
*/
public void logStats(Logger outputLog, Level logLevel) {
String label = "[" + name + "] ";
for (Map.Entry<String,SampleSet> e : sampleSets.entrySet()) {
String rowHeader = label + e.getKey() + ": ";
outputLog.log(logLevel, rowHeader + formatStats(e.getValue()));
if (printDataDump) {
for (String s : e.getValue().dumpData())
outputLog.log(logLevel, rowHeader + s);
}
}
}
/** Outputs the statistics of this StatCompiler to the provided stream.
* @param out the PrintStream to print the output to (e.g. System.out)
*/
public void printStats(PrintStream out) {
String label = "[" + name + "] ";
for (Map.Entry<String,SampleSet> e : sampleSets.entrySet()) {
String rowHeader = label + e.getKey() + ": ";
out.println(rowHeader + formatStats(e.getValue()));
if (printDataDump) {
for (String s : e.getValue().dumpData())
out.println(rowHeader + s);
}
}
}
/** Returns a string representation of the provided SampleSet suitable for console display.
* If a StatisticPrinter is set, it is used to produce the output. Otherwise the sample set's
* toString() method is used.
* This method may be overridden by subclasses to customize the output.
* @param sampleSet the sample set to print
* @return a console-friendly string representation of the provided sample set
*/
public String formatStats(SampleSet sampleSet) {
if (statFormatter != null)
return statFormatter.toString(sampleSet);
else
return sampleSet.toString();
}
@Override
public String toString() {
return getClass().getSimpleName() + "@" + name;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
StatCompiler other = (StatCompiler) obj;
if (name == null) {
if (other.name != null)
return false;
}
else if (!name.equals(other.name))
return false;
return true;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment