lifey/LongAdderBench.java

## LongAdderBench.java
/*
Welcome to the game show "lies and benchmarks".

We will present the participants a required result, and they would build a
benchmark that reaches the predetermined conclusions.

When done professionally, this activity is called "benchmarketing".

Our task for today is to prove LongAdder is slower than AtomicLong. Our
observation is, LongAdder prevents write contention to a single a atomic
variable by spreading the writes to more than one variables. As a result
the read activity is reading from a few fields hence costlier.

We will therefor construct a benchmark that mostly reads, and sometimes writes.

The benchmark need to be tunable, we shall therefor play with the number until
we receive the required results.

Indeed with 16 threads each performs and 100K reads/writes for each thread,
we're getting better result for AtomicLong, than LongAdder.

Of course, with 90% writes, LongAdder is much faster than AtomicLong.

Note that for performing real benchmarks, one have to understand the underlying
architecture, as well as the tested algorithm. It is easy to make fatal flaws
and measure the wrong thing entirely. This is relevant however for actionable
benchmarks you are expected to derive interesting conclusions from. Once you
internalize the fact that the benchmarks are not a tool that supplies
information to you, but you are the one who decides the benchmark's result, you
can neglect all the benchmarking methodology, and just do something that looks
reasonable.

If you made an error that made your benchmark look good - go post it in your
blog before anyone notice the error.

Indeed the presented benchmarks are probably faulty. I made sure the JIT is warm
with the primitive technique of running the benchmark function 100 times and
picking the minimal result, assuming that the minimal result is JIT optimized.
It may very well be that I'm actually measuring the performance of the Java's for
loop. Wrong or not, the graphs would still look very good on the blog post.

An example run looks like (you must use Java 8 of course).
$ javac *.java
$ java -cp . LongAdderBench
for  1% writes=16,000 (16 threads * 100,000 ops each=1,600,000):
  LongAdder  1.518ms
  AtomicLong 0.319ms
AtomicLong faster than LongAdder by 78.99%

for 90% writes=1,440,000 (16 threads * 100,000 ops each=1,600,000):
  LongAdder  3.258ms
  AtomicLong 28.892ms
LongAdder faster than AtomicLong by 786.80%

 */
import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.LongAdder;

public class LongAdderBench {

    public static final int THREADS = 16;
    public static final int OPS = 1_00_000;
    private static final int MIN_BATCH = (int) (OPS * 0.01);

    public static void main(String[] args) throws Exception {
        adderVsAtomic(0.01d);
        adderVsAtomic(0.90d);
    }

    public static void adderVsAtomic(double percent_write) throws Exception {
        long minAdder = Long.MAX_VALUE, minAtomic = Long.MAX_VALUE;
        for (int i=0; i<100; i++) {
            long atomic = run(false, percent_write);
            long adder = run(true, percent_write);
            minAdder = Math.min(adder, minAdder);
            minAtomic = Math.min(atomic, minAtomic);
        }
        System.out.printf("for %2.0f%% writes=%,.0f (%d threads * %,d ops each=%,d):\n", 100d*percent_write,
                OPS *THREADS*percent_write, THREADS, OPS, OPS *THREADS);
        String adderName = "LongAdder";
        String atomicName = "AtomicLong";
        System.out.printf("  %-10s %,.3fms\n", adderName, minAdder/1_000_000d);
        System.out.printf("  %-10s %,.3fms\n", atomicName, minAtomic/1_000_000d);
        double delta = (double) minAdder - minAtomic;
        String faster = atomicName, slower = adderName;
        if (delta<0) {
            faster = adderName;
            slower = atomicName;
            delta = Math.abs(delta);
        }
        System.out.printf("%s faster than %s by %.2f%%\n\n", faster, slower, 100d*(delta /minAdder));
    }

    public static long run(boolean useAdder, double percent_write) throws Exception {
        ExecutorService svc = Executors.newFixedThreadPool(THREADS);
        final LongAdder adder = new LongAdder();
        final AtomicLong atomic = new AtomicLong();
        CountDownLatch latch = new CountDownLatch(1);
        for (int i = 0; i < THREADS; i++) {
            svc.submit(() -> {
                latch.await();
                int count = 0;
                Random rand = new Random();
                while (count < OPS) {
                    int batch = rand.nextInt(MIN_BATCH * 2) + MIN_BATCH;
                    int nwrites = (int) (batch * percent_write);
                    if (useAdder) {
                        count = writeAdder(adder, count, nwrites);
                        count = readAdder(adder, count, batch - nwrites);
                    } else {
                        count = writeAtomic(atomic, count, nwrites);
                        count = readAtomic(atomic, count, batch - nwrites);
                    }
                }
                return null;
            });
        }
        long start = System.nanoTime();
        latch.countDown();
        svc.shutdown();
        svc.awaitTermination(1, TimeUnit.MINUTES);
        return System.nanoTime()-start;
    }


    private static int readAdder(LongAdder adder, int count, int limit) {
        int i=count;
        for (; i < count+limit; i++) {
            adder.sum();
        }
        return i;
    }

    private static int writeAdder(LongAdder adder, int count, int limit) {
        int i=count;
        for (; i < count+limit; i++) {
            adder.add(1);
        }
        return i;
    }

    private static int readAtomic(AtomicLong adder, int count, int limit) {
        int i=count;
        for (; i < count+limit; i++) {
            adder.get();
        }
        return i;
    }

    private static int writeAtomic(AtomicLong adder, int count, int limit) {
        int i=count;
        for (; i < count+limit; i++) {
            adder.addAndGet(1);
        }
        return i;
    }
}
	/*
	Welcome to the game show "lies and benchmarks".

	We will present the participants a required result, and they would build a
	benchmark that reaches the predetermined conclusions.

	When done professionally, this activity is called "benchmarketing".

	Our task for today is to prove LongAdder is slower than AtomicLong. Our
	observation is, LongAdder prevents write contention to a single a atomic
	variable by spreading the writes to more than one variables. As a result
	the read activity is reading from a few fields hence costlier.

	We will therefor construct a benchmark that mostly reads, and sometimes writes.

	The benchmark need to be tunable, we shall therefor play with the number until
	we receive the required results.

	Indeed with 16 threads each performs and 100K reads/writes for each thread,
	we're getting better result for AtomicLong, than LongAdder.

	Of course, with 90% writes, LongAdder is much faster than AtomicLong.

	Note that for performing real benchmarks, one have to understand the underlying
	architecture, as well as the tested algorithm. It is easy to make fatal flaws
	and measure the wrong thing entirely. This is relevant however for actionable
	benchmarks you are expected to derive interesting conclusions from. Once you
	internalize the fact that the benchmarks are not a tool that supplies
	information to you, but you are the one who decides the benchmark's result, you
	can neglect all the benchmarking methodology, and just do something that looks
	reasonable.

	If you made an error that made your benchmark look good - go post it in your
	blog before anyone notice the error.

	Indeed the presented benchmarks are probably faulty. I made sure the JIT is warm
	with the primitive technique of running the benchmark function 100 times and
	picking the minimal result, assuming that the minimal result is JIT optimized.
	It may very well be that I'm actually measuring the performance of the Java's for
	loop. Wrong or not, the graphs would still look very good on the blog post.

	An example run looks like (you must use Java 8 of course).
	$ javac *.java
	$ java -cp . LongAdderBench
	for 1% writes=16,000 (16 threads * 100,000 ops each=1,600,000):
	LongAdder 1.518ms
	AtomicLong 0.319ms
	AtomicLong faster than LongAdder by 78.99%

	for 90% writes=1,440,000 (16 threads * 100,000 ops each=1,600,000):
	LongAdder 3.258ms
	AtomicLong 28.892ms
	LongAdder faster than AtomicLong by 786.80%

	*/
	import java.util.Random;
	import java.util.concurrent.CountDownLatch;
	import java.util.concurrent.ExecutorService;
	import java.util.concurrent.Executors;
	import java.util.concurrent.TimeUnit;
	import java.util.concurrent.atomic.AtomicLong;
	import java.util.concurrent.atomic.LongAdder;

	public class LongAdderBench {

	public static final int THREADS = 16;
	public static final int OPS = 1_00_000;
	private static final int MIN_BATCH = (int) (OPS * 0.01);

	public static void main(String[] args) throws Exception {
	adderVsAtomic(0.01d);
	adderVsAtomic(0.90d);
	}

	public static void adderVsAtomic(double percent_write) throws Exception {
	long minAdder = Long.MAX_VALUE, minAtomic = Long.MAX_VALUE;
	for (int i=0; i<100; i++) {
	long atomic = run(false, percent_write);
	long adder = run(true, percent_write);
	minAdder = Math.min(adder, minAdder);
	minAtomic = Math.min(atomic, minAtomic);
	}
	System.out.printf("for %2.0f%% writes=%,.0f (%d threads * %,d ops each=%,d):\n", 100d*percent_write,
	OPS THREADSpercent_write, THREADS, OPS, OPS *THREADS);
	String adderName = "LongAdder";
	String atomicName = "AtomicLong";
	System.out.printf(" %-10s %,.3fms\n", adderName, minAdder/1_000_000d);
	System.out.printf(" %-10s %,.3fms\n", atomicName, minAtomic/1_000_000d);
	double delta = (double) minAdder - minAtomic;
	String faster = atomicName, slower = adderName;
	if (delta<0) {
	faster = adderName;
	slower = atomicName;
	delta = Math.abs(delta);
	}
	System.out.printf("%s faster than %s by %.2f%%\n\n", faster, slower, 100d*(delta /minAdder));
	}

	public static long run(boolean useAdder, double percent_write) throws Exception {
	ExecutorService svc = Executors.newFixedThreadPool(THREADS);
	final LongAdder adder = new LongAdder();
	final AtomicLong atomic = new AtomicLong();
	CountDownLatch latch = new CountDownLatch(1);
	for (int i = 0; i < THREADS; i++) {
	svc.submit(() -> {
	latch.await();
	int count = 0;
	Random rand = new Random();
	while (count < OPS) {
	int batch = rand.nextInt(MIN_BATCH * 2) + MIN_BATCH;
	int nwrites = (int) (batch * percent_write);
	if (useAdder) {
	count = writeAdder(adder, count, nwrites);
	count = readAdder(adder, count, batch - nwrites);
	} else {
	count = writeAtomic(atomic, count, nwrites);
	count = readAtomic(atomic, count, batch - nwrites);
	}
	}
	return null;
	});
	}
	long start = System.nanoTime();
	latch.countDown();
	svc.shutdown();
	svc.awaitTermination(1, TimeUnit.MINUTES);
	return System.nanoTime()-start;
	}



	private static int readAdder(LongAdder adder, int count, int limit) {
	int i=count;
	for (; i < count+limit; i++) {
	adder.sum();
	}
	return i;
	}

	private static int writeAdder(LongAdder adder, int count, int limit) {
	int i=count;
	for (; i < count+limit; i++) {
	adder.add(1);
	}
	return i;
	}

	private static int readAtomic(AtomicLong adder, int count, int limit) {
	int i=count;
	for (; i < count+limit; i++) {
	adder.get();
	}
	return i;
	}

	private static int writeAtomic(AtomicLong adder, int count, int limit) {
	int i=count;
	for (; i < count+limit; i++) {
	adder.addAndGet(1);
	}
	return i;
	}
	}