Skip to content

Instantly share code, notes, and snippets.


sklam/main.c Secret

Created December 22, 2016 23:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sklam/11f11a410258ca191e6f263262a4ea65 to your computer and use it in GitHub Desktop.
Save sklam/11f11a410258ca191e6f263262a4ea65 to your computer and use it in GitHub Desktop.
2x different performance with clang
#include <time.h>
#include <stdio.h>
double apple(double *arr, int size) ;
double orange(double *arr, int size) ;
int main() {
const int size = 1000;
double arr[size];
for (int i=0; i<size; ++i) {
arr[i] = i;
arr[size / 2] = -0.123213;
double ra = apple(arr, size);
double rb = orange(arr, size);
printf("ra = %f | rb = %f\n", ra, rb);
// benchmark
clock_t ts, te;
double dur;
const int repeat = 100000;
ts = clock();
for (int i=0; i<repeat; ++i) apple(arr, size);
te = clock();
dur = te - ts;
printf("apple %f\n", dur/CLOCKS_PER_SEC);
ts = clock();
for (int i=0; i<repeat; ++i) orange(arr, size);
te = clock();
dur = te - ts;
printf("orange %f\n", dur/CLOCKS_PER_SEC);
return 0;
#include <math.h>
double apple(double *arr, int size) {
double amin = INFINITY;
int all_missing = 1;
int i;
double ai;
for (i=0; i<size; ++i) { // increment i here
ai = arr[i];
if ( ai <= amin ) {
amin = ai;
all_missing = 0;
if (all_missing) {
amin = NAN;
return amin;
double orange(double *arr, int size) {
double amin = INFINITY;
int all_missing = 1;
int i;
double ai;
for (i=0; i<size;) {
ai = arr[i];
++i; // increment i here
if ( ai <= amin ) {
amin = ai;
all_missing = 0;
if (all_missing) {
amin = NAN;
return amin;
Copy link

sklam commented Dec 23, 2016

Compile with:

$ clang -O3 -c nanmin.c
$ clang -O3 -c main.c
$ clang nanmin.o main.o -o test


$ ./test
ra = -0.123213 | rb = -0.123213
apple 0.202377
orange 0.096341

Clang version
Apple LLVM version 7.3.0 (clang-703.0.31)
Target: x86_64-apple-darwin15.6.0

Copy link

sklam commented Dec 26, 2016

Same slowdown on clang4.0 nightly build

clang version 4.0.0-svn290259-1~exp1 (trunk)
Target: x86_64-unknown-linux-gnu

Copy link

sklam commented Dec 26, 2016

The responsible opt passes is -simplifycfg -sroa -simplifycfg The second simplifycfg flattens PHI into select.

Before last simplifycfg:

; ModuleID = 'nanmin.ll'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"

; Function Attrs: nounwind ssp uwtable
define double @apple(double* %arr, i32 %size) #0 {
  br label %1

; <label>:1                                       ; preds = %9, %0
  %all_missing.0 = phi i32 [ 1, %0 ], [ %all_missing.1, %9 ]
  %i.0 = phi i32 [ 0, %0 ], [ %10, %9 ]
  %amin.0 = phi double [ 0x7FF0000000000000, %0 ], [ %amin.1, %9 ]
  %2 = icmp slt i32 %i.0, %size
  br i1 %2, label %3, label %11

; <label>:3                                       ; preds = %1
  %4 = sext i32 %i.0 to i64
  %5 = getelementptr inbounds double, double* %arr, i64 %4
  %6 = load double, double* %5, align 8
  %7 = fcmp ole double %6, %amin.0
  br i1 %7, label %8, label %9

; <label>:8                                       ; preds = %3
  br label %9

; <label>:9                                       ; preds = %3, %8
  %all_missing.1 = phi i32 [ 0, %8 ], [ %all_missing.0, %3 ]
  %amin.1 = phi double [ %6, %8 ], [ %amin.0, %3 ]
  %10 = add nsw i32 %i.0, 1
  br label %1

; <label>:11                                      ; preds = %1
  %12 = icmp ne i32 %all_missing.0, 0
  br i1 %12, label %13, label %14

; <label>:13                                      ; preds = %11
  br label %14

; <label>:14                                      ; preds = %13, %11
  %amin.2 = phi double [ 0x7FF8000000000000, %13 ], [ %amin.0, %11 ]
  ret double %amin.2

After last simplifycfg:

; ModuleID = 'nanmin.ll'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"

; Function Attrs: nounwind ssp uwtable
define double @apple(double* %arr, i32 %size) #0 {
  br label %1

; <label>:1                                       ; preds = %3, %0
  %all_missing.0 = phi i32 [ 1, %0 ], [ %all_missing.1, %3 ]
  %i.0 = phi i32 [ 0, %0 ], [ %8, %3 ]
  %amin.0 = phi double [ 0x7FF0000000000000, %0 ], [ %amin.1, %3 ]
  %2 = icmp slt i32 %i.0, %size
  br i1 %2, label %3, label %9

; <label>:3                                       ; preds = %1
  %4 = sext i32 %i.0 to i64
  %5 = getelementptr inbounds double, double* %arr, i64 %4
  %6 = load double, double* %5, align 8
  %7 = fcmp ole double %6, %amin.0
  %all_missing.1 = select i1 %7, i32 0, i32 %all_missing.0
  %amin.1 = select i1 %7, double %6, double %amin.0
  %8 = add nsw i32 %i.0, 1
  br label %1

; <label>:9                                       ; preds = %1
  %10 = icmp ne i32 %all_missing.0, 0
  %.amin.0 = select i1 %10, double 0x7FF8000000000000, double %amin.0
  ret double %.amin.0

Copy link

sklam commented Dec 26, 2016

A dummy tail block in the end of the loop body also trigger the slowdown.

double orange(double *arr, int size) {
    double amin = INFINITY;
    int all_missing = 1;
    int i;
    double ai;

    for (i=0; i<size;) {
        ai = arr[i];
        if ( ai <= amin ) {
            amin = ai;
            all_missing = 0;
        i; // dummy unused line
    if (all_missing) {
        amin = NAN;
    return amin;

Copy link

sklam commented Dec 26, 2016

With gcc-4.8, it will always produce the slower performance for both functions

Copy link

hiraditya commented Feb 14, 2017

gcc 7.0 will give same performance number for both functions apple and orange which is fast 0.096341s

Copy link

ml31415 commented Feb 15, 2017

michael@nyx:~/w/apples# ./test-3.8 
ra = -0.123213 | rb = -0.123213
apple 0.242737
orange 0.053176
michael@nyx:~/w/apples# ./test-3.6
ra = -0.123213 | rb = -0.123213
apple 0.235650
orange 0.054918

x86_64 Linux
Looks like the speed impact is even more drastic here, and that it's really not a pure 3.9 issue.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment