Skip to content

Instantly share code, notes, and snippets.

@sklam

sklam/main.c Secret

Created December 22, 2016 23:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sklam/11f11a410258ca191e6f263262a4ea65 to your computer and use it in GitHub Desktop.
Save sklam/11f11a410258ca191e6f263262a4ea65 to your computer and use it in GitHub Desktop.
2x different performance with clang
#include <time.h>
#include <stdio.h>
double apple(double *arr, int size) ;
double orange(double *arr, int size) ;
int main() {
const int size = 1000;
double arr[size];
for (int i=0; i<size; ++i) {
arr[i] = i;
}
arr[size / 2] = -0.123213;
double ra = apple(arr, size);
double rb = orange(arr, size);
printf("ra = %f | rb = %f\n", ra, rb);
// benchmark
clock_t ts, te;
double dur;
const int repeat = 100000;
ts = clock();
for (int i=0; i<repeat; ++i) apple(arr, size);
te = clock();
dur = te - ts;
printf("apple %f\n", dur/CLOCKS_PER_SEC);
ts = clock();
for (int i=0; i<repeat; ++i) orange(arr, size);
te = clock();
dur = te - ts;
printf("orange %f\n", dur/CLOCKS_PER_SEC);
return 0;
}
#include <math.h>
double apple(double *arr, int size) {
double amin = INFINITY;
int all_missing = 1;
int i;
double ai;
for (i=0; i<size; ++i) { // increment i here
ai = arr[i];
if ( ai <= amin ) {
amin = ai;
all_missing = 0;
}
}
if (all_missing) {
amin = NAN;
}
return amin;
}
double orange(double *arr, int size) {
double amin = INFINITY;
int all_missing = 1;
int i;
double ai;
for (i=0; i<size;) {
ai = arr[i];
++i; // increment i here
if ( ai <= amin ) {
amin = ai;
all_missing = 0;
}
}
if (all_missing) {
amin = NAN;
}
return amin;
}
@sklam
Copy link
Author

sklam commented Dec 23, 2016

Compile with:

$ clang -O3 -c nanmin.c
$ clang -O3 -c main.c
$ clang nanmin.o main.o -o test

Outputs

$ ./test
ra = -0.123213 | rb = -0.123213
apple 0.202377
orange 0.096341

Clang version
Apple LLVM version 7.3.0 (clang-703.0.31)
Target: x86_64-apple-darwin15.6.0

@sklam
Copy link
Author

sklam commented Dec 26, 2016

Same slowdown on clang4.0 nightly build

clang version 4.0.0-svn290259-1~exp1 (trunk)
Target: x86_64-unknown-linux-gnu

@sklam
Copy link
Author

sklam commented Dec 26, 2016

The responsible opt passes is -simplifycfg -sroa -simplifycfg The second simplifycfg flattens PHI into select.

Before last simplifycfg:

; ModuleID = 'nanmin.ll'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"

; Function Attrs: nounwind ssp uwtable
define double @apple(double* %arr, i32 %size) #0 {
  br label %1

; <label>:1                                       ; preds = %9, %0
  %all_missing.0 = phi i32 [ 1, %0 ], [ %all_missing.1, %9 ]
  %i.0 = phi i32 [ 0, %0 ], [ %10, %9 ]
  %amin.0 = phi double [ 0x7FF0000000000000, %0 ], [ %amin.1, %9 ]
  %2 = icmp slt i32 %i.0, %size
  br i1 %2, label %3, label %11

; <label>:3                                       ; preds = %1
  %4 = sext i32 %i.0 to i64
  %5 = getelementptr inbounds double, double* %arr, i64 %4
  %6 = load double, double* %5, align 8
  %7 = fcmp ole double %6, %amin.0
  br i1 %7, label %8, label %9

; <label>:8                                       ; preds = %3
  br label %9

; <label>:9                                       ; preds = %3, %8
  %all_missing.1 = phi i32 [ 0, %8 ], [ %all_missing.0, %3 ]
  %amin.1 = phi double [ %6, %8 ], [ %amin.0, %3 ]
  %10 = add nsw i32 %i.0, 1
  br label %1

; <label>:11                                      ; preds = %1
  %12 = icmp ne i32 %all_missing.0, 0
  br i1 %12, label %13, label %14

; <label>:13                                      ; preds = %11
  br label %14

; <label>:14                                      ; preds = %13, %11
  %amin.2 = phi double [ 0x7FF8000000000000, %13 ], [ %amin.0, %11 ]
  ret double %amin.2
}

After last simplifycfg:

; ModuleID = 'nanmin.ll'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"

; Function Attrs: nounwind ssp uwtable
define double @apple(double* %arr, i32 %size) #0 {
  br label %1

; <label>:1                                       ; preds = %3, %0
  %all_missing.0 = phi i32 [ 1, %0 ], [ %all_missing.1, %3 ]
  %i.0 = phi i32 [ 0, %0 ], [ %8, %3 ]
  %amin.0 = phi double [ 0x7FF0000000000000, %0 ], [ %amin.1, %3 ]
  %2 = icmp slt i32 %i.0, %size
  br i1 %2, label %3, label %9

; <label>:3                                       ; preds = %1
  %4 = sext i32 %i.0 to i64
  %5 = getelementptr inbounds double, double* %arr, i64 %4
  %6 = load double, double* %5, align 8
  %7 = fcmp ole double %6, %amin.0
  %all_missing.1 = select i1 %7, i32 0, i32 %all_missing.0
  %amin.1 = select i1 %7, double %6, double %amin.0
  %8 = add nsw i32 %i.0, 1
  br label %1

; <label>:9                                       ; preds = %1
  %10 = icmp ne i32 %all_missing.0, 0
  %.amin.0 = select i1 %10, double 0x7FF8000000000000, double %amin.0
  ret double %.amin.0
}

@sklam
Copy link
Author

sklam commented Dec 26, 2016

A dummy tail block in the end of the loop body also trigger the slowdown.

double orange(double *arr, int size) {
    double amin = INFINITY;
    int all_missing = 1;
    int i;
    double ai;

    for (i=0; i<size;) {
        ai = arr[i];
        ++i;
        if ( ai <= amin ) {
            amin = ai;
            all_missing = 0;
        }
        i; // dummy unused line
    }
    if (all_missing) {
        amin = NAN;
    }
    return amin;
}

@sklam
Copy link
Author

sklam commented Dec 26, 2016

With gcc-4.8, it will always produce the slower performance for both functions

@hiraditya
Copy link

hiraditya commented Feb 14, 2017

gcc 7.0 will give same performance number for both functions apple and orange which is fast 0.096341s

@ml31415
Copy link

ml31415 commented Feb 15, 2017

michael@nyx:~/w/apples# ./test-3.8 
ra = -0.123213 | rb = -0.123213
apple 0.242737
orange 0.053176
michael@nyx:~/w/apples# ./test-3.6
ra = -0.123213 | rb = -0.123213
apple 0.235650
orange 0.054918

x86_64 Linux
Looks like the speed impact is even more drastic here, and that it's really not a pure 3.9 issue.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment