-
-
Save sklam/11f11a410258ca191e6f263262a4ea65 to your computer and use it in GitHub Desktop.
2x different performance with clang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <time.h> | |
#include <stdio.h> | |
double apple(double *arr, int size) ; | |
double orange(double *arr, int size) ; | |
int main() { | |
const int size = 1000; | |
double arr[size]; | |
for (int i=0; i<size; ++i) { | |
arr[i] = i; | |
} | |
arr[size / 2] = -0.123213; | |
double ra = apple(arr, size); | |
double rb = orange(arr, size); | |
printf("ra = %f | rb = %f\n", ra, rb); | |
// benchmark | |
clock_t ts, te; | |
double dur; | |
const int repeat = 100000; | |
ts = clock(); | |
for (int i=0; i<repeat; ++i) apple(arr, size); | |
te = clock(); | |
dur = te - ts; | |
printf("apple %f\n", dur/CLOCKS_PER_SEC); | |
ts = clock(); | |
for (int i=0; i<repeat; ++i) orange(arr, size); | |
te = clock(); | |
dur = te - ts; | |
printf("orange %f\n", dur/CLOCKS_PER_SEC); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <math.h> | |
double apple(double *arr, int size) { | |
double amin = INFINITY; | |
int all_missing = 1; | |
int i; | |
double ai; | |
for (i=0; i<size; ++i) { // increment i here | |
ai = arr[i]; | |
if ( ai <= amin ) { | |
amin = ai; | |
all_missing = 0; | |
} | |
} | |
if (all_missing) { | |
amin = NAN; | |
} | |
return amin; | |
} | |
double orange(double *arr, int size) { | |
double amin = INFINITY; | |
int all_missing = 1; | |
int i; | |
double ai; | |
for (i=0; i<size;) { | |
ai = arr[i]; | |
++i; // increment i here | |
if ( ai <= amin ) { | |
amin = ai; | |
all_missing = 0; | |
} | |
} | |
if (all_missing) { | |
amin = NAN; | |
} | |
return amin; | |
} |
Same slowdown on clang4.0 nightly build
clang version 4.0.0-svn290259-1~exp1 (trunk)
Target: x86_64-unknown-linux-gnu
The responsible opt passes is -simplifycfg -sroa -simplifycfg
The second simplifycfg flattens PHI into select.
Before last simplifycfg:
; ModuleID = 'nanmin.ll'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
; Function Attrs: nounwind ssp uwtable
define double @apple(double* %arr, i32 %size) #0 {
br label %1
; <label>:1 ; preds = %9, %0
%all_missing.0 = phi i32 [ 1, %0 ], [ %all_missing.1, %9 ]
%i.0 = phi i32 [ 0, %0 ], [ %10, %9 ]
%amin.0 = phi double [ 0x7FF0000000000000, %0 ], [ %amin.1, %9 ]
%2 = icmp slt i32 %i.0, %size
br i1 %2, label %3, label %11
; <label>:3 ; preds = %1
%4 = sext i32 %i.0 to i64
%5 = getelementptr inbounds double, double* %arr, i64 %4
%6 = load double, double* %5, align 8
%7 = fcmp ole double %6, %amin.0
br i1 %7, label %8, label %9
; <label>:8 ; preds = %3
br label %9
; <label>:9 ; preds = %3, %8
%all_missing.1 = phi i32 [ 0, %8 ], [ %all_missing.0, %3 ]
%amin.1 = phi double [ %6, %8 ], [ %amin.0, %3 ]
%10 = add nsw i32 %i.0, 1
br label %1
; <label>:11 ; preds = %1
%12 = icmp ne i32 %all_missing.0, 0
br i1 %12, label %13, label %14
; <label>:13 ; preds = %11
br label %14
; <label>:14 ; preds = %13, %11
%amin.2 = phi double [ 0x7FF8000000000000, %13 ], [ %amin.0, %11 ]
ret double %amin.2
}
After last simplifycfg:
; ModuleID = 'nanmin.ll'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
; Function Attrs: nounwind ssp uwtable
define double @apple(double* %arr, i32 %size) #0 {
br label %1
; <label>:1 ; preds = %3, %0
%all_missing.0 = phi i32 [ 1, %0 ], [ %all_missing.1, %3 ]
%i.0 = phi i32 [ 0, %0 ], [ %8, %3 ]
%amin.0 = phi double [ 0x7FF0000000000000, %0 ], [ %amin.1, %3 ]
%2 = icmp slt i32 %i.0, %size
br i1 %2, label %3, label %9
; <label>:3 ; preds = %1
%4 = sext i32 %i.0 to i64
%5 = getelementptr inbounds double, double* %arr, i64 %4
%6 = load double, double* %5, align 8
%7 = fcmp ole double %6, %amin.0
%all_missing.1 = select i1 %7, i32 0, i32 %all_missing.0
%amin.1 = select i1 %7, double %6, double %amin.0
%8 = add nsw i32 %i.0, 1
br label %1
; <label>:9 ; preds = %1
%10 = icmp ne i32 %all_missing.0, 0
%.amin.0 = select i1 %10, double 0x7FF8000000000000, double %amin.0
ret double %.amin.0
}
A dummy tail block in the end of the loop body also trigger the slowdown.
double orange(double *arr, int size) {
double amin = INFINITY;
int all_missing = 1;
int i;
double ai;
for (i=0; i<size;) {
ai = arr[i];
++i;
if ( ai <= amin ) {
amin = ai;
all_missing = 0;
}
i; // dummy unused line
}
if (all_missing) {
amin = NAN;
}
return amin;
}
With gcc-4.8, it will always produce the slower performance for both functions
gcc 7.0 will give same performance number for both functions apple and orange which is fast 0.096341s
michael@nyx:~/w/apples# ./test-3.8
ra = -0.123213 | rb = -0.123213
apple 0.242737
orange 0.053176
michael@nyx:~/w/apples# ./test-3.6
ra = -0.123213 | rb = -0.123213
apple 0.235650
orange 0.054918
x86_64 Linux
Looks like the speed impact is even more drastic here, and that it's really not a pure 3.9 issue.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compile with:
$ clang -O3 -c nanmin.c $ clang -O3 -c main.c $ clang nanmin.o main.o -o test
Outputs
Clang version
Apple LLVM version 7.3.0 (clang-703.0.31)
Target: x86_64-apple-darwin15.6.0