Skip to content

Instantly share code, notes, and snippets.

@nicolasvasilache
Last active November 9, 2021 08:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nicolasvasilache/0fe30c83cbfe5b4776ec9f0ee465611a to your computer and use it in GitHub Desktop.
Save nicolasvasilache/0fe30c83cbfe5b4776ec9f0ee465611a to your computer and use it in GitHub Desktop.
LLVM AVX2 transpose 8x8
; Further compiled with:
; clang -x ir -emit-llvm -S -mcpu=haswell -O3 - -o - | llc -O3 -mcpu=haswell - -o -
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
declare i8* @malloc(i64)
declare void @free(i8*)
; Function Attrs: noinline
define void @transpose_2d_on_tensors(float* %0, float* %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, float* %7, float* %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13) #0 {
%15 = getelementptr float, float* %1, i64 0
%16 = bitcast float* %15 to <8 x float>*
%17 = load <8 x float>, <8 x float>* %16, align 4
%18 = getelementptr float, float* %1, i64 8
%19 = bitcast float* %18 to <8 x float>*
%20 = load <8 x float>, <8 x float>* %19, align 4
%21 = getelementptr float, float* %1, i64 16
%22 = bitcast float* %21 to <8 x float>*
%23 = load <8 x float>, <8 x float>* %22, align 4
%24 = getelementptr float, float* %1, i64 24
%25 = bitcast float* %24 to <8 x float>*
%26 = load <8 x float>, <8 x float>* %25, align 4
%27 = getelementptr float, float* %1, i64 32
%28 = bitcast float* %27 to <8 x float>*
%29 = load <8 x float>, <8 x float>* %28, align 4
%30 = getelementptr float, float* %1, i64 40
%31 = bitcast float* %30 to <8 x float>*
%32 = load <8 x float>, <8 x float>* %31, align 4
%33 = getelementptr float, float* %1, i64 48
%34 = bitcast float* %33 to <8 x float>*
%35 = load <8 x float>, <8 x float>* %34, align 4
%36 = getelementptr float, float* %1, i64 56
%37 = bitcast float* %36 to <8 x float>*
%38 = load <8 x float>, <8 x float>* %37, align 4
%39 = shufflevector <8 x float> %17, <8 x float> %20, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%40 = shufflevector <8 x float> %17, <8 x float> %20, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%41 = shufflevector <8 x float> %23, <8 x float> %26, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%42 = shufflevector <8 x float> %23, <8 x float> %26, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%43 = shufflevector <8 x float> %29, <8 x float> %32, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%44 = shufflevector <8 x float> %29, <8 x float> %32, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%45 = shufflevector <8 x float> %35, <8 x float> %38, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%46 = shufflevector <8 x float> %35, <8 x float> %38, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%47 = shufflevector <8 x float> %39, <8 x float> %41, <8 x i32> <i32 2, i32 3, i32 8, i32 9, i32 6, i32 7, i32 12, i32 13>
%48 = shufflevector <8 x float> %39, <8 x float> %47, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
%49 = shufflevector <8 x float> %41, <8 x float> %47, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
%50 = shufflevector <8 x float> %40, <8 x float> %42, <8 x i32> <i32 2, i32 3, i32 8, i32 9, i32 6, i32 7, i32 12, i32 13>
%51 = shufflevector <8 x float> %40, <8 x float> %50, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
%52 = shufflevector <8 x float> %42, <8 x float> %50, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
%53 = shufflevector <8 x float> %43, <8 x float> %45, <8 x i32> <i32 2, i32 3, i32 8, i32 9, i32 6, i32 7, i32 12, i32 13>
%54 = shufflevector <8 x float> %43, <8 x float> %53, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
%55 = shufflevector <8 x float> %45, <8 x float> %53, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
%56 = shufflevector <8 x float> %44, <8 x float> %46, <8 x i32> <i32 2, i32 3, i32 8, i32 9, i32 6, i32 7, i32 12, i32 13>
%57 = shufflevector <8 x float> %44, <8 x float> %56, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
%58 = shufflevector <8 x float> %46, <8 x float> %56, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
%59 = shufflevector <8 x float> %48, <8 x float> %54, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
%60 = shufflevector <8 x float> %49, <8 x float> %55, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
%61 = shufflevector <8 x float> %51, <8 x float> %57, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
%62 = shufflevector <8 x float> %52, <8 x float> %58, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
%63 = shufflevector <8 x float> %48, <8 x float> %54, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%64 = shufflevector <8 x float> %49, <8 x float> %55, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%65 = shufflevector <8 x float> %51, <8 x float> %57, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%66 = shufflevector <8 x float> %52, <8 x float> %58, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%67 = getelementptr float, float* %8, i64 0
%68 = bitcast float* %67 to <8 x float>*
store <8 x float> %59, <8 x float>* %68, align 4
%69 = getelementptr float, float* %8, i64 8
%70 = bitcast float* %69 to <8 x float>*
store <8 x float> %60, <8 x float>* %70, align 4
%71 = getelementptr float, float* %8, i64 16
%72 = bitcast float* %71 to <8 x float>*
store <8 x float> %61, <8 x float>* %72, align 4
%73 = getelementptr float, float* %8, i64 24
%74 = bitcast float* %73 to <8 x float>*
store <8 x float> %62, <8 x float>* %74, align 4
%75 = getelementptr float, float* %8, i64 32
%76 = bitcast float* %75 to <8 x float>*
store <8 x float> %63, <8 x float>* %76, align 4
%77 = getelementptr float, float* %8, i64 40
%78 = bitcast float* %77 to <8 x float>*
store <8 x float> %64, <8 x float>* %78, align 4
%79 = getelementptr float, float* %8, i64 48
%80 = bitcast float* %79 to <8 x float>*
store <8 x float> %65, <8 x float>* %80, align 4
%81 = getelementptr float, float* %8, i64 56
%82 = bitcast float* %81 to <8 x float>*
store <8 x float> %66, <8 x float>* %82, align 4
ret void
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment