Skip to content

Instantly share code, notes, and snippets.

@NicolasT
Last active December 13, 2015 20:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NicolasT/4974223 to your computer and use it in GitHub Desktop.
Save NicolasT/4974223 to your computer and use it in GitHub Desktop.
Haskell SSE 128bit SIMD support in GHC 7.7.20130216 Compiled using ghc-7.7.20130216 -msse2 -O3 -fllvm -funbox-strict-fields -fforce-recomp -keep-llvm-files -optlo-O3 -optlc-O=3 -S simd.hs simd.ll and simd.s contain the 'most significant' parts of the compilation result.
{-# LANGUAGE MagicHash, UnboxedTuples, BangPatterns #-}
module Main where
import Data.Int
import System.Environment
import GHC.Int
import GHC.Prim
f :: Int32 -> Int32
f (I32# a#) =
let !b# = packInt32X4# a# 1# 2# (0xdeadbeef#)
!c# = plusInt32X4# b# b#
!(# d1#, d2#, d3#, d4# #) = unpackInt32X4# c#
in (I32# (d1# +# d2# +# d3# +# d4#))
main :: IO ()
main = return ()
{-main :: IO ()
main = do
v <- (read . head) `fmap` getArgs
print $ f v-}
; (snip)
c1o6:
%ln1s2 = load i64* %ls1nc
%ln1s3 = add i64 %ln1s2, 7
%ln1s4 = inttoptr i64 %ln1s3 to i64*
%ln1s5 = load i64* %ln1s4, !tbaa !5
%ln1s6 = trunc i64 %ln1s5 to i32
%ln1s7 = insertelement <4 x i32> < i32 0, i32 0, i32 0, i32 0 >, i32 %ln1s6, i32 0
%ln1s8 = insertelement <4 x i32> %ln1s7, i32 1, i32 1
%ln1s9 = insertelement <4 x i32> %ln1s8, i32 2, i32 2
%ln1sa = insertelement <4 x i32> %ln1s9, i32 -559038737, i32 3
%ln1sb = bitcast <4 x i32> %ln1sa to <4 x i32>
store <4 x i32> %ln1sb, <4 x i32>* %ls1mP, align 1
%ln1sc = ptrtoint [0 x i64]* @base_GHCziInt_I32zh_con_info to i64
%ln1sd = load i64** %Hp_Var
%ln1se = getelementptr inbounds i64* %ln1sd, i32 -1
store i64 %ln1sc, i64* %ln1se, !tbaa !2
%ln1sf = load <4 x i32>* %ls1mP, align 1
%ln1sg = load <4 x i32>* %ls1mP, align 1
%ln1sh = add <4 x i32> %ln1sf, %ln1sg
%ln1si = bitcast <4 x i32> %ln1sh to <4 x i32>
store <4 x i32> %ln1si, <4 x i32>* %ls1na, align 1
%ln1sj = load <4 x i32>* %ls1na, align 1
%ln1sk = extractelement <4 x i32> %ln1sj, i32 0
%ln1sl = sext i32 %ln1sk to i64
%ln1sm = load <4 x i32>* %ls1na, align 1
%ln1sn = extractelement <4 x i32> %ln1sm, i32 1
%ln1so = sext i32 %ln1sn to i64
%ln1sp = load <4 x i32>* %ls1na, align 1
%ln1sq = extractelement <4 x i32> %ln1sp, i32 2
%ln1sr = sext i32 %ln1sq to i64
%ln1ss = load <4 x i32>* %ls1na, align 1
%ln1st = extractelement <4 x i32> %ln1ss, i32 3
%ln1su = sext i32 %ln1st to i64
%ln1sv = add i64 %ln1sr, %ln1su
%ln1sw = add i64 %ln1so, %ln1sv
%ln1sx = add i64 %ln1sl, %ln1sw
%ln1sy = load i64** %Hp_Var
%ln1sz = getelementptr inbounds i64* %ln1sy, i32 0
; (snip)
# (snip)
.LBB0_6: # %c1o6.i
movd 7(%r14), %xmm0
movl $1, %ecx
pinsrd $1, %ecx, %xmm0
movl $2, %ecx
pinsrd $2, %ecx, %xmm0
movl $-559038737, %ecx # imm = 0xFFFFFFFFDEADBEEF
pinsrd $3, %ecx, %xmm0
paddd %xmm0, %xmm0
movdqa %xmm0, -24(%rsp)
movq $base_GHCziInt_I32zh_con_info, 8(%r12)
leaq -7(%rax), %rbx
movslq -16(%rsp), %rdx
movslq -12(%rsp), %rcx
addq %rdx, %rcx
movslq -24(%rsp), %rdx
addq %rcx, %rdx
movslq -20(%rsp), %rcx
addq %rdx, %rcx
movq %rcx, (%rax)
movq (%rbp), %rcx
movq %rax, %r12
jmpq *%rcx # TAILCALL
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment