30 (3.53 % of base) - System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong
; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 21 single block inlinees; 25 inlinees without PGO data
+; 0 inlinees with PGO data; 25 single block inlinees; 21 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T06] ( 9, 9 ) long -> rdi single-def
; V01 arg1 [V01,T04] ( 15, 12 ) long -> rsi single-def
; V02 arg2 [V02,T11] ( 9, 6 ) long -> rdx single-def
; V03 loc0 [V03,T00] ( 23, 30 ) long -> rax
; V04 loc1 [V04,T12] ( 13, 6.50) int -> r8
;* V05 loc2 [V05 ] ( 0, 0 ) int -> zero-ref
; V06 loc3 [V06,T05] ( 7, 14 ) long -> rcx
; V07 loc4 [V07,T22] ( 5, 2.50) long -> rdx
; V08 loc5 [V08,T16] ( 2, 4.50) long -> r8
;# V09 OutArgs [V09 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V10 tmp1 [V10,T23] ( 3, 1.50) long -> rax "Inline return value spill temp"
; V11 tmp2 [V11,T07] ( 5, 9.50) byref -> rcx single-def "Inline stloc first use temp"
-; V12 tmp3 [V12,T30] ( 14, 17.50) simd64 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+; V12 tmp3 [V12,T32] ( 14, 17.50) simd64 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
; V13 tmp4 [V13,T13] ( 5, 6 ) byref -> rax single-def "Inline stloc first use temp"
;* V14 tmp5 [V14 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
; V15 tmp6 [V15,T01] ( 12, 27 ) long -> r8 "Inline stloc first use temp"
; V16 tmp7 [V16,T17] ( 2, 4.50) long -> r9 "Inline stloc first use temp"
-; V17 tmp8 [V17,T36] ( 3, 12 ) simd64 -> mm3 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+; V17 tmp8 [V17,T38] ( 3, 12 ) simd64 -> mm3 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V18 tmp9 [V18 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
-; V19 tmp10 [V19,T33] ( 2, 16 ) simd64 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+; V19 tmp10 [V19,T35] ( 2, 16 ) simd64 -> mm0 "Spilling op1 side effects for HWIntrinsic"
;* V20 tmp11 [V20 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
;* V21 tmp12 [V21 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V22 tmp13 [V22 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V23 tmp14 [V23 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V24 tmp15 [V24 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V25 tmp16 [V25 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V26 tmp17 [V26 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V27 tmp18 [V27 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V28 tmp19 [V28 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V29 tmp20 [V29,T24] ( 3, 1.50) long -> rax "Inline return value spill temp"
; V30 tmp21 [V30,T08] ( 5, 9.50) byref -> rcx single-def "Inline stloc first use temp"
-; V31 tmp22 [V31,T31] ( 14, 17.50) simd32 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V31 tmp22 [V31,T33] ( 14, 17.50) simd32 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V32 tmp23 [V32,T14] ( 5, 6 ) byref -> rax single-def "Inline stloc first use temp"
;* V33 tmp24 [V33 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V34 tmp25 [V34,T02] ( 12, 27 ) long -> r8 "Inline stloc first use temp"
; V35 tmp26 [V35,T18] ( 2, 4.50) long -> r9 "Inline stloc first use temp"
-; V36 tmp27 [V36,T37] ( 3, 12 ) simd32 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V36 tmp27 [V36,T39] ( 3, 12 ) simd32 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
;* V37 tmp28 [V37 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-; V38 tmp29 [V38,T34] ( 2, 16 ) simd32 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+; V38 tmp29 [V38,T36] ( 2, 16 ) simd32 -> mm0 "Spilling op1 side effects for HWIntrinsic"
;* V39 tmp30 [V39 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-;* V40 tmp31 [V40 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V41 tmp32 [V41 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V40 tmp31 [V40 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V41 tmp32 [V41 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V42 tmp33 [V42 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V43 tmp34 [V43 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V44 tmp35 [V44 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V45 tmp36 [V45 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V44 tmp35 [V44 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V45 tmp36 [V45 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V46 tmp37 [V46 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V47 tmp38 [V47 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V48 tmp39 [V48 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V49 tmp40 [V49 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V50 tmp41 [V50 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V51 tmp42 [V51 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V52 tmp43 [V52 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V53 tmp44 [V53 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V54 tmp45 [V54 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V55 tmp46 [V55 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-; V56 tmp47 [V56,T25] ( 3, 1.50) long -> rax "Inline return value spill temp"
-;* V57 tmp48 [V57,T27] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
-;* V58 tmp49 [V58 ] ( 0, 0 ) long -> zero-ref "Inline stloc first use temp"
-; V59 tmp50 [V59,T09] ( 5, 9.50) byref -> rcx single-def "Inline stloc first use temp"
-; V60 tmp51 [V60,T32] ( 14, 17.50) simd16 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V61 tmp52 [V61,T15] ( 5, 6 ) byref -> r8 single-def "Inline stloc first use temp"
-;* V62 tmp53 [V62 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
-; V63 tmp54 [V63,T03] ( 11, 26.50) long -> rax "Inline stloc first use temp"
-; V64 tmp55 [V64,T19] ( 2, 4.50) long -> r9 "Inline stloc first use temp"
-; V65 tmp56 [V65,T38] ( 3, 12 ) simd16 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V66 tmp57 [V66 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
-; V67 tmp58 [V67,T35] ( 2, 16 ) simd16 -> mm0 "Spilling op1 side effects for HWIntrinsic"
-;* V68 tmp59 [V68 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
-;* V69 tmp60 [V69 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V70 tmp61 [V70 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V71 tmp62 [V71 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V72 tmp63 [V72 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V73 tmp64 [V73 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V74 tmp65 [V74 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V75 tmp66 [V75 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V76 tmp67 [V76 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V77 tmp68 [V77 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V78 tmp69 [V78 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V79 tmp70 [V79 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V80 tmp71 [V80 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V81 tmp72 [V81 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
-; V82 tmp73 [V82,T28] ( 3, 24 ) simd16 -> mm1 "dup spill"
-;* V83 tmp74 [V83 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[uint]>
-;* V84 tmp75 [V84 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
-; V85 tmp76 [V85,T20] ( 3, 3 ) byref -> rcx single-def "Inlining Arg"
-; V86 tmp77 [V86,T21] ( 3, 3 ) byref -> rdx "Inlining Arg"
-;* V87 tmp78 [V87,T26] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V88 cse0 [V88,T10] ( 3, 8.50) long -> r10 "CSE #05: conservative"
-; V89 cse1 [V89,T39] ( 5, 6 ) simd64 -> mm1 "CSE #01: conservative"
-; V90 cse2 [V90,T40] ( 5, 6 ) simd32 -> mm1 "CSE #03: conservative"
-; V91 cse3 [V91,T41] ( 5, 6 ) simd16 -> mm1 "CSE #04: conservative"
-; V92 cse4 [V92,T42] ( 5, 6 ) simd64 -> mm2 "CSE #02: conservative"
-; V93 rat0 [V93,T29] ( 3, 24 ) simd64 -> mm4 "ReplaceWithLclVar is creating a new local variable"
+; V48 tmp39 [V48,T25] ( 3, 1.50) long -> rax "Inline return value spill temp"
+;* V49 tmp40 [V49,T27] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
+;* V50 tmp41 [V50 ] ( 0, 0 ) long -> zero-ref "Inline stloc first use temp"
+; V51 tmp42 [V51,T09] ( 5, 9.50) byref -> rcx single-def "Inline stloc first use temp"
+; V52 tmp43 [V52,T34] ( 14, 17.50) simd16 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V53 tmp44 [V53,T15] ( 5, 6 ) byref -> r8 single-def "Inline stloc first use temp"
+;* V54 tmp45 [V54 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+; V55 tmp46 [V55,T03] ( 11, 26.50) long -> rax "Inline stloc first use temp"
+; V56 tmp47 [V56,T19] ( 2, 4.50) long -> r9 "Inline stloc first use temp"
+; V57 tmp48 [V57,T40] ( 3, 12 ) simd16 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V58 tmp49 [V58 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+; V59 tmp50 [V59,T37] ( 2, 16 ) simd16 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+;* V60 tmp51 [V60 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+;* V61 tmp52 [V61 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V62 tmp53 [V62 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V63 tmp54 [V63 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V64 tmp55 [V64 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V65 tmp56 [V65 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V66 tmp57 [V66 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V67 tmp58 [V67 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V68 tmp59 [V68 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V69 tmp60 [V69 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
+; V70 tmp61 [V70,T28] ( 3, 24 ) simd16 -> mm1 "dup spill"
+;* V71 tmp62 [V71 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[uint]>
+;* V72 tmp63 [V72 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
+; V73 tmp64 [V73,T20] ( 3, 3 ) byref -> rcx single-def "Inlining Arg"
+; V74 tmp65 [V74,T21] ( 3, 3 ) byref -> rdx "Inlining Arg"
+;* V75 tmp66 [V75,T26] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V76 cse0 [V76,T10] ( 3, 8.50) long -> r10 "CSE #05: conservative"
+; V77 cse1 [V77,T41] ( 5, 6 ) simd64 -> mm1 "CSE #01: conservative"
+; V78 cse2 [V78,T42] ( 5, 6 ) simd32 -> mm1 "CSE #03: conservative"
+; V79 cse3 [V79,T43] ( 5, 6 ) simd16 -> mm1 "CSE #04: conservative"
+; V80 cse4 [V80,T44] ( 5, 6 ) simd64 -> mm2 "CSE #02: conservative"
+; V81 rat0 [V81,T29] ( 3, 24 ) simd16 -> mm3 "ReplaceWithLclVar is creating a new local variable"
+; V82 rat1 [V82,T30] ( 3, 24 ) simd32 -> mm3 "ReplaceWithLclVar is creating a new local variable"
+; V83 rat2 [V83,T31] ( 3, 24 ) simd64 -> mm4 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M6063_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M6063_IG02:
xor eax, eax
cmp rdx, 32
jb G_M6063_IG21
;; size=12 bbWeight=1 PerfScore 1.50
G_M6063_IG03:
mov rcx, qword ptr [rdi]
mov r8, 0xD1FFAB1E
test rcx, r8
jne G_M6063_IG27
cmp rdx, 128
jb SHORT G_M6063_IG04
mov rcx, rdi
vmovups zmm0, zmmword ptr [rcx]
vmovups zmm1, zmmword ptr [reloc @RWD00]
vptestmw k1, zmm1, zmm0
kortestd k1, k1
;; NOP compensation instructions of 3 bytes.
je G_M6063_IG17
xor eax, eax
jmp G_M6063_IG21
align [3 bytes for IG08]
;; size=80 bbWeight=0.50 PerfScore 9.62
G_M6063_IG04:
cmp rdx, 64
jb SHORT G_M6063_IG05
mov rcx, rdi
vmovups ymm0, ymmword ptr [rcx]
vmovups ymm1, ymmword ptr [reloc @RWD00]
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
je G_M6063_IG11
xor eax, eax
jmp G_M6063_IG15
;; size=39 bbWeight=0.50 PerfScore 9.38
G_M6063_IG05:
mov rcx, rdi
vmovups xmm0, xmmword ptr [rcx]
vmovups xmm1, xmmword ptr [reloc @RWD00]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
je SHORT G_M6063_IG06
xor eax, eax
jmp SHORT G_M6063_IG09
;; size=26 bbWeight=0.50 PerfScore 6.75
G_M6063_IG06:
mov r8, rsi
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [r8], xmm0
mov eax, 8
test sil, 8
jne SHORT G_M6063_IG07
vmovups xmm0, xmmword ptr [rcx+0x10]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne SHORT G_M6063_IG09
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [r8+0x08], xmm0
;; size=45 bbWeight=0.50 PerfScore 7.88
G_M6063_IG07:
mov rax, rsi
and rax, 15
neg rax
add rax, 16
lea r9, [rdx-0x10]
;; size=18 bbWeight=0.50 PerfScore 0.75
G_M6063_IG08:
vmovups xmm0, xmmword ptr [rcx+2*rax]
lea r10, [rax+0x08]
vmovups xmm2, xmmword ptr [rcx+2*r10]
- vpor xmm3, xmm0, xmm2
- vptest xmm3, xmm1
+ vmovaps xmm3, xmm0
+ vpternlogd xmm3, xmm2, xmm1, -88
+ vptest xmm3, xmm3
jne SHORT G_M6063_IG10
vpackuswb xmm0, xmm0, xmm2
vmovups xmmword ptr [r8+rax], xmm0
add rax, 16
cmp rax, r9
jbe SHORT G_M6063_IG08
- ;; size=45 bbWeight=4 PerfScore 69.33
+ ;; size=52 bbWeight=4 PerfScore 71.00
G_M6063_IG09:
jmp G_M6063_IG21
- align [0 bytes for IG13]
- ;; size=5 bbWeight=0.50 PerfScore 1.00
+ align [7 bytes for IG13]
+ ;; size=12 bbWeight=0.50 PerfScore 1.00
G_M6063_IG10:
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne SHORT G_M6063_IG09
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [r8+rax], xmm0
mov rax, r10
jmp SHORT G_M6063_IG09
;; size=22 bbWeight=0.50 PerfScore 4.62
G_M6063_IG11:
mov rax, rsi
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rax], xmm0
mov r8d, 16
test sil, 16
jne SHORT G_M6063_IG12
vmovups ymm0, ymmword ptr [rcx+0x20]
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
jne SHORT G_M6063_IG14
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rax+0x10], xmm0
;; size=56 bbWeight=0.50 PerfScore 11.38
G_M6063_IG12:
mov r8, rsi
and r8, 31
neg r8
add r8, 32
lea r9, [rdx-0x20]
;; size=18 bbWeight=0.50 PerfScore 0.75
G_M6063_IG13:
vmovups ymm0, ymmword ptr [rcx+2*r8]
vmovups ymm2, ymmword ptr [rcx+2*r8+0x20]
- vpor ymm3, ymm0, ymm2
- vptest ymm3, ymm1
+ vmovaps ymm3, ymm0
+ vpternlogd ymm3, ymm2, ymm1, -88
+ vptest ymm3, ymm3
jne SHORT G_M6063_IG16
vpackuswb ymm0, ymm0, ymm2
vpermq ymm0, ymm0, -40
vmovups ymmword ptr [rax+r8], ymm0
add r8, 32
cmp r8, r9
jbe SHORT G_M6063_IG13
- ;; size=49 bbWeight=4 PerfScore 91.33
+ ;; size=56 bbWeight=4 PerfScore 93.00
G_M6063_IG14:
mov rax, r8
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M6063_IG15:
jmp G_M6063_IG21
align [0 bytes for IG19]
;; size=5 bbWeight=0.50 PerfScore 1.00
G_M6063_IG16:
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
jne SHORT G_M6063_IG14
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rax+r8], xmm0
add r8, 16
jmp SHORT G_M6063_IG14
;; size=29 bbWeight=0.50 PerfScore 6.62
G_M6063_IG17:
mov rax, rsi
vpackuswb zmm0, zmm0, zmm0
vmovups zmm2, zmmword ptr [reloc @RWD64]
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rax], ymm0
mov r8d, 32
test sil, 32
jne SHORT G_M6063_IG18
vmovups zmm0, zmmword ptr [rcx+0x40]
vptestmw k1, zmm1, zmm0
kortestd k1, k1
+ ;; NOP compensation instructions of 3 bytes.
jne SHORT G_M6063_IG20
vpackuswb zmm0, zmm0, zmm0
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rax+0x20], ymm0
- ;; size=78 bbWeight=0.50 PerfScore 11.88
+ ;; size=81 bbWeight=0.50 PerfScore 11.88
G_M6063_IG18:
mov r8, rsi
and r8, 63
neg r8
add r8, 64
lea r9, [rdx-0x40]
;; size=18 bbWeight=0.50 PerfScore 0.75
G_M6063_IG19:
vmovups zmm0, zmmword ptr [rcx+2*r8]
vmovups zmm3, zmmword ptr [rcx+2*r8+0x40]
vmovaps zmm4, zmm0
vpternlogd zmm4, zmm3, zmm1, -88
vptestmw k1, zmm4, zmm4
kortestd k1, k1
+ ;; NOP compensation instructions of 3 bytes.
jne G_M6063_IG26
vpackuswb zmm0, zmm0, zmm3
vpermq zmm0, zmm2, zmm0
vmovups zmmword ptr [rax+r8], zmm0
add r8, 64
cmp r8, r9
jbe SHORT G_M6063_IG19
- ;; size=73 bbWeight=4 PerfScore 81.00
+ ;; size=76 bbWeight=4 PerfScore 81.00
G_M6063_IG20:
mov rax, r8
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M6063_IG21:
sub rdx, rax
cmp rdx, 4
jb SHORT G_M6063_IG23
lea r8, [rax+rdx-0x04]
- align [0 bytes for IG22]
- ;; size=14 bbWeight=0.50 PerfScore 1.25
+ align [3 bytes for IG22]
+ ;; size=17 bbWeight=0.50 PerfScore 1.38
G_M6063_IG22:
mov rcx, qword ptr [rdi+2*rax]
mov r9, 0xD1FFAB1E
test rcx, r9
jne G_M6063_IG27
vmovd xmm1, rcx
vpackuswb xmm2, xmm1, xmm1
vmovd dword ptr [rsi+rax], xmm2
add rax, 4
cmp rax, r8
jbe SHORT G_M6063_IG22
;; size=46 bbWeight=4 PerfScore 40.00
G_M6063_IG23:
test dl, 2
je SHORT G_M6063_IG24
mov r8d, dword ptr [rdi+2*rax]
test r8d, 0xD1FFAB1E
jne G_M6063_IG28
lea rcx, [rsi+rax]
mov byte ptr [rcx], r8b
shr r8d, 16
mov byte ptr [rcx+0x01], r8b
add rax, 2
;; size=41 bbWeight=0.50 PerfScore 3.88
G_M6063_IG24:
test dl, 1
je SHORT G_M6063_IG29
movzx r8, word ptr [rdi+2*rax]
cmp r8d, 127
ja SHORT G_M6063_IG29
;; size=16 bbWeight=0.50 PerfScore 2.25
G_M6063_IG25:
mov byte ptr [rsi+rax], r8b
inc rax
jmp SHORT G_M6063_IG29
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M6063_IG26:
vptestmw k1, zmm1, zmm0
kortestd k1, k1
jne G_M6063_IG20
vpackuswb zmm0, zmm0, zmm0
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rax+r8], ymm0
add r8, 32
jmp G_M6063_IG20
;; size=44 bbWeight=0.50 PerfScore 6.12
G_M6063_IG27:
mov r8d, ecx
test r8d, 0xD1FFAB1E
jne SHORT G_M6063_IG28
lea rdx, [rsi+rax]
mov byte ptr [rdx], r8b
shr r8d, 16
mov byte ptr [rdx+0x01], r8b
shr rcx, 32
mov r8d, ecx
add rax, 2
;; size=38 bbWeight=0.50 PerfScore 2.75
G_M6063_IG28:
test r8d, 0xFF80
je SHORT G_M6063_IG25
;; size=9 bbWeight=0.50 PerfScore 0.62
G_M6063_IG29:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD64 dq 0000000000000000h, 0000000000000002h, 0000000000000004h, 0000000000000006h, 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
-; Total bytes of code 850, prolog size 4, PerfScore 378.04, instruction count 200, allocated bytes for code 859 (MethodHash=53fae850) for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 880, prolog size 4, PerfScore 381.50, instruction count 202, allocated bytes for code 883 (MethodHash=53fae850) for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
14 (4.02 % of base) - System.Text.Ascii:IsValidCore[short](byref,int):ubyte
; Assembly listing for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 6 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 18, 17 ) byref -> rdi
; V01 arg1 [V01,T04] ( 11, 7 ) int -> rsi single-def
; V02 loc0 [V02,T08] ( 5, 2.50) byref -> rcx single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
; V05 loc3 [V05,T03] ( 5, 16.50) long -> rax
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T02] ( 6, 17 ) long -> rdx
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T05] ( 4, 5.50) long -> rsi
; V10 loc8 [V10,T00] ( 5, 20 ) byref -> rax
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
; V17 tmp2 [V17,T10] ( 2, 1 ) ubyte -> rdx "Inline return value spill temp"
;* V18 tmp3 [V18 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V19 tmp4 [V19 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V20 tmp5 [V20 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[short]>
;* V21 tmp6 [V21 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V22 tmp7 [V22 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
;* V23 tmp8 [V23 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V24 tmp9 [V24,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
+; V24 tmp9 [V24,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
;* V25 tmp10 [V25 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
; V26 tmp11 [V26,T11] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
;* V27 tmp12 [V27 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V28 tmp13 [V28,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
+; V28 tmp13 [V28,T16] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
; V29 tmp14 [V29,T06] ( 5, 5 ) int -> rdx "Single return block return value"
; V30 tmp15 [V30,T09] ( 2, 2 ) long -> rax "Cast away GC"
-; V31 cse0 [V31,T12] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #04: aggressive"
+; V31 cse0 [V31,T12] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #04: aggressive"
; V32 cse1 [V32,T07] ( 3, 5 ) long -> rsi "CSE #01: aggressive"
+; V33 rat0 [V33,T13] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V34 rat1 [V34,T14] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M12635_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M12635_IG02:
cmp esi, 8
jge SHORT G_M12635_IG07
;; size=5 bbWeight=1 PerfScore 1.25
G_M12635_IG03:
movsxd rax, esi
cmp rax, 4
jge G_M12635_IG16
xor eax, eax
mov esi, esi
test rsi, rsi
je SHORT G_M12635_IG05
align [1 bytes for IG04]
;; size=23 bbWeight=0.50 PerfScore 1.75
G_M12635_IG04:
cmp word ptr [rdi+2*rax], 127
ja G_M12635_IG11
inc rax
cmp rax, rsi
jb SHORT G_M12635_IG04
;; size=19 bbWeight=4 PerfScore 22.00
G_M12635_IG05:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M12635_IG06:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M12635_IG07:
movsxd rax, esi
lea rcx, bword ptr [rdi+2*rax]
cmp esi, 16
jg SHORT G_M12635_IG08
vmovups xmm0, xmmword ptr [rdi]
- vpor xmm0, xmm0, xmmword ptr [rcx-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rcx-0x10]
+ vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete dl
movzx rdx, dl
jmp G_M12635_IG17
align [0 bytes for IG10]
- ;; size=41 bbWeight=0.50 PerfScore 8.62
+ ;; size=48 bbWeight=0.50 PerfScore 9.12
G_M12635_IG08:
cmp esi, 32
jg SHORT G_M12635_IG09
vmovups ymm0, ymmword ptr [rdi]
- vpor ymm0, ymm0, ymmword ptr [rcx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rcx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogd ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete dl
movzx rdx, dl
jmp G_M12635_IG17
- ;; size=38 bbWeight=0.50 PerfScore 10.75
+ ;; size=45 bbWeight=0.50 PerfScore 12.00
G_M12635_IG09:
cmp esi, 64
jle SHORT G_M12635_IG15
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne SHORT G_M12635_IG11
mov rax, rdi
and rax, 31
shr rax, 1
mov rdx, rax
neg rdx
add rdx, 64
movsxd rsi, esi
add rsi, -64
cmp rdx, rsi
jae SHORT G_M12635_IG14
;; size=74 bbWeight=0.50 PerfScore 15.38
G_M12635_IG10:
lea rax, bword ptr [rdi+2*rdx]
vmovups ymm0, ymmword ptr [rax]
- vmovups ymm2, ymmword ptr [rax+0x20]
- vpternlogd ymm0, ymm2, ymmword ptr [rax+0x40], -2
+ vmovups ymm1, ymmword ptr [rax+0x20]
+ vpternlogd ymm0, ymm1, ymmword ptr [rax+0x40], -2
vpor ymm0, ymm0, ymmword ptr [rax+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
je SHORT G_M12635_IG13
;; size=33 bbWeight=4 PerfScore 90.00
G_M12635_IG11:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M12635_IG12:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M12635_IG13:
add rdx, 64
cmp rdx, rsi
jb SHORT G_M12635_IG10
;; size=9 bbWeight=4 PerfScore 6.00
G_M12635_IG14:
lea rdi, bword ptr [rdi+2*rsi]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M12635_IG15:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rcx-0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rcx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rcx-0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rcx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete dl
movzx rdx, dl
jmp SHORT G_M12635_IG17
;; size=43 bbWeight=0.50 PerfScore 14.12
G_M12635_IG16:
mov rdx, qword ptr [rdi]
movsxd rax, esi
or rdx, qword ptr [rdi+2*rax-0x08]
mov rax, 0xD1FFAB1E
test rdx, rax
sete dl
movzx rdx, dl
;; size=30 bbWeight=0.50 PerfScore 3.50
G_M12635_IG17:
movzx rax, dl
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M12635_IG18:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 348, prolog size 4, PerfScore 179.00, instruction count 95, allocated bytes for code 348 (MethodHash=fb38cea4) for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts)
+; Total bytes of code 362, prolog size 4, PerfScore 180.75, instruction count 97, allocated bytes for code 362 (MethodHash=fb38cea4) for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts)
14 (3.65 % of base) - System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte
; Assembly listing for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 6 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 20, 18 ) byref -> rdi
; V01 arg1 [V01,T04] ( 13, 8 ) int -> rsi single-def
; V02 loc0 [V02,T08] ( 5, 2.50) byref -> rax single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
; V05 loc3 [V05,T03] ( 5, 16.50) long -> rax
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T02] ( 6, 17 ) long -> rcx
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T06] ( 4, 5.50) long -> rsi
; V10 loc8 [V10,T01] ( 5, 20 ) byref -> rdx
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
;* V17 tmp2 [V17 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V18 tmp3 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V19 tmp4 [V19 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V20 tmp5 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V21 tmp6 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V22 tmp7 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V23 tmp8 [V23 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V24 tmp9 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 tmp10 [V25,T12] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+; V25 tmp10 [V25,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V26 tmp11 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
; V27 tmp12 [V27,T10] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V28 tmp13 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V29 tmp14 [V29,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+; V29 tmp14 [V29,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V30 tmp15 [V30,T05] ( 6, 6 ) int -> rcx "Single return block return value"
; V31 tmp16 [V31,T09] ( 2, 2 ) long -> rcx "Cast away GC"
-; V32 cse0 [V32,T11] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #04: aggressive"
+; V32 cse0 [V32,T11] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #04: aggressive"
; V33 cse1 [V33,T07] ( 3, 5 ) long -> rcx "CSE #01: aggressive"
+; V34 rat0 [V34,T12] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V35 rat1 [V35,T13] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M58774_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M58774_IG02:
cmp esi, 16
jge SHORT G_M58774_IG06
;; size=5 bbWeight=1 PerfScore 1.25
G_M58774_IG03:
movsxd rax, esi
cmp rax, 8
jge G_M58774_IG12
cmp esi, 4
jl G_M58774_IG13
mov eax, dword ptr [rdi]
add esi, -4
movsxd rcx, esi
or eax, dword ptr [rdi+rcx]
test eax, 0xD1FFAB1E
sete cl
movzx rcx, cl
;; size=44 bbWeight=0.50 PerfScore 4.88
G_M58774_IG04:
movzx rax, cl
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M58774_IG05:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M58774_IG06:
movsxd rax, esi
add rax, rdi
cmp esi, 32
jg SHORT G_M58774_IG07
vmovups xmm0, xmmword ptr [rdi]
- vpor xmm0, xmm0, xmmword ptr [rax-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rax-0x10]
+ vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete cl
movzx rcx, cl
jmp SHORT G_M58774_IG04
align [0 bytes for IG09]
- ;; size=37 bbWeight=0.50 PerfScore 8.50
+ ;; size=44 bbWeight=0.50 PerfScore 9.00
G_M58774_IG07:
cmp esi, 64
jg SHORT G_M58774_IG08
vmovups ymm0, ymmword ptr [rdi]
- vpor ymm0, ymm0, ymmword ptr [rax-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rax-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogd ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete cl
movzx rcx, cl
jmp SHORT G_M58774_IG04
- ;; size=35 bbWeight=0.50 PerfScore 10.75
+ ;; size=42 bbWeight=0.50 PerfScore 12.00
G_M58774_IG08:
cmp esi, 128
jle SHORT G_M58774_IG11
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne G_M58774_IG17
mov rcx, rdi
and rcx, 31
neg rcx
add rcx, 128
movsxd rsi, esi
add rsi, -128
cmp rcx, rsi
jae SHORT G_M58774_IG10
;; size=78 bbWeight=0.50 PerfScore 15.00
G_M58774_IG09:
lea rdx, bword ptr [rdi+rcx]
vmovups ymm0, ymmword ptr [rdx]
- vmovups ymm2, ymmword ptr [rdx+0x20]
- vpternlogd ymm0, ymm2, ymmword ptr [rdx+0x40], -2
+ vmovups ymm1, ymmword ptr [rdx+0x20]
+ vpternlogd ymm0, ymm1, ymmword ptr [rdx+0x40], -2
vpor ymm0, ymm0, ymmword ptr [rdx+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
jne G_M58774_IG17
add rcx, 128
cmp rcx, rsi
jb SHORT G_M58774_IG09
;; size=49 bbWeight=4 PerfScore 96.00
G_M58774_IG10:
add rdi, rsi
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M58774_IG11:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rax-0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rax-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rax-0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rax-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete cl
movzx rcx, cl
jmp G_M58774_IG04
align [0 bytes for IG14]
;; size=46 bbWeight=0.50 PerfScore 14.12
G_M58774_IG12:
mov rcx, qword ptr [rdi]
movsxd rsi, esi
or rcx, qword ptr [rdi+rsi-0x08]
mov rdi, 0xD1FFAB1E
test rcx, rdi
sete cl
movzx rcx, cl
jmp G_M58774_IG04
;; size=35 bbWeight=0.50 PerfScore 4.50
G_M58774_IG13:
xor eax, eax
mov ecx, esi
test rcx, rcx
je SHORT G_M58774_IG15
;; size=9 bbWeight=0.50 PerfScore 0.88
G_M58774_IG14:
cmp byte ptr [rdi+rax], 127
ja SHORT G_M58774_IG17
inc rax
cmp rax, rcx
jb SHORT G_M58774_IG14
;; size=14 bbWeight=4 PerfScore 22.00
G_M58774_IG15:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M58774_IG16:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M58774_IG17:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M58774_IG18:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
RWD00 dq 8080808080808080h, 8080808080808080h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h
-; Total bytes of code 384, prolog size 4, PerfScore 183.38, instruction count 103, allocated bytes for code 384 (MethodHash=d69a1a69) for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts)
+; Total bytes of code 398, prolog size 4, PerfScore 185.12, instruction count 105, allocated bytes for code 398 (MethodHash=d69a1a69) for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts)
12 (3.40 % of base) - System.Text.Ascii:IsValidCore[int](byref,int):ubyte
; Assembly listing for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 18, 13 ) byref -> rdi
; V01 arg1 [V01,T03] ( 9, 6 ) int -> rsi single-def
; V02 loc0 [V02,T07] ( 5, 2.50) byref -> rdx single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
;* V05 loc3 [V05,T10] ( 0, 0 ) long -> zero-ref
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T01] ( 6, 17 ) long -> r8
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T04] ( 4, 5.50) long -> rcx
; V10 loc8 [V10,T00] ( 5, 20 ) byref -> rax
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
;* V17 tmp2 [V17 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inlining Arg"
; V18 tmp3 [V18,T09] ( 2, 1 ) ubyte -> r8 "Inline return value spill temp"
;* V19 tmp4 [V19 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V20 tmp5 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V21 tmp6 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[int]>
;* V22 tmp7 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V23 tmp8 [V23 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
;* V24 tmp9 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 tmp10 [V25,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+; V25 tmp10 [V25,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
;* V26 tmp11 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
; V27 tmp12 [V27,T11] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
;* V28 tmp13 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V29 tmp14 [V29,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+; V29 tmp14 [V29,T16] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
; V30 tmp15 [V30,T05] ( 5, 5 ) int -> r8 "Single return block return value"
; V31 tmp16 [V31,T08] ( 2, 2 ) long -> rax "Cast away GC"
-; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #03: aggressive"
+; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #03: aggressive"
; V33 cse1 [V33,T06] ( 6, 3 ) long -> rcx multi-def "CSE #01: aggressive"
+; V34 rat0 [V34,T13] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V35 rat1 [V35,T14] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M8346_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M8346_IG02:
cmp esi, 4
jge SHORT G_M8346_IG04
;; size=5 bbWeight=1 PerfScore 1.25
G_M8346_IG03:
movsxd rcx, esi
cmp rcx, 2
jge G_M8346_IG13
test esi, esi
je G_M8346_IG16
jmp G_M8346_IG18
;; size=26 bbWeight=0.50 PerfScore 2.38
G_M8346_IG04:
movsxd rcx, esi
lea rdx, bword ptr [rdi+4*rcx]
cmp esi, 8
jg SHORT G_M8346_IG05
vmovups xmm0, xmmword ptr [rdi]
- vpor xmm0, xmm0, xmmword ptr [rdx-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rdx-0x10]
+ vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete r8b
movzx r8, r8b
jmp G_M8346_IG14
- align [2 bytes for IG07]
- ;; size=45 bbWeight=0.50 PerfScore 8.62
+ align [0 bytes for IG07]
+ ;; size=50 bbWeight=0.50 PerfScore 9.12
G_M8346_IG05:
cmp esi, 16
jg SHORT G_M8346_IG06
vmovups ymm0, ymmword ptr [rdi]
- vpor ymm0, ymm0, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogd ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete r8b
movzx r8, r8b
jmp G_M8346_IG14
- ;; size=40 bbWeight=0.50 PerfScore 10.75
+ ;; size=47 bbWeight=0.50 PerfScore 12.00
G_M8346_IG06:
cmp esi, 32
jle SHORT G_M8346_IG12
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne SHORT G_M8346_IG08
mov rax, rdi
and rax, 31
shr rax, 2
mov r8, rax
neg r8
add r8, 32
add rcx, -32
cmp r8, rcx
jae SHORT G_M8346_IG11
;; size=72 bbWeight=0.50 PerfScore 15.25
G_M8346_IG07:
lea rax, bword ptr [rdi+4*r8]
vmovups ymm0, ymmword ptr [rax]
- vmovups ymm2, ymmword ptr [rax+0x20]
- vpternlogd ymm0, ymm2, ymmword ptr [rax+0x40], -2
+ vmovups ymm1, ymmword ptr [rax+0x20]
+ vpternlogd ymm0, ymm1, ymmword ptr [rax+0x40], -2
vpor ymm0, ymm0, ymmword ptr [rax+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
je SHORT G_M8346_IG10
;; size=33 bbWeight=4 PerfScore 90.00
G_M8346_IG08:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M8346_IG09:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG10:
add r8, 32
cmp r8, rcx
jb SHORT G_M8346_IG07
;; size=9 bbWeight=4 PerfScore 6.00
G_M8346_IG11:
lea rdi, bword ptr [rdi+4*rcx]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M8346_IG12:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdx-0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdx-0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete r8b
movzx r8, r8b
jmp SHORT G_M8346_IG14
;; size=45 bbWeight=0.50 PerfScore 14.12
G_M8346_IG13:
mov r8, qword ptr [rdi]
or r8, qword ptr [rdi+4*rcx-0x08]
mov rax, 0xD1FFAB1E
test r8, rax
sete r8b
movzx r8, r8b
;; size=29 bbWeight=0.50 PerfScore 3.38
G_M8346_IG14:
movzx rax, r8b
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M8346_IG15:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG16:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M8346_IG17:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG18:
cmp dword ptr [rdi], edi
mov rax, 0xD1FFAB1E ; code for System.ThrowHelper:ThrowNotSupportedException()
call [rax]System.ThrowHelper:ThrowNotSupportedException()
int3
;; size=15 bbWeight=0 PerfScore 0.00
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
+; Total bytes of code 365, prolog size 4, PerfScore 159.12, instruction count 92, allocated bytes for code 365 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
11 (3.12 % of base) - System.Text.Ascii:IsValidCore[double](byref,int):ubyte
; Assembly listing for method System.Text.Ascii:IsValidCore[double](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 18, 13 ) byref -> rdi
; V01 arg1 [V01,T03] ( 9, 6 ) int -> rsi single-def
; V02 loc0 [V02,T07] ( 5, 2.50) byref -> rdx single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
;* V05 loc3 [V05,T10] ( 0, 0 ) long -> zero-ref
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T01] ( 6, 17 ) long -> r8
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T04] ( 4, 5.50) long -> rcx
; V10 loc8 [V10,T00] ( 5, 20 ) byref -> rax
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
;* V17 tmp2 [V17 ] ( 0, 0 ) double -> zero-ref do-not-enreg[F] ld-addr-op "Inlining Arg"
; V18 tmp3 [V18,T09] ( 2, 1 ) ubyte -> r8 "Inline return value spill temp"
;* V19 tmp4 [V19 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V20 tmp5 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V21 tmp6 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[double]>
;* V22 tmp7 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V23 tmp8 [V23 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
;* V24 tmp9 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 tmp10 [V25,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
+; V25 tmp10 [V25,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
;* V26 tmp11 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
; V27 tmp12 [V27,T11] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
;* V28 tmp13 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V29 tmp14 [V29,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
+; V29 tmp14 [V29,T16] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
; V30 tmp15 [V30,T05] ( 5, 5 ) int -> r8 "Single return block return value"
; V31 tmp16 [V31,T08] ( 2, 2 ) long -> rax "Cast away GC"
-; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #03: aggressive"
+; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #03: aggressive"
; V33 cse1 [V33,T06] ( 6, 3 ) long -> rcx multi-def "CSE #01: aggressive"
+; V34 rat0 [V34,T13] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V35 rat1 [V35,T14] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M10908_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M10908_IG02:
cmp esi, 2
jge SHORT G_M10908_IG04
;; size=5 bbWeight=1 PerfScore 1.25
G_M10908_IG03:
movsxd rcx, esi
test rcx, rcx
jg G_M10908_IG13
test esi, esi
je G_M10908_IG16
jmp G_M10908_IG18
;; size=25 bbWeight=0.50 PerfScore 2.38
G_M10908_IG04:
movsxd rcx, esi
lea rdx, bword ptr [rdi+8*rcx]
cmp esi, 4
jg SHORT G_M10908_IG05
vmovups xmm0, xmmword ptr [rdi]
- vorpd xmm0, xmm0, xmmword ptr [rdx-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rdx-0x10]
+ vpternlogq xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete r8b
movzx r8, r8b
jmp G_M10908_IG14
- align [3 bytes for IG07]
- ;; size=46 bbWeight=0.50 PerfScore 8.62
+ align [0 bytes for IG07]
+ ;; size=50 bbWeight=0.50 PerfScore 9.12
G_M10908_IG05:
cmp esi, 8
jg SHORT G_M10908_IG06
vmovups ymm0, ymmword ptr [rdi]
- vorpd ymm0, ymm0, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogq ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete r8b
movzx r8, r8b
jmp G_M10908_IG14
- ;; size=40 bbWeight=0.50 PerfScore 10.75
+ ;; size=47 bbWeight=0.50 PerfScore 12.00
G_M10908_IG06:
cmp esi, 16
jle SHORT G_M10908_IG12
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogq ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vorpd ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogq ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vorpd ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne SHORT G_M10908_IG08
mov rax, rdi
and rax, 31
shr rax, 3
mov r8, rax
neg r8
add r8, 16
add rcx, -16
cmp r8, rcx
jae SHORT G_M10908_IG11
;; size=72 bbWeight=0.50 PerfScore 15.25
G_M10908_IG07:
lea rax, bword ptr [rdi+8*r8]
vmovups ymm0, ymmword ptr [rax]
- vmovups ymm2, ymmword ptr [rax+0x20]
- vpternlogq ymm0, ymm2, ymmword ptr [rax+0x40], -2
+ vmovups ymm1, ymmword ptr [rax+0x20]
+ vpternlogq ymm0, ymm1, ymmword ptr [rax+0x40], -2
vorpd ymm0, ymm0, ymmword ptr [rax+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
je SHORT G_M10908_IG10
;; size=33 bbWeight=4 PerfScore 90.00
G_M10908_IG08:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M10908_IG09:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M10908_IG10:
add r8, 16
cmp r8, rcx
jb SHORT G_M10908_IG07
;; size=9 bbWeight=4 PerfScore 6.00
G_M10908_IG11:
lea rdi, bword ptr [rdi+8*rcx]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M10908_IG12:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogq ymm1, ymm0, ymmword ptr [rdx-0x40], -2
- vorpd ymm0, ymm1, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogq ymm2, ymm0, ymmword ptr [rdx-0x40], -2
+ vorpd ymm0, ymm2, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete r8b
movzx r8, r8b
jmp SHORT G_M10908_IG14
;; size=45 bbWeight=0.50 PerfScore 14.12
G_M10908_IG13:
mov r8, qword ptr [rdi]
or r8, qword ptr [rdi+8*rcx-0x08]
mov rax, 0xD1FFAB1E
test r8, rax
sete r8b
movzx r8, r8b
;; size=29 bbWeight=0.50 PerfScore 3.38
G_M10908_IG14:
movzx rax, r8b
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M10908_IG15:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M10908_IG16:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M10908_IG17:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M10908_IG18:
cmp dword ptr [rdi], edi
mov rax, 0xD1FFAB1E ; code for System.ThrowHelper:ThrowNotSupportedException()
call [rax]System.ThrowHelper:ThrowNotSupportedException()
int3
;; size=15 bbWeight=0 PerfScore 0.00
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=1534d563) for method System.Text.Ascii:IsValidCore[double](byref,int):ubyte (FullOpts)
+; Total bytes of code 364, prolog size 4, PerfScore 159.12, instruction count 92, allocated bytes for code 364 (MethodHash=1534d563) for method System.Text.Ascii:IsValidCore[double](byref,int):ubyte (FullOpts)
11 (3.12 % of base) - System.Text.Ascii:IsValidCore[long](byref,int):ubyte
; Assembly listing for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 18, 13 ) byref -> rdi
; V01 arg1 [V01,T03] ( 9, 6 ) int -> rsi single-def
; V02 loc0 [V02,T07] ( 5, 2.50) byref -> rdx single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
;* V05 loc3 [V05,T10] ( 0, 0 ) long -> zero-ref
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T01] ( 6, 17 ) long -> r8
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T04] ( 4, 5.50) long -> rcx
; V10 loc8 [V10,T00] ( 5, 20 ) byref -> rax
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
;* V17 tmp2 [V17 ] ( 0, 0 ) long -> zero-ref ld-addr-op "Inlining Arg"
; V18 tmp3 [V18,T09] ( 2, 1 ) ubyte -> r8 "Inline return value spill temp"
;* V19 tmp4 [V19 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V20 tmp5 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V21 tmp6 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[long]>
;* V22 tmp7 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V23 tmp8 [V23 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
;* V24 tmp9 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 tmp10 [V25,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+; V25 tmp10 [V25,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
;* V26 tmp11 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
; V27 tmp12 [V27,T11] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
;* V28 tmp13 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V29 tmp14 [V29,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+; V29 tmp14 [V29,T16] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
; V30 tmp15 [V30,T05] ( 5, 5 ) int -> r8 "Single return block return value"
; V31 tmp16 [V31,T08] ( 2, 2 ) long -> rax "Cast away GC"
-; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #03: aggressive"
+; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #03: aggressive"
; V33 cse1 [V33,T06] ( 6, 3 ) long -> rcx multi-def "CSE #01: aggressive"
+; V34 rat0 [V34,T13] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V35 rat1 [V35,T14] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M33379_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M33379_IG02:
cmp esi, 2
jge SHORT G_M33379_IG04
;; size=5 bbWeight=1 PerfScore 1.25
G_M33379_IG03:
movsxd rcx, esi
test rcx, rcx
jg G_M33379_IG13
test esi, esi
je G_M33379_IG16
jmp G_M33379_IG18
;; size=25 bbWeight=0.50 PerfScore 2.38
G_M33379_IG04:
movsxd rcx, esi
lea rdx, bword ptr [rdi+8*rcx]
cmp esi, 4
jg SHORT G_M33379_IG05
vmovups xmm0, xmmword ptr [rdi]
- vpor xmm0, xmm0, xmmword ptr [rdx-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rdx-0x10]
+ vpternlogq xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete r8b
movzx r8, r8b
jmp G_M33379_IG14
- align [3 bytes for IG07]
- ;; size=46 bbWeight=0.50 PerfScore 8.62
+ align [0 bytes for IG07]
+ ;; size=50 bbWeight=0.50 PerfScore 9.12
G_M33379_IG05:
cmp esi, 8
jg SHORT G_M33379_IG06
vmovups ymm0, ymmword ptr [rdi]
- vpor ymm0, ymm0, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogq ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete r8b
movzx r8, r8b
jmp G_M33379_IG14
- ;; size=40 bbWeight=0.50 PerfScore 10.75
+ ;; size=47 bbWeight=0.50 PerfScore 12.00
G_M33379_IG06:
cmp esi, 16
jle SHORT G_M33379_IG12
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogq ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogq ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne SHORT G_M33379_IG08
mov rax, rdi
and rax, 31
shr rax, 3
mov r8, rax
neg r8
add r8, 16
add rcx, -16
cmp r8, rcx
jae SHORT G_M33379_IG11
;; size=72 bbWeight=0.50 PerfScore 15.25
G_M33379_IG07:
lea rax, bword ptr [rdi+8*r8]
vmovups ymm0, ymmword ptr [rax]
- vmovups ymm2, ymmword ptr [rax+0x20]
- vpternlogq ymm0, ymm2, ymmword ptr [rax+0x40], -2
+ vmovups ymm1, ymmword ptr [rax+0x20]
+ vpternlogq ymm0, ymm1, ymmword ptr [rax+0x40], -2
vpor ymm0, ymm0, ymmword ptr [rax+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
je SHORT G_M33379_IG10
;; size=33 bbWeight=4 PerfScore 90.00
G_M33379_IG08:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M33379_IG09:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33379_IG10:
add r8, 16
cmp r8, rcx
jb SHORT G_M33379_IG07
;; size=9 bbWeight=4 PerfScore 6.00
G_M33379_IG11:
lea rdi, bword ptr [rdi+8*rcx]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M33379_IG12:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogq ymm1, ymm0, ymmword ptr [rdx-0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogq ymm2, ymm0, ymmword ptr [rdx-0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete r8b
movzx r8, r8b
jmp SHORT G_M33379_IG14
;; size=45 bbWeight=0.50 PerfScore 14.12
G_M33379_IG13:
mov r8, qword ptr [rdi]
or r8, qword ptr [rdi+8*rcx-0x08]
mov rax, 0xD1FFAB1E
test r8, rax
sete r8b
movzx r8, r8b
;; size=29 bbWeight=0.50 PerfScore 3.38
G_M33379_IG14:
movzx rax, r8b
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M33379_IG15:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33379_IG16:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M33379_IG17:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33379_IG18:
cmp dword ptr [rdi], edi
mov rax, 0xD1FFAB1E ; code for System.ThrowHelper:ThrowNotSupportedException()
call [rax]System.ThrowHelper:ThrowNotSupportedException()
int3
;; size=15 bbWeight=0 PerfScore 0.00
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=a1267d9c) for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
+; Total bytes of code 364, prolog size 4, PerfScore 159.12, instruction count 92, allocated bytes for code 364 (MethodHash=a1267d9c) for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
7 (9.86 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this
; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) byref -> zero-ref this single-def
; V01 arg1 [V01,T00] ( 4, 4 ) long -> rsi single-def
;* V02 arg2 [V02 ] ( 0, 0 ) long -> zero-ref single-def
;* V03 arg3 [V03 ] ( 0, 0 ) int -> zero-ref single-def
; V04 arg4 [V04,T01] ( 4, 3 ) byref -> r8 single-def
-; V05 loc0 [V05,T02] ( 3, 2.50) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ushort]>
-; V06 loc1 [V06,T03] ( 3, 2.50) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V05 loc0 [V05,T03] ( 3, 2.50) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V06 loc1 [V06,T04] ( 3, 2.50) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ushort]>
;# V07 OutArgs [V07 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V08 tmp1 [V08 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V09 tmp2 [V09 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V10 tmp3 [V10 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V08 tmp1 [V08 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V09 rat0 [V09,T02] ( 3, 6 ) simd32 -> mm2 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M46395_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M46395_IG02:
vmovups ymm0, ymmword ptr [rsi]
vmovups ymm1, ymmword ptr [rsi+0x20]
- vpor ymm2, ymm0, ymm1
- vptest ymm2, ymmword ptr [reloc @RWD00]
+ vmovaps ymm2, ymm0
+ vpternlogd ymm2, ymm1, ymmword ptr [reloc @RWD00], -88
+ vptest ymm2, ymm2
je SHORT G_M46395_IG05
- ;; size=24 bbWeight=1 PerfScore 18.33
+ ;; size=31 bbWeight=1 PerfScore 18.25
G_M46395_IG03:
vxorps ymm0, ymm0, ymm0
vmovups ymmword ptr [r8], ymm0
xor eax, eax
;; size=11 bbWeight=0.50 PerfScore 1.29
G_M46395_IG04:
vzeroupper
ret
;; size=4 bbWeight=0.50 PerfScore 1.00
G_M46395_IG05:
vpmovwb ymm0, ymm0
vpmovwb ymm1, ymm1
vinserti128 ymm0, ymm0, xmm1, 1
vmovups ymmword ptr [r8], ymm0
mov eax, 1
;; size=28 bbWeight=0.50 PerfScore 5.12
G_M46395_IG06:
vzeroupper
ret
;; size=4 bbWeight=0.50 PerfScore 1.00
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 71, prolog size 0, PerfScore 26.75, instruction count 17, allocated bytes for code 71 (MethodHash=fb454ac4) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 78, prolog size 0, PerfScore 26.67, instruction count 18, allocated bytes for code 78 (MethodHash=fb454ac4) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
7 (13.73 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this
; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 0 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) byref -> zero-ref this single-def
; V01 arg1 [V01,T00] ( 4, 4 ) long -> rsi single-def
;* V02 arg2 [V02 ] ( 0, 0 ) long -> zero-ref single-def
;* V03 arg3 [V03 ] ( 0, 0 ) int -> zero-ref single-def
; V04 arg4 [V04,T01] ( 4, 3 ) byref -> r8 single-def
-; V05 loc0 [V05,T02] ( 3, 2.50) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V06 loc1 [V06,T03] ( 3, 2.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V05 loc0 [V05,T03] ( 3, 2.50) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V06 loc1 [V06,T04] ( 3, 2.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ushort]>
;# V07 OutArgs [V07 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V08 tmp1 [V08 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V09 tmp2 [V09 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V10 tmp3 [V10 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V11 tmp4 [V11 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V09 tmp2 [V09 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V10 tmp3 [V10 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+; V11 rat0 [V11,T02] ( 3, 6 ) simd16 -> mm2 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M11006_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M11006_IG02:
vmovups xmm0, xmmword ptr [rsi]
vmovups xmm1, xmmword ptr [rsi+0x10]
- vpor xmm2, xmm0, xmm1
- vptest xmm2, xmmword ptr [reloc @RWD00]
+ vmovaps xmm2, xmm0
+ vpternlogd xmm2, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm2, xmm2
je SHORT G_M11006_IG05
- ;; size=24 bbWeight=1 PerfScore 14.33
+ ;; size=31 bbWeight=1 PerfScore 14.25
G_M11006_IG03:
vxorps xmm0, xmm0, xmm0
vmovups xmmword ptr [r8], xmm0
xor eax, eax
;; size=11 bbWeight=0.50 PerfScore 1.29
G_M11006_IG04:
ret
;; size=1 bbWeight=0.50 PerfScore 0.50
G_M11006_IG05:
vpackuswb xmm0, xmm0, xmm1
vmovups xmmword ptr [r8], xmm0
mov eax, 1
;; size=14 bbWeight=0.50 PerfScore 1.62
G_M11006_IG06:
ret
;; size=1 bbWeight=0.50 PerfScore 0.50
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 51, prolog size 0, PerfScore 18.25, instruction count 13, allocated bytes for code 51 (MethodHash=0badd501) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 58, prolog size 0, PerfScore 18.17, instruction count 14, allocated bytes for code 58 (MethodHash=0badd501) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
7 (2.06 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Intrinsified(ulong,ulong):ulong
; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Intrinsified(ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 6 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 32, 34.50) long -> rdi
; V01 arg1 [V01,T01] ( 17, 10 ) long -> rsi
;* V02 loc0 [V02,T08] ( 0, 0 ) int -> zero-ref
-; V03 loc1 [V03,T09] ( 11, 12.50) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V04 loc2 [V04,T10] ( 3, 8.50) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V03 loc1 [V03,T10] ( 11, 12.50) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V04 loc2 [V04,T11] ( 3, 8.50) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V05 loc3 [V05 ] ( 0, 0 ) int -> zero-ref
; V06 loc4 [V06,T03] ( 8, 4 ) long -> rax
;* V07 loc5 [V07 ] ( 0, 0 ) long -> zero-ref
; V08 loc6 [V08,T04] ( 3, 1.50) int -> rcx
; V09 loc7 [V09,T02] ( 2, 4.50) long -> rcx
;* V10 loc8 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V11 loc9 [V11 ] ( 0, 0 ) ref -> zero-ref class-hnd <System.Object>
;* V12 loc10 [V12 ] ( 0, 0 ) ref -> zero-ref class-hnd <System.Object>
; V13 loc11 [V13,T06] ( 2, 1 ) long -> rdi
;# V14 OutArgs [V14 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V15 tmp1 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V16 tmp2 [V16 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V17 tmp3 [V17 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V18 tmp4 [V18 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V19 tmp5 [V19 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V16 tmp2 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V17 tmp3 [V17 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V18 tmp4 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V19 tmp5 [V19 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V20 tmp6 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V21 tmp7 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V22 tmp8 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V23 tmp9 [V23 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V24 tmp10 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V25 tmp11 [V25 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V26 tmp12 [V26,T07] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V27 cse0 [V27,T05] ( 3, 1.50) long -> rcx "CSE #02: moderate"
-; V28 cse1 [V28,T11] ( 6, 6.50) simd16 -> mm1 "CSE #01: aggressive"
+;* V21 tmp7 [V21,T07] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V22 cse0 [V22,T05] ( 3, 1.50) long -> rcx "CSE #02: moderate"
+; V23 cse1 [V23,T12] ( 6, 6.50) simd16 -> mm1 "CSE #01: aggressive"
+; V24 rat0 [V24,T09] ( 3, 24 ) simd16 -> mm3 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M29265_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M29265_IG02:
test rsi, rsi
jne SHORT G_M29265_IG05
;; size=5 bbWeight=1 PerfScore 1.25
G_M29265_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M29265_IG04:
pop rbp
ret
;; size=2 bbWeight=0.50 PerfScore 0.75
G_M29265_IG05:
mov rax, rdi
cmp rsi, 8
jb G_M29265_IG10
vmovups xmm0, xmmword ptr [rax]
vmovups xmm1, xmmword ptr [reloc @RWD00]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne G_M29265_IG17
add rsi, rsi
cmp rsi, 32
jb SHORT G_M29265_IG08
lea rdi, [rax+0x10]
and rdi, -16
mov rcx, rdi
sub rcx, rax
sub rsi, rcx
cmp rsi, 32
jb SHORT G_M29265_IG07
lea rcx, [rdi+rsi-0x20]
align [0 bytes for IG06]
;; size=73 bbWeight=0.50 PerfScore 8.88
G_M29265_IG06:
vmovups xmm0, xmmword ptr [rdi]
vmovups xmm2, xmmword ptr [rdi+0x10]
- vpor xmm3, xmm0, xmm2
- vptest xmm3, xmm1
+ vmovaps xmm3, xmm0
+ vpternlogd xmm3, xmm2, xmm1, -88
+ vptest xmm3, xmm3
jne G_M29265_IG16
add rdi, 32
cmp rdi, rcx
jbe SHORT G_M29265_IG06
- ;; size=33 bbWeight=4 PerfScore 55.33
+ ;; size=40 bbWeight=4 PerfScore 57.00
G_M29265_IG07:
test sil, 16
je SHORT G_M29265_IG09
vmovups xmm0, xmmword ptr [rdi]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne G_M29265_IG17
;; size=21 bbWeight=0.50 PerfScore 4.62
G_M29265_IG08:
add rdi, 16
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M29265_IG09:
movzx rcx, sil
test cl, 15
je G_M29265_IG18
and rsi, 15
lea rdi, [rdi+rsi-0x10]
vmovups xmm0, xmmword ptr [rdi]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne SHORT G_M29265_IG17
add rdi, 16
jmp G_M29265_IG18
;; size=42 bbWeight=0.50 PerfScore 6.50
G_M29265_IG10:
test sil, 4
je SHORT G_M29265_IG12
mov rdi, qword ptr [rax]
mov rcx, 0xD1FFAB1E
and rcx, rdi
je SHORT G_M29265_IG11
xor esi, esi
tzcnt rsi, rcx
sar esi, 3
movsxd rdi, esi
and rdi, -2
add rdi, rax
jmp SHORT G_M29265_IG18
;; size=46 bbWeight=0.50 PerfScore 5.12
G_M29265_IG11:
lea rdi, [rax+0x08]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M29265_IG12:
test sil, 2
je SHORT G_M29265_IG13
mov ecx, dword ptr [rdi]
test ecx, 0xD1FFAB1E
jne SHORT G_M29265_IG15
add rdi, 4
;; size=20 bbWeight=0.50 PerfScore 2.38
G_M29265_IG13:
test sil, 1
je SHORT G_M29265_IG18
cmp word ptr [rdi], 127
ja SHORT G_M29265_IG18
;; size=12 bbWeight=0.50 PerfScore 2.62
G_M29265_IG14:
add rdi, 2
jmp SHORT G_M29265_IG18
;; size=6 bbWeight=0.50 PerfScore 1.12
G_M29265_IG15:
test ecx, 0xFF80
je SHORT G_M29265_IG14
jmp SHORT G_M29265_IG18
;; size=10 bbWeight=0.50 PerfScore 1.62
G_M29265_IG16:
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne SHORT G_M29265_IG17
add rdi, 16
vmovaps xmm0, xmm2
;; size=15 bbWeight=0.50 PerfScore 2.25
G_M29265_IG17:
vpaddusw xmm0, xmm0, xmmword ptr [reloc @RWD16]
vpmovmskb ecx, xmm0
and ecx, 0xAAAA
tzcnt ecx, ecx
lea rdi, [rdi+rcx-0x01]
;; size=27 bbWeight=0.50 PerfScore 3.62
G_M29265_IG18:
mov rcx, rdi
sub rcx, rax
mov rax, rcx
shr rax, 1
;; size=12 bbWeight=0.50 PerfScore 0.62
G_M29265_IG19:
pop rbp
ret
;; size=2 bbWeight=0.50 PerfScore 0.75
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD16 dq 7F807F807F807F80h, 7F807F807F807F80h
-; Total bytes of code 340, prolog size 4, PerfScore 99.21, instruction count 94, allocated bytes for code 340 (MethodHash=6c288dae) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Intrinsified(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 347, prolog size 4, PerfScore 100.88, instruction count 95, allocated bytes for code 347 (MethodHash=6c288dae) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Intrinsified(ulong,ulong):ulong (FullOpts)
7 (3.85 % of base) - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_256(ulong,ulong,ulong):ulong
; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_256(ulong,ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 4 single block inlinees; 8 inlinees without PGO data
+; 0 inlinees with PGO data; 8 single block inlinees; 4 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T04] ( 3, 3 ) long -> rdi single-def
; V01 arg1 [V01,T03] ( 5, 3.50) long -> rsi single-def
; V02 arg2 [V02,T05] ( 3, 2.50) long -> rdx single-def
; V03 loc0 [V03,T01] ( 5, 10.50) byref -> rdi single-def
-; V04 loc1 [V04,T07] ( 14, 18.50) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V04 loc1 [V04,T08] ( 14, 18.50) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V05 loc2 [V05,T02] ( 5, 6 ) byref -> rcx single-def
; V06 loc3 [V06,T00] ( 12, 27 ) long -> rax
; V07 loc4 [V07,T06] ( 2, 4.50) long -> rdx
-; V08 loc5 [V08,T09] ( 3, 12 ) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V08 loc5 [V08,T10] ( 3, 12 ) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ushort]>
;# V09 OutArgs [V09 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V10 tmp1 [V10 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V11 tmp2 [V11 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-; V12 tmp3 [V12,T08] ( 2, 16 ) simd32 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+; V12 tmp3 [V12,T09] ( 2, 16 ) simd32 -> mm0 "Spilling op1 side effects for HWIntrinsic"
;* V13 tmp4 [V13 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-;* V14 tmp5 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V15 tmp6 [V15 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V14 tmp5 [V14 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V15 tmp6 [V15 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V16 tmp7 [V16 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V17 tmp8 [V17 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V18 tmp9 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V19 tmp10 [V19 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V18 tmp9 [V18 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V19 tmp10 [V19 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V20 tmp11 [V20 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V21 tmp12 [V21 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V22 tmp13 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V23 tmp14 [V23 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V24 tmp15 [V24 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V25 tmp16 [V25 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V26 tmp17 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V27 tmp18 [V27 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V28 tmp19 [V28 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V29 tmp20 [V29 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-; V30 cse0 [V30,T10] ( 5, 7 ) simd32 -> mm1 "CSE #01: moderate"
+; V22 cse0 [V22,T11] ( 5, 7 ) simd32 -> mm1 "CSE #01: moderate"
+; V23 rat0 [V23,T07] ( 3, 24 ) simd32 -> mm3 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M60588_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M60588_IG02:
vmovups ymm0, ymmword ptr [rdi]
vmovups ymm1, ymmword ptr [reloc @RWD00]
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
je SHORT G_M60588_IG05
;; size=19 bbWeight=1 PerfScore 15.00
G_M60588_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M60588_IG04:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M60588_IG05:
mov rcx, rsi
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rcx], xmm0
mov eax, 16
test sil, 16
jne SHORT G_M60588_IG06
vmovups ymm0, ymmword ptr [rdi+0x20]
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
jne SHORT G_M60588_IG08
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rcx+0x10], xmm0
;; size=55 bbWeight=0.50 PerfScore 11.38
G_M60588_IG06:
and rsi, 31
mov rax, rsi
neg rax
add rax, 32
add rdx, -32
align [0 bytes for IG07]
;; size=18 bbWeight=0.50 PerfScore 0.62
G_M60588_IG07:
vmovups ymm0, ymmword ptr [rdi+2*rax]
vmovups ymm2, ymmword ptr [rdi+2*rax+0x20]
- vpor ymm3, ymm0, ymm2
- vptest ymm3, ymm1
+ vmovaps ymm3, ymm0
+ vpternlogd ymm3, ymm2, ymm1, -88
+ vptest ymm3, ymm3
jne SHORT G_M60588_IG09
vpackuswb ymm0, ymm0, ymm2
vpermq ymm0, ymm0, -40
vmovups ymmword ptr [rcx+rax], ymm0
add rax, 32
cmp rax, rdx
jbe SHORT G_M60588_IG07
- ;; size=46 bbWeight=4 PerfScore 91.33
+ ;; size=53 bbWeight=4 PerfScore 93.00
G_M60588_IG08:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M60588_IG09:
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
jne SHORT G_M60588_IG08
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rcx+rax], xmm0
add rax, 16
jmp SHORT G_M60588_IG08
;; size=28 bbWeight=0.50 PerfScore 6.62
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 182, prolog size 4, PerfScore 128.83, instruction count 50, allocated bytes for code 182 (MethodHash=910c1353) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_256(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 189, prolog size 4, PerfScore 130.50, instruction count 51, allocated bytes for code 189 (MethodHash=910c1353) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_256(ulong,ulong,ulong):ulong (FullOpts)
7 (4.55 % of base) - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified(ulong,ulong,ulong):ulong
; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified(ulong,ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 8 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T05] ( 3, 3 ) long -> rdi single-def
; V01 arg1 [V01,T04] ( 5, 3.50) long -> rsi single-def
; V02 arg2 [V02,T06] ( 3, 2.50) long -> rdx single-def
;* V03 loc0 [V03,T08] ( 0, 0 ) int -> zero-ref
;* V04 loc1 [V04 ] ( 0, 0 ) long -> zero-ref
; V05 loc2 [V05,T01] ( 5, 10.50) byref -> rdi single-def
-; V06 loc3 [V06,T09] ( 14, 18.50) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V06 loc3 [V06,T10] ( 14, 18.50) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V07 loc4 [V07,T03] ( 5, 6 ) byref -> rcx single-def
; V08 loc5 [V08,T00] ( 11, 26.50) long -> rax
; V09 loc6 [V09,T07] ( 2, 4.50) long -> rdx
-; V10 loc7 [V10,T11] ( 3, 12 ) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V10 loc7 [V10,T12] ( 3, 12 ) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ushort]>
;# V11 OutArgs [V11 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V12 tmp1 [V12 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
;* V13 tmp2 [V13 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
-; V14 tmp3 [V14,T10] ( 2, 16 ) simd16 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+; V14 tmp3 [V14,T11] ( 2, 16 ) simd16 -> mm0 "Spilling op1 side effects for HWIntrinsic"
;* V15 tmp4 [V15 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
;* V16 tmp5 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V17 tmp6 [V17 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V18 tmp7 [V18 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V19 tmp8 [V19 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V20 tmp9 [V20 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V21 tmp10 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V17 tmp6 [V17 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V18 tmp7 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V19 tmp8 [V19 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V20 tmp9 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V21 tmp10 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V22 tmp11 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V23 tmp12 [V23 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V24 tmp13 [V24 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V25 tmp14 [V25 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V26 tmp15 [V26 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V27 tmp16 [V27 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V28 cse0 [V28,T02] ( 3, 8.50) long -> rsi "CSE #02: aggressive"
-; V29 cse1 [V29,T12] ( 5, 7 ) simd16 -> mm1 "CSE #01: aggressive"
+;* V23 tmp12 [V23 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+; V24 cse0 [V24,T02] ( 3, 8.50) long -> rsi "CSE #02: aggressive"
+; V25 cse1 [V25,T13] ( 5, 7 ) simd16 -> mm1 "CSE #01: aggressive"
+; V26 rat0 [V26,T09] ( 3, 24 ) simd16 -> mm3 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M11650_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M11650_IG02:
vmovups xmm0, xmmword ptr [rdi]
vmovups xmm1, xmmword ptr [reloc @RWD00]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
je SHORT G_M11650_IG05
;; size=19 bbWeight=1 PerfScore 11.00
G_M11650_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M11650_IG04:
pop rbp
ret
;; size=2 bbWeight=0.50 PerfScore 0.75
G_M11650_IG05:
mov rcx, rsi
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [rcx], xmm0
mov eax, 8
test sil, 8
jne SHORT G_M11650_IG06
vmovups xmm0, xmmword ptr [rdi+0x10]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne SHORT G_M11650_IG08
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [rcx+0x08], xmm0
;; size=43 bbWeight=0.50 PerfScore 7.88
G_M11650_IG06:
and rsi, 15
mov rax, rsi
neg rax
add rax, 16
add rdx, -16
align [0 bytes for IG07]
;; size=18 bbWeight=0.50 PerfScore 0.62
G_M11650_IG07:
vmovups xmm0, xmmword ptr [rdi+2*rax]
lea rsi, [rax+0x08]
vmovups xmm2, xmmword ptr [rdi+2*rsi]
- vpor xmm3, xmm0, xmm2
- vptest xmm3, xmm1
+ vmovaps xmm3, xmm0
+ vpternlogd xmm3, xmm2, xmm1, -88
+ vptest xmm3, xmm3
jne SHORT G_M11650_IG09
vpackuswb xmm0, xmm0, xmm2
vmovups xmmword ptr [rcx+rax], xmm0
add rax, 16
cmp rax, rdx
jbe SHORT G_M11650_IG07
- ;; size=43 bbWeight=4 PerfScore 69.33
+ ;; size=50 bbWeight=4 PerfScore 71.00
G_M11650_IG08:
pop rbp
ret
;; size=2 bbWeight=0.50 PerfScore 0.75
G_M11650_IG09:
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne SHORT G_M11650_IG08
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [rcx+rax], xmm0
mov rax, rsi
jmp SHORT G_M11650_IG08
;; size=21 bbWeight=0.50 PerfScore 4.62
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 154, prolog size 4, PerfScore 96.33, instruction count 45, allocated bytes for code 154 (MethodHash=8c3ed27d) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 161, prolog size 4, PerfScore 98.00, instruction count 46, allocated bytes for code 161 (MethodHash=8c3ed27d) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified(ulong,ulong,ulong):ulong (FullOpts)
7 (1.77 % of base) - System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong
; Assembly listing for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 32, 34.50) long -> rbx
; V01 arg1 [V01,T01] ( 17, 10 ) long -> rsi
;* V02 loc0 [V02,T08] ( 0, 0 ) int -> zero-ref
;* V03 loc1 [V03,T09] ( 0, 0 ) int -> zero-ref
-; V04 loc2 [V04,T10] ( 9, 11.50) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V05 loc3 [V05,T11] ( 3, 8.50) simd16 -> mm3 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V04 loc2 [V04,T11] ( 9, 11.50) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V05 loc3 [V05,T12] ( 3, 8.50) simd16 -> mm3 <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V06 loc4 [V06,T04] ( 4, 2 ) int -> r14
; V07 loc5 [V07,T03] ( 8, 4 ) long -> r15
-; V08 loc6 [V08,T12] ( 5, 6 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V09 loc7 [V09,T13] ( 3, 1.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V08 loc6 [V08,T13] ( 5, 6 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V09 loc7 [V09,T14] ( 3, 1.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V10 loc8 [V10,T05] ( 3, 1.50) int -> rdi
; V11 loc9 [V11,T02] ( 2, 4.50) long -> rdi
;* V12 loc10 [V12 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V13 loc11 [V13,T07] ( 2, 1 ) long -> rdi
;* V14 loc12 [V14 ] ( 0, 0 ) int -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V16 cse0 [V16,T06] ( 3, 1.50) long -> rdi "CSE #01: moderate"
+; V17 rat0 [V17,T10] ( 3, 24 ) simd16 -> mm4 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 8
G_M38868_IG01:
push rbp
push r15
push r14
push rbx
push rax
lea rbp, [rsp+0x20]
mov rbx, rdi
;; size=15 bbWeight=1 PerfScore 5.75
G_M38868_IG02:
test rsi, rsi
jne SHORT G_M38868_IG05
;; size=5 bbWeight=1 PerfScore 1.25
G_M38868_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M38868_IG04:
add rsp, 8
pop rbx
pop r14
pop r15
pop rbp
ret
;; size=11 bbWeight=0.50 PerfScore 1.62
G_M38868_IG05:
mov r15, rbx
cmp rsi, 8
jb G_M38868_IG10
vmovups xmm0, xmmword ptr [reloc @RWD00]
vmovups xmm1, xmmword ptr [reloc @RWD16]
vpaddusw xmm2, xmm1, xmmword ptr [r15]
vpmovmskb r14d, xmm2
test r14d, 0xAAAA
jne G_M38868_IG18
add rsi, rsi
cmp rsi, 32
jb SHORT G_M38868_IG08
lea rbx, [r15+0x10]
and rbx, -16
add rsi, r15
sub rsi, rbx
cmp rsi, 32
jb SHORT G_M38868_IG07
lea rdi, [rbx+rsi-0x20]
align [0 bytes for IG06]
;; size=85 bbWeight=0.50 PerfScore 9.38
G_M38868_IG06:
vmovdqa xmm2, xmmword ptr [rbx]
vmovdqa xmm3, xmmword ptr [rbx+0x10]
- vpor xmm4, xmm2, xmm3
- vptest xmm4, xmm0
+ vmovaps xmm4, xmm2
+ vpternlogd xmm4, xmm3, xmm0, -88
+ vptest xmm4, xmm4
jne G_M38868_IG16
add rbx, 32
cmp rbx, rdi
jbe SHORT G_M38868_IG06
- ;; size=33 bbWeight=4 PerfScore 55.33
+ ;; size=40 bbWeight=4 PerfScore 57.00
G_M38868_IG07:
test sil, 16
je SHORT G_M38868_IG09
vmovdqa xmm2, xmmword ptr [rbx]
vptest xmm2, xmm0
jne G_M38868_IG17
;; size=21 bbWeight=0.50 PerfScore 4.62
G_M38868_IG08:
add rbx, 16
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M38868_IG09:
movzx rdi, sil
test dil, 15
je G_M38868_IG19
and rsi, 15
add rsi, rbx
mov rbx, rsi
sub rbx, 16
vmovups xmm2, xmmword ptr [rbx]
vptest xmm2, xmm0
jne G_M38868_IG17
add rbx, 16
jmp G_M38868_IG19
;; size=52 bbWeight=0.50 PerfScore 6.38
G_M38868_IG10:
test sil, 4
je SHORT G_M38868_IG12
mov rdi, qword ptr [r15]
mov rax, 0xD1FFAB1E
and rdi, rax
je SHORT G_M38868_IG11
xor ebx, ebx
tzcnt rbx, rdi
shr rbx, 3
and rbx, -2
add rbx, r15
jmp SHORT G_M38868_IG19
;; size=44 bbWeight=0.50 PerfScore 5.00
G_M38868_IG11:
lea rbx, [r15+0x08]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M38868_IG12:
test sil, 2
je SHORT G_M38868_IG13
mov edi, dword ptr [rbx]
test edi, 0xD1FFAB1E
jne SHORT G_M38868_IG14
add rbx, 4
;; size=20 bbWeight=0.50 PerfScore 2.38
G_M38868_IG13:
test sil, 1
je SHORT G_M38868_IG19
cmp word ptr [rbx], 255
ja SHORT G_M38868_IG19
jmp SHORT G_M38868_IG15
;; size=15 bbWeight=0.50 PerfScore 3.62
G_M38868_IG14:
mov rax, 0xD1FFAB1E ; code for System.Text.Latin1Utility:FirstCharInUInt32IsLatin1(uint):ubyte
call [rax]System.Text.Latin1Utility:FirstCharInUInt32IsLatin1(uint):ubyte
test eax, eax
je SHORT G_M38868_IG19
;; size=16 bbWeight=0.50 PerfScore 2.25
G_M38868_IG15:
add rbx, 2
jmp SHORT G_M38868_IG19
;; size=6 bbWeight=0.50 PerfScore 1.12
G_M38868_IG16:
vptest xmm2, xmm0
jne SHORT G_M38868_IG17
add rbx, 16
vmovaps xmm2, xmm3
;; size=15 bbWeight=0.50 PerfScore 2.25
G_M38868_IG17:
vpaddusw xmm0, xmm2, xmm1
vpmovmskb r14d, xmm0
;; size=8 bbWeight=0.50 PerfScore 1.17
G_M38868_IG18:
and r14d, 0xAAAA
xor eax, eax
tzcnt eax, r14d
lea rbx, [rbx+rax-0x01]
;; size=19 bbWeight=0.50 PerfScore 1.75
G_M38868_IG19:
mov rax, rbx
sub rax, r15
shr rax, 1
;; size=9 bbWeight=0.50 PerfScore 0.50
G_M38868_IG20:
add rsp, 8
pop rbx
pop r14
pop r15
pop rbp
ret
;; size=11 bbWeight=0.50 PerfScore 1.62
RWD00 dq FF00FF00FF00FF00h, FF00FF00FF00FF00h
RWD16 dq 7F007F007F007F00h, 7F007F007F007F00h
-; Total bytes of code 395, prolog size 15, PerfScore 106.50, instruction count 111, allocated bytes for code 395 (MethodHash=0f68682b) for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 402, prolog size 15, PerfScore 108.17, instruction count 112, allocated bytes for code 402 (MethodHash=0f68682b) for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
7 (4.70 % of base) - System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong
; Assembly listing for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 6, 11.50) long -> rdi single-def
; V01 arg1 [V01,T02] ( 8, 8.50) long -> rsi single-def
; V02 arg2 [V02,T03] ( 3, 2.50) long -> rdx single-def
;* V03 loc0 [V03,T05] ( 0, 0 ) int -> zero-ref
;* V04 loc1 [V04 ] ( 0, 0 ) long -> zero-ref
-; V05 loc2 [V05,T08] ( 5, 7 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[short]>
+; V05 loc2 [V05,T09] ( 5, 7 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[short]>
;* V06 loc3 [V06 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V07 loc4 [V07,T06] ( 14, 18.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[short]>
+; V07 loc4 [V07,T07] ( 14, 18.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[short]>
;* V08 loc5 [V08 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V09 loc6 [V09,T00] ( 12, 27 ) long -> rax
; V10 loc7 [V10,T04] ( 2, 4.50) long -> rdx
-; V11 loc8 [V11,T07] ( 3, 12 ) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[short]>
+; V11 loc8 [V11,T08] ( 3, 12 ) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[short]>
;* V12 loc9 [V12 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;# V13 OutArgs [V13 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V14 rat0 [V14,T06] ( 3, 24 ) simd16 -> mm3 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M23879_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M23879_IG02:
vmovups xmm0, xmmword ptr [reloc @RWD00]
vmovups xmm1, xmmword ptr [rdi]
vptest xmm1, xmm0
je SHORT G_M23879_IG05
;; size=19 bbWeight=1 PerfScore 11.00
G_M23879_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M23879_IG04:
pop rbp
ret
;; size=2 bbWeight=0.50 PerfScore 0.75
G_M23879_IG05:
vpackuswb xmm1, xmm1, xmm1
vmovq qword ptr [rsi], xmm1
mov eax, 8
test sil, 8
jne SHORT G_M23879_IG06
vmovups xmm1, xmmword ptr [rdi+0x10]
vptest xmm1, xmm0
jne SHORT G_M23879_IG08
vpackuswb xmm1, xmm1, xmm1
vmovq qword ptr [rsi+0x08], xmm1
;; size=40 bbWeight=0.50 PerfScore 7.75
G_M23879_IG06:
mov rax, rsi
and rax, 15
neg rax
add rax, 16
add rdx, -16
align [0 bytes for IG07]
;; size=18 bbWeight=0.50 PerfScore 0.62
G_M23879_IG07:
vmovups xmm1, xmmword ptr [rdi+2*rax]
vmovups xmm2, xmmword ptr [rdi+2*rax+0x10]
- vpor xmm3, xmm1, xmm2
- vptest xmm3, xmm0
+ vmovaps xmm3, xmm1
+ vpternlogd xmm3, xmm2, xmm0, -88
+ vptest xmm3, xmm3
jne SHORT G_M23879_IG09
vpackuswb xmm1, xmm1, xmm2
vmovdqa xmmword ptr [rsi+rax], xmm1
add rax, 16
cmp rax, rdx
jbe SHORT G_M23879_IG07
- ;; size=40 bbWeight=4 PerfScore 67.33
+ ;; size=47 bbWeight=4 PerfScore 69.00
G_M23879_IG08:
pop rbp
ret
;; size=2 bbWeight=0.50 PerfScore 0.75
G_M23879_IG09:
vptest xmm1, xmm0
jne SHORT G_M23879_IG08
vpackuswb xmm0, xmm1, xmm1
vmovq qword ptr [rsi+rax], xmm0
add rax, 8
jmp SHORT G_M23879_IG08
;; size=22 bbWeight=0.50 PerfScore 4.62
RWD00 dq FF00FF00FF00FF00h, FF00FF00FF00FF00h
-; Total bytes of code 149, prolog size 4, PerfScore 94.21, instruction count 43, allocated bytes for code 149 (MethodHash=f65ba2b8) for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 156, prolog size 4, PerfScore 95.88, instruction count 44, allocated bytes for code 156 (MethodHash=f65ba2b8) for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
4 (1.08 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong
; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 8 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 38, 77 ) long -> rdi
; V01 arg1 [V01,T01] ( 17, 21 ) long -> rsi
; V02 loc0 [V02,T04] ( 12, 7 ) long -> rax
; V03 loc1 [V03,T02] ( 9, 11.50) int -> rcx
; V04 loc2 [V04,T05] ( 2, 4.50) long -> rcx
; V05 loc3 [V05,T06] ( 2, 4.50) long -> rcx
; V06 loc4 [V06,T07] ( 2, 4.50) long -> rcx
; V07 loc5 [V07,T03] ( 3, 8.50) int -> rdx
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V09 tmp1 [V09 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
;* V10 tmp2 [V10 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V11 tmp3 [V11 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
;* V12 tmp4 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V13 tmp5 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V14 tmp6 [V14 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V15 tmp7 [V15 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
-; V16 cse0 [V16,T08] ( 3, 5 ) simd16 -> mm0 "CSE #02: aggressive"
+; V16 cse0 [V16,T09] ( 3, 5 ) simd16 -> mm0 "CSE #02: aggressive"
+; V17 rat0 [V17,T08] ( 3, 24 ) simd16 -> mm1 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M50024_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M50024_IG02:
mov rax, rdi
cmp rsi, 128
jb SHORT G_M50024_IG05
;; size=12 bbWeight=1 PerfScore 1.50
G_M50024_IG03:
vmovups zmm0, zmmword ptr [rax]
vpmovb2m k1, zmm0
kmovq rcx, k1
;; NOP compensation instructions of 3 bytes.
test rcx, rcx
jne G_M50024_IG10
lea rcx, [rax+rsi-0x40]
lea rdi, [rax+0x40]
and rdi, -64
align [6 bytes for IG04]
;; size=48 bbWeight=0.50 PerfScore 5.62
G_M50024_IG04:
vmovdqa32 zmm0, zmmword ptr [rdi]
vpmovb2m k1, zmm0
kmovq rdx, k1
;; NOP compensation instructions of 3 bytes.
test rdx, rdx
jne G_M50024_IG09
add rdi, 64
cmp rdi, rcx
jbe SHORT G_M50024_IG04
jmp SHORT G_M50024_IG09
;; NOP compensation instructions of 3 bytes.
;; size=43 bbWeight=4 PerfScore 51.00
G_M50024_IG05:
cmp rsi, 64
jb SHORT G_M50024_IG07
vmovups ymm0, ymmword ptr [rax]
vpmovmskb ecx, ymm0
test ecx, ecx
jne SHORT G_M50024_IG10
+ ;; NOP compensation instructions of 4 bytes.
lea rcx, [rax+rsi-0x20]
lea rdi, [rax+0x20]
and rdi, -32
- align [6 bytes for IG06]
+ align [2 bytes for IG06]
;; size=37 bbWeight=0.50 PerfScore 6.25
G_M50024_IG06:
vmovdqa ymm0, ymmword ptr [rdi]
vpmovmskb edx, ymm0
test edx, edx
jne SHORT G_M50024_IG09
add rdi, 32
cmp rdi, rcx
jbe SHORT G_M50024_IG06
jmp SHORT G_M50024_IG09
;; size=23 bbWeight=4 PerfScore 51.00
G_M50024_IG07:
cmp rsi, 32
jb SHORT G_M50024_IG10
vmovups xmm0, xmmword ptr [reloc @RWD00]
vptest xmm0, xmmword ptr [rax]
jne SHORT G_M50024_IG10
lea rcx, [rax+rsi-0x10]
lea rdi, [rax+0x10]
and rdi, -16
align [0 bytes for IG08]
;; size=34 bbWeight=0.50 PerfScore 6.50
G_M50024_IG08:
- vptest xmm0, xmmword ptr [rdi]
+ vpand xmm1, xmm0, xmmword ptr [rdi]
+ vptest xmm1, xmm1
jne SHORT G_M50024_IG09
add rdi, 16
cmp rdi, rcx
jbe SHORT G_M50024_IG08
- ;; size=16 bbWeight=4 PerfScore 34.00
+ ;; size=20 bbWeight=4 PerfScore 34.00
G_M50024_IG09:
sub rsi, rdi
add rsi, rax
;; size=6 bbWeight=0.50 PerfScore 0.25
G_M50024_IG10:
cmp rsi, 8
jb SHORT G_M50024_IG15
align [0 bytes for IG11]
;; size=6 bbWeight=1 PerfScore 1.25
G_M50024_IG11:
mov ecx, dword ptr [rdi]
mov edx, dword ptr [rdi+0x04]
mov r8d, ecx
or r8d, edx
test r8d, 0xD1FFAB1E
je SHORT G_M50024_IG14
;; size=20 bbWeight=4 PerfScore 23.00
G_M50024_IG12:
test ecx, 0xD1FFAB1E
jne SHORT G_M50024_IG13
mov ecx, edx
add rdi, 4
;; size=14 bbWeight=0.50 PerfScore 0.88
G_M50024_IG13:
and ecx, 0xD1FFAB1E
xor esi, esi
tzcnt esi, ecx
shr esi, 3
mov ecx, esi
add rdi, rcx
jmp SHORT G_M50024_IG18
;; size=22 bbWeight=0.50 PerfScore 2.75
G_M50024_IG14:
add rdi, 8
add rsi, -8
cmp rsi, 8
jae SHORT G_M50024_IG11
;; size=14 bbWeight=4 PerfScore 7.00
G_M50024_IG15:
test sil, 4
je SHORT G_M50024_IG16
mov ecx, dword ptr [rdi]
test ecx, 0xD1FFAB1E
jne SHORT G_M50024_IG13
add rdi, 4
;; size=20 bbWeight=0.50 PerfScore 2.38
G_M50024_IG16:
test sil, 2
je SHORT G_M50024_IG17
movzx rcx, word ptr [rdi]
test ecx, 0xD1FFAB1E
jne SHORT G_M50024_IG13
add rdi, 2
;; size=21 bbWeight=0.50 PerfScore 2.38
G_M50024_IG17:
test sil, 1
je SHORT G_M50024_IG18
lea rcx, [rdi+0x01]
cmp byte ptr [rdi], 0
cmovge rdi, rcx
;; size=17 bbWeight=0.50 PerfScore 2.50
G_M50024_IG18:
mov rcx, rdi
sub rcx, rax
mov rax, rcx
;; size=9 bbWeight=1 PerfScore 0.75
G_M50024_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq 8080808080808080h, 8080808080808080h
-; Total bytes of code 371, prolog size 4, PerfScore 202.75, instruction count 104, allocated bytes for code 371 (MethodHash=58923c97) for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 375, prolog size 4, PerfScore 202.75, instruction count 105, allocated bytes for code 375 (MethodHash=58923c97) for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
4 (1.19 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong
; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 8 single block inlinees; 5 inlinees without PGO data
+; 0 inlinees with PGO data; 10 single block inlinees; 3 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 32, 74 ) long -> rdi
; V01 arg1 [V01,T01] ( 16, 20.50) long -> rsi
; V02 loc0 [V02,T04] ( 12, 7 ) long -> rax
; V03 loc1 [V03,T02] ( 7, 10.50) int -> rcx
; V04 loc2 [V04,T05] ( 2, 4.50) long -> rcx
; V05 loc3 [V05,T06] ( 2, 4.50) long -> rcx
; V06 loc4 [V06,T07] ( 2, 4.50) long -> rcx
; V07 loc5 [V07,T03] ( 3, 8.50) int -> rdx
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V09 tmp1 [V09 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
;* V10 tmp2 [V10 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V11 tmp3 [V11 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
;* V12 tmp4 [V12 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;* V13 tmp5 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V14 tmp6 [V14 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V13 tmp5 [V13 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V14 tmp6 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V15 tmp7 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V16 tmp8 [V16 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V17 tmp9 [V17 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V18 tmp10 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V19 tmp11 [V19 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V20 tmp12 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V21 tmp13 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V22 tmp14 [V22 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V23 tmp15 [V23 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
-;* V24 tmp16 [V24,T08] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 cse0 [V25,T09] ( 3, 5 ) simd64 -> mm0 "CSE #01: aggressive"
-; V26 cse1 [V26,T10] ( 3, 5 ) simd32 -> mm0 "CSE #04: aggressive"
-; V27 cse2 [V27,T11] ( 3, 5 ) simd16 -> mm0 "CSE #05: aggressive"
+;* V16 tmp8 [V16 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V17 tmp9 [V17 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
+;* V18 tmp10 [V18,T08] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V19 cse0 [V19,T11] ( 3, 5 ) simd64 -> mm0 "CSE #01: aggressive"
+; V20 cse1 [V20,T12] ( 3, 5 ) simd32 -> mm0 "CSE #04: aggressive"
+; V21 cse2 [V21,T13] ( 3, 5 ) simd16 -> mm0 "CSE #05: aggressive"
+; V22 rat0 [V22,T09] ( 3, 24 ) simd32 -> mm1 "ReplaceWithLclVar is creating a new local variable"
+; V23 rat1 [V23,T10] ( 3, 24 ) simd16 -> mm1 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M42618_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M42618_IG02:
mov rax, rdi
cmp rsi, 64
jb SHORT G_M42618_IG05
;; size=9 bbWeight=1 PerfScore 1.50
G_M42618_IG03:
vmovups zmm0, zmmword ptr [reloc @RWD00]
vptestmw k1, zmm0, zmmword ptr [rax]
kortestd k1, k1
;; NOP compensation instructions of 3 bytes.
jne G_M42618_IG10
lea rcx, [rax+2*rsi-0x40]
lea rdi, [rax+0x40]
and rdi, -64
align [0 bytes for IG04]
;; size=43 bbWeight=0.50 PerfScore 6.38
G_M42618_IG04:
vptestmw k1, zmm0, zmmword ptr [rdi]
kortestd k1, k1
;; NOP compensation instructions of 3 bytes.
jne G_M42618_IG09
add rdi, 64
cmp rdi, rcx
jbe SHORT G_M42618_IG04
jmp SHORT G_M42618_IG09
;; NOP compensation instructions of 3 bytes.
;; size=34 bbWeight=4 PerfScore 46.00
G_M42618_IG05:
cmp rsi, 32
jb SHORT G_M42618_IG07
vmovups ymm0, ymmword ptr [reloc @RWD00]
vptest ymm0, ymmword ptr [rax]
jne SHORT G_M42618_IG10
+ ;; NOP compensation instructions of 4 bytes.
lea rcx, [rax+2*rsi-0x20]
lea rdi, [rax+0x20]
and rdi, -32
- align [4 bytes for IG06]
- ;; size=38 bbWeight=0.50 PerfScore 8.12
+ align [0 bytes for IG06]
+ ;; size=38 bbWeight=0.50 PerfScore 8.00
G_M42618_IG06:
- vptest ymm0, ymmword ptr [rdi]
+ vpand ymm1, ymm0, ymmword ptr [rdi]
+ vptest ymm1, ymm1
jne SHORT G_M42618_IG09
add rdi, 32
cmp rdi, rcx
jbe SHORT G_M42618_IG06
jmp SHORT G_M42618_IG09
- ;; size=18 bbWeight=4 PerfScore 50.00
+ ;; size=22 bbWeight=4 PerfScore 50.00
G_M42618_IG07:
cmp rsi, 16
jb SHORT G_M42618_IG10
vmovups xmm0, xmmword ptr [reloc @RWD00]
vptest xmm0, xmmword ptr [rax]
jne SHORT G_M42618_IG10
lea rcx, [rax+2*rsi-0x10]
lea rdi, [rax+0x10]
and rdi, -16
- align [12 bytes for IG08]
- ;; size=46 bbWeight=0.50 PerfScore 6.62
+ align [8 bytes for IG08]
+ ;; size=42 bbWeight=0.50 PerfScore 6.62
G_M42618_IG08:
- vptest xmm0, xmmword ptr [rdi]
+ vpand xmm1, xmm0, xmmword ptr [rdi]
+ vptest xmm1, xmm1
jne SHORT G_M42618_IG09
add rdi, 16
cmp rdi, rcx
jbe SHORT G_M42618_IG08
- ;; size=16 bbWeight=4 PerfScore 34.00
+ ;; size=20 bbWeight=4 PerfScore 34.00
G_M42618_IG09:
mov rcx, rdi
sub rcx, rax
shr rcx, 1
sub rsi, rcx
;; size=12 bbWeight=0.50 PerfScore 0.62
G_M42618_IG10:
cmp rsi, 4
jb SHORT G_M42618_IG15
align [0 bytes for IG11]
;; size=6 bbWeight=1 PerfScore 1.25
G_M42618_IG11:
mov ecx, dword ptr [rdi]
mov edx, dword ptr [rdi+0x04]
mov r8d, ecx
or r8d, edx
test r8d, 0xD1FFAB1E
je SHORT G_M42618_IG14
;; size=20 bbWeight=4 PerfScore 23.00
G_M42618_IG12:
test ecx, 0xD1FFAB1E
jne SHORT G_M42618_IG13
mov ecx, edx
add rdi, 4
;; size=14 bbWeight=0.50 PerfScore 0.88
G_M42618_IG13:
test ecx, 0xFF80
jne SHORT G_M42618_IG18
jmp SHORT G_M42618_IG17
;; size=10 bbWeight=0.50 PerfScore 1.62
G_M42618_IG14:
add rdi, 8
add rsi, -4
cmp rsi, 4
jae SHORT G_M42618_IG11
;; size=14 bbWeight=4 PerfScore 7.00
G_M42618_IG15:
test sil, 2
je SHORT G_M42618_IG16
mov ecx, dword ptr [rdi]
test ecx, 0xD1FFAB1E
jne SHORT G_M42618_IG13
add rdi, 4
;; size=20 bbWeight=0.50 PerfScore 2.38
G_M42618_IG16:
test sil, 1
je SHORT G_M42618_IG18
cmp word ptr [rdi], 127
ja SHORT G_M42618_IG18
;; size=12 bbWeight=0.50 PerfScore 2.62
G_M42618_IG17:
add rdi, 2
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M42618_IG18:
mov rcx, rdi
sub rcx, rax
mov rax, rcx
shr rax, 1
;; size=12 bbWeight=1 PerfScore 1.25
G_M42618_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 337, prolog size 4, PerfScore 197.12, instruction count 91, allocated bytes for code 337 (MethodHash=bc9a5985) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 341, prolog size 4, PerfScore 197.00, instruction count 93, allocated bytes for code 341 (MethodHash=bc9a5985) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)