Skip to content

Instantly share code, notes, and snippets.

@MihuBot
Created July 14, 2024 23:49
Show Gist options
  • Save MihuBot/b9e3821b9ab1891a35cf85f702d2f8d0 to your computer and use it in GitHub Desktop.
Save MihuBot/b9e3821b9ab1891a35cf85f702d2f8d0 to your computer and use it in GitHub Desktop.

Top method regressions

36 (10.68 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 8 single block inlinees; 5 inlinees without PGO data
+; 0 inlinees with PGO data; 18 single block inlinees; 25 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] ( 32, 74   )    long  ->  rdi        
 ;  V01 arg1         [V01,T01] ( 16, 20.50)    long  ->  rsi        
 ;  V02 loc0         [V02,T04] ( 12,  7   )    long  ->  rax        
 ;  V03 loc1         [V03,T02] (  7, 10.50)     int  ->  rcx        
 ;  V04 loc2         [V04,T05] (  2,  4.50)    long  ->  rcx        
 ;  V05 loc3         [V05,T06] (  2,  4.50)    long  ->  rcx        
 ;  V06 loc4         [V06,T07] (  2,  4.50)    long  ->  rcx        
 ;  V07 loc5         [V07,T03] (  3,  8.50)     int  ->  rdx        
 ;# V08 OutArgs      [V08    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V09 tmp1         [V09    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
-;* V10 tmp2         [V10    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
-;* V11 tmp3         [V11    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
-;* V12 tmp4         [V12    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;* V13 tmp5         [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V14 tmp6         [V14    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V09 tmp1         [V09,T21] (  2,  2   )  simd16  ->  mm0         "spilled call-like call argument"
+;  V10 tmp2         [V10,T22] (  2,  2   )  simd32  ->  mm0         "spilled call-like call argument"
+;  V11 tmp3         [V11,T23] (  2,  2   )  simd64  ->  mm0         "spilled call-like call argument"
+;* V12 tmp4         [V12    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V13 tmp5         [V13    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V14 tmp6         [V14,T12] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
 ;* V15 tmp7         [V15    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V16 tmp8         [V16    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V17 tmp9         [V17    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V18 tmp10        [V18    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V19 tmp11        [V19    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V20 tmp12        [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V21 tmp13        [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V22 tmp14        [V22    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V23 tmp15        [V23    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
-;* V24 tmp16        [V24,T08] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V25 cse0         [V25,T09] (  3,  5   )  simd64  ->  mm0         "CSE #01: aggressive"
-;  V26 cse1         [V26,T10] (  3,  5   )  simd32  ->  mm0         "CSE #04: aggressive"
-;  V27 cse2         [V27,T11] (  3,  5   )  simd16  ->  mm0         "CSE #05: aggressive"
+;* V16 tmp8         [V16    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V17 tmp9         [V17    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V18 tmp10        [V18    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V19 tmp11        [V19    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V20 tmp12        [V20,T15] (  2, 16   )  simd64  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V21 tmp13        [V21    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V22 tmp14        [V22    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V23 tmp15        [V23,T08] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V24 tmp16        [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V25 tmp17        [V25    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V26 tmp18        [V26    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V27 tmp19        [V27    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V28 tmp20        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V29 tmp21        [V29    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V30 tmp22        [V30    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V31 tmp23        [V31,T13] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V32 tmp24        [V32    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V33 tmp25        [V33    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V34 tmp26        [V34    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V35 tmp27        [V35    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V36 tmp28        [V36    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V37 tmp29        [V37,T16] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V38 tmp30        [V38    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V39 tmp31        [V39    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V40 tmp32        [V40,T09] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V41 tmp33        [V41    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V42 tmp34        [V42    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V43 tmp35        [V43    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V44 tmp36        [V44    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V45 tmp37        [V45    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V46 tmp38        [V46    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V47 tmp39        [V47    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V48 tmp40        [V48,T14] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V49 tmp41        [V49    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V50 tmp42        [V50    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V51 tmp43        [V51    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V52 tmp44        [V52    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V53 tmp45        [V53    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V54 tmp46        [V54,T17] (  2, 16   )  simd16  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V55 tmp47        [V55    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V56 tmp48        [V56    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V57 tmp49        [V57,T10] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V58 tmp50        [V58    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V59 tmp51        [V59    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V60 tmp52        [V60    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V61 tmp53        [V61    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V62 tmp54        [V62    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V63 tmp55        [V63    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V64 tmp56        [V64,T11] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V65 cse0         [V65,T18] (  3,  5   )  simd64  ->  mm1         "CSE #01: moderate"
+;  V66 cse1         [V66,T19] (  3,  5   )  simd32  ->  mm1         "CSE #04: moderate"
+;  V67 cse2         [V67,T20] (  3,  5   )  simd16  ->  mm1         "CSE #05: moderate"
 ;
 ; Lcl frame size = 0
 
 G_M42618_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M42618_IG02:
        mov      rax, rdi
        cmp      rsi, 64
        jb       SHORT G_M42618_IG05
 						;; size=9 bbWeight=1 PerfScore 1.50
 G_M42618_IG03:
-       vmovups  zmm0, zmmword ptr [reloc @RWD00]
-       vptestmw k1, zmm0, zmmword ptr [rax]
+       vmovups  zmm0, zmmword ptr [rax]
+       vmovups  zmm1, zmmword ptr [reloc @RWD00]
+       vptestmw k1, zmm1, zmm0
        kortestd k1, k1
 		  ;; NOP compensation instructions of 3 bytes.
        jne      G_M42618_IG10
        lea      rcx, [rax+2*rsi-0x40]
        lea      rdi, [rax+0x40]
        and      rdi, -64
-       align    [0 bytes for IG04]
-						;; size=43 bbWeight=0.50 PerfScore 6.38
+       align    [2 bytes for IG04]
+						;; size=51 bbWeight=0.50 PerfScore 7.00
 G_M42618_IG04:
-       vptestmw k1, zmm0, zmmword ptr [rdi]
+       vmovdqa32 zmm0, zmmword ptr [rdi]
+       vptestmw k1, zmm1, zmm0
        kortestd k1, k1
 		  ;; NOP compensation instructions of 3 bytes.
        jne      G_M42618_IG09
        add      rdi, 64
        cmp      rdi, rcx
        jbe      SHORT G_M42618_IG04
-       jmp      SHORT G_M42618_IG09
-		  ;; NOP compensation instructions of 3 bytes.
-						;; size=34 bbWeight=4 PerfScore 46.00
+       jmp      G_M42618_IG09
+						;; size=40 bbWeight=4 PerfScore 50.00
 G_M42618_IG05:
        cmp      rsi, 32
        jb       SHORT G_M42618_IG07
-       vmovups  ymm0, ymmword ptr [reloc @RWD00]
-       vptest   ymm0, ymmword ptr [rax]
+       vmovups  ymm0, ymmword ptr [rax]
+       vmovups  ymm1, ymmword ptr [reloc @RWD00]
+       vptest   ymm1, ymm0
        jne      SHORT G_M42618_IG10
+		  ;; NOP compensation instructions of 4 bytes.
        lea      rcx, [rax+2*rsi-0x20]
        lea      rdi, [rax+0x20]
        and      rdi, -32
-       align    [4 bytes for IG06]
-						;; size=38 bbWeight=0.50 PerfScore 8.12
+       align    [14 bytes for IG06]
+						;; size=56 bbWeight=0.50 PerfScore 9.12
 G_M42618_IG06:
-       vptest   ymm0, ymmword ptr [rdi]
+       vmovdqa  ymm0, ymmword ptr [rdi]
+       vptest   ymm1, ymm0
        jne      SHORT G_M42618_IG09
        add      rdi, 32
        cmp      rdi, rcx
        jbe      SHORT G_M42618_IG06
        jmp      SHORT G_M42618_IG09
-						;; size=18 bbWeight=4 PerfScore 50.00
+						;; size=22 bbWeight=4 PerfScore 58.00
 G_M42618_IG07:
        cmp      rsi, 16
        jb       SHORT G_M42618_IG10
-       vmovups  xmm0, xmmword ptr [reloc @RWD00]
-       vptest   xmm0, xmmword ptr [rax]
+       vmovups  xmm0, xmmword ptr [rax]
+       vmovups  xmm1, xmmword ptr [reloc @RWD00]
+       vptest   xmm1, xmm0
        jne      SHORT G_M42618_IG10
        lea      rcx, [rax+2*rsi-0x10]
        lea      rdi, [rax+0x10]
        and      rdi, -16
-       align    [12 bytes for IG08]
-						;; size=46 bbWeight=0.50 PerfScore 6.62
+       align    [4 bytes for IG08]
+						;; size=42 bbWeight=0.50 PerfScore 7.12
 G_M42618_IG08:
-       vptest   xmm0, xmmword ptr [rdi]
+       vmovdqa  xmm0, xmmword ptr [rdi]
+       vptest   xmm1, xmm0
        jne      SHORT G_M42618_IG09
        add      rdi, 16
        cmp      rdi, rcx
        jbe      SHORT G_M42618_IG08
-						;; size=16 bbWeight=4 PerfScore 34.00
+						;; size=20 bbWeight=4 PerfScore 38.00
 G_M42618_IG09:
        mov      rcx, rdi
        sub      rcx, rax
        shr      rcx, 1
        sub      rsi, rcx
 						;; size=12 bbWeight=0.50 PerfScore 0.62
 G_M42618_IG10:
        cmp      rsi, 4
        jb       SHORT G_M42618_IG15
        align    [0 bytes for IG11]
 						;; size=6 bbWeight=1 PerfScore 1.25
 G_M42618_IG11:
        mov      ecx, dword ptr [rdi]
        mov      edx, dword ptr [rdi+0x04]
        mov      r8d, ecx
        or       r8d, edx
        test     r8d, 0xD1FFAB1E
        je       SHORT G_M42618_IG14
 						;; size=20 bbWeight=4 PerfScore 23.00
 G_M42618_IG12:
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M42618_IG13
        mov      ecx, edx
        add      rdi, 4
 						;; size=14 bbWeight=0.50 PerfScore 0.88
 G_M42618_IG13:
        test     ecx, 0xFF80
        jne      SHORT G_M42618_IG18
        jmp      SHORT G_M42618_IG17
 						;; size=10 bbWeight=0.50 PerfScore 1.62
 G_M42618_IG14:
        add      rdi, 8
        add      rsi, -4
        cmp      rsi, 4
        jae      SHORT G_M42618_IG11
 						;; size=14 bbWeight=4 PerfScore 7.00
 G_M42618_IG15:
        test     sil, 2
        je       SHORT G_M42618_IG16
        mov      ecx, dword ptr [rdi]
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M42618_IG13
        add      rdi, 4
 						;; size=20 bbWeight=0.50 PerfScore 2.38
 G_M42618_IG16:
        test     sil, 1
        je       SHORT G_M42618_IG18
        cmp      word  ptr [rdi], 127
        ja       SHORT G_M42618_IG18
 						;; size=12 bbWeight=0.50 PerfScore 2.62
 G_M42618_IG17:
        add      rdi, 2
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M42618_IG18:
        mov      rcx, rdi
        sub      rcx, rax
        mov      rax, rcx
        shr      rax, 1
 						;; size=12 bbWeight=1 PerfScore 1.25
 G_M42618_IG19:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=1 PerfScore 2.50
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 337, prolog size 4, PerfScore 197.12, instruction count 91, allocated bytes for code 337 (MethodHash=bc9a5985) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 373, prolog size 4, PerfScore 215.25, instruction count 97, allocated bytes for code 373 (MethodHash=bc9a5985) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
11 (2.96 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 8 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 12 single block inlinees; 8 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] ( 38, 77   )    long  ->  rdi        
 ;  V01 arg1         [V01,T01] ( 17, 21   )    long  ->  rsi        
 ;  V02 loc0         [V02,T04] ( 12,  7   )    long  ->  rax        
 ;  V03 loc1         [V03,T02] (  9, 11.50)     int  ->  rcx        
 ;  V04 loc2         [V04,T05] (  2,  4.50)    long  ->  rcx        
 ;  V05 loc3         [V05,T06] (  2,  4.50)    long  ->  rcx        
 ;  V06 loc4         [V06,T07] (  2,  4.50)    long  ->  rcx        
 ;  V07 loc5         [V07,T03] (  3,  8.50)     int  ->  rdx        
 ;# V08 OutArgs      [V08    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V09 tmp1         [V09    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;  V09 tmp1         [V09,T12] (  2,  2   )  simd16  ->  mm0         "spilled call-like call argument"
 ;* V10 tmp2         [V10    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V11 tmp3         [V11    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
-;* V12 tmp4         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V13 tmp5         [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V14 tmp6         [V14    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V15 tmp7         [V15    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
-;  V16 cse0         [V16,T08] (  3,  5   )  simd16  ->  mm0         "CSE #02: aggressive"
+;* V12 tmp4         [V12    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V13 tmp5         [V13    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V14 tmp6         [V14,T09] (  0,  0   )   ubyte  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V15 tmp7         [V15    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V16 tmp8         [V16    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V17 tmp9         [V17    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline stloc first use temp"
+;* V18 tmp10        [V18    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
+;* V19 tmp11        [V19    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V20 tmp12        [V20,T10] (  2, 16   )  simd16  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V21 tmp13        [V21    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V22 tmp14        [V22    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V23 tmp15        [V23,T08] (  0,  0   )   ubyte  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V24 tmp16        [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V25 tmp17        [V25    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V26 tmp18        [V26    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline stloc first use temp"
+;* V27 tmp19        [V27    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
+;* V28 tmp20        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V29 tmp21        [V29    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;  V30 cse0         [V30,T11] (  3,  5   )  simd16  ->  mm1         "CSE #02: moderate"
 ;
 ; Lcl frame size = 0
 
 G_M50024_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M50024_IG02:
        mov      rax, rdi
        cmp      rsi, 128
        jb       SHORT G_M50024_IG05
 						;; size=12 bbWeight=1 PerfScore 1.50
 G_M50024_IG03:
        vmovups  zmm0, zmmword ptr [rax]
        vpmovb2m k1, zmm0
        kmovq    rcx, k1
 		  ;; NOP compensation instructions of 3 bytes.
        test     rcx, rcx
        jne      G_M50024_IG10
        lea      rcx, [rax+rsi-0x40]
        lea      rdi, [rax+0x40]
        and      rdi, -64
        align    [6 bytes for IG04]
 						;; size=48 bbWeight=0.50 PerfScore 5.62
 G_M50024_IG04:
        vmovdqa32 zmm0, zmmword ptr [rdi]
        vpmovb2m k1, zmm0
        kmovq    rdx, k1
 		  ;; NOP compensation instructions of 3 bytes.
        test     rdx, rdx
        jne      G_M50024_IG09
        add      rdi, 64
        cmp      rdi, rcx
        jbe      SHORT G_M50024_IG04
        jmp      SHORT G_M50024_IG09
 		  ;; NOP compensation instructions of 3 bytes.
 						;; size=43 bbWeight=4 PerfScore 51.00
 G_M50024_IG05:
        cmp      rsi, 64
        jb       SHORT G_M50024_IG07
        vmovups  ymm0, ymmword ptr [rax]
        vpmovmskb ecx, ymm0
        test     ecx, ecx
        jne      SHORT G_M50024_IG10
+		  ;; NOP compensation instructions of 4 bytes.
        lea      rcx, [rax+rsi-0x20]
        lea      rdi, [rax+0x20]
        and      rdi, -32
-       align    [6 bytes for IG06]
+       align    [2 bytes for IG06]
 						;; size=37 bbWeight=0.50 PerfScore 6.25
 G_M50024_IG06:
        vmovdqa  ymm0, ymmword ptr [rdi]
        vpmovmskb edx, ymm0
        test     edx, edx
        jne      SHORT G_M50024_IG09
        add      rdi, 32
        cmp      rdi, rcx
        jbe      SHORT G_M50024_IG06
        jmp      SHORT G_M50024_IG09
 						;; size=23 bbWeight=4 PerfScore 51.00
 G_M50024_IG07:
        cmp      rsi, 32
        jb       SHORT G_M50024_IG10
-       vmovups  xmm0, xmmword ptr [reloc @RWD00]
-       vptest   xmm0, xmmword ptr [rax]
+       vmovups  xmm0, xmmword ptr [rax]
+       vmovups  xmm1, xmmword ptr [reloc @RWD00]
+       vptest   xmm1, xmm0
        jne      SHORT G_M50024_IG10
        lea      rcx, [rax+rsi-0x10]
        lea      rdi, [rax+0x10]
        and      rdi, -16
-       align    [0 bytes for IG08]
-						;; size=34 bbWeight=0.50 PerfScore 6.50
+       align    [3 bytes for IG08]
+						;; size=41 bbWeight=0.50 PerfScore 7.12
 G_M50024_IG08:
-       vptest   xmm0, xmmword ptr [rdi]
+       vmovdqa  xmm0, xmmword ptr [rdi]
+       vptest   xmm1, xmm0
        jne      SHORT G_M50024_IG09
        add      rdi, 16
        cmp      rdi, rcx
        jbe      SHORT G_M50024_IG08
-						;; size=16 bbWeight=4 PerfScore 34.00
+						;; size=20 bbWeight=4 PerfScore 38.00
 G_M50024_IG09:
        sub      rsi, rdi
        add      rsi, rax
 						;; size=6 bbWeight=0.50 PerfScore 0.25
 G_M50024_IG10:
        cmp      rsi, 8
        jb       SHORT G_M50024_IG15
        align    [0 bytes for IG11]
 						;; size=6 bbWeight=1 PerfScore 1.25
 G_M50024_IG11:
        mov      ecx, dword ptr [rdi]
        mov      edx, dword ptr [rdi+0x04]
        mov      r8d, ecx
        or       r8d, edx
        test     r8d, 0xD1FFAB1E
        je       SHORT G_M50024_IG14
 						;; size=20 bbWeight=4 PerfScore 23.00
 G_M50024_IG12:
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M50024_IG13
        mov      ecx, edx
        add      rdi, 4
 						;; size=14 bbWeight=0.50 PerfScore 0.88
 G_M50024_IG13:
        and      ecx, 0xD1FFAB1E
        xor      esi, esi
        tzcnt    esi, ecx
        shr      esi, 3
        mov      ecx, esi
        add      rdi, rcx
        jmp      SHORT G_M50024_IG18
 						;; size=22 bbWeight=0.50 PerfScore 2.75
 G_M50024_IG14:
        add      rdi, 8
        add      rsi, -8
        cmp      rsi, 8
        jae      SHORT G_M50024_IG11
 						;; size=14 bbWeight=4 PerfScore 7.00
 G_M50024_IG15:
        test     sil, 4
        je       SHORT G_M50024_IG16
        mov      ecx, dword ptr [rdi]
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M50024_IG13
        add      rdi, 4
 						;; size=20 bbWeight=0.50 PerfScore 2.38
 G_M50024_IG16:
        test     sil, 2
        je       SHORT G_M50024_IG17
        movzx    rcx, word  ptr [rdi]
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M50024_IG13
        add      rdi, 2
 						;; size=21 bbWeight=0.50 PerfScore 2.38
 G_M50024_IG17:
        test     sil, 1
        je       SHORT G_M50024_IG18
        lea      rcx, [rdi+0x01]
        cmp      byte  ptr [rdi], 0
        cmovge   rdi, rcx
 						;; size=17 bbWeight=0.50 PerfScore 2.50
 G_M50024_IG18:
        mov      rcx, rdi
        sub      rcx, rax
        mov      rax, rcx
 						;; size=9 bbWeight=1 PerfScore 0.75
 G_M50024_IG19:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=1 PerfScore 2.50
 RWD00  	dq	8080808080808080h, 8080808080808080h
 
 
-; Total bytes of code 371, prolog size 4, PerfScore 202.75, instruction count 104, allocated bytes for code 371 (MethodHash=58923c97) for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 382, prolog size 4, PerfScore 207.38, instruction count 106, allocated bytes for code 382 (MethodHash=58923c97) for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
9 (12.68 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this
 ; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 4 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;* V00 this         [V00    ] (  0,  0   )   byref  ->  zero-ref    this single-def
 ;  V01 arg1         [V01,T00] (  4,  4   )    long  ->  rsi         single-def
 ;* V02 arg2         [V02    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;* V03 arg3         [V03    ] (  0,  0   )     int  ->  zero-ref    single-def
 ;  V04 arg4         [V04,T01] (  4,  3   )   byref  ->   r8         single-def
-;  V05 loc0         [V05,T02] (  3,  2.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;  V06 loc1         [V06,T03] (  3,  2.50)  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V05 loc0         [V05,T04] (  3,  2.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V06 loc1         [V06,T05] (  3,  2.50)  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V08 tmp1         [V08    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V09 tmp2         [V09    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V10 tmp3         [V10    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V08 tmp1         [V08,T03] (  2,  4   )  simd32  ->  mm2         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V09 tmp2         [V09    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V10 tmp3         [V10    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V11 tmp4         [V11,T02] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V12 tmp5         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V13 tmp6         [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V14 tmp7         [V14    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V15 tmp8         [V15    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V16 tmp9         [V16    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;
-; Lcl frame size = 0
+; Lcl frame size = 8
 
 G_M46395_IG01:
-						;; size=0 bbWeight=1 PerfScore 0.00
+       push     rax
+						;; size=1 bbWeight=1 PerfScore 1.00
 G_M46395_IG02:
        vmovups  ymm0, ymmword ptr [rsi]
        vmovups  ymm1, ymmword ptr [rsi+0x20]
        vpor     ymm2, ymm0, ymm1
        vptest   ymm2, ymmword ptr [reloc @RWD00]
        je       SHORT G_M46395_IG05
 						;; size=24 bbWeight=1 PerfScore 18.33
 G_M46395_IG03:
        vxorps   ymm0, ymm0, ymm0
        vmovups  ymmword ptr [r8], ymm0
        xor      eax, eax
 						;; size=11 bbWeight=0.50 PerfScore 1.29
 G_M46395_IG04:
        vzeroupper 
+       add      rsp, 8
        ret      
-						;; size=4 bbWeight=0.50 PerfScore 1.00
+						;; size=8 bbWeight=0.50 PerfScore 1.12
 G_M46395_IG05:
        vpmovwb  ymm0, ymm0
        vpmovwb  ymm1, ymm1
        vinserti128 ymm0, ymm0, xmm1, 1
        vmovups  ymmword ptr [r8], ymm0
        mov      eax, 1
 						;; size=28 bbWeight=0.50 PerfScore 5.12
 G_M46395_IG06:
        vzeroupper 
+       add      rsp, 8
        ret      
-						;; size=4 bbWeight=0.50 PerfScore 1.00
+						;; size=8 bbWeight=0.50 PerfScore 1.12
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 71, prolog size 0, PerfScore 26.75, instruction count 17, allocated bytes for code 71 (MethodHash=fb454ac4) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 80, prolog size 1, PerfScore 28.00, instruction count 20, allocated bytes for code 80 (MethodHash=fb454ac4) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
9 (17.65 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this
 ; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 5 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;* V00 this         [V00    ] (  0,  0   )   byref  ->  zero-ref    this single-def
 ;  V01 arg1         [V01,T00] (  4,  4   )    long  ->  rsi         single-def
 ;* V02 arg2         [V02    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;* V03 arg3         [V03    ] (  0,  0   )     int  ->  zero-ref    single-def
 ;  V04 arg4         [V04,T01] (  4,  3   )   byref  ->   r8         single-def
-;  V05 loc0         [V05,T02] (  3,  2.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V06 loc1         [V06,T03] (  3,  2.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V05 loc0         [V05,T04] (  3,  2.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V06 loc1         [V06,T05] (  3,  2.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V08 tmp1         [V08    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V09 tmp2         [V09    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V10 tmp3         [V10    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V11 tmp4         [V11    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V08 tmp1         [V08,T03] (  2,  4   )  simd16  ->  mm2         "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V09 tmp2         [V09    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V10 tmp3         [V10    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V11 tmp4         [V11,T02] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V12 tmp5         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V13 tmp6         [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V14 tmp7         [V14    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V15 tmp8         [V15    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V16 tmp9         [V16    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V17 tmp10        [V17    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;
-; Lcl frame size = 0
+; Lcl frame size = 8
 
 G_M11006_IG01:
-						;; size=0 bbWeight=1 PerfScore 0.00
+       push     rax
+						;; size=1 bbWeight=1 PerfScore 1.00
 G_M11006_IG02:
        vmovups  xmm0, xmmword ptr [rsi]
        vmovups  xmm1, xmmword ptr [rsi+0x10]
        vpor     xmm2, xmm0, xmm1
        vptest   xmm2, xmmword ptr [reloc @RWD00]
        je       SHORT G_M11006_IG05
 						;; size=24 bbWeight=1 PerfScore 14.33
 G_M11006_IG03:
        vxorps   xmm0, xmm0, xmm0
        vmovups  xmmword ptr [r8], xmm0
        xor      eax, eax
 						;; size=11 bbWeight=0.50 PerfScore 1.29
 G_M11006_IG04:
+       add      rsp, 8
        ret      
-						;; size=1 bbWeight=0.50 PerfScore 0.50
+						;; size=5 bbWeight=0.50 PerfScore 0.62
 G_M11006_IG05:
        vpackuswb xmm0, xmm0, xmm1
        vmovups  xmmword ptr [r8], xmm0
        mov      eax, 1
 						;; size=14 bbWeight=0.50 PerfScore 1.62
 G_M11006_IG06:
+       add      rsp, 8
        ret      
-						;; size=1 bbWeight=0.50 PerfScore 0.50
+						;; size=5 bbWeight=0.50 PerfScore 0.62
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 51, prolog size 0, PerfScore 18.25, instruction count 13, allocated bytes for code 51 (MethodHash=0badd501) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 60, prolog size 1, PerfScore 19.50, instruction count 16, allocated bytes for code 60 (MethodHash=0badd501) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
8 (2.27 % of base) - System.Text.Ascii:IsValidCore[int](byref,int):ubyte
 ; Assembly listing for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
+; 0 inlinees with PGO data; 12 single block inlinees; 12 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T02] ( 18, 13   )   byref  ->  rdi        
 ;  V01 arg1         [V01,T03] (  9,  6   )     int  ->  rsi         single-def
 ;  V02 loc0         [V02,T07] (  5,  2.50)   byref  ->  rdx         single-def
 ;* V03 loc1         [V03    ] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc2         [V04    ] (  0,  0   )    long  ->  zero-ref   
 ;* V05 loc3         [V05,T10] (  0,  0   )    long  ->  zero-ref   
 ;* V06 loc4         [V06    ] (  0,  0   )    long  ->  zero-ref   
 ;  V07 loc5         [V07,T01] (  6, 17   )    long  ->   r8        
 ;* V08 loc6         [V08    ] (  0,  0   )    long  ->  zero-ref   
 ;  V09 loc7         [V09,T04] (  4,  5.50)    long  ->  rcx        
 ;  V10 loc8         [V10,T00] (  5, 20   )   byref  ->  rax        
 ;* V11 loc9         [V11    ] (  0,  0   )    long  ->  zero-ref   
 ;* V12 loc10        [V12    ] (  0,  0   )    long  ->  zero-ref   
 ;* V13 loc11        [V13    ] (  0,  0   )    long  ->  zero-ref   
 ;* V14 loc12        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V16 tmp1         [V16    ] (  0,  0   )     int  ->  zero-ref   
 ;* V17 tmp2         [V17    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inlining Arg"
 ;  V18 tmp3         [V18,T09] (  2,  1   )   ubyte  ->   r8         "Inline return value spill temp"
 ;* V19 tmp4         [V19    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
-;* V20 tmp5         [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V21 tmp6         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[int]>
-;* V22 tmp7         [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V23 tmp8         [V23    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
-;* V24 tmp9         [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V25 tmp10        [V25,T13] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
-;* V26 tmp11        [V26    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V27 tmp12        [V27,T11] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
-;* V28 tmp13        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V29 tmp14        [V29,T14] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
-;  V30 tmp15        [V30,T05] (  5,  5   )     int  ->   r8         "Single return block return value"
-;  V31 tmp16        [V31,T08] (  2,  2   )    long  ->  rax         "Cast away GC"
-;  V32 cse0         [V32,T12] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #03: aggressive"
-;  V33 cse1         [V33,T06] (  6,  3   )    long  ->  rcx         multi-def "CSE #01: aggressive"
+;* V20 tmp5         [V20    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[int]>
+;* V21 tmp6         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V22 tmp7         [V22    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[int]>
+;* V23 tmp8         [V23    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
+;* V24 tmp9         [V24    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V25 tmp10        [V25    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V26 tmp11        [V26    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V27 tmp12        [V27    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V28 tmp13        [V28    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V29 tmp14        [V29    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
+;* V30 tmp15        [V30    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V31 tmp16        [V31    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V32 tmp17        [V32,T15] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V33 tmp18        [V33    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V34 tmp19        [V34    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V35 tmp20        [V35    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
+;* V36 tmp21        [V36    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V37 tmp22        [V37    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V38 tmp23        [V38,T11] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V39 tmp24        [V39    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V40 tmp25        [V40    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V41 tmp26        [V41    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
+;* V42 tmp27        [V42    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V43 tmp28        [V43    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V44 tmp29        [V44,T16] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V45 tmp30        [V45    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V46 tmp31        [V46    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V47 tmp32        [V47    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
+;* V48 tmp33        [V48    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V49 tmp34        [V49    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V50 tmp35        [V50,T05] (  5,  5   )     int  ->   r8         "Single return block return value"
+;  V51 tmp36        [V51,T08] (  2,  2   )    long  ->  rax         "Cast away GC"
+;  V52 cse0         [V52,T12] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #03: aggressive"
+;  V53 cse1         [V53,T06] (  6,  3   )    long  ->  rcx         multi-def "CSE #01: aggressive"
+;  V54 rat0         [V54,T13] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
+;  V55 rat1         [V55,T14] (  3,  3   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M8346_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M8346_IG02:
        cmp      esi, 4
        jge      SHORT G_M8346_IG04
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M8346_IG03:
        movsxd   rcx, esi
        cmp      rcx, 2
        jge      G_M8346_IG13
        test     esi, esi
        je       G_M8346_IG16
        jmp      G_M8346_IG18
 						;; size=26 bbWeight=0.50 PerfScore 2.38
 G_M8346_IG04:
        movsxd   rcx, esi
        lea      rdx, bword ptr [rdi+4*rcx]
        cmp      esi, 8
        jg       SHORT G_M8346_IG05
        vmovups  xmm0, xmmword ptr [rdi]
-       vpor     xmm0, xmm0, xmmword ptr [rdx-0x10]
-       vptest   xmm0, xmmword ptr [reloc @RWD00]
+       vmovups  xmm1, xmmword ptr [rdx-0x10]
+       vpternlogd xmm0, xmm1, dword ptr [reloc @RWD00] {1to4}, -88
+       vptest   xmm0, xmm0
        sete     r8b
        movzx    r8, r8b
        jmp      G_M8346_IG14
-       align    [2 bytes for IG07]
-						;; size=45 bbWeight=0.50 PerfScore 8.62
+       align    [0 bytes for IG07]
+						;; size=50 bbWeight=0.50 PerfScore 9.12
 G_M8346_IG05:
        cmp      esi, 16
        jg       SHORT G_M8346_IG06
        vmovups  ymm0, ymmword ptr [rdi]
-       vpor     ymm0, ymm0, ymmword ptr [rdx-0x20]
        vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogd ymm0, ymm1, ymmword ptr [rdx-0x20], -56
+       vptest   ymm0, ymm0
        sete     r8b
        movzx    r8, r8b
        jmp      G_M8346_IG14
-						;; size=40 bbWeight=0.50 PerfScore 10.75
+						;; size=43 bbWeight=0.50 PerfScore 10.75
 G_M8346_IG06:
        cmp      esi, 32
        jle      SHORT G_M8346_IG12
        vmovups  ymm1, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
        vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
        vpor     ymm0, ymm1, ymmword ptr [rdi+0x60]
        vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        jne      SHORT G_M8346_IG08
        mov      rax, rdi
        and      rax, 31
        shr      rax, 2
        mov      r8, rax
        neg      r8
        add      r8, 32
        add      rcx, -32
        cmp      r8, rcx
        jae      SHORT G_M8346_IG11
 						;; size=72 bbWeight=0.50 PerfScore 15.25
 G_M8346_IG07:
        lea      rax, bword ptr [rdi+4*r8]
        vmovups  ymm0, ymmword ptr [rax]
        vmovups  ymm2, ymmword ptr [rax+0x20]
        vpternlogd ymm0, ymm2, ymmword ptr [rax+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rax+0x60]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        je       SHORT G_M8346_IG10
 						;; size=33 bbWeight=4 PerfScore 90.00
 G_M8346_IG08:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M8346_IG09:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M8346_IG10:
        add      r8, 32
        cmp      r8, rcx
        jb       SHORT G_M8346_IG07
 						;; size=9 bbWeight=4 PerfScore 6.00
 G_M8346_IG11:
        lea      rdi, bword ptr [rdi+4*rcx]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M8346_IG12:
        vmovups  ymm1, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
        vpternlogd ymm1, ymm0, ymmword ptr [rdx-0x40], -2
        vpor     ymm0, ymm1, ymmword ptr [rdx-0x20]
        vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        sete     r8b
        movzx    r8, r8b
        jmp      SHORT G_M8346_IG14
 						;; size=45 bbWeight=0.50 PerfScore 14.12
 G_M8346_IG13:
        mov      r8, qword ptr [rdi]
        or       r8, qword ptr [rdi+4*rcx-0x08]
        mov      rax, 0xD1FFAB1E
        test     r8, rax
        sete     r8b
        movzx    r8, r8b
 						;; size=29 bbWeight=0.50 PerfScore 3.38
 G_M8346_IG14:
        movzx    rax, r8b
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M8346_IG15:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M8346_IG16:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M8346_IG17:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M8346_IG18:
        cmp      dword ptr [rdi], edi
        mov      rax, 0xD1FFAB1E      ; code for System.ThrowHelper:ThrowNotSupportedException()
        call     [rax]System.ThrowHelper:ThrowNotSupportedException()
        int3     
 						;; size=15 bbWeight=0 PerfScore 0.00
-RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
-RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
-RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
+RWD00  	dd	FFFFFF80h
+RWD04  	dd	00000000h, 00000000h, 00000000h, 00000000h, 00000000h, 00000000h
+	dd	00000000h
+RWD32  	dq	FFFFFF80FFFFFF80h, FFFFFF80FFFFFF80h, FFFFFF80FFFFFF80h, FFFFFF80FFFFFF80h
 
 
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
+; Total bytes of code 361, prolog size 4, PerfScore 157.88, instruction count 91, allocated bytes for code 361 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
7 (1.77 % of base) - System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong
 ; Assembly listing for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] ( 32, 34.50)    long  ->  rbx        
 ;  V01 arg1         [V01,T01] ( 17, 10   )    long  ->  rsi        
 ;* V02 loc0         [V02,T08] (  0,  0   )     int  ->  zero-ref   
 ;* V03 loc1         [V03,T09] (  0,  0   )     int  ->  zero-ref   
-;  V04 loc2         [V04,T10] (  9, 11.50)  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V05 loc3         [V05,T11] (  3,  8.50)  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V04 loc2         [V04,T11] (  9, 11.50)  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V05 loc3         [V05,T12] (  3,  8.50)  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V06 loc4         [V06,T04] (  4,  2   )     int  ->  r14        
 ;  V07 loc5         [V07,T03] (  8,  4   )    long  ->  r15        
-;  V08 loc6         [V08,T12] (  5,  6   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V09 loc7         [V09,T13] (  3,  1.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V08 loc6         [V08,T13] (  5,  6   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V09 loc7         [V09,T14] (  3,  1.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V10 loc8         [V10,T05] (  3,  1.50)     int  ->  rdi        
 ;  V11 loc9         [V11,T02] (  2,  4.50)    long  ->  rdi        
 ;* V12 loc10        [V12    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V13 loc11        [V13,T07] (  2,  1   )    long  ->  rdi        
 ;* V14 loc12        [V14    ] (  0,  0   )     int  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V16 cse0         [V16,T06] (  3,  1.50)    long  ->  rdi         "CSE #01: moderate"
+;  V17 rat0         [V17,T10] (  3, 24   )  simd16  ->  mm4         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 8
 
 G_M38868_IG01:
        push     rbp
        push     r15
        push     r14
        push     rbx
        push     rax
        lea      rbp, [rsp+0x20]
        mov      rbx, rdi
 						;; size=15 bbWeight=1 PerfScore 5.75
 G_M38868_IG02:
        test     rsi, rsi
        jne      SHORT G_M38868_IG05
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M38868_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M38868_IG04:
        add      rsp, 8
        pop      rbx
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=11 bbWeight=0.50 PerfScore 1.62
 G_M38868_IG05:
        mov      r15, rbx
        cmp      rsi, 8
        jb       G_M38868_IG10
        vmovups  xmm0, xmmword ptr [reloc @RWD00]
        vmovups  xmm1, xmmword ptr [reloc @RWD16]
        vpaddusw xmm2, xmm1, xmmword ptr [r15]
        vpmovmskb r14d, xmm2
        test     r14d, 0xAAAA
        jne      G_M38868_IG18
        add      rsi, rsi
        cmp      rsi, 32
        jb       SHORT G_M38868_IG08
        lea      rbx, [r15+0x10]
        and      rbx, -16
        add      rsi, r15
        sub      rsi, rbx
        cmp      rsi, 32
        jb       SHORT G_M38868_IG07
        lea      rdi, [rbx+rsi-0x20]
        align    [0 bytes for IG06]
 						;; size=85 bbWeight=0.50 PerfScore 9.38
 G_M38868_IG06:
        vmovdqa  xmm2, xmmword ptr [rbx]
        vmovdqa  xmm3, xmmword ptr [rbx+0x10]
-       vpor     xmm4, xmm2, xmm3
-       vptest   xmm4, xmm0
+       vmovaps  xmm4, xmm2
+       vpternlogd xmm4, xmm3, xmm0, -88
+       vptest   xmm4, xmm4
        jne      G_M38868_IG16
        add      rbx, 32
        cmp      rbx, rdi
        jbe      SHORT G_M38868_IG06
-						;; size=33 bbWeight=4 PerfScore 55.33
+						;; size=40 bbWeight=4 PerfScore 57.00
 G_M38868_IG07:
        test     sil, 16
        je       SHORT G_M38868_IG09
        vmovdqa  xmm2, xmmword ptr [rbx]
        vptest   xmm2, xmm0
        jne      G_M38868_IG17
 						;; size=21 bbWeight=0.50 PerfScore 4.62
 G_M38868_IG08:
        add      rbx, 16
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M38868_IG09:
        movzx    rdi, sil
        test     dil, 15
        je       G_M38868_IG19
        and      rsi, 15
        add      rsi, rbx
        mov      rbx, rsi
        sub      rbx, 16
        vmovups  xmm2, xmmword ptr [rbx]
        vptest   xmm2, xmm0
        jne      G_M38868_IG17
        add      rbx, 16
        jmp      G_M38868_IG19
 						;; size=52 bbWeight=0.50 PerfScore 6.38
 G_M38868_IG10:
        test     sil, 4
        je       SHORT G_M38868_IG12
        mov      rdi, qword ptr [r15]
        mov      rax, 0xD1FFAB1E
        and      rdi, rax
        je       SHORT G_M38868_IG11
        xor      ebx, ebx
        tzcnt    rbx, rdi
        shr      rbx, 3
        and      rbx, -2
        add      rbx, r15
        jmp      SHORT G_M38868_IG19
 						;; size=44 bbWeight=0.50 PerfScore 5.00
 G_M38868_IG11:
        lea      rbx, [r15+0x08]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M38868_IG12:
        test     sil, 2
        je       SHORT G_M38868_IG13
        mov      edi, dword ptr [rbx]
        test     edi, 0xD1FFAB1E
        jne      SHORT G_M38868_IG14
        add      rbx, 4
 						;; size=20 bbWeight=0.50 PerfScore 2.38
 G_M38868_IG13:
        test     sil, 1
        je       SHORT G_M38868_IG19
        cmp      word  ptr [rbx], 255
        ja       SHORT G_M38868_IG19
        jmp      SHORT G_M38868_IG15
 						;; size=15 bbWeight=0.50 PerfScore 3.62
 G_M38868_IG14:
        mov      rax, 0xD1FFAB1E      ; code for System.Text.Latin1Utility:FirstCharInUInt32IsLatin1(uint):ubyte
        call     [rax]System.Text.Latin1Utility:FirstCharInUInt32IsLatin1(uint):ubyte
        test     eax, eax
        je       SHORT G_M38868_IG19
 						;; size=16 bbWeight=0.50 PerfScore 2.25
 G_M38868_IG15:
        add      rbx, 2
        jmp      SHORT G_M38868_IG19
 						;; size=6 bbWeight=0.50 PerfScore 1.12
 G_M38868_IG16:
        vptest   xmm2, xmm0
        jne      SHORT G_M38868_IG17
        add      rbx, 16
        vmovaps  xmm2, xmm3
 						;; size=15 bbWeight=0.50 PerfScore 2.25
 G_M38868_IG17:
        vpaddusw xmm0, xmm2, xmm1
        vpmovmskb r14d, xmm0
 						;; size=8 bbWeight=0.50 PerfScore 1.17
 G_M38868_IG18:
        and      r14d, 0xAAAA
        xor      eax, eax
        tzcnt    eax, r14d
        lea      rbx, [rbx+rax-0x01]
 						;; size=19 bbWeight=0.50 PerfScore 1.75
 G_M38868_IG19:
        mov      rax, rbx
        sub      rax, r15
        shr      rax, 1
 						;; size=9 bbWeight=0.50 PerfScore 0.50
 G_M38868_IG20:
        add      rsp, 8
        pop      rbx
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=11 bbWeight=0.50 PerfScore 1.62
 RWD00  	dq	FF00FF00FF00FF00h, FF00FF00FF00FF00h
 RWD16  	dq	7F007F007F007F00h, 7F007F007F007F00h
 
 
-; Total bytes of code 395, prolog size 15, PerfScore 106.50, instruction count 111, allocated bytes for code 395 (MethodHash=0f68682b) for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 402, prolog size 15, PerfScore 108.17, instruction count 112, allocated bytes for code 402 (MethodHash=0f68682b) for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
7 (4.70 % of base) - System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong
 ; Assembly listing for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T01] (  6, 11.50)    long  ->  rdi         single-def
 ;  V01 arg1         [V01,T02] (  8,  8.50)    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T03] (  3,  2.50)    long  ->  rdx         single-def
 ;* V03 loc0         [V03,T05] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc1         [V04    ] (  0,  0   )    long  ->  zero-ref   
-;  V05 loc2         [V05,T08] (  5,  7   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V05 loc2         [V05,T09] (  5,  7   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V06 loc3         [V06    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V07 loc4         [V07,T06] ( 14, 18.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V07 loc4         [V07,T07] ( 14, 18.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V08 loc5         [V08    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V09 loc6         [V09,T00] ( 12, 27   )    long  ->  rax        
 ;  V10 loc7         [V10,T04] (  2,  4.50)    long  ->  rdx        
-;  V11 loc8         [V11,T07] (  3, 12   )  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V11 loc8         [V11,T08] (  3, 12   )  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V12 loc9         [V12    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;# V13 OutArgs      [V13    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+;  V14 rat0         [V14,T06] (  3, 24   )  simd16  ->  mm3         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M23879_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M23879_IG02:
        vmovups  xmm0, xmmword ptr [reloc @RWD00]
        vmovups  xmm1, xmmword ptr [rdi]
        vptest   xmm1, xmm0
        je       SHORT G_M23879_IG05
 						;; size=19 bbWeight=1 PerfScore 11.00
 G_M23879_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M23879_IG04:
        pop      rbp
        ret      
 						;; size=2 bbWeight=0.50 PerfScore 0.75
 G_M23879_IG05:
        vpackuswb xmm1, xmm1, xmm1
        vmovq    qword ptr [rsi], xmm1
        mov      eax, 8
        test     sil, 8
        jne      SHORT G_M23879_IG06
        vmovups  xmm1, xmmword ptr [rdi+0x10]
        vptest   xmm1, xmm0
        jne      SHORT G_M23879_IG08
        vpackuswb xmm1, xmm1, xmm1
        vmovq    qword ptr [rsi+0x08], xmm1
 						;; size=40 bbWeight=0.50 PerfScore 7.75
 G_M23879_IG06:
        mov      rax, rsi
        and      rax, 15
        neg      rax
        add      rax, 16
        add      rdx, -16
        align    [0 bytes for IG07]
 						;; size=18 bbWeight=0.50 PerfScore 0.62
 G_M23879_IG07:
        vmovups  xmm1, xmmword ptr [rdi+2*rax]
        vmovups  xmm2, xmmword ptr [rdi+2*rax+0x10]
-       vpor     xmm3, xmm1, xmm2
-       vptest   xmm3, xmm0
+       vmovaps  xmm3, xmm1
+       vpternlogd xmm3, xmm2, xmm0, -88
+       vptest   xmm3, xmm3
        jne      SHORT G_M23879_IG09
        vpackuswb xmm1, xmm1, xmm2
        vmovdqa  xmmword ptr [rsi+rax], xmm1
        add      rax, 16
        cmp      rax, rdx
        jbe      SHORT G_M23879_IG07
-						;; size=40 bbWeight=4 PerfScore 67.33
+						;; size=47 bbWeight=4 PerfScore 69.00
 G_M23879_IG08:
        pop      rbp
        ret      
 						;; size=2 bbWeight=0.50 PerfScore 0.75
 G_M23879_IG09:
        vptest   xmm1, xmm0
        jne      SHORT G_M23879_IG08
        vpackuswb xmm0, xmm1, xmm1
        vmovq    qword ptr [rsi+rax], xmm0
        add      rax, 8
        jmp      SHORT G_M23879_IG08
 						;; size=22 bbWeight=0.50 PerfScore 4.62
 RWD00  	dq	FF00FF00FF00FF00h, FF00FF00FF00FF00h
 
 
-; Total bytes of code 149, prolog size 4, PerfScore 94.21, instruction count 43, allocated bytes for code 149 (MethodHash=f65ba2b8) for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 156, prolog size 4, PerfScore 95.88, instruction count 44, allocated bytes for code 156 (MethodHash=f65ba2b8) for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
6 (12.24 % of base) - System.Text.Ascii+PlainLoader`1[int]:EqualAndAscii256(byref,byref):ubyte
 ; Assembly listing for method System.Text.Ascii+PlainLoader`1[int]:EqualAndAscii256(byref,byref):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 2 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] (  3,  3   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T01] (  3,  3   )   byref  ->  rsi         single-def
-;  V02 loc0         [V02,T02] (  3,  2.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[int]>
+;  V02 loc0         [V02,T03] (  3,  2.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[int]>
 ;* V03 loc1         [V03    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[int]>
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V05 tmp1         [V05    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V05 tmp1         [V05    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V06 tmp2         [V06    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V07 tmp3         [V07    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
+;* V08 tmp4         [V08    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V09 tmp5         [V09    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V10 rat0         [V10,T02] (  3,  3   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M61666_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M61666_IG02:
        vmovups  ymm0, ymmword ptr [rdi]
        vpcmpd   k1, ymm0, ymmword ptr [rsi], 4
        kortestb k1, k1
        jne      SHORT G_M61666_IG04
 						;; size=17 bbWeight=1 PerfScore 13.00
 G_M61666_IG03:
-       vptest   ymm0, ymmword ptr [reloc @RWD00]
+       vpandd   ymm0, ymm0, dword ptr [reloc @RWD00] {1to8}
+       vptest   ymm0, ymm0
        je       SHORT G_M61666_IG06
-						;; size=11 bbWeight=0.50 PerfScore 4.00
+						;; size=17 bbWeight=0.50 PerfScore 4.00
 G_M61666_IG04:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M61666_IG05:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M61666_IG06:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M61666_IG07:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
-RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
+RWD00  	dd	FFFFFF80h
 
 
-; Total bytes of code 49, prolog size 4, PerfScore 21.00, instruction count 16, allocated bytes for code 51 (MethodHash=a7ac0f1d) for method System.Text.Ascii+PlainLoader`1[int]:EqualAndAscii256(byref,byref):ubyte (FullOpts)
+; Total bytes of code 55, prolog size 4, PerfScore 21.00, instruction count 17, allocated bytes for code 57 (MethodHash=a7ac0f1d) for method System.Text.Ascii+PlainLoader`1[int]:EqualAndAscii256(byref,byref):ubyte (FullOpts)
6 (12.24 % of base) - System.Text.Ascii+PlainLoader`1[long]:EqualAndAscii256(byref,byref):ubyte
 ; Assembly listing for method System.Text.Ascii+PlainLoader`1[long]:EqualAndAscii256(byref,byref):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 4 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] (  3,  3   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T01] (  3,  3   )   byref  ->  rsi         single-def
-;  V02 loc0         [V02,T02] (  3,  2.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[long]>
+;  V02 loc0         [V02,T03] (  3,  2.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[long]>
 ;* V03 loc1         [V03    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[long]>
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V05 tmp1         [V05    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V05 tmp1         [V05    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V06 tmp2         [V06    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V07 tmp3         [V07    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V08 tmp4         [V08    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V09 tmp5         [V09    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V10 tmp6         [V10    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V11 tmp7         [V11    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
+;* V12 tmp8         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V13 rat0         [V13,T02] (  3,  3   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M4539_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M4539_IG02:
        vmovups  ymm0, ymmword ptr [rdi]
        vpcmpq   k1, ymm0, ymmword ptr [rsi], 4
        kortestb k1, k1
        jne      SHORT G_M4539_IG04
 						;; size=17 bbWeight=1 PerfScore 13.00
 G_M4539_IG03:
-       vptest   ymm0, ymmword ptr [reloc @RWD00]
+       vpandq   ymm0, ymm0, qword ptr [reloc @RWD00] {1to4}
+       vptest   ymm0, ymm0
        je       SHORT G_M4539_IG06
-						;; size=11 bbWeight=0.50 PerfScore 4.00
+						;; size=17 bbWeight=0.50 PerfScore 4.00
 G_M4539_IG04:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M4539_IG05:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M4539_IG06:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M4539_IG07:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
-RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
+RWD00  	dq	FFFFFFFFFFFFFF80h
 
 
-; Total bytes of code 49, prolog size 4, PerfScore 21.00, instruction count 16, allocated bytes for code 51 (MethodHash=0669ee44) for method System.Text.Ascii+PlainLoader`1[long]:EqualAndAscii256(byref,byref):ubyte (FullOpts)
+; Total bytes of code 55, prolog size 4, PerfScore 21.00, instruction count 17, allocated bytes for code 57 (MethodHash=0669ee44) for method System.Text.Ascii+PlainLoader`1[long]:EqualAndAscii256(byref,byref):ubyte (FullOpts)
2 (2.15 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector512(ulong,ulong,int,byref):ubyte:this
 ; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector512(ulong,ulong,int,byref):ubyte:this (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
+; 0 inlinees with PGO data; 4 single block inlinees; 4 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;* V00 this         [V00    ] (  0,  0   )   byref  ->  zero-ref    this single-def
 ;  V01 arg1         [V01,T00] (  4,  4   )    long  ->  rsi         single-def
 ;* V02 arg2         [V02    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;* V03 arg3         [V03    ] (  0,  0   )     int  ->  zero-ref    single-def
 ;  V04 arg4         [V04,T01] (  4,  3   )   byref  ->   r8         single-def
-;  V05 loc0         [V05,T03] (  3,  2.50)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;  V06 loc1         [V06,T04] (  3,  2.50)  simd64  ->  mm1         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V05 loc0         [V05,T04] (  3,  2.50)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V06 loc1         [V06,T05] (  3,  2.50)  simd64  ->  mm1         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V08 tmp1         [V08    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
-;* V09 tmp2         [V09    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;  V10 rat0         [V10,T02] (  3,  6   )  simd64  ->  mm2         "ReplaceWithLclVar is creating a new local variable"
+;  V09 tmp2         [V09,T03] (  2,  4   )  simd64  ->  mm2         "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V10 tmp3         [V10    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V11 tmp4         [V11    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V12 tmp5         [V12,T02] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V13 tmp6         [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V14 tmp7         [V14    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V15 tmp8         [V15    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V16 tmp9         [V16    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V17 tmp10        [V17    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;
-; Lcl frame size = 0
+; Lcl frame size = 8
 
 G_M39699_IG01:
-						;; size=0 bbWeight=1 PerfScore 0.00
+       push     rax
+						;; size=1 bbWeight=1 PerfScore 1.00
 G_M39699_IG02:
        vmovups  zmm0, zmmword ptr [rsi]
        vmovups  zmm1, zmmword ptr [rsi+0x40]
-       vmovaps  zmm2, zmm0
-       vpternlogd zmm2, zmm1, zmmword ptr [reloc @RWD00], -88
-       vptestmw k1, zmm2, zmm2
+       vpord    zmm2, zmm0, zmm1
+       vptestmw k1, zmm2, zmmword ptr [reloc @RWD00]
        kortestd k1, k1
        je       SHORT G_M39699_IG05
-						;; size=43 bbWeight=1 PerfScore 15.25
+						;; size=36 bbWeight=1 PerfScore 15.33
 G_M39699_IG03:
        vxorps   ymm0, ymm0, ymm0
        vmovups  zmmword ptr [r8], zmm0
        xor      eax, eax
 						;; size=12 bbWeight=0.50 PerfScore 1.29
 G_M39699_IG04:
        vzeroupper 
+       add      rsp, 8
        ret      
-						;; size=4 bbWeight=0.50 PerfScore 1.00
+						;; size=8 bbWeight=0.50 PerfScore 1.12
 G_M39699_IG05:
        vpmovwb  zmm0, zmm0
        vpmovwb  zmm1, zmm1
        vinserti64x4 zmm0, zmm0, ymm1, 1
        vmovups  zmmword ptr [r8], zmm0
        mov      eax, 1
 						;; size=30 bbWeight=0.50 PerfScore 5.12
 G_M39699_IG06:
        vzeroupper 
+       add      rsp, 8
        ret      
-						;; size=4 bbWeight=0.50 PerfScore 1.00
+						;; size=8 bbWeight=0.50 PerfScore 1.12
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 93, prolog size 0, PerfScore 23.67, instruction count 19, allocated bytes for code 96 (MethodHash=65b664ec) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector512(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 95, prolog size 1, PerfScore 25.00, instruction count 21, allocated bytes for code 98 (MethodHash=65b664ec) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector512(ulong,ulong,int,byref):ubyte:this (FullOpts)
2 (0.57 % of base) - System.Text.Ascii:IsValidCore[long](byref,int):ubyte
 ; Assembly listing for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
+; 0 inlinees with PGO data; 12 single block inlinees; 22 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T02] ( 18, 13   )   byref  ->  rdi        
-;  V01 arg1         [V01,T03] (  9,  6   )     int  ->  rsi         single-def
-;  V02 loc0         [V02,T07] (  5,  2.50)   byref  ->  rdx         single-def
+;  V01 arg1         [V01,T03] ( 11,  6.50)     int  ->  rsi         single-def
+;  V02 loc0         [V02,T06] (  5,  2.50)   byref  ->  rcx         single-def
 ;* V03 loc1         [V03    ] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc2         [V04    ] (  0,  0   )    long  ->  zero-ref   
-;* V05 loc3         [V05,T10] (  0,  0   )    long  ->  zero-ref   
+;* V05 loc3         [V05    ] (  0,  0   )    long  ->  zero-ref   
 ;* V06 loc4         [V06    ] (  0,  0   )    long  ->  zero-ref   
-;  V07 loc5         [V07,T01] (  6, 17   )    long  ->   r8        
+;  V07 loc5         [V07,T01] (  6, 17   )    long  ->  rdx        
 ;* V08 loc6         [V08    ] (  0,  0   )    long  ->  zero-ref   
-;  V09 loc7         [V09,T04] (  4,  5.50)    long  ->  rcx        
+;  V09 loc7         [V09,T04] (  4,  5.50)    long  ->  rsi        
 ;  V10 loc8         [V10,T00] (  5, 20   )   byref  ->  rax        
 ;* V11 loc9         [V11    ] (  0,  0   )    long  ->  zero-ref   
 ;* V12 loc10        [V12    ] (  0,  0   )    long  ->  zero-ref   
 ;* V13 loc11        [V13    ] (  0,  0   )    long  ->  zero-ref   
 ;* V14 loc12        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V16 tmp1         [V16    ] (  0,  0   )     int  ->  zero-ref   
 ;* V17 tmp2         [V17    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inlining Arg"
-;  V18 tmp3         [V18,T09] (  2,  1   )   ubyte  ->   r8         "Inline return value spill temp"
+;  V18 tmp3         [V18,T08] (  2,  1   )   ubyte  ->  rdx         "Inline return value spill temp"
 ;* V19 tmp4         [V19    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
-;* V20 tmp5         [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V21 tmp6         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[long]>
-;* V22 tmp7         [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V23 tmp8         [V23    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;  V20 tmp5         [V20,T12] (  2,  2   )  simd16  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[long]>
+;* V21 tmp6         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V22 tmp7         [V22    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[long]>
+;* V23 tmp8         [V23    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
 ;* V24 tmp9         [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V25 tmp10        [V25,T13] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V25 tmp10        [V25    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
 ;* V26 tmp11        [V26    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V27 tmp12        [V27,T11] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V27 tmp12        [V27    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
 ;* V28 tmp13        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V29 tmp14        [V29,T14] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
-;  V30 tmp15        [V30,T05] (  5,  5   )     int  ->   r8         "Single return block return value"
-;  V31 tmp16        [V31,T08] (  2,  2   )    long  ->  rax         "Cast away GC"
-;  V32 cse0         [V32,T12] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #03: aggressive"
-;  V33 cse1         [V33,T06] (  6,  3   )    long  ->  rcx         multi-def "CSE #01: aggressive"
+;  V29 tmp14        [V29,T13] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V30 tmp15        [V30    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V31 tmp16        [V31    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V32 tmp17        [V32    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V33 tmp18        [V33    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V34 tmp19        [V34    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V35 tmp20        [V35    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V36 tmp21        [V36    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
+;* V37 tmp22        [V37    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V38 tmp23        [V38,T14] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V39 tmp24        [V39    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V40 tmp25        [V40    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V41 tmp26        [V41    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V42 tmp27        [V42    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V43 tmp28        [V43    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V44 tmp29        [V44    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V45 tmp30        [V45    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
+;* V46 tmp31        [V46    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V47 tmp32        [V47,T09] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V48 tmp33        [V48    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V49 tmp34        [V49    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V50 tmp35        [V50    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V51 tmp36        [V51    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V52 tmp37        [V52    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V53 tmp38        [V53    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V54 tmp39        [V54    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
+;* V55 tmp40        [V55    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V56 tmp41        [V56,T15] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V57 tmp42        [V57    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V58 tmp43        [V58    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[long]>
+;* V59 tmp44        [V59    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V60 tmp45        [V60    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V61 tmp46        [V61    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V62 tmp47        [V62    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V63 tmp48        [V63    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
+;* V64 tmp49        [V64    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V65 tmp50        [V65,T05] (  5,  5   )     int  ->  rdx         "Single return block return value"
+;  V66 tmp51        [V66,T07] (  2,  2   )    long  ->  rax         "Cast away GC"
+;  V67 cse0         [V67,T10] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #04: aggressive"
+;  V68 rat0         [V68,T11] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M33379_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M33379_IG02:
        cmp      esi, 2
        jge      SHORT G_M33379_IG04
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M33379_IG03:
-       movsxd   rcx, esi
-       test     rcx, rcx
+       movsxd   rax, esi
+       test     rax, rax
        jg       G_M33379_IG13
-       test     esi, esi
-       je       G_M33379_IG16
-       jmp      G_M33379_IG18
-						;; size=25 bbWeight=0.50 PerfScore 2.38
+       jmp      G_M33379_IG16
+						;; size=17 bbWeight=0.50 PerfScore 1.75
 G_M33379_IG04:
-       movsxd   rcx, esi
-       lea      rdx, bword ptr [rdi+8*rcx]
+       movsxd   rax, esi
+       lea      rcx, bword ptr [rdi+8*rax]
        cmp      esi, 4
        jg       SHORT G_M33379_IG05
        vmovups  xmm0, xmmword ptr [rdi]
-       vpor     xmm0, xmm0, xmmword ptr [rdx-0x10]
-       vptest   xmm0, xmmword ptr [reloc @RWD00]
-       sete     r8b
-       movzx    r8, r8b
+       vpor     xmm0, xmm0, xmmword ptr [rcx-0x10]
+       vpandq   xmm0, xmm0, qword ptr [reloc @RWD00] {1to2}
+       vptest   xmm0, xmm0
+       sete     dl
+       movzx    rdx, dl
        jmp      G_M33379_IG14
-       align    [3 bytes for IG07]
-						;; size=46 bbWeight=0.50 PerfScore 8.62
+       align    [6 bytes for IG07]
+						;; size=53 bbWeight=0.50 PerfScore 8.62
 G_M33379_IG05:
        cmp      esi, 8
        jg       SHORT G_M33379_IG06
        vmovups  ymm0, ymmword ptr [rdi]
-       vpor     ymm0, ymm0, ymmword ptr [rdx-0x20]
+       vpor     ymm0, ymm0, ymmword ptr [rcx-0x20]
        vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
-       sete     r8b
-       movzx    r8, r8b
+       vptest   ymm1, ymm0
+       sete     dl
+       movzx    rdx, dl
        jmp      G_M33379_IG14
-						;; size=40 bbWeight=0.50 PerfScore 10.75
+						;; size=38 bbWeight=0.50 PerfScore 10.75
 G_M33379_IG06:
        cmp      esi, 16
        jle      SHORT G_M33379_IG12
        vmovups  ymm1, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
        vpternlogq ymm1, ymm0, ymmword ptr [rdi+0x40], -2
        vpor     ymm0, ymm1, ymmword ptr [rdi+0x60]
        vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        jne      SHORT G_M33379_IG08
        mov      rax, rdi
        and      rax, 31
        shr      rax, 3
-       mov      r8, rax
-       neg      r8
-       add      r8, 16
-       add      rcx, -16
-       cmp      r8, rcx
+       mov      rdx, rax
+       neg      rdx
+       add      rdx, 16
+       movsxd   rsi, esi
+       add      rsi, -16
+       cmp      rdx, rsi
        jae      SHORT G_M33379_IG11
-						;; size=72 bbWeight=0.50 PerfScore 15.25
+						;; size=75 bbWeight=0.50 PerfScore 15.38
 G_M33379_IG07:
-       lea      rax, bword ptr [rdi+8*r8]
+       lea      rax, bword ptr [rdi+8*rdx]
        vmovups  ymm0, ymmword ptr [rax]
        vmovups  ymm2, ymmword ptr [rax+0x20]
        vpternlogq ymm0, ymm2, ymmword ptr [rax+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rax+0x60]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        je       SHORT G_M33379_IG10
 						;; size=33 bbWeight=4 PerfScore 90.00
 G_M33379_IG08:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M33379_IG09:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M33379_IG10:
-       add      r8, 16
-       cmp      r8, rcx
+       add      rdx, 16
+       cmp      rdx, rsi
        jb       SHORT G_M33379_IG07
 						;; size=9 bbWeight=4 PerfScore 6.00
 G_M33379_IG11:
-       lea      rdi, bword ptr [rdi+8*rcx]
+       lea      rdi, bword ptr [rdi+8*rsi]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M33379_IG12:
        vmovups  ymm1, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogq ymm1, ymm0, ymmword ptr [rdx-0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rdx-0x20]
+       vpternlogq ymm1, ymm0, ymmword ptr [rcx-0x40], -2
+       vpor     ymm0, ymm1, ymmword ptr [rcx-0x20]
        vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
-       sete     r8b
-       movzx    r8, r8b
+       vptest   ymm1, ymm0
+       sete     dl
+       movzx    rdx, dl
        jmp      SHORT G_M33379_IG14
-						;; size=45 bbWeight=0.50 PerfScore 14.12
+						;; size=43 bbWeight=0.50 PerfScore 14.12
 G_M33379_IG13:
-       mov      r8, qword ptr [rdi]
-       or       r8, qword ptr [rdi+8*rcx-0x08]
+       mov      rdx, qword ptr [rdi]
+       movsxd   rax, esi
+       or       rdx, qword ptr [rdi+8*rax-0x08]
        mov      rax, 0xD1FFAB1E
-       test     r8, rax
-       sete     r8b
-       movzx    r8, r8b
-						;; size=29 bbWeight=0.50 PerfScore 3.38
+       test     rdx, rax
+       sete     dl
+       movzx    rdx, dl
+						;; size=30 bbWeight=0.50 PerfScore 3.50
 G_M33379_IG14:
-       movzx    rax, r8b
-						;; size=4 bbWeight=0.50 PerfScore 0.12
+       movzx    rax, dl
+						;; size=3 bbWeight=0.50 PerfScore 0.12
 G_M33379_IG15:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M33379_IG16:
-       mov      eax, 1
-						;; size=5 bbWeight=0.50 PerfScore 0.12
-G_M33379_IG17:
-       vzeroupper 
-       pop      rbp
-       ret      
-						;; size=5 bbWeight=0.50 PerfScore 1.25
-G_M33379_IG18:
+       test     esi, esi
+       je       SHORT G_M33379_IG17
        cmp      dword ptr [rdi], edi
        mov      rax, 0xD1FFAB1E      ; code for System.ThrowHelper:ThrowNotSupportedException()
        call     [rax]System.ThrowHelper:ThrowNotSupportedException()
        int3     
-						;; size=15 bbWeight=0 PerfScore 0.00
-RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
-RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
-RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
+						;; size=19 bbWeight=0 PerfScore 0.00
+G_M33379_IG17:
+       mov      eax, 1
+						;; size=5 bbWeight=0 PerfScore 0.00
+G_M33379_IG18:
+       vzeroupper 
+       pop      rbp
+       ret      
+						;; size=5 bbWeight=0 PerfScore 0.00
+RWD00  	dq	FFFFFFFFFFFFFF80h
+RWD08  	dd	00000000h, 00000000h, 00000000h, 00000000h, 00000000h, 00000000h
+RWD32  	dq	FFFFFFFFFFFFFF80h, FFFFFFFFFFFFFF80h, FFFFFFFFFFFFFF80h, FFFFFFFFFFFFFF80h
 
 
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=a1267d9c) for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
+; Total bytes of code 355, prolog size 4, PerfScore 155.62, instruction count 93, allocated bytes for code 355 (MethodHash=a1267d9c) for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
1 (1.75 % of base) - System.Text.Ascii+PlainLoader`1[ubyte]:EqualAndAscii512(byref,byref):ubyte
 ; Assembly listing for method System.Text.Ascii+PlainLoader`1[ubyte]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 4 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] (  3,  3   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T01] (  3,  3   )   byref  ->  rsi         single-def
-;  V02 loc0         [V02,T02] (  3,  2.50)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;  V02 loc0         [V02,T03] (  3,  2.50)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V03 loc1         [V03    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V05 tmp1         [V05    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V05 tmp1         [V05    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V06 tmp2         [V06    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V07 tmp3         [V07,T02] (  0,  0   )   ubyte  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V08 tmp4         [V08    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V09 tmp5         [V09    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V10 tmp6         [V10    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline stloc first use temp"
+;* V11 tmp7         [V11    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
+;* V12 tmp8         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;
 ; Lcl frame size = 0
 
 G_M30537_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M30537_IG02:
        vmovups  zmm0, zmmword ptr [rdi]
        vpcmpub  k1, zmm0, zmmword ptr [rsi], 4
        kortestq k1, k1
        jne      SHORT G_M30537_IG04
 						;; size=20 bbWeight=1 PerfScore 12.00
 G_M30537_IG03:
-       vpmovb2m k1, zmm0
-       kmovq    rax, k1
-       test     rax, rax
+       vptestmb k1, zmm0, zmmword ptr [reloc @RWD00]
+       kortestq k1, k1
        je       SHORT G_M30537_IG06
-						;; size=16 bbWeight=0.50 PerfScore 2.62
+						;; size=17 bbWeight=0.50 PerfScore 3.50
 G_M30537_IG04:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M30537_IG05:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M30537_IG06:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M30537_IG07:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
+RWD00  	dq	8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h
+
 
-; Total bytes of code 57, prolog size 4, PerfScore 18.62, instruction count 18, allocated bytes for code 63 (MethodHash=8c8d88b6) for method System.Text.Ascii+PlainLoader`1[ubyte]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
+; Total bytes of code 58, prolog size 4, PerfScore 19.50, instruction count 17, allocated bytes for code 64 (MethodHash=8c8d88b6) for method System.Text.Ascii+PlainLoader`1[ubyte]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
1 (0.99 % of base) - System.Text.Ascii+WideningLoader:EqualAndAscii512(byref,byref):ubyte
 ; Assembly listing for method System.Text.Ascii+WideningLoader:EqualAndAscii512(byref,byref):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 2 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 4 single block inlinees; 4 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T01] (  3,  3   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T00] (  4,  3   )   byref  ->  rsi         single-def
-;  V02 loc0         [V02,T02] (  4,  3   )  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;  V02 loc0         [V02,T03] (  4,  3   )  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V03 loc1         [V03    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V04 loc2         [V04    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V05 loc3         [V05    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V06 loc4         [V06    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V08 tmp1         [V08    ] (  0,  0   )  struct (128) zero-ref    "dup spill" <System.ValueTuple`2[System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]]>
-;* V09 tmp2         [V09    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V10 tmp3         [V10    ] (  0,  0   )  struct (128) zero-ref    ld-addr-op "NewObj constructor temp" <System.ValueTuple`2[System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]]>
-;  V11 tmp4         [V11,T04] (  2,  2   )  simd64  ->  mm1         "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;  V12 tmp5         [V12,T05] (  2,  2   )  simd64  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;* V13 tmp6         [V13    ] (  0,  0   )  simd64  ->  zero-ref    "field V08.Item1 (fldOffset=0x0)" P-INDEP
-;* V14 tmp7         [V14    ] (  0,  0   )  simd64  ->  zero-ref    "field V08.Item2 (fldOffset=0x40)" P-INDEP
-;* V15 tmp8         [V15    ] (  0,  0   )  simd64  ->  zero-ref    "field V10.Item1 (fldOffset=0x0)" P-INDEP
-;* V16 tmp9         [V16    ] (  0,  0   )  simd64  ->  zero-ref    "field V10.Item2 (fldOffset=0x40)" P-INDEP
-;  V17 rat0         [V17,T03] (  3,  3   )  simd64  ->  mm1         "ReplaceWithLclVar is creating a new local variable"
+;* V09 tmp2         [V09    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V10 tmp3         [V10    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V11 tmp4         [V11,T02] (  0,  0   )   ubyte  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V12 tmp5         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V13 tmp6         [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V14 tmp7         [V14    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline stloc first use temp"
+;* V15 tmp8         [V15    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
+;* V16 tmp9         [V16    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V17 tmp10        [V17    ] (  0,  0   )  struct (128) zero-ref    ld-addr-op "NewObj constructor temp" <System.ValueTuple`2[System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]]>
+;  V18 tmp11        [V18,T05] (  2,  2   )  simd64  ->  mm1         "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V19 tmp12        [V19,T06] (  2,  2   )  simd64  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V20 tmp13        [V20    ] (  0,  0   )  simd64  ->  zero-ref    "field V08.Item1 (fldOffset=0x0)" P-INDEP
+;* V21 tmp14        [V21    ] (  0,  0   )  simd64  ->  zero-ref    "field V08.Item2 (fldOffset=0x40)" P-INDEP
+;* V22 tmp15        [V22    ] (  0,  0   )  simd64  ->  zero-ref    "field V17.Item1 (fldOffset=0x0)" P-INDEP
+;* V23 tmp16        [V23    ] (  0,  0   )  simd64  ->  zero-ref    "field V17.Item2 (fldOffset=0x40)" P-INDEP
+;  V24 rat0         [V24,T04] (  3,  3   )  simd64  ->  mm1         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M19370_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M19370_IG02:
        vmovups  zmm0, zmmword ptr [rdi]
-       vpmovb2m k1, zmm0
-       kmovq    rax, k1
-       test     rax, rax
+       vptestmb k1, zmm0, zmmword ptr [reloc @RWD00]
+       kortestq k1, k1
        je       SHORT G_M19370_IG05
-						;; size=22 bbWeight=1 PerfScore 9.25
+						;; size=23 bbWeight=1 PerfScore 11.00
 G_M19370_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M19370_IG04:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M19370_IG05:
        vmovaps  zmm1, zmm0
        vpmovzxbw zmm1, zmm1
        vextracti64x4 ymm0, zmm0, 1
        vpmovzxbw zmm0, zmm0
        vmovups  zmm2, zmmword ptr [rsi]
        vpxord   zmm0, zmm0, zmmword ptr [rsi+0x40]
        vpternlogd zmm1, zmm0, zmm2, -34
        vptestmw k1, zmm1, zmm1
        kortestd k1, k1
        jne      SHORT G_M19370_IG03
        mov      eax, 1
 						;; size=63 bbWeight=0.50 PerfScore 8.50
 G_M19370_IG06:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
+RWD00  	dq	8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h
 
-; Total bytes of code 101, prolog size 4, PerfScore 21.62, instruction count 25, allocated bytes for code 107 (MethodHash=9e74b455) for method System.Text.Ascii+WideningLoader:EqualAndAscii512(byref,byref):ubyte (FullOpts)
+
+; Total bytes of code 102, prolog size 4, PerfScore 23.38, instruction count 24, allocated bytes for code 108 (MethodHash=9e74b455) for method System.Text.Ascii+WideningLoader:EqualAndAscii512(byref,byref):ubyte (FullOpts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment