Skip to content

Instantly share code, notes, and snippets.

@MihuBot
Created July 10, 2024 23:43
Show Gist options
  • Save MihuBot/6be1c4e16f21c742c3995e6ba8009dec to your computer and use it in GitHub Desktop.
Save MihuBot/6be1c4e16f21c742c3995e6ba8009dec to your computer and use it in GitHub Desktop.

Top method regressions

30 (3.53 % of base) - System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 21 single block inlinees; 25 inlinees without PGO data
+; 0 inlinees with PGO data; 25 single block inlinees; 21 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T06] (  9,  9   )    long  ->  rdi         single-def
 ;  V01 arg1         [V01,T04] ( 15, 12   )    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T11] (  9,  6   )    long  ->  rdx         single-def
 ;  V03 loc0         [V03,T00] ( 23, 30   )    long  ->  rax        
 ;  V04 loc1         [V04,T12] ( 13,  6.50)     int  ->   r8        
 ;* V05 loc2         [V05    ] (  0,  0   )     int  ->  zero-ref   
 ;  V06 loc3         [V06,T05] (  7, 14   )    long  ->  rcx        
 ;  V07 loc4         [V07,T22] (  5,  2.50)    long  ->  rdx        
 ;  V08 loc5         [V08,T16] (  2,  4.50)    long  ->   r8        
 ;# V09 OutArgs      [V09    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V10 tmp1         [V10,T23] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
 ;  V11 tmp2         [V11,T07] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
-;  V12 tmp3         [V12,T30] ( 14, 17.50)  simd64  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V12 tmp3         [V12,T32] ( 14, 17.50)  simd64  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;  V13 tmp4         [V13,T13] (  5,  6   )   byref  ->  rax         single-def "Inline stloc first use temp"
 ;* V14 tmp5         [V14    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
 ;  V15 tmp6         [V15,T01] ( 12, 27   )    long  ->   r8         "Inline stloc first use temp"
 ;  V16 tmp7         [V16,T17] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
-;  V17 tmp8         [V17,T36] (  3, 12   )  simd64  ->  mm3         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V17 tmp8         [V17,T38] (  3, 12   )  simd64  ->  mm3         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V18 tmp9         [V18    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
-;  V19 tmp10        [V19,T33] (  2, 16   )  simd64  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;  V19 tmp10        [V19,T35] (  2, 16   )  simd64  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
 ;* V20 tmp11        [V20    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
 ;* V21 tmp12        [V21    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V22 tmp13        [V22    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V23 tmp14        [V23    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V24 tmp15        [V24    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V25 tmp16        [V25    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V26 tmp17        [V26    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V27 tmp18        [V27    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V28 tmp19        [V28    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V29 tmp20        [V29,T24] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
 ;  V30 tmp21        [V30,T08] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
-;  V31 tmp22        [V31,T31] ( 14, 17.50)  simd32  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V31 tmp22        [V31,T33] ( 14, 17.50)  simd32  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;  V32 tmp23        [V32,T14] (  5,  6   )   byref  ->  rax         single-def "Inline stloc first use temp"
 ;* V33 tmp24        [V33    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V34 tmp25        [V34,T02] ( 12, 27   )    long  ->   r8         "Inline stloc first use temp"
 ;  V35 tmp26        [V35,T18] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
-;  V36 tmp27        [V36,T37] (  3, 12   )  simd32  ->  mm2         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V36 tmp27        [V36,T39] (  3, 12   )  simd32  ->  mm2         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;* V37 tmp28        [V37    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
-;  V38 tmp29        [V38,T34] (  2, 16   )  simd32  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;  V38 tmp29        [V38,T36] (  2, 16   )  simd32  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
 ;* V39 tmp30        [V39    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
-;* V40 tmp31        [V40    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V41 tmp32        [V41    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V40 tmp31        [V40    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V41 tmp32        [V41    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V42 tmp33        [V42    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V43 tmp34        [V43    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V44 tmp35        [V44    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V45 tmp36        [V45    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V44 tmp35        [V44    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V45 tmp36        [V45    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V46 tmp37        [V46    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V47 tmp38        [V47    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V48 tmp39        [V48    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V49 tmp40        [V49    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V50 tmp41        [V50    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V51 tmp42        [V51    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V52 tmp43        [V52    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V53 tmp44        [V53    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V54 tmp45        [V54    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V55 tmp46        [V55    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;  V56 tmp47        [V56,T25] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
-;* V57 tmp48        [V57,T27] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
-;* V58 tmp49        [V58    ] (  0,  0   )    long  ->  zero-ref    "Inline stloc first use temp"
-;  V59 tmp50        [V59,T09] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
-;  V60 tmp51        [V60,T32] ( 14, 17.50)  simd16  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V61 tmp52        [V61,T15] (  5,  6   )   byref  ->   r8         single-def "Inline stloc first use temp"
-;* V62 tmp53        [V62    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
-;  V63 tmp54        [V63,T03] ( 11, 26.50)    long  ->  rax         "Inline stloc first use temp"
-;  V64 tmp55        [V64,T19] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
-;  V65 tmp56        [V65,T38] (  3, 12   )  simd16  ->  mm2         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V66 tmp57        [V66    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
-;  V67 tmp58        [V67,T35] (  2, 16   )  simd16  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
-;* V68 tmp59        [V68    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
-;* V69 tmp60        [V69    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V70 tmp61        [V70    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V71 tmp62        [V71    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V72 tmp63        [V72    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V73 tmp64        [V73    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V74 tmp65        [V74    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V75 tmp66        [V75    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V76 tmp67        [V76    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V77 tmp68        [V77    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V78 tmp69        [V78    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V79 tmp70        [V79    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V80 tmp71        [V80    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V81 tmp72        [V81    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
-;  V82 tmp73        [V82,T28] (  3, 24   )  simd16  ->  mm1         "dup spill"
-;* V83 tmp74        [V83    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[uint]>
-;* V84 tmp75        [V84    ] (  0,  0   )   byref  ->  zero-ref    "Inlining Arg"
-;  V85 tmp76        [V85,T20] (  3,  3   )   byref  ->  rcx         single-def "Inlining Arg"
-;  V86 tmp77        [V86,T21] (  3,  3   )   byref  ->  rdx         "Inlining Arg"
-;* V87 tmp78        [V87,T26] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V88 cse0         [V88,T10] (  3,  8.50)    long  ->  r10         "CSE #05: conservative"
-;  V89 cse1         [V89,T39] (  5,  6   )  simd64  ->  mm1         "CSE #01: conservative"
-;  V90 cse2         [V90,T40] (  5,  6   )  simd32  ->  mm1         "CSE #03: conservative"
-;  V91 cse3         [V91,T41] (  5,  6   )  simd16  ->  mm1         "CSE #04: conservative"
-;  V92 cse4         [V92,T42] (  5,  6   )  simd64  ->  mm2         "CSE #02: conservative"
-;  V93 rat0         [V93,T29] (  3, 24   )  simd64  ->  mm4         "ReplaceWithLclVar is creating a new local variable"
+;  V48 tmp39        [V48,T25] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
+;* V49 tmp40        [V49,T27] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V50 tmp41        [V50    ] (  0,  0   )    long  ->  zero-ref    "Inline stloc first use temp"
+;  V51 tmp42        [V51,T09] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
+;  V52 tmp43        [V52,T34] ( 14, 17.50)  simd16  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V53 tmp44        [V53,T15] (  5,  6   )   byref  ->   r8         single-def "Inline stloc first use temp"
+;* V54 tmp45        [V54    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;  V55 tmp46        [V55,T03] ( 11, 26.50)    long  ->  rax         "Inline stloc first use temp"
+;  V56 tmp47        [V56,T19] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
+;  V57 tmp48        [V57,T40] (  3, 12   )  simd16  ->  mm2         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V58 tmp49        [V58    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;  V59 tmp50        [V59,T37] (  2, 16   )  simd16  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;* V60 tmp51        [V60    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V61 tmp52        [V61    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V62 tmp53        [V62    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V63 tmp54        [V63    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V64 tmp55        [V64    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V65 tmp56        [V65    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V66 tmp57        [V66    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V67 tmp58        [V67    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V68 tmp59        [V68    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V69 tmp60        [V69    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
+;  V70 tmp61        [V70,T28] (  3, 24   )  simd16  ->  mm1         "dup spill"
+;* V71 tmp62        [V71    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[uint]>
+;* V72 tmp63        [V72    ] (  0,  0   )   byref  ->  zero-ref    "Inlining Arg"
+;  V73 tmp64        [V73,T20] (  3,  3   )   byref  ->  rcx         single-def "Inlining Arg"
+;  V74 tmp65        [V74,T21] (  3,  3   )   byref  ->  rdx         "Inlining Arg"
+;* V75 tmp66        [V75,T26] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V76 cse0         [V76,T10] (  3,  8.50)    long  ->  r10         "CSE #05: conservative"
+;  V77 cse1         [V77,T41] (  5,  6   )  simd64  ->  mm1         "CSE #01: conservative"
+;  V78 cse2         [V78,T42] (  5,  6   )  simd32  ->  mm1         "CSE #03: conservative"
+;  V79 cse3         [V79,T43] (  5,  6   )  simd16  ->  mm1         "CSE #04: conservative"
+;  V80 cse4         [V80,T44] (  5,  6   )  simd64  ->  mm2         "CSE #02: conservative"
+;  V81 rat0         [V81,T29] (  3, 24   )  simd16  ->  mm3         "ReplaceWithLclVar is creating a new local variable"
+;  V82 rat1         [V82,T30] (  3, 24   )  simd32  ->  mm3         "ReplaceWithLclVar is creating a new local variable"
+;  V83 rat2         [V83,T31] (  3, 24   )  simd64  ->  mm4         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M6063_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M6063_IG02:
        xor      eax, eax
        cmp      rdx, 32
        jb       G_M6063_IG21
 						;; size=12 bbWeight=1 PerfScore 1.50
 G_M6063_IG03:
        mov      rcx, qword ptr [rdi]
        mov      r8, 0xD1FFAB1E
        test     rcx, r8
        jne      G_M6063_IG27
        cmp      rdx, 128
        jb       SHORT G_M6063_IG04
        mov      rcx, rdi
        vmovups  zmm0, zmmword ptr [rcx]
        vmovups  zmm1, zmmword ptr [reloc @RWD00]
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
 		  ;; NOP compensation instructions of 3 bytes.
        je       G_M6063_IG17
        xor      eax, eax
        jmp      G_M6063_IG21
        align    [3 bytes for IG08]
 						;; size=80 bbWeight=0.50 PerfScore 9.62
 G_M6063_IG04:
        cmp      rdx, 64
        jb       SHORT G_M6063_IG05
        mov      rcx, rdi
        vmovups  ymm0, ymmword ptr [rcx]
        vmovups  ymm1, ymmword ptr [reloc @RWD00]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        je       G_M6063_IG11
        xor      eax, eax
        jmp      G_M6063_IG15
 						;; size=39 bbWeight=0.50 PerfScore 9.38
 G_M6063_IG05:
        mov      rcx, rdi
        vmovups  xmm0, xmmword ptr [rcx]
        vmovups  xmm1, xmmword ptr [reloc @RWD00]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        je       SHORT G_M6063_IG06
        xor      eax, eax
        jmp      SHORT G_M6063_IG09
 						;; size=26 bbWeight=0.50 PerfScore 6.75
 G_M6063_IG06:
        mov      r8, rsi
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [r8], xmm0
        mov      eax, 8
        test     sil, 8
        jne      SHORT G_M6063_IG07
        vmovups  xmm0, xmmword ptr [rcx+0x10]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      SHORT G_M6063_IG09
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [r8+0x08], xmm0
 						;; size=45 bbWeight=0.50 PerfScore 7.88
 G_M6063_IG07:
        mov      rax, rsi
        and      rax, 15
        neg      rax
        add      rax, 16
        lea      r9, [rdx-0x10]
 						;; size=18 bbWeight=0.50 PerfScore 0.75
 G_M6063_IG08:
        vmovups  xmm0, xmmword ptr [rcx+2*rax]
        lea      r10, [rax+0x08]
        vmovups  xmm2, xmmword ptr [rcx+2*r10]
-       vpor     xmm3, xmm0, xmm2
-       vptest   xmm3, xmm1
+       vmovaps  xmm3, xmm0
+       vpternlogd xmm3, xmm2, xmm1, -88
+       vptest   xmm3, xmm3
        jne      SHORT G_M6063_IG10
        vpackuswb xmm0, xmm0, xmm2
        vmovups  xmmword ptr [r8+rax], xmm0
        add      rax, 16
        cmp      rax, r9
        jbe      SHORT G_M6063_IG08
-						;; size=45 bbWeight=4 PerfScore 69.33
+						;; size=52 bbWeight=4 PerfScore 71.00
 G_M6063_IG09:
        jmp      G_M6063_IG21
-       align    [0 bytes for IG13]
-						;; size=5 bbWeight=0.50 PerfScore 1.00
+       align    [7 bytes for IG13]
+						;; size=12 bbWeight=0.50 PerfScore 1.00
 G_M6063_IG10:
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      SHORT G_M6063_IG09
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [r8+rax], xmm0
        mov      rax, r10
        jmp      SHORT G_M6063_IG09
 						;; size=22 bbWeight=0.50 PerfScore 4.62
 G_M6063_IG11:
        mov      rax, rsi
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rax], xmm0
        mov      r8d, 16
        test     sil, 16
        jne      SHORT G_M6063_IG12
        vmovups  ymm0, ymmword ptr [rcx+0x20]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        jne      SHORT G_M6063_IG14
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rax+0x10], xmm0
 						;; size=56 bbWeight=0.50 PerfScore 11.38
 G_M6063_IG12:
        mov      r8, rsi
        and      r8, 31
        neg      r8
        add      r8, 32
        lea      r9, [rdx-0x20]
 						;; size=18 bbWeight=0.50 PerfScore 0.75
 G_M6063_IG13:
        vmovups  ymm0, ymmword ptr [rcx+2*r8]
        vmovups  ymm2, ymmword ptr [rcx+2*r8+0x20]
-       vpor     ymm3, ymm0, ymm2
-       vptest   ymm3, ymm1
+       vmovaps  ymm3, ymm0
+       vpternlogd ymm3, ymm2, ymm1, -88
+       vptest   ymm3, ymm3
        jne      SHORT G_M6063_IG16
        vpackuswb ymm0, ymm0, ymm2
        vpermq   ymm0, ymm0, -40
        vmovups  ymmword ptr [rax+r8], ymm0
        add      r8, 32
        cmp      r8, r9
        jbe      SHORT G_M6063_IG13
-						;; size=49 bbWeight=4 PerfScore 91.33
+						;; size=56 bbWeight=4 PerfScore 93.00
 G_M6063_IG14:
        mov      rax, r8
 						;; size=3 bbWeight=0.50 PerfScore 0.12
 G_M6063_IG15:
        jmp      G_M6063_IG21
        align    [0 bytes for IG19]
 						;; size=5 bbWeight=0.50 PerfScore 1.00
 G_M6063_IG16:
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        jne      SHORT G_M6063_IG14
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rax+r8], xmm0
        add      r8, 16
        jmp      SHORT G_M6063_IG14
 						;; size=29 bbWeight=0.50 PerfScore 6.62
 G_M6063_IG17:
        mov      rax, rsi
        vpackuswb zmm0, zmm0, zmm0
        vmovups  zmm2, zmmword ptr [reloc @RWD64]
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rax], ymm0
        mov      r8d, 32
        test     sil, 32
        jne      SHORT G_M6063_IG18
        vmovups  zmm0, zmmword ptr [rcx+0x40]
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
+		  ;; NOP compensation instructions of 3 bytes.
        jne      SHORT G_M6063_IG20
        vpackuswb zmm0, zmm0, zmm0
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rax+0x20], ymm0
-						;; size=78 bbWeight=0.50 PerfScore 11.88
+						;; size=81 bbWeight=0.50 PerfScore 11.88
 G_M6063_IG18:
        mov      r8, rsi
        and      r8, 63
        neg      r8
        add      r8, 64
        lea      r9, [rdx-0x40]
 						;; size=18 bbWeight=0.50 PerfScore 0.75
 G_M6063_IG19:
        vmovups  zmm0, zmmword ptr [rcx+2*r8]
        vmovups  zmm3, zmmword ptr [rcx+2*r8+0x40]
        vmovaps  zmm4, zmm0
        vpternlogd zmm4, zmm3, zmm1, -88
        vptestmw k1, zmm4, zmm4
        kortestd k1, k1
+		  ;; NOP compensation instructions of 3 bytes.
        jne      G_M6063_IG26
        vpackuswb zmm0, zmm0, zmm3
        vpermq   zmm0, zmm2, zmm0
        vmovups  zmmword ptr [rax+r8], zmm0
        add      r8, 64
        cmp      r8, r9
        jbe      SHORT G_M6063_IG19
-						;; size=73 bbWeight=4 PerfScore 81.00
+						;; size=76 bbWeight=4 PerfScore 81.00
 G_M6063_IG20:
        mov      rax, r8
 						;; size=3 bbWeight=0.50 PerfScore 0.12
 G_M6063_IG21:
        sub      rdx, rax
        cmp      rdx, 4
        jb       SHORT G_M6063_IG23
        lea      r8, [rax+rdx-0x04]
-       align    [0 bytes for IG22]
-						;; size=14 bbWeight=0.50 PerfScore 1.25
+       align    [3 bytes for IG22]
+						;; size=17 bbWeight=0.50 PerfScore 1.38
 G_M6063_IG22:
        mov      rcx, qword ptr [rdi+2*rax]
        mov      r9, 0xD1FFAB1E
        test     rcx, r9
        jne      G_M6063_IG27
        vmovd    xmm1, rcx
        vpackuswb xmm2, xmm1, xmm1
        vmovd    dword ptr [rsi+rax], xmm2
        add      rax, 4
        cmp      rax, r8
        jbe      SHORT G_M6063_IG22
 						;; size=46 bbWeight=4 PerfScore 40.00
 G_M6063_IG23:
        test     dl, 2
        je       SHORT G_M6063_IG24
        mov      r8d, dword ptr [rdi+2*rax]
        test     r8d, 0xD1FFAB1E
        jne      G_M6063_IG28
        lea      rcx, [rsi+rax]
        mov      byte  ptr [rcx], r8b
        shr      r8d, 16
        mov      byte  ptr [rcx+0x01], r8b
        add      rax, 2
 						;; size=41 bbWeight=0.50 PerfScore 3.88
 G_M6063_IG24:
        test     dl, 1
        je       SHORT G_M6063_IG29
        movzx    r8, word  ptr [rdi+2*rax]
        cmp      r8d, 127
        ja       SHORT G_M6063_IG29
 						;; size=16 bbWeight=0.50 PerfScore 2.25
 G_M6063_IG25:
        mov      byte  ptr [rsi+rax], r8b
        inc      rax
        jmp      SHORT G_M6063_IG29
 						;; size=9 bbWeight=0.50 PerfScore 1.62
 G_M6063_IG26:
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
        jne      G_M6063_IG20
        vpackuswb zmm0, zmm0, zmm0
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rax+r8], ymm0
        add      r8, 32
        jmp      G_M6063_IG20
 						;; size=44 bbWeight=0.50 PerfScore 6.12
 G_M6063_IG27:
        mov      r8d, ecx
        test     r8d, 0xD1FFAB1E
        jne      SHORT G_M6063_IG28
        lea      rdx, [rsi+rax]
        mov      byte  ptr [rdx], r8b
        shr      r8d, 16
        mov      byte  ptr [rdx+0x01], r8b
        shr      rcx, 32
        mov      r8d, ecx
        add      rax, 2
 						;; size=38 bbWeight=0.50 PerfScore 2.75
 G_M6063_IG28:
        test     r8d, 0xFF80
        je       SHORT G_M6063_IG25
 						;; size=9 bbWeight=0.50 PerfScore 0.62
 G_M6063_IG29:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=1 PerfScore 2.50
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD64  	dq	0000000000000000h, 0000000000000002h, 0000000000000004h, 0000000000000006h, 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
 
 
-; Total bytes of code 850, prolog size 4, PerfScore 378.04, instruction count 200, allocated bytes for code 859 (MethodHash=53fae850) for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 880, prolog size 4, PerfScore 381.50, instruction count 202, allocated bytes for code 883 (MethodHash=53fae850) for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
14 (4.02 % of base) - System.Text.Ascii:IsValidCore[short](byref,int):ubyte
 ; Assembly listing for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 2 single block inlinees; 6 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T01] ( 18, 17   )   byref  ->  rdi        
 ;  V01 arg1         [V01,T04] ( 11,  7   )     int  ->  rsi         single-def
 ;  V02 loc0         [V02,T08] (  5,  2.50)   byref  ->  rcx         single-def
 ;* V03 loc1         [V03    ] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc2         [V04    ] (  0,  0   )    long  ->  zero-ref   
 ;  V05 loc3         [V05,T03] (  5, 16.50)    long  ->  rax        
 ;* V06 loc4         [V06    ] (  0,  0   )    long  ->  zero-ref   
 ;  V07 loc5         [V07,T02] (  6, 17   )    long  ->  rdx        
 ;* V08 loc6         [V08    ] (  0,  0   )    long  ->  zero-ref   
 ;  V09 loc7         [V09,T05] (  4,  5.50)    long  ->  rsi        
 ;  V10 loc8         [V10,T00] (  5, 20   )   byref  ->  rax        
 ;* V11 loc9         [V11    ] (  0,  0   )    long  ->  zero-ref   
 ;* V12 loc10        [V12    ] (  0,  0   )    long  ->  zero-ref   
 ;* V13 loc11        [V13    ] (  0,  0   )    long  ->  zero-ref   
 ;* V14 loc12        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V16 tmp1         [V16    ] (  0,  0   )     int  ->  zero-ref   
 ;  V17 tmp2         [V17,T10] (  2,  1   )   ubyte  ->  rdx         "Inline return value spill temp"
 ;* V18 tmp3         [V18    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
 ;* V19 tmp4         [V19    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V20 tmp5         [V20    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V21 tmp6         [V21    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V22 tmp7         [V22    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
 ;* V23 tmp8         [V23    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V24 tmp9         [V24,T13] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
+;  V24 tmp9         [V24,T15] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
 ;* V25 tmp10        [V25    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;  V26 tmp11        [V26,T11] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
 ;* V27 tmp12        [V27    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V28 tmp13        [V28,T14] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
+;  V28 tmp13        [V28,T16] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V29 tmp14        [V29,T06] (  5,  5   )     int  ->  rdx         "Single return block return value"
 ;  V30 tmp15        [V30,T09] (  2,  2   )    long  ->  rax         "Cast away GC"
-;  V31 cse0         [V31,T12] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #04: aggressive"
+;  V31 cse0         [V31,T12] (  7,  7   )  simd32  ->  mm2         multi-def "CSE #04: aggressive"
 ;  V32 cse1         [V32,T07] (  3,  5   )    long  ->  rsi         "CSE #01: aggressive"
+;  V33 rat0         [V33,T13] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
+;  V34 rat1         [V34,T14] (  3,  3   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M12635_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M12635_IG02:
        cmp      esi, 8
        jge      SHORT G_M12635_IG07
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M12635_IG03:
        movsxd   rax, esi
        cmp      rax, 4
        jge      G_M12635_IG16
        xor      eax, eax
        mov      esi, esi
        test     rsi, rsi
        je       SHORT G_M12635_IG05
        align    [1 bytes for IG04]
 						;; size=23 bbWeight=0.50 PerfScore 1.75
 G_M12635_IG04:
        cmp      word  ptr [rdi+2*rax], 127
        ja       G_M12635_IG11
        inc      rax
        cmp      rax, rsi
        jb       SHORT G_M12635_IG04
 						;; size=19 bbWeight=4 PerfScore 22.00
 G_M12635_IG05:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M12635_IG06:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M12635_IG07:
        movsxd   rax, esi
        lea      rcx, bword ptr [rdi+2*rax]
        cmp      esi, 16
        jg       SHORT G_M12635_IG08
        vmovups  xmm0, xmmword ptr [rdi]
-       vpor     xmm0, xmm0, xmmword ptr [rcx-0x10]
-       vptest   xmm0, xmmword ptr [reloc @RWD00]
+       vmovups  xmm1, xmmword ptr [rcx-0x10]
+       vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+       vptest   xmm0, xmm0
        sete     dl
        movzx    rdx, dl
        jmp      G_M12635_IG17
        align    [0 bytes for IG10]
-						;; size=41 bbWeight=0.50 PerfScore 8.62
+						;; size=48 bbWeight=0.50 PerfScore 9.12
 G_M12635_IG08:
        cmp      esi, 32
        jg       SHORT G_M12635_IG09
        vmovups  ymm0, ymmword ptr [rdi]
-       vpor     ymm0, ymm0, ymmword ptr [rcx-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vmovups  ymm1, ymmword ptr [rcx-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vpternlogd ymm0, ymm1, ymm2, -88
+       vptest   ymm0, ymm0
        sete     dl
        movzx    rdx, dl
        jmp      G_M12635_IG17
-						;; size=38 bbWeight=0.50 PerfScore 10.75
+						;; size=45 bbWeight=0.50 PerfScore 12.00
 G_M12635_IG09:
        cmp      esi, 64
        jle      SHORT G_M12635_IG15
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rdi+0x60]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+       vpor     ymm0, ymm2, ymmword ptr [rdi+0x60]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        jne      SHORT G_M12635_IG11
        mov      rax, rdi
        and      rax, 31
        shr      rax, 1
        mov      rdx, rax
        neg      rdx
        add      rdx, 64
        movsxd   rsi, esi
        add      rsi, -64
        cmp      rdx, rsi
        jae      SHORT G_M12635_IG14
 						;; size=74 bbWeight=0.50 PerfScore 15.38
 G_M12635_IG10:
        lea      rax, bword ptr [rdi+2*rdx]
        vmovups  ymm0, ymmword ptr [rax]
-       vmovups  ymm2, ymmword ptr [rax+0x20]
-       vpternlogd ymm0, ymm2, ymmword ptr [rax+0x40], -2
+       vmovups  ymm1, ymmword ptr [rax+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rax+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rax+0x60]
-       vptest   ymm0, ymm1
+       vptest   ymm2, ymm0
        je       SHORT G_M12635_IG13
 						;; size=33 bbWeight=4 PerfScore 90.00
 G_M12635_IG11:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M12635_IG12:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M12635_IG13:
        add      rdx, 64
        cmp      rdx, rsi
        jb       SHORT G_M12635_IG10
 						;; size=9 bbWeight=4 PerfScore 6.00
 G_M12635_IG14:
        lea      rdi, bword ptr [rdi+2*rsi]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M12635_IG15:
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogd ymm1, ymm0, ymmword ptr [rcx-0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rcx-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogd ymm2, ymm0, ymmword ptr [rcx-0x40], -2
+       vpor     ymm0, ymm2, ymmword ptr [rcx-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        sete     dl
        movzx    rdx, dl
        jmp      SHORT G_M12635_IG17
 						;; size=43 bbWeight=0.50 PerfScore 14.12
 G_M12635_IG16:
        mov      rdx, qword ptr [rdi]
        movsxd   rax, esi
        or       rdx, qword ptr [rdi+2*rax-0x08]
        mov      rax, 0xD1FFAB1E
        test     rdx, rax
        sete     dl
        movzx    rdx, dl
 						;; size=30 bbWeight=0.50 PerfScore 3.50
 G_M12635_IG17:
        movzx    rax, dl
 						;; size=3 bbWeight=0.50 PerfScore 0.12
 G_M12635_IG18:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 348, prolog size 4, PerfScore 179.00, instruction count 95, allocated bytes for code 348 (MethodHash=fb38cea4) for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts)
+; Total bytes of code 362, prolog size 4, PerfScore 180.75, instruction count 97, allocated bytes for code 362 (MethodHash=fb38cea4) for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts)
14 (3.65 % of base) - System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte
 ; Assembly listing for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 3 single block inlinees; 6 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] ( 20, 18   )   byref  ->  rdi        
 ;  V01 arg1         [V01,T04] ( 13,  8   )     int  ->  rsi         single-def
 ;  V02 loc0         [V02,T08] (  5,  2.50)   byref  ->  rax         single-def
 ;* V03 loc1         [V03    ] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc2         [V04    ] (  0,  0   )    long  ->  zero-ref   
 ;  V05 loc3         [V05,T03] (  5, 16.50)    long  ->  rax        
 ;* V06 loc4         [V06    ] (  0,  0   )    long  ->  zero-ref   
 ;  V07 loc5         [V07,T02] (  6, 17   )    long  ->  rcx        
 ;* V08 loc6         [V08    ] (  0,  0   )    long  ->  zero-ref   
 ;  V09 loc7         [V09,T06] (  4,  5.50)    long  ->  rsi        
 ;  V10 loc8         [V10,T01] (  5, 20   )   byref  ->  rdx        
 ;* V11 loc9         [V11    ] (  0,  0   )    long  ->  zero-ref   
 ;* V12 loc10        [V12    ] (  0,  0   )    long  ->  zero-ref   
 ;* V13 loc11        [V13    ] (  0,  0   )    long  ->  zero-ref   
 ;* V14 loc12        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V16 tmp1         [V16    ] (  0,  0   )     int  ->  zero-ref   
 ;* V17 tmp2         [V17    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
 ;* V18 tmp3         [V18    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V19 tmp4         [V19    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
 ;* V20 tmp5         [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V21 tmp6         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V22 tmp7         [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V23 tmp8         [V23    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V24 tmp9         [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V25 tmp10        [V25,T12] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;  V25 tmp10        [V25,T14] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V26 tmp11        [V26    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;  V27 tmp12        [V27,T10] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V28 tmp13        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V29 tmp14        [V29,T13] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;  V29 tmp14        [V29,T15] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V30 tmp15        [V30,T05] (  6,  6   )     int  ->  rcx         "Single return block return value"
 ;  V31 tmp16        [V31,T09] (  2,  2   )    long  ->  rcx         "Cast away GC"
-;  V32 cse0         [V32,T11] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #04: aggressive"
+;  V32 cse0         [V32,T11] (  7,  7   )  simd32  ->  mm2         multi-def "CSE #04: aggressive"
 ;  V33 cse1         [V33,T07] (  3,  5   )    long  ->  rcx         "CSE #01: aggressive"
+;  V34 rat0         [V34,T12] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
+;  V35 rat1         [V35,T13] (  3,  3   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M58774_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M58774_IG02:
        cmp      esi, 16
        jge      SHORT G_M58774_IG06
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M58774_IG03:
        movsxd   rax, esi
        cmp      rax, 8
        jge      G_M58774_IG12
        cmp      esi, 4
        jl       G_M58774_IG13
        mov      eax, dword ptr [rdi]
        add      esi, -4
        movsxd   rcx, esi
        or       eax, dword ptr [rdi+rcx]
        test     eax, 0xD1FFAB1E
        sete     cl
        movzx    rcx, cl
 						;; size=44 bbWeight=0.50 PerfScore 4.88
 G_M58774_IG04:
        movzx    rax, cl
 						;; size=3 bbWeight=0.50 PerfScore 0.12
 G_M58774_IG05:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M58774_IG06:
        movsxd   rax, esi
        add      rax, rdi
        cmp      esi, 32
        jg       SHORT G_M58774_IG07
        vmovups  xmm0, xmmword ptr [rdi]
-       vpor     xmm0, xmm0, xmmword ptr [rax-0x10]
-       vptest   xmm0, xmmword ptr [reloc @RWD00]
+       vmovups  xmm1, xmmword ptr [rax-0x10]
+       vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+       vptest   xmm0, xmm0
        sete     cl
        movzx    rcx, cl
        jmp      SHORT G_M58774_IG04
        align    [0 bytes for IG09]
-						;; size=37 bbWeight=0.50 PerfScore 8.50
+						;; size=44 bbWeight=0.50 PerfScore 9.00
 G_M58774_IG07:
        cmp      esi, 64
        jg       SHORT G_M58774_IG08
        vmovups  ymm0, ymmword ptr [rdi]
-       vpor     ymm0, ymm0, ymmword ptr [rax-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vmovups  ymm1, ymmword ptr [rax-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vpternlogd ymm0, ymm1, ymm2, -88
+       vptest   ymm0, ymm0
        sete     cl
        movzx    rcx, cl
        jmp      SHORT G_M58774_IG04
-						;; size=35 bbWeight=0.50 PerfScore 10.75
+						;; size=42 bbWeight=0.50 PerfScore 12.00
 G_M58774_IG08:
        cmp      esi, 128
        jle      SHORT G_M58774_IG11
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rdi+0x60]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+       vpor     ymm0, ymm2, ymmword ptr [rdi+0x60]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        jne      G_M58774_IG17
        mov      rcx, rdi
        and      rcx, 31
        neg      rcx
        add      rcx, 128
        movsxd   rsi, esi
        add      rsi, -128
        cmp      rcx, rsi
        jae      SHORT G_M58774_IG10
 						;; size=78 bbWeight=0.50 PerfScore 15.00
 G_M58774_IG09:
        lea      rdx, bword ptr [rdi+rcx]
        vmovups  ymm0, ymmword ptr [rdx]
-       vmovups  ymm2, ymmword ptr [rdx+0x20]
-       vpternlogd ymm0, ymm2, ymmword ptr [rdx+0x40], -2
+       vmovups  ymm1, ymmword ptr [rdx+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rdx+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rdx+0x60]
-       vptest   ymm0, ymm1
+       vptest   ymm2, ymm0
        jne      G_M58774_IG17
        add      rcx, 128
        cmp      rcx, rsi
        jb       SHORT G_M58774_IG09
 						;; size=49 bbWeight=4 PerfScore 96.00
 G_M58774_IG10:
        add      rdi, rsi
 						;; size=3 bbWeight=0.50 PerfScore 0.12
 G_M58774_IG11:
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogd ymm1, ymm0, ymmword ptr [rax-0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rax-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogd ymm2, ymm0, ymmword ptr [rax-0x40], -2
+       vpor     ymm0, ymm2, ymmword ptr [rax-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        sete     cl
        movzx    rcx, cl
        jmp      G_M58774_IG04
        align    [0 bytes for IG14]
 						;; size=46 bbWeight=0.50 PerfScore 14.12
 G_M58774_IG12:
        mov      rcx, qword ptr [rdi]
        movsxd   rsi, esi
        or       rcx, qword ptr [rdi+rsi-0x08]
        mov      rdi, 0xD1FFAB1E
        test     rcx, rdi
        sete     cl
        movzx    rcx, cl
        jmp      G_M58774_IG04
 						;; size=35 bbWeight=0.50 PerfScore 4.50
 G_M58774_IG13:
        xor      eax, eax
        mov      ecx, esi
        test     rcx, rcx
        je       SHORT G_M58774_IG15
 						;; size=9 bbWeight=0.50 PerfScore 0.88
 G_M58774_IG14:
        cmp      byte  ptr [rdi+rax], 127
        ja       SHORT G_M58774_IG17
        inc      rax
        cmp      rax, rcx
        jb       SHORT G_M58774_IG14
 						;; size=14 bbWeight=4 PerfScore 22.00
 G_M58774_IG15:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M58774_IG16:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M58774_IG17:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M58774_IG18:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 RWD00  	dq	8080808080808080h, 8080808080808080h
 RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h
 
 
-; Total bytes of code 384, prolog size 4, PerfScore 183.38, instruction count 103, allocated bytes for code 384 (MethodHash=d69a1a69) for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts)
+; Total bytes of code 398, prolog size 4, PerfScore 185.12, instruction count 105, allocated bytes for code 398 (MethodHash=d69a1a69) for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts)
12 (3.40 % of base) - System.Text.Ascii:IsValidCore[int](byref,int):ubyte
 ; Assembly listing for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T02] ( 18, 13   )   byref  ->  rdi        
 ;  V01 arg1         [V01,T03] (  9,  6   )     int  ->  rsi         single-def
 ;  V02 loc0         [V02,T07] (  5,  2.50)   byref  ->  rdx         single-def
 ;* V03 loc1         [V03    ] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc2         [V04    ] (  0,  0   )    long  ->  zero-ref   
 ;* V05 loc3         [V05,T10] (  0,  0   )    long  ->  zero-ref   
 ;* V06 loc4         [V06    ] (  0,  0   )    long  ->  zero-ref   
 ;  V07 loc5         [V07,T01] (  6, 17   )    long  ->   r8        
 ;* V08 loc6         [V08    ] (  0,  0   )    long  ->  zero-ref   
 ;  V09 loc7         [V09,T04] (  4,  5.50)    long  ->  rcx        
 ;  V10 loc8         [V10,T00] (  5, 20   )   byref  ->  rax        
 ;* V11 loc9         [V11    ] (  0,  0   )    long  ->  zero-ref   
 ;* V12 loc10        [V12    ] (  0,  0   )    long  ->  zero-ref   
 ;* V13 loc11        [V13    ] (  0,  0   )    long  ->  zero-ref   
 ;* V14 loc12        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V16 tmp1         [V16    ] (  0,  0   )     int  ->  zero-ref   
 ;* V17 tmp2         [V17    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inlining Arg"
 ;  V18 tmp3         [V18,T09] (  2,  1   )   ubyte  ->   r8         "Inline return value spill temp"
 ;* V19 tmp4         [V19    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
 ;* V20 tmp5         [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V21 tmp6         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[int]>
 ;* V22 tmp7         [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V23 tmp8         [V23    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
 ;* V24 tmp9         [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V25 tmp10        [V25,T13] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;  V25 tmp10        [V25,T15] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
 ;* V26 tmp11        [V26    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;  V27 tmp12        [V27,T11] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
 ;* V28 tmp13        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V29 tmp14        [V29,T14] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;  V29 tmp14        [V29,T16] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
 ;  V30 tmp15        [V30,T05] (  5,  5   )     int  ->   r8         "Single return block return value"
 ;  V31 tmp16        [V31,T08] (  2,  2   )    long  ->  rax         "Cast away GC"
-;  V32 cse0         [V32,T12] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #03: aggressive"
+;  V32 cse0         [V32,T12] (  7,  7   )  simd32  ->  mm2         multi-def "CSE #03: aggressive"
 ;  V33 cse1         [V33,T06] (  6,  3   )    long  ->  rcx         multi-def "CSE #01: aggressive"
+;  V34 rat0         [V34,T13] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
+;  V35 rat1         [V35,T14] (  3,  3   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M8346_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M8346_IG02:
        cmp      esi, 4
        jge      SHORT G_M8346_IG04
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M8346_IG03:
        movsxd   rcx, esi
        cmp      rcx, 2
        jge      G_M8346_IG13
        test     esi, esi
        je       G_M8346_IG16
        jmp      G_M8346_IG18
 						;; size=26 bbWeight=0.50 PerfScore 2.38
 G_M8346_IG04:
        movsxd   rcx, esi
        lea      rdx, bword ptr [rdi+4*rcx]
        cmp      esi, 8
        jg       SHORT G_M8346_IG05
        vmovups  xmm0, xmmword ptr [rdi]
-       vpor     xmm0, xmm0, xmmword ptr [rdx-0x10]
-       vptest   xmm0, xmmword ptr [reloc @RWD00]
+       vmovups  xmm1, xmmword ptr [rdx-0x10]
+       vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+       vptest   xmm0, xmm0
        sete     r8b
        movzx    r8, r8b
        jmp      G_M8346_IG14
-       align    [2 bytes for IG07]
-						;; size=45 bbWeight=0.50 PerfScore 8.62
+       align    [0 bytes for IG07]
+						;; size=50 bbWeight=0.50 PerfScore 9.12
 G_M8346_IG05:
        cmp      esi, 16
        jg       SHORT G_M8346_IG06
        vmovups  ymm0, ymmword ptr [rdi]
-       vpor     ymm0, ymm0, ymmword ptr [rdx-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vmovups  ymm1, ymmword ptr [rdx-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vpternlogd ymm0, ymm1, ymm2, -88
+       vptest   ymm0, ymm0
        sete     r8b
        movzx    r8, r8b
        jmp      G_M8346_IG14
-						;; size=40 bbWeight=0.50 PerfScore 10.75
+						;; size=47 bbWeight=0.50 PerfScore 12.00
 G_M8346_IG06:
        cmp      esi, 32
        jle      SHORT G_M8346_IG12
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rdi+0x60]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+       vpor     ymm0, ymm2, ymmword ptr [rdi+0x60]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        jne      SHORT G_M8346_IG08
        mov      rax, rdi
        and      rax, 31
        shr      rax, 2
        mov      r8, rax
        neg      r8
        add      r8, 32
        add      rcx, -32
        cmp      r8, rcx
        jae      SHORT G_M8346_IG11
 						;; size=72 bbWeight=0.50 PerfScore 15.25
 G_M8346_IG07:
        lea      rax, bword ptr [rdi+4*r8]
        vmovups  ymm0, ymmword ptr [rax]
-       vmovups  ymm2, ymmword ptr [rax+0x20]
-       vpternlogd ymm0, ymm2, ymmword ptr [rax+0x40], -2
+       vmovups  ymm1, ymmword ptr [rax+0x20]
+       vpternlogd ymm0, ymm1, ymmword ptr [rax+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rax+0x60]
-       vptest   ymm0, ymm1
+       vptest   ymm2, ymm0
        je       SHORT G_M8346_IG10
 						;; size=33 bbWeight=4 PerfScore 90.00
 G_M8346_IG08:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M8346_IG09:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M8346_IG10:
        add      r8, 32
        cmp      r8, rcx
        jb       SHORT G_M8346_IG07
 						;; size=9 bbWeight=4 PerfScore 6.00
 G_M8346_IG11:
        lea      rdi, bword ptr [rdi+4*rcx]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M8346_IG12:
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogd ymm1, ymm0, ymmword ptr [rdx-0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rdx-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogd ymm2, ymm0, ymmword ptr [rdx-0x40], -2
+       vpor     ymm0, ymm2, ymmword ptr [rdx-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        sete     r8b
        movzx    r8, r8b
        jmp      SHORT G_M8346_IG14
 						;; size=45 bbWeight=0.50 PerfScore 14.12
 G_M8346_IG13:
        mov      r8, qword ptr [rdi]
        or       r8, qword ptr [rdi+4*rcx-0x08]
        mov      rax, 0xD1FFAB1E
        test     r8, rax
        sete     r8b
        movzx    r8, r8b
 						;; size=29 bbWeight=0.50 PerfScore 3.38
 G_M8346_IG14:
        movzx    rax, r8b
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M8346_IG15:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M8346_IG16:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M8346_IG17:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M8346_IG18:
        cmp      dword ptr [rdi], edi
        mov      rax, 0xD1FFAB1E      ; code for System.ThrowHelper:ThrowNotSupportedException()
        call     [rax]System.ThrowHelper:ThrowNotSupportedException()
        int3     
 						;; size=15 bbWeight=0 PerfScore 0.00
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
+; Total bytes of code 365, prolog size 4, PerfScore 159.12, instruction count 92, allocated bytes for code 365 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
11 (3.12 % of base) - System.Text.Ascii:IsValidCore[double](byref,int):ubyte
 ; Assembly listing for method System.Text.Ascii:IsValidCore[double](byref,int):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T02] ( 18, 13   )   byref  ->  rdi        
 ;  V01 arg1         [V01,T03] (  9,  6   )     int  ->  rsi         single-def
 ;  V02 loc0         [V02,T07] (  5,  2.50)   byref  ->  rdx         single-def
 ;* V03 loc1         [V03    ] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc2         [V04    ] (  0,  0   )    long  ->  zero-ref   
 ;* V05 loc3         [V05,T10] (  0,  0   )    long  ->  zero-ref   
 ;* V06 loc4         [V06    ] (  0,  0   )    long  ->  zero-ref   
 ;  V07 loc5         [V07,T01] (  6, 17   )    long  ->   r8        
 ;* V08 loc6         [V08    ] (  0,  0   )    long  ->  zero-ref   
 ;  V09 loc7         [V09,T04] (  4,  5.50)    long  ->  rcx        
 ;  V10 loc8         [V10,T00] (  5, 20   )   byref  ->  rax        
 ;* V11 loc9         [V11    ] (  0,  0   )    long  ->  zero-ref   
 ;* V12 loc10        [V12    ] (  0,  0   )    long  ->  zero-ref   
 ;* V13 loc11        [V13    ] (  0,  0   )    long  ->  zero-ref   
 ;* V14 loc12        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V16 tmp1         [V16    ] (  0,  0   )     int  ->  zero-ref   
 ;* V17 tmp2         [V17    ] (  0,  0   )  double  ->  zero-ref    do-not-enreg[F] ld-addr-op "Inlining Arg"
 ;  V18 tmp3         [V18,T09] (  2,  1   )   ubyte  ->   r8         "Inline return value spill temp"
 ;* V19 tmp4         [V19    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
 ;* V20 tmp5         [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V21 tmp6         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[double]>
 ;* V22 tmp7         [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V23 tmp8         [V23    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
 ;* V24 tmp9         [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V25 tmp10        [V25,T13] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
+;  V25 tmp10        [V25,T15] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
 ;* V26 tmp11        [V26    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;  V27 tmp12        [V27,T11] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
 ;* V28 tmp13        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V29 tmp14        [V29,T14] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
+;  V29 tmp14        [V29,T16] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[double]>
 ;  V30 tmp15        [V30,T05] (  5,  5   )     int  ->   r8         "Single return block return value"
 ;  V31 tmp16        [V31,T08] (  2,  2   )    long  ->  rax         "Cast away GC"
-;  V32 cse0         [V32,T12] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #03: aggressive"
+;  V32 cse0         [V32,T12] (  7,  7   )  simd32  ->  mm2         multi-def "CSE #03: aggressive"
 ;  V33 cse1         [V33,T06] (  6,  3   )    long  ->  rcx         multi-def "CSE #01: aggressive"
+;  V34 rat0         [V34,T13] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
+;  V35 rat1         [V35,T14] (  3,  3   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M10908_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M10908_IG02:
        cmp      esi, 2
        jge      SHORT G_M10908_IG04
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M10908_IG03:
        movsxd   rcx, esi
        test     rcx, rcx
        jg       G_M10908_IG13
        test     esi, esi
        je       G_M10908_IG16
        jmp      G_M10908_IG18
 						;; size=25 bbWeight=0.50 PerfScore 2.38
 G_M10908_IG04:
        movsxd   rcx, esi
        lea      rdx, bword ptr [rdi+8*rcx]
        cmp      esi, 4
        jg       SHORT G_M10908_IG05
        vmovups  xmm0, xmmword ptr [rdi]
-       vorpd    xmm0, xmm0, xmmword ptr [rdx-0x10]
-       vptest   xmm0, xmmword ptr [reloc @RWD00]
+       vmovups  xmm1, xmmword ptr [rdx-0x10]
+       vpternlogq xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+       vptest   xmm0, xmm0
        sete     r8b
        movzx    r8, r8b
        jmp      G_M10908_IG14
-       align    [3 bytes for IG07]
-						;; size=46 bbWeight=0.50 PerfScore 8.62
+       align    [0 bytes for IG07]
+						;; size=50 bbWeight=0.50 PerfScore 9.12
 G_M10908_IG05:
        cmp      esi, 8
        jg       SHORT G_M10908_IG06
        vmovups  ymm0, ymmword ptr [rdi]
-       vorpd    ymm0, ymm0, ymmword ptr [rdx-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vmovups  ymm1, ymmword ptr [rdx-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vpternlogq ymm0, ymm1, ymm2, -88
+       vptest   ymm0, ymm0
        sete     r8b
        movzx    r8, r8b
        jmp      G_M10908_IG14
-						;; size=40 bbWeight=0.50 PerfScore 10.75
+						;; size=47 bbWeight=0.50 PerfScore 12.00
 G_M10908_IG06:
        cmp      esi, 16
        jle      SHORT G_M10908_IG12
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogq ymm1, ymm0, ymmword ptr [rdi+0x40], -2
-       vorpd    ymm0, ymm1, ymmword ptr [rdi+0x60]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogq ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+       vorpd    ymm0, ymm2, ymmword ptr [rdi+0x60]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        jne      SHORT G_M10908_IG08
        mov      rax, rdi
        and      rax, 31
        shr      rax, 3
        mov      r8, rax
        neg      r8
        add      r8, 16
        add      rcx, -16
        cmp      r8, rcx
        jae      SHORT G_M10908_IG11
 						;; size=72 bbWeight=0.50 PerfScore 15.25
 G_M10908_IG07:
        lea      rax, bword ptr [rdi+8*r8]
        vmovups  ymm0, ymmword ptr [rax]
-       vmovups  ymm2, ymmword ptr [rax+0x20]
-       vpternlogq ymm0, ymm2, ymmword ptr [rax+0x40], -2
+       vmovups  ymm1, ymmword ptr [rax+0x20]
+       vpternlogq ymm0, ymm1, ymmword ptr [rax+0x40], -2
        vorpd    ymm0, ymm0, ymmword ptr [rax+0x60]
-       vptest   ymm0, ymm1
+       vptest   ymm2, ymm0
        je       SHORT G_M10908_IG10
 						;; size=33 bbWeight=4 PerfScore 90.00
 G_M10908_IG08:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M10908_IG09:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M10908_IG10:
        add      r8, 16
        cmp      r8, rcx
        jb       SHORT G_M10908_IG07
 						;; size=9 bbWeight=4 PerfScore 6.00
 G_M10908_IG11:
        lea      rdi, bword ptr [rdi+8*rcx]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M10908_IG12:
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogq ymm1, ymm0, ymmword ptr [rdx-0x40], -2
-       vorpd    ymm0, ymm1, ymmword ptr [rdx-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogq ymm2, ymm0, ymmword ptr [rdx-0x40], -2
+       vorpd    ymm0, ymm2, ymmword ptr [rdx-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        sete     r8b
        movzx    r8, r8b
        jmp      SHORT G_M10908_IG14
 						;; size=45 bbWeight=0.50 PerfScore 14.12
 G_M10908_IG13:
        mov      r8, qword ptr [rdi]
        or       r8, qword ptr [rdi+8*rcx-0x08]
        mov      rax, 0xD1FFAB1E
        test     r8, rax
        sete     r8b
        movzx    r8, r8b
 						;; size=29 bbWeight=0.50 PerfScore 3.38
 G_M10908_IG14:
        movzx    rax, r8b
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M10908_IG15:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M10908_IG16:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M10908_IG17:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M10908_IG18:
        cmp      dword ptr [rdi], edi
        mov      rax, 0xD1FFAB1E      ; code for System.ThrowHelper:ThrowNotSupportedException()
        call     [rax]System.ThrowHelper:ThrowNotSupportedException()
        int3     
 						;; size=15 bbWeight=0 PerfScore 0.00
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=1534d563) for method System.Text.Ascii:IsValidCore[double](byref,int):ubyte (FullOpts)
+; Total bytes of code 364, prolog size 4, PerfScore 159.12, instruction count 92, allocated bytes for code 364 (MethodHash=1534d563) for method System.Text.Ascii:IsValidCore[double](byref,int):ubyte (FullOpts)
11 (3.12 % of base) - System.Text.Ascii:IsValidCore[long](byref,int):ubyte
 ; Assembly listing for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T02] ( 18, 13   )   byref  ->  rdi        
 ;  V01 arg1         [V01,T03] (  9,  6   )     int  ->  rsi         single-def
 ;  V02 loc0         [V02,T07] (  5,  2.50)   byref  ->  rdx         single-def
 ;* V03 loc1         [V03    ] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc2         [V04    ] (  0,  0   )    long  ->  zero-ref   
 ;* V05 loc3         [V05,T10] (  0,  0   )    long  ->  zero-ref   
 ;* V06 loc4         [V06    ] (  0,  0   )    long  ->  zero-ref   
 ;  V07 loc5         [V07,T01] (  6, 17   )    long  ->   r8        
 ;* V08 loc6         [V08    ] (  0,  0   )    long  ->  zero-ref   
 ;  V09 loc7         [V09,T04] (  4,  5.50)    long  ->  rcx        
 ;  V10 loc8         [V10,T00] (  5, 20   )   byref  ->  rax        
 ;* V11 loc9         [V11    ] (  0,  0   )    long  ->  zero-ref   
 ;* V12 loc10        [V12    ] (  0,  0   )    long  ->  zero-ref   
 ;* V13 loc11        [V13    ] (  0,  0   )    long  ->  zero-ref   
 ;* V14 loc12        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V16 tmp1         [V16    ] (  0,  0   )     int  ->  zero-ref   
 ;* V17 tmp2         [V17    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inlining Arg"
 ;  V18 tmp3         [V18,T09] (  2,  1   )   ubyte  ->   r8         "Inline return value spill temp"
 ;* V19 tmp4         [V19    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
 ;* V20 tmp5         [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V21 tmp6         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[long]>
 ;* V22 tmp7         [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V23 tmp8         [V23    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
 ;* V24 tmp9         [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V25 tmp10        [V25,T13] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;  V25 tmp10        [V25,T15] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
 ;* V26 tmp11        [V26    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;  V27 tmp12        [V27,T11] (  2, 16   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
 ;* V28 tmp13        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V29 tmp14        [V29,T14] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
+;  V29 tmp14        [V29,T16] (  2,  2   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[long]>
 ;  V30 tmp15        [V30,T05] (  5,  5   )     int  ->   r8         "Single return block return value"
 ;  V31 tmp16        [V31,T08] (  2,  2   )    long  ->  rax         "Cast away GC"
-;  V32 cse0         [V32,T12] (  7,  7   )  simd32  ->  mm1         multi-def "CSE #03: aggressive"
+;  V32 cse0         [V32,T12] (  7,  7   )  simd32  ->  mm2         multi-def "CSE #03: aggressive"
 ;  V33 cse1         [V33,T06] (  6,  3   )    long  ->  rcx         multi-def "CSE #01: aggressive"
+;  V34 rat0         [V34,T13] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
+;  V35 rat1         [V35,T14] (  3,  3   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M33379_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M33379_IG02:
        cmp      esi, 2
        jge      SHORT G_M33379_IG04
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M33379_IG03:
        movsxd   rcx, esi
        test     rcx, rcx
        jg       G_M33379_IG13
        test     esi, esi
        je       G_M33379_IG16
        jmp      G_M33379_IG18
 						;; size=25 bbWeight=0.50 PerfScore 2.38
 G_M33379_IG04:
        movsxd   rcx, esi
        lea      rdx, bword ptr [rdi+8*rcx]
        cmp      esi, 4
        jg       SHORT G_M33379_IG05
        vmovups  xmm0, xmmword ptr [rdi]
-       vpor     xmm0, xmm0, xmmword ptr [rdx-0x10]
-       vptest   xmm0, xmmword ptr [reloc @RWD00]
+       vmovups  xmm1, xmmword ptr [rdx-0x10]
+       vpternlogq xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+       vptest   xmm0, xmm0
        sete     r8b
        movzx    r8, r8b
        jmp      G_M33379_IG14
-       align    [3 bytes for IG07]
-						;; size=46 bbWeight=0.50 PerfScore 8.62
+       align    [0 bytes for IG07]
+						;; size=50 bbWeight=0.50 PerfScore 9.12
 G_M33379_IG05:
        cmp      esi, 8
        jg       SHORT G_M33379_IG06
        vmovups  ymm0, ymmword ptr [rdi]
-       vpor     ymm0, ymm0, ymmword ptr [rdx-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vmovups  ymm1, ymmword ptr [rdx-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vpternlogq ymm0, ymm1, ymm2, -88
+       vptest   ymm0, ymm0
        sete     r8b
        movzx    r8, r8b
        jmp      G_M33379_IG14
-						;; size=40 bbWeight=0.50 PerfScore 10.75
+						;; size=47 bbWeight=0.50 PerfScore 12.00
 G_M33379_IG06:
        cmp      esi, 16
        jle      SHORT G_M33379_IG12
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogq ymm1, ymm0, ymmword ptr [rdi+0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rdi+0x60]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogq ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+       vpor     ymm0, ymm2, ymmword ptr [rdi+0x60]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        jne      SHORT G_M33379_IG08
        mov      rax, rdi
        and      rax, 31
        shr      rax, 3
        mov      r8, rax
        neg      r8
        add      r8, 16
        add      rcx, -16
        cmp      r8, rcx
        jae      SHORT G_M33379_IG11
 						;; size=72 bbWeight=0.50 PerfScore 15.25
 G_M33379_IG07:
        lea      rax, bword ptr [rdi+8*r8]
        vmovups  ymm0, ymmword ptr [rax]
-       vmovups  ymm2, ymmword ptr [rax+0x20]
-       vpternlogq ymm0, ymm2, ymmword ptr [rax+0x40], -2
+       vmovups  ymm1, ymmword ptr [rax+0x20]
+       vpternlogq ymm0, ymm1, ymmword ptr [rax+0x40], -2
        vpor     ymm0, ymm0, ymmword ptr [rax+0x60]
-       vptest   ymm0, ymm1
+       vptest   ymm2, ymm0
        je       SHORT G_M33379_IG10
 						;; size=33 bbWeight=4 PerfScore 90.00
 G_M33379_IG08:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M33379_IG09:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M33379_IG10:
        add      r8, 16
        cmp      r8, rcx
        jb       SHORT G_M33379_IG07
 						;; size=9 bbWeight=4 PerfScore 6.00
 G_M33379_IG11:
        lea      rdi, bword ptr [rdi+8*rcx]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M33379_IG12:
-       vmovups  ymm1, ymmword ptr [rdi]
+       vmovups  ymm2, ymmword ptr [rdi]
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vpternlogq ymm1, ymm0, ymmword ptr [rdx-0x40], -2
-       vpor     ymm0, ymm1, ymmword ptr [rdx-0x20]
-       vmovups  ymm1, ymmword ptr [reloc @RWD32]
-       vptest   ymm0, ymm1
+       vpternlogq ymm2, ymm0, ymmword ptr [rdx-0x40], -2
+       vpor     ymm0, ymm2, ymmword ptr [rdx-0x20]
+       vmovups  ymm2, ymmword ptr [reloc @RWD32]
+       vptest   ymm2, ymm0
        sete     r8b
        movzx    r8, r8b
        jmp      SHORT G_M33379_IG14
 						;; size=45 bbWeight=0.50 PerfScore 14.12
 G_M33379_IG13:
        mov      r8, qword ptr [rdi]
        or       r8, qword ptr [rdi+8*rcx-0x08]
        mov      rax, 0xD1FFAB1E
        test     r8, rax
        sete     r8b
        movzx    r8, r8b
 						;; size=29 bbWeight=0.50 PerfScore 3.38
 G_M33379_IG14:
        movzx    rax, r8b
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M33379_IG15:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M33379_IG16:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M33379_IG17:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M33379_IG18:
        cmp      dword ptr [rdi], edi
        mov      rax, 0xD1FFAB1E      ; code for System.ThrowHelper:ThrowNotSupportedException()
        call     [rax]System.ThrowHelper:ThrowNotSupportedException()
        int3     
 						;; size=15 bbWeight=0 PerfScore 0.00
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=a1267d9c) for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
+; Total bytes of code 364, prolog size 4, PerfScore 159.12, instruction count 92, allocated bytes for code 364 (MethodHash=a1267d9c) for method System.Text.Ascii:IsValidCore[long](byref,int):ubyte (FullOpts)
7 (9.86 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this
 ; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;* V00 this         [V00    ] (  0,  0   )   byref  ->  zero-ref    this single-def
 ;  V01 arg1         [V01,T00] (  4,  4   )    long  ->  rsi         single-def
 ;* V02 arg2         [V02    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;* V03 arg3         [V03    ] (  0,  0   )     int  ->  zero-ref    single-def
 ;  V04 arg4         [V04,T01] (  4,  3   )   byref  ->   r8         single-def
-;  V05 loc0         [V05,T02] (  3,  2.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;  V06 loc1         [V06,T03] (  3,  2.50)  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V05 loc0         [V05,T03] (  3,  2.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V06 loc1         [V06,T04] (  3,  2.50)  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V08 tmp1         [V08    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V09 tmp2         [V09    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V10 tmp3         [V10    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V08 tmp1         [V08    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V09 rat0         [V09,T02] (  3,  6   )  simd32  ->  mm2         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M46395_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M46395_IG02:
        vmovups  ymm0, ymmword ptr [rsi]
        vmovups  ymm1, ymmword ptr [rsi+0x20]
-       vpor     ymm2, ymm0, ymm1
-       vptest   ymm2, ymmword ptr [reloc @RWD00]
+       vmovaps  ymm2, ymm0
+       vpternlogd ymm2, ymm1, ymmword ptr [reloc @RWD00], -88
+       vptest   ymm2, ymm2
        je       SHORT G_M46395_IG05
-						;; size=24 bbWeight=1 PerfScore 18.33
+						;; size=31 bbWeight=1 PerfScore 18.25
 G_M46395_IG03:
        vxorps   ymm0, ymm0, ymm0
        vmovups  ymmword ptr [r8], ymm0
        xor      eax, eax
 						;; size=11 bbWeight=0.50 PerfScore 1.29
 G_M46395_IG04:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=0.50 PerfScore 1.00
 G_M46395_IG05:
        vpmovwb  ymm0, ymm0
        vpmovwb  ymm1, ymm1
        vinserti128 ymm0, ymm0, xmm1, 1
        vmovups  ymmword ptr [r8], ymm0
        mov      eax, 1
 						;; size=28 bbWeight=0.50 PerfScore 5.12
 G_M46395_IG06:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=0.50 PerfScore 1.00
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 71, prolog size 0, PerfScore 26.75, instruction count 17, allocated bytes for code 71 (MethodHash=fb454ac4) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 78, prolog size 0, PerfScore 26.67, instruction count 18, allocated bytes for code 78 (MethodHash=fb454ac4) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
7 (13.73 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this
 ; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 0 single block inlinees; 2 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;* V00 this         [V00    ] (  0,  0   )   byref  ->  zero-ref    this single-def
 ;  V01 arg1         [V01,T00] (  4,  4   )    long  ->  rsi         single-def
 ;* V02 arg2         [V02    ] (  0,  0   )    long  ->  zero-ref    single-def
 ;* V03 arg3         [V03    ] (  0,  0   )     int  ->  zero-ref    single-def
 ;  V04 arg4         [V04,T01] (  4,  3   )   byref  ->   r8         single-def
-;  V05 loc0         [V05,T02] (  3,  2.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V06 loc1         [V06,T03] (  3,  2.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V05 loc0         [V05,T03] (  3,  2.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V06 loc1         [V06,T04] (  3,  2.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;# V07 OutArgs      [V07    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V08 tmp1         [V08    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V09 tmp2         [V09    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V10 tmp3         [V10    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V11 tmp4         [V11    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V09 tmp2         [V09    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V10 tmp3         [V10    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V11 rat0         [V11,T02] (  3,  6   )  simd16  ->  mm2         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M11006_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M11006_IG02:
        vmovups  xmm0, xmmword ptr [rsi]
        vmovups  xmm1, xmmword ptr [rsi+0x10]
-       vpor     xmm2, xmm0, xmm1
-       vptest   xmm2, xmmword ptr [reloc @RWD00]
+       vmovaps  xmm2, xmm0
+       vpternlogd xmm2, xmm1, xmmword ptr [reloc @RWD00], -88
+       vptest   xmm2, xmm2
        je       SHORT G_M11006_IG05
-						;; size=24 bbWeight=1 PerfScore 14.33
+						;; size=31 bbWeight=1 PerfScore 14.25
 G_M11006_IG03:
        vxorps   xmm0, xmm0, xmm0
        vmovups  xmmword ptr [r8], xmm0
        xor      eax, eax
 						;; size=11 bbWeight=0.50 PerfScore 1.29
 G_M11006_IG04:
        ret      
 						;; size=1 bbWeight=0.50 PerfScore 0.50
 G_M11006_IG05:
        vpackuswb xmm0, xmm0, xmm1
        vmovups  xmmword ptr [r8], xmm0
        mov      eax, 1
 						;; size=14 bbWeight=0.50 PerfScore 1.62
 G_M11006_IG06:
        ret      
 						;; size=1 bbWeight=0.50 PerfScore 0.50
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 51, prolog size 0, PerfScore 18.25, instruction count 13, allocated bytes for code 51 (MethodHash=0badd501) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 58, prolog size 0, PerfScore 18.17, instruction count 14, allocated bytes for code 58 (MethodHash=0badd501) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
7 (2.06 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Intrinsified(ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Intrinsified(ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 2 single block inlinees; 6 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] ( 32, 34.50)    long  ->  rdi        
 ;  V01 arg1         [V01,T01] ( 17, 10   )    long  ->  rsi        
 ;* V02 loc0         [V02,T08] (  0,  0   )     int  ->  zero-ref   
-;  V03 loc1         [V03,T09] ( 11, 12.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V04 loc2         [V04,T10] (  3,  8.50)  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V03 loc1         [V03,T10] ( 11, 12.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V04 loc2         [V04,T11] (  3,  8.50)  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;* V05 loc3         [V05    ] (  0,  0   )     int  ->  zero-ref   
 ;  V06 loc4         [V06,T03] (  8,  4   )    long  ->  rax        
 ;* V07 loc5         [V07    ] (  0,  0   )    long  ->  zero-ref   
 ;  V08 loc6         [V08,T04] (  3,  1.50)     int  ->  rcx        
 ;  V09 loc7         [V09,T02] (  2,  4.50)    long  ->  rcx        
 ;* V10 loc8         [V10    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;* V11 loc9         [V11    ] (  0,  0   )     ref  ->  zero-ref    class-hnd <System.Object>
 ;* V12 loc10        [V12    ] (  0,  0   )     ref  ->  zero-ref    class-hnd <System.Object>
 ;  V13 loc11        [V13,T06] (  2,  1   )    long  ->  rdi        
 ;# V14 OutArgs      [V14    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V15 tmp1         [V15    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V16 tmp2         [V16    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V17 tmp3         [V17    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V18 tmp4         [V18    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V19 tmp5         [V19    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V16 tmp2         [V16    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V17 tmp3         [V17    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V18 tmp4         [V18    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V19 tmp5         [V19    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V20 tmp6         [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V21 tmp7         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V22 tmp8         [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V23 tmp9         [V23    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V24 tmp10        [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V25 tmp11        [V25    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V26 tmp12        [V26,T07] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V27 cse0         [V27,T05] (  3,  1.50)    long  ->  rcx         "CSE #02: moderate"
-;  V28 cse1         [V28,T11] (  6,  6.50)  simd16  ->  mm1         "CSE #01: aggressive"
+;* V21 tmp7         [V21,T07] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V22 cse0         [V22,T05] (  3,  1.50)    long  ->  rcx         "CSE #02: moderate"
+;  V23 cse1         [V23,T12] (  6,  6.50)  simd16  ->  mm1         "CSE #01: aggressive"
+;  V24 rat0         [V24,T09] (  3, 24   )  simd16  ->  mm3         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M29265_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M29265_IG02:
        test     rsi, rsi
        jne      SHORT G_M29265_IG05
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M29265_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M29265_IG04:
        pop      rbp
        ret      
 						;; size=2 bbWeight=0.50 PerfScore 0.75
 G_M29265_IG05:
        mov      rax, rdi
        cmp      rsi, 8
        jb       G_M29265_IG10
        vmovups  xmm0, xmmword ptr [rax]
        vmovups  xmm1, xmmword ptr [reloc @RWD00]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      G_M29265_IG17
        add      rsi, rsi
        cmp      rsi, 32
        jb       SHORT G_M29265_IG08
        lea      rdi, [rax+0x10]
        and      rdi, -16
        mov      rcx, rdi
        sub      rcx, rax
        sub      rsi, rcx
        cmp      rsi, 32
        jb       SHORT G_M29265_IG07
        lea      rcx, [rdi+rsi-0x20]
        align    [0 bytes for IG06]
 						;; size=73 bbWeight=0.50 PerfScore 8.88
 G_M29265_IG06:
        vmovups  xmm0, xmmword ptr [rdi]
        vmovups  xmm2, xmmword ptr [rdi+0x10]
-       vpor     xmm3, xmm0, xmm2
-       vptest   xmm3, xmm1
+       vmovaps  xmm3, xmm0
+       vpternlogd xmm3, xmm2, xmm1, -88
+       vptest   xmm3, xmm3
        jne      G_M29265_IG16
        add      rdi, 32
        cmp      rdi, rcx
        jbe      SHORT G_M29265_IG06
-						;; size=33 bbWeight=4 PerfScore 55.33
+						;; size=40 bbWeight=4 PerfScore 57.00
 G_M29265_IG07:
        test     sil, 16
        je       SHORT G_M29265_IG09
        vmovups  xmm0, xmmword ptr [rdi]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      G_M29265_IG17
 						;; size=21 bbWeight=0.50 PerfScore 4.62
 G_M29265_IG08:
        add      rdi, 16
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M29265_IG09:
        movzx    rcx, sil
        test     cl, 15
        je       G_M29265_IG18
        and      rsi, 15
        lea      rdi, [rdi+rsi-0x10]
        vmovups  xmm0, xmmword ptr [rdi]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      SHORT G_M29265_IG17
        add      rdi, 16
        jmp      G_M29265_IG18
 						;; size=42 bbWeight=0.50 PerfScore 6.50
 G_M29265_IG10:
        test     sil, 4
        je       SHORT G_M29265_IG12
        mov      rdi, qword ptr [rax]
        mov      rcx, 0xD1FFAB1E
        and      rcx, rdi
        je       SHORT G_M29265_IG11
        xor      esi, esi
        tzcnt    rsi, rcx
        sar      esi, 3
        movsxd   rdi, esi
        and      rdi, -2
        add      rdi, rax
        jmp      SHORT G_M29265_IG18
 						;; size=46 bbWeight=0.50 PerfScore 5.12
 G_M29265_IG11:
        lea      rdi, [rax+0x08]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M29265_IG12:
        test     sil, 2
        je       SHORT G_M29265_IG13
        mov      ecx, dword ptr [rdi]
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M29265_IG15
        add      rdi, 4
 						;; size=20 bbWeight=0.50 PerfScore 2.38
 G_M29265_IG13:
        test     sil, 1
        je       SHORT G_M29265_IG18
        cmp      word  ptr [rdi], 127
        ja       SHORT G_M29265_IG18
 						;; size=12 bbWeight=0.50 PerfScore 2.62
 G_M29265_IG14:
        add      rdi, 2
        jmp      SHORT G_M29265_IG18
 						;; size=6 bbWeight=0.50 PerfScore 1.12
 G_M29265_IG15:
        test     ecx, 0xFF80
        je       SHORT G_M29265_IG14
        jmp      SHORT G_M29265_IG18
 						;; size=10 bbWeight=0.50 PerfScore 1.62
 G_M29265_IG16:
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      SHORT G_M29265_IG17
        add      rdi, 16
        vmovaps  xmm0, xmm2
 						;; size=15 bbWeight=0.50 PerfScore 2.25
 G_M29265_IG17:
        vpaddusw xmm0, xmm0, xmmword ptr [reloc @RWD16]
        vpmovmskb ecx, xmm0
        and      ecx, 0xAAAA
        tzcnt    ecx, ecx
        lea      rdi, [rdi+rcx-0x01]
 						;; size=27 bbWeight=0.50 PerfScore 3.62
 G_M29265_IG18:
        mov      rcx, rdi
        sub      rcx, rax
        mov      rax, rcx
        shr      rax, 1
 						;; size=12 bbWeight=0.50 PerfScore 0.62
 G_M29265_IG19:
        pop      rbp
        ret      
 						;; size=2 bbWeight=0.50 PerfScore 0.75
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD16  	dq	7F807F807F807F80h, 7F807F807F807F80h
 
 
-; Total bytes of code 340, prolog size 4, PerfScore 99.21, instruction count 94, allocated bytes for code 340 (MethodHash=6c288dae) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Intrinsified(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 347, prolog size 4, PerfScore 100.88, instruction count 95, allocated bytes for code 347 (MethodHash=6c288dae) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Intrinsified(ulong,ulong):ulong (FullOpts)
7 (3.85 % of base) - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_256(ulong,ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_256(ulong,ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 4 single block inlinees; 8 inlinees without PGO data
+; 0 inlinees with PGO data; 8 single block inlinees; 4 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T04] (  3,  3   )    long  ->  rdi         single-def
 ;  V01 arg1         [V01,T03] (  5,  3.50)    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T05] (  3,  2.50)    long  ->  rdx         single-def
 ;  V03 loc0         [V03,T01] (  5, 10.50)   byref  ->  rdi         single-def
-;  V04 loc1         [V04,T07] ( 14, 18.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V04 loc1         [V04,T08] ( 14, 18.50)  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;  V05 loc2         [V05,T02] (  5,  6   )   byref  ->  rcx         single-def
 ;  V06 loc3         [V06,T00] ( 12, 27   )    long  ->  rax        
 ;  V07 loc4         [V07,T06] (  2,  4.50)    long  ->  rdx        
-;  V08 loc5         [V08,T09] (  3, 12   )  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V08 loc5         [V08,T10] (  3, 12   )  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;# V09 OutArgs      [V09    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V10 tmp1         [V10    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V11 tmp2         [V11    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
-;  V12 tmp3         [V12,T08] (  2, 16   )  simd32  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;  V12 tmp3         [V12,T09] (  2, 16   )  simd32  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
 ;* V13 tmp4         [V13    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
-;* V14 tmp5         [V14    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V15 tmp6         [V15    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V14 tmp5         [V14    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V15 tmp6         [V15    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V16 tmp7         [V16    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V17 tmp8         [V17    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V18 tmp9         [V18    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V19 tmp10        [V19    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V18 tmp9         [V18    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V19 tmp10        [V19    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V20 tmp11        [V20    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V21 tmp12        [V21    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V22 tmp13        [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V23 tmp14        [V23    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V24 tmp15        [V24    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V25 tmp16        [V25    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V26 tmp17        [V26    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V27 tmp18        [V27    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V28 tmp19        [V28    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V29 tmp20        [V29    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;  V30 cse0         [V30,T10] (  5,  7   )  simd32  ->  mm1         "CSE #01: moderate"
+;  V22 cse0         [V22,T11] (  5,  7   )  simd32  ->  mm1         "CSE #01: moderate"
+;  V23 rat0         [V23,T07] (  3, 24   )  simd32  ->  mm3         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M60588_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M60588_IG02:
        vmovups  ymm0, ymmword ptr [rdi]
        vmovups  ymm1, ymmword ptr [reloc @RWD00]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        je       SHORT G_M60588_IG05
 						;; size=19 bbWeight=1 PerfScore 15.00
 G_M60588_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M60588_IG04:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M60588_IG05:
        mov      rcx, rsi
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rcx], xmm0
        mov      eax, 16
        test     sil, 16
        jne      SHORT G_M60588_IG06
        vmovups  ymm0, ymmword ptr [rdi+0x20]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        jne      SHORT G_M60588_IG08
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rcx+0x10], xmm0
 						;; size=55 bbWeight=0.50 PerfScore 11.38
 G_M60588_IG06:
        and      rsi, 31
        mov      rax, rsi
        neg      rax
        add      rax, 32
        add      rdx, -32
        align    [0 bytes for IG07]
 						;; size=18 bbWeight=0.50 PerfScore 0.62
 G_M60588_IG07:
        vmovups  ymm0, ymmword ptr [rdi+2*rax]
        vmovups  ymm2, ymmword ptr [rdi+2*rax+0x20]
-       vpor     ymm3, ymm0, ymm2
-       vptest   ymm3, ymm1
+       vmovaps  ymm3, ymm0
+       vpternlogd ymm3, ymm2, ymm1, -88
+       vptest   ymm3, ymm3
        jne      SHORT G_M60588_IG09
        vpackuswb ymm0, ymm0, ymm2
        vpermq   ymm0, ymm0, -40
        vmovups  ymmword ptr [rcx+rax], ymm0
        add      rax, 32
        cmp      rax, rdx
        jbe      SHORT G_M60588_IG07
-						;; size=46 bbWeight=4 PerfScore 91.33
+						;; size=53 bbWeight=4 PerfScore 93.00
 G_M60588_IG08:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M60588_IG09:
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        jne      SHORT G_M60588_IG08
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rcx+rax], xmm0
        add      rax, 16
        jmp      SHORT G_M60588_IG08
 						;; size=28 bbWeight=0.50 PerfScore 6.62
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 182, prolog size 4, PerfScore 128.83, instruction count 50, allocated bytes for code 182 (MethodHash=910c1353) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_256(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 189, prolog size 4, PerfScore 130.50, instruction count 51, allocated bytes for code 189 (MethodHash=910c1353) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_256(ulong,ulong,ulong):ulong (FullOpts)
7 (4.55 % of base) - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified(ulong,ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified(ulong,ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 3 single block inlinees; 8 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T05] (  3,  3   )    long  ->  rdi         single-def
 ;  V01 arg1         [V01,T04] (  5,  3.50)    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T06] (  3,  2.50)    long  ->  rdx         single-def
 ;* V03 loc0         [V03,T08] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc1         [V04    ] (  0,  0   )    long  ->  zero-ref   
 ;  V05 loc2         [V05,T01] (  5, 10.50)   byref  ->  rdi         single-def
-;  V06 loc3         [V06,T09] ( 14, 18.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V06 loc3         [V06,T10] ( 14, 18.50)  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V07 loc4         [V07,T03] (  5,  6   )   byref  ->  rcx         single-def
 ;  V08 loc5         [V08,T00] ( 11, 26.50)    long  ->  rax        
 ;  V09 loc6         [V09,T07] (  2,  4.50)    long  ->  rdx        
-;  V10 loc7         [V10,T11] (  3, 12   )  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V10 loc7         [V10,T12] (  3, 12   )  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;# V11 OutArgs      [V11    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V12 tmp1         [V12    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;* V13 tmp2         [V13    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
-;  V14 tmp3         [V14,T10] (  2, 16   )  simd16  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;  V14 tmp3         [V14,T11] (  2, 16   )  simd16  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
 ;* V15 tmp4         [V15    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;* V16 tmp5         [V16    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V17 tmp6         [V17    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V18 tmp7         [V18    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V19 tmp8         [V19    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V20 tmp9         [V20    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V21 tmp10        [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V17 tmp6         [V17    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V18 tmp7         [V18    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V19 tmp8         [V19    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V20 tmp9         [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V21 tmp10        [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;* V22 tmp11        [V22    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V23 tmp12        [V23    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V24 tmp13        [V24    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V25 tmp14        [V25    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V26 tmp15        [V26    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V27 tmp16        [V27    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V28 cse0         [V28,T02] (  3,  8.50)    long  ->  rsi         "CSE #02: aggressive"
-;  V29 cse1         [V29,T12] (  5,  7   )  simd16  ->  mm1         "CSE #01: aggressive"
+;* V23 tmp12        [V23    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V24 cse0         [V24,T02] (  3,  8.50)    long  ->  rsi         "CSE #02: aggressive"
+;  V25 cse1         [V25,T13] (  5,  7   )  simd16  ->  mm1         "CSE #01: aggressive"
+;  V26 rat0         [V26,T09] (  3, 24   )  simd16  ->  mm3         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M11650_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M11650_IG02:
        vmovups  xmm0, xmmword ptr [rdi]
        vmovups  xmm1, xmmword ptr [reloc @RWD00]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        je       SHORT G_M11650_IG05
 						;; size=19 bbWeight=1 PerfScore 11.00
 G_M11650_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M11650_IG04:
        pop      rbp
        ret      
 						;; size=2 bbWeight=0.50 PerfScore 0.75
 G_M11650_IG05:
        mov      rcx, rsi
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [rcx], xmm0
        mov      eax, 8
        test     sil, 8
        jne      SHORT G_M11650_IG06
        vmovups  xmm0, xmmword ptr [rdi+0x10]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      SHORT G_M11650_IG08
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [rcx+0x08], xmm0
 						;; size=43 bbWeight=0.50 PerfScore 7.88
 G_M11650_IG06:
        and      rsi, 15
        mov      rax, rsi
        neg      rax
        add      rax, 16
        add      rdx, -16
        align    [0 bytes for IG07]
 						;; size=18 bbWeight=0.50 PerfScore 0.62
 G_M11650_IG07:
        vmovups  xmm0, xmmword ptr [rdi+2*rax]
        lea      rsi, [rax+0x08]
        vmovups  xmm2, xmmword ptr [rdi+2*rsi]
-       vpor     xmm3, xmm0, xmm2
-       vptest   xmm3, xmm1
+       vmovaps  xmm3, xmm0
+       vpternlogd xmm3, xmm2, xmm1, -88
+       vptest   xmm3, xmm3
        jne      SHORT G_M11650_IG09
        vpackuswb xmm0, xmm0, xmm2
        vmovups  xmmword ptr [rcx+rax], xmm0
        add      rax, 16
        cmp      rax, rdx
        jbe      SHORT G_M11650_IG07
-						;; size=43 bbWeight=4 PerfScore 69.33
+						;; size=50 bbWeight=4 PerfScore 71.00
 G_M11650_IG08:
        pop      rbp
        ret      
 						;; size=2 bbWeight=0.50 PerfScore 0.75
 G_M11650_IG09:
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      SHORT G_M11650_IG08
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [rcx+rax], xmm0
        mov      rax, rsi
        jmp      SHORT G_M11650_IG08
 						;; size=21 bbWeight=0.50 PerfScore 4.62
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 154, prolog size 4, PerfScore 96.33, instruction count 45, allocated bytes for code 154 (MethodHash=8c3ed27d) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 161, prolog size 4, PerfScore 98.00, instruction count 46, allocated bytes for code 161 (MethodHash=8c3ed27d) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified(ulong,ulong,ulong):ulong (FullOpts)
7 (1.77 % of base) - System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong
 ; Assembly listing for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] ( 32, 34.50)    long  ->  rbx        
 ;  V01 arg1         [V01,T01] ( 17, 10   )    long  ->  rsi        
 ;* V02 loc0         [V02,T08] (  0,  0   )     int  ->  zero-ref   
 ;* V03 loc1         [V03,T09] (  0,  0   )     int  ->  zero-ref   
-;  V04 loc2         [V04,T10] (  9, 11.50)  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V05 loc3         [V05,T11] (  3,  8.50)  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V04 loc2         [V04,T11] (  9, 11.50)  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V05 loc3         [V05,T12] (  3,  8.50)  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V06 loc4         [V06,T04] (  4,  2   )     int  ->  r14        
 ;  V07 loc5         [V07,T03] (  8,  4   )    long  ->  r15        
-;  V08 loc6         [V08,T12] (  5,  6   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V09 loc7         [V09,T13] (  3,  1.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V08 loc6         [V08,T13] (  5,  6   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V09 loc7         [V09,T14] (  3,  1.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V10 loc8         [V10,T05] (  3,  1.50)     int  ->  rdi        
 ;  V11 loc9         [V11,T02] (  2,  4.50)    long  ->  rdi        
 ;* V12 loc10        [V12    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V13 loc11        [V13,T07] (  2,  1   )    long  ->  rdi        
 ;* V14 loc12        [V14    ] (  0,  0   )     int  ->  zero-ref   
 ;# V15 OutArgs      [V15    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V16 cse0         [V16,T06] (  3,  1.50)    long  ->  rdi         "CSE #01: moderate"
+;  V17 rat0         [V17,T10] (  3, 24   )  simd16  ->  mm4         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 8
 
 G_M38868_IG01:
        push     rbp
        push     r15
        push     r14
        push     rbx
        push     rax
        lea      rbp, [rsp+0x20]
        mov      rbx, rdi
 						;; size=15 bbWeight=1 PerfScore 5.75
 G_M38868_IG02:
        test     rsi, rsi
        jne      SHORT G_M38868_IG05
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M38868_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M38868_IG04:
        add      rsp, 8
        pop      rbx
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=11 bbWeight=0.50 PerfScore 1.62
 G_M38868_IG05:
        mov      r15, rbx
        cmp      rsi, 8
        jb       G_M38868_IG10
        vmovups  xmm0, xmmword ptr [reloc @RWD00]
        vmovups  xmm1, xmmword ptr [reloc @RWD16]
        vpaddusw xmm2, xmm1, xmmword ptr [r15]
        vpmovmskb r14d, xmm2
        test     r14d, 0xAAAA
        jne      G_M38868_IG18
        add      rsi, rsi
        cmp      rsi, 32
        jb       SHORT G_M38868_IG08
        lea      rbx, [r15+0x10]
        and      rbx, -16
        add      rsi, r15
        sub      rsi, rbx
        cmp      rsi, 32
        jb       SHORT G_M38868_IG07
        lea      rdi, [rbx+rsi-0x20]
        align    [0 bytes for IG06]
 						;; size=85 bbWeight=0.50 PerfScore 9.38
 G_M38868_IG06:
        vmovdqa  xmm2, xmmword ptr [rbx]
        vmovdqa  xmm3, xmmword ptr [rbx+0x10]
-       vpor     xmm4, xmm2, xmm3
-       vptest   xmm4, xmm0
+       vmovaps  xmm4, xmm2
+       vpternlogd xmm4, xmm3, xmm0, -88
+       vptest   xmm4, xmm4
        jne      G_M38868_IG16
        add      rbx, 32
        cmp      rbx, rdi
        jbe      SHORT G_M38868_IG06
-						;; size=33 bbWeight=4 PerfScore 55.33
+						;; size=40 bbWeight=4 PerfScore 57.00
 G_M38868_IG07:
        test     sil, 16
        je       SHORT G_M38868_IG09
        vmovdqa  xmm2, xmmword ptr [rbx]
        vptest   xmm2, xmm0
        jne      G_M38868_IG17
 						;; size=21 bbWeight=0.50 PerfScore 4.62
 G_M38868_IG08:
        add      rbx, 16
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M38868_IG09:
        movzx    rdi, sil
        test     dil, 15
        je       G_M38868_IG19
        and      rsi, 15
        add      rsi, rbx
        mov      rbx, rsi
        sub      rbx, 16
        vmovups  xmm2, xmmword ptr [rbx]
        vptest   xmm2, xmm0
        jne      G_M38868_IG17
        add      rbx, 16
        jmp      G_M38868_IG19
 						;; size=52 bbWeight=0.50 PerfScore 6.38
 G_M38868_IG10:
        test     sil, 4
        je       SHORT G_M38868_IG12
        mov      rdi, qword ptr [r15]
        mov      rax, 0xD1FFAB1E
        and      rdi, rax
        je       SHORT G_M38868_IG11
        xor      ebx, ebx
        tzcnt    rbx, rdi
        shr      rbx, 3
        and      rbx, -2
        add      rbx, r15
        jmp      SHORT G_M38868_IG19
 						;; size=44 bbWeight=0.50 PerfScore 5.00
 G_M38868_IG11:
        lea      rbx, [r15+0x08]
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M38868_IG12:
        test     sil, 2
        je       SHORT G_M38868_IG13
        mov      edi, dword ptr [rbx]
        test     edi, 0xD1FFAB1E
        jne      SHORT G_M38868_IG14
        add      rbx, 4
 						;; size=20 bbWeight=0.50 PerfScore 2.38
 G_M38868_IG13:
        test     sil, 1
        je       SHORT G_M38868_IG19
        cmp      word  ptr [rbx], 255
        ja       SHORT G_M38868_IG19
        jmp      SHORT G_M38868_IG15
 						;; size=15 bbWeight=0.50 PerfScore 3.62
 G_M38868_IG14:
        mov      rax, 0xD1FFAB1E      ; code for System.Text.Latin1Utility:FirstCharInUInt32IsLatin1(uint):ubyte
        call     [rax]System.Text.Latin1Utility:FirstCharInUInt32IsLatin1(uint):ubyte
        test     eax, eax
        je       SHORT G_M38868_IG19
 						;; size=16 bbWeight=0.50 PerfScore 2.25
 G_M38868_IG15:
        add      rbx, 2
        jmp      SHORT G_M38868_IG19
 						;; size=6 bbWeight=0.50 PerfScore 1.12
 G_M38868_IG16:
        vptest   xmm2, xmm0
        jne      SHORT G_M38868_IG17
        add      rbx, 16
        vmovaps  xmm2, xmm3
 						;; size=15 bbWeight=0.50 PerfScore 2.25
 G_M38868_IG17:
        vpaddusw xmm0, xmm2, xmm1
        vpmovmskb r14d, xmm0
 						;; size=8 bbWeight=0.50 PerfScore 1.17
 G_M38868_IG18:
        and      r14d, 0xAAAA
        xor      eax, eax
        tzcnt    eax, r14d
        lea      rbx, [rbx+rax-0x01]
 						;; size=19 bbWeight=0.50 PerfScore 1.75
 G_M38868_IG19:
        mov      rax, rbx
        sub      rax, r15
        shr      rax, 1
 						;; size=9 bbWeight=0.50 PerfScore 0.50
 G_M38868_IG20:
        add      rsp, 8
        pop      rbx
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=11 bbWeight=0.50 PerfScore 1.62
 RWD00  	dq	FF00FF00FF00FF00h, FF00FF00FF00FF00h
 RWD16  	dq	7F007F007F007F00h, 7F007F007F007F00h
 
 
-; Total bytes of code 395, prolog size 15, PerfScore 106.50, instruction count 111, allocated bytes for code 395 (MethodHash=0f68682b) for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 402, prolog size 15, PerfScore 108.17, instruction count 112, allocated bytes for code 402 (MethodHash=0f68682b) for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
7 (4.70 % of base) - System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong
 ; Assembly listing for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T01] (  6, 11.50)    long  ->  rdi         single-def
 ;  V01 arg1         [V01,T02] (  8,  8.50)    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T03] (  3,  2.50)    long  ->  rdx         single-def
 ;* V03 loc0         [V03,T05] (  0,  0   )     int  ->  zero-ref   
 ;* V04 loc1         [V04    ] (  0,  0   )    long  ->  zero-ref   
-;  V05 loc2         [V05,T08] (  5,  7   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V05 loc2         [V05,T09] (  5,  7   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V06 loc3         [V06    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V07 loc4         [V07,T06] ( 14, 18.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V07 loc4         [V07,T07] ( 14, 18.50)  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V08 loc5         [V08    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V09 loc6         [V09,T00] ( 12, 27   )    long  ->  rax        
 ;  V10 loc7         [V10,T04] (  2,  4.50)    long  ->  rdx        
-;  V11 loc8         [V11,T07] (  3, 12   )  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V11 loc8         [V11,T08] (  3, 12   )  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V12 loc9         [V12    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;# V13 OutArgs      [V13    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+;  V14 rat0         [V14,T06] (  3, 24   )  simd16  ->  mm3         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M23879_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M23879_IG02:
        vmovups  xmm0, xmmword ptr [reloc @RWD00]
        vmovups  xmm1, xmmword ptr [rdi]
        vptest   xmm1, xmm0
        je       SHORT G_M23879_IG05
 						;; size=19 bbWeight=1 PerfScore 11.00
 G_M23879_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M23879_IG04:
        pop      rbp
        ret      
 						;; size=2 bbWeight=0.50 PerfScore 0.75
 G_M23879_IG05:
        vpackuswb xmm1, xmm1, xmm1
        vmovq    qword ptr [rsi], xmm1
        mov      eax, 8
        test     sil, 8
        jne      SHORT G_M23879_IG06
        vmovups  xmm1, xmmword ptr [rdi+0x10]
        vptest   xmm1, xmm0
        jne      SHORT G_M23879_IG08
        vpackuswb xmm1, xmm1, xmm1
        vmovq    qword ptr [rsi+0x08], xmm1
 						;; size=40 bbWeight=0.50 PerfScore 7.75
 G_M23879_IG06:
        mov      rax, rsi
        and      rax, 15
        neg      rax
        add      rax, 16
        add      rdx, -16
        align    [0 bytes for IG07]
 						;; size=18 bbWeight=0.50 PerfScore 0.62
 G_M23879_IG07:
        vmovups  xmm1, xmmword ptr [rdi+2*rax]
        vmovups  xmm2, xmmword ptr [rdi+2*rax+0x10]
-       vpor     xmm3, xmm1, xmm2
-       vptest   xmm3, xmm0
+       vmovaps  xmm3, xmm1
+       vpternlogd xmm3, xmm2, xmm0, -88
+       vptest   xmm3, xmm3
        jne      SHORT G_M23879_IG09
        vpackuswb xmm1, xmm1, xmm2
        vmovdqa  xmmword ptr [rsi+rax], xmm1
        add      rax, 16
        cmp      rax, rdx
        jbe      SHORT G_M23879_IG07
-						;; size=40 bbWeight=4 PerfScore 67.33
+						;; size=47 bbWeight=4 PerfScore 69.00
 G_M23879_IG08:
        pop      rbp
        ret      
 						;; size=2 bbWeight=0.50 PerfScore 0.75
 G_M23879_IG09:
        vptest   xmm1, xmm0
        jne      SHORT G_M23879_IG08
        vpackuswb xmm0, xmm1, xmm1
        vmovq    qword ptr [rsi+rax], xmm0
        add      rax, 8
        jmp      SHORT G_M23879_IG08
 						;; size=22 bbWeight=0.50 PerfScore 4.62
 RWD00  	dq	FF00FF00FF00FF00h, FF00FF00FF00FF00h
 
 
-; Total bytes of code 149, prolog size 4, PerfScore 94.21, instruction count 43, allocated bytes for code 149 (MethodHash=f65ba2b8) for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 156, prolog size 4, PerfScore 95.88, instruction count 44, allocated bytes for code 156 (MethodHash=f65ba2b8) for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
4 (1.08 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 8 single block inlinees; 2 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] ( 38, 77   )    long  ->  rdi        
 ;  V01 arg1         [V01,T01] ( 17, 21   )    long  ->  rsi        
 ;  V02 loc0         [V02,T04] ( 12,  7   )    long  ->  rax        
 ;  V03 loc1         [V03,T02] (  9, 11.50)     int  ->  rcx        
 ;  V04 loc2         [V04,T05] (  2,  4.50)    long  ->  rcx        
 ;  V05 loc3         [V05,T06] (  2,  4.50)    long  ->  rcx        
 ;  V06 loc4         [V06,T07] (  2,  4.50)    long  ->  rcx        
 ;  V07 loc5         [V07,T03] (  3,  8.50)     int  ->  rdx        
 ;# V08 OutArgs      [V08    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V09 tmp1         [V09    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;* V10 tmp2         [V10    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V11 tmp3         [V11    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
 ;* V12 tmp4         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V13 tmp5         [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V14 tmp6         [V14    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V15 tmp7         [V15    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
-;  V16 cse0         [V16,T08] (  3,  5   )  simd16  ->  mm0         "CSE #02: aggressive"
+;  V16 cse0         [V16,T09] (  3,  5   )  simd16  ->  mm0         "CSE #02: aggressive"
+;  V17 rat0         [V17,T08] (  3, 24   )  simd16  ->  mm1         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M50024_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M50024_IG02:
        mov      rax, rdi
        cmp      rsi, 128
        jb       SHORT G_M50024_IG05
 						;; size=12 bbWeight=1 PerfScore 1.50
 G_M50024_IG03:
        vmovups  zmm0, zmmword ptr [rax]
        vpmovb2m k1, zmm0
        kmovq    rcx, k1
 		  ;; NOP compensation instructions of 3 bytes.
        test     rcx, rcx
        jne      G_M50024_IG10
        lea      rcx, [rax+rsi-0x40]
        lea      rdi, [rax+0x40]
        and      rdi, -64
        align    [6 bytes for IG04]
 						;; size=48 bbWeight=0.50 PerfScore 5.62
 G_M50024_IG04:
        vmovdqa32 zmm0, zmmword ptr [rdi]
        vpmovb2m k1, zmm0
        kmovq    rdx, k1
 		  ;; NOP compensation instructions of 3 bytes.
        test     rdx, rdx
        jne      G_M50024_IG09
        add      rdi, 64
        cmp      rdi, rcx
        jbe      SHORT G_M50024_IG04
        jmp      SHORT G_M50024_IG09
 		  ;; NOP compensation instructions of 3 bytes.
 						;; size=43 bbWeight=4 PerfScore 51.00
 G_M50024_IG05:
        cmp      rsi, 64
        jb       SHORT G_M50024_IG07
        vmovups  ymm0, ymmword ptr [rax]
        vpmovmskb ecx, ymm0
        test     ecx, ecx
        jne      SHORT G_M50024_IG10
+		  ;; NOP compensation instructions of 4 bytes.
        lea      rcx, [rax+rsi-0x20]
        lea      rdi, [rax+0x20]
        and      rdi, -32
-       align    [6 bytes for IG06]
+       align    [2 bytes for IG06]
 						;; size=37 bbWeight=0.50 PerfScore 6.25
 G_M50024_IG06:
        vmovdqa  ymm0, ymmword ptr [rdi]
        vpmovmskb edx, ymm0
        test     edx, edx
        jne      SHORT G_M50024_IG09
        add      rdi, 32
        cmp      rdi, rcx
        jbe      SHORT G_M50024_IG06
        jmp      SHORT G_M50024_IG09
 						;; size=23 bbWeight=4 PerfScore 51.00
 G_M50024_IG07:
        cmp      rsi, 32
        jb       SHORT G_M50024_IG10
        vmovups  xmm0, xmmword ptr [reloc @RWD00]
        vptest   xmm0, xmmword ptr [rax]
        jne      SHORT G_M50024_IG10
        lea      rcx, [rax+rsi-0x10]
        lea      rdi, [rax+0x10]
        and      rdi, -16
        align    [0 bytes for IG08]
 						;; size=34 bbWeight=0.50 PerfScore 6.50
 G_M50024_IG08:
-       vptest   xmm0, xmmword ptr [rdi]
+       vpand    xmm1, xmm0, xmmword ptr [rdi]
+       vptest   xmm1, xmm1
        jne      SHORT G_M50024_IG09
        add      rdi, 16
        cmp      rdi, rcx
        jbe      SHORT G_M50024_IG08
-						;; size=16 bbWeight=4 PerfScore 34.00
+						;; size=20 bbWeight=4 PerfScore 34.00
 G_M50024_IG09:
        sub      rsi, rdi
        add      rsi, rax
 						;; size=6 bbWeight=0.50 PerfScore 0.25
 G_M50024_IG10:
        cmp      rsi, 8
        jb       SHORT G_M50024_IG15
        align    [0 bytes for IG11]
 						;; size=6 bbWeight=1 PerfScore 1.25
 G_M50024_IG11:
        mov      ecx, dword ptr [rdi]
        mov      edx, dword ptr [rdi+0x04]
        mov      r8d, ecx
        or       r8d, edx
        test     r8d, 0xD1FFAB1E
        je       SHORT G_M50024_IG14
 						;; size=20 bbWeight=4 PerfScore 23.00
 G_M50024_IG12:
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M50024_IG13
        mov      ecx, edx
        add      rdi, 4
 						;; size=14 bbWeight=0.50 PerfScore 0.88
 G_M50024_IG13:
        and      ecx, 0xD1FFAB1E
        xor      esi, esi
        tzcnt    esi, ecx
        shr      esi, 3
        mov      ecx, esi
        add      rdi, rcx
        jmp      SHORT G_M50024_IG18
 						;; size=22 bbWeight=0.50 PerfScore 2.75
 G_M50024_IG14:
        add      rdi, 8
        add      rsi, -8
        cmp      rsi, 8
        jae      SHORT G_M50024_IG11
 						;; size=14 bbWeight=4 PerfScore 7.00
 G_M50024_IG15:
        test     sil, 4
        je       SHORT G_M50024_IG16
        mov      ecx, dword ptr [rdi]
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M50024_IG13
        add      rdi, 4
 						;; size=20 bbWeight=0.50 PerfScore 2.38
 G_M50024_IG16:
        test     sil, 2
        je       SHORT G_M50024_IG17
        movzx    rcx, word  ptr [rdi]
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M50024_IG13
        add      rdi, 2
 						;; size=21 bbWeight=0.50 PerfScore 2.38
 G_M50024_IG17:
        test     sil, 1
        je       SHORT G_M50024_IG18
        lea      rcx, [rdi+0x01]
        cmp      byte  ptr [rdi], 0
        cmovge   rdi, rcx
 						;; size=17 bbWeight=0.50 PerfScore 2.50
 G_M50024_IG18:
        mov      rcx, rdi
        sub      rcx, rax
        mov      rax, rcx
 						;; size=9 bbWeight=1 PerfScore 0.75
 G_M50024_IG19:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=1 PerfScore 2.50
 RWD00  	dq	8080808080808080h, 8080808080808080h
 
 
-; Total bytes of code 371, prolog size 4, PerfScore 202.75, instruction count 104, allocated bytes for code 371 (MethodHash=58923c97) for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 375, prolog size 4, PerfScore 202.75, instruction count 105, allocated bytes for code 375 (MethodHash=58923c97) for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
4 (1.19 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 8 single block inlinees; 5 inlinees without PGO data
+; 0 inlinees with PGO data; 10 single block inlinees; 3 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] ( 32, 74   )    long  ->  rdi        
 ;  V01 arg1         [V01,T01] ( 16, 20.50)    long  ->  rsi        
 ;  V02 loc0         [V02,T04] ( 12,  7   )    long  ->  rax        
 ;  V03 loc1         [V03,T02] (  7, 10.50)     int  ->  rcx        
 ;  V04 loc2         [V04,T05] (  2,  4.50)    long  ->  rcx        
 ;  V05 loc3         [V05,T06] (  2,  4.50)    long  ->  rcx        
 ;  V06 loc4         [V06,T07] (  2,  4.50)    long  ->  rcx        
 ;  V07 loc5         [V07,T03] (  3,  8.50)     int  ->  rdx        
 ;# V08 OutArgs      [V08    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V09 tmp1         [V09    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;* V10 tmp2         [V10    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V11 tmp3         [V11    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
 ;* V12 tmp4         [V12    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;* V13 tmp5         [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V14 tmp6         [V14    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V13 tmp5         [V13    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V14 tmp6         [V14    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V15 tmp7         [V15    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V16 tmp8         [V16    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V17 tmp9         [V17    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V18 tmp10        [V18    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V19 tmp11        [V19    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V20 tmp12        [V20    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V21 tmp13        [V21    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V22 tmp14        [V22    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V23 tmp15        [V23    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
-;* V24 tmp16        [V24,T08] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V25 cse0         [V25,T09] (  3,  5   )  simd64  ->  mm0         "CSE #01: aggressive"
-;  V26 cse1         [V26,T10] (  3,  5   )  simd32  ->  mm0         "CSE #04: aggressive"
-;  V27 cse2         [V27,T11] (  3,  5   )  simd16  ->  mm0         "CSE #05: aggressive"
+;* V16 tmp8         [V16    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V17 tmp9         [V17    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V18 tmp10        [V18,T08] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V19 cse0         [V19,T11] (  3,  5   )  simd64  ->  mm0         "CSE #01: aggressive"
+;  V20 cse1         [V20,T12] (  3,  5   )  simd32  ->  mm0         "CSE #04: aggressive"
+;  V21 cse2         [V21,T13] (  3,  5   )  simd16  ->  mm0         "CSE #05: aggressive"
+;  V22 rat0         [V22,T09] (  3, 24   )  simd32  ->  mm1         "ReplaceWithLclVar is creating a new local variable"
+;  V23 rat1         [V23,T10] (  3, 24   )  simd16  ->  mm1         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M42618_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M42618_IG02:
        mov      rax, rdi
        cmp      rsi, 64
        jb       SHORT G_M42618_IG05
 						;; size=9 bbWeight=1 PerfScore 1.50
 G_M42618_IG03:
        vmovups  zmm0, zmmword ptr [reloc @RWD00]
        vptestmw k1, zmm0, zmmword ptr [rax]
        kortestd k1, k1
 		  ;; NOP compensation instructions of 3 bytes.
        jne      G_M42618_IG10
        lea      rcx, [rax+2*rsi-0x40]
        lea      rdi, [rax+0x40]
        and      rdi, -64
        align    [0 bytes for IG04]
 						;; size=43 bbWeight=0.50 PerfScore 6.38
 G_M42618_IG04:
        vptestmw k1, zmm0, zmmword ptr [rdi]
        kortestd k1, k1
 		  ;; NOP compensation instructions of 3 bytes.
        jne      G_M42618_IG09
        add      rdi, 64
        cmp      rdi, rcx
        jbe      SHORT G_M42618_IG04
        jmp      SHORT G_M42618_IG09
 		  ;; NOP compensation instructions of 3 bytes.
 						;; size=34 bbWeight=4 PerfScore 46.00
 G_M42618_IG05:
        cmp      rsi, 32
        jb       SHORT G_M42618_IG07
        vmovups  ymm0, ymmword ptr [reloc @RWD00]
        vptest   ymm0, ymmword ptr [rax]
        jne      SHORT G_M42618_IG10
+		  ;; NOP compensation instructions of 4 bytes.
        lea      rcx, [rax+2*rsi-0x20]
        lea      rdi, [rax+0x20]
        and      rdi, -32
-       align    [4 bytes for IG06]
-						;; size=38 bbWeight=0.50 PerfScore 8.12
+       align    [0 bytes for IG06]
+						;; size=38 bbWeight=0.50 PerfScore 8.00
 G_M42618_IG06:
-       vptest   ymm0, ymmword ptr [rdi]
+       vpand    ymm1, ymm0, ymmword ptr [rdi]
+       vptest   ymm1, ymm1
        jne      SHORT G_M42618_IG09
        add      rdi, 32
        cmp      rdi, rcx
        jbe      SHORT G_M42618_IG06
        jmp      SHORT G_M42618_IG09
-						;; size=18 bbWeight=4 PerfScore 50.00
+						;; size=22 bbWeight=4 PerfScore 50.00
 G_M42618_IG07:
        cmp      rsi, 16
        jb       SHORT G_M42618_IG10
        vmovups  xmm0, xmmword ptr [reloc @RWD00]
        vptest   xmm0, xmmword ptr [rax]
        jne      SHORT G_M42618_IG10
        lea      rcx, [rax+2*rsi-0x10]
        lea      rdi, [rax+0x10]
        and      rdi, -16
-       align    [12 bytes for IG08]
-						;; size=46 bbWeight=0.50 PerfScore 6.62
+       align    [8 bytes for IG08]
+						;; size=42 bbWeight=0.50 PerfScore 6.62
 G_M42618_IG08:
-       vptest   xmm0, xmmword ptr [rdi]
+       vpand    xmm1, xmm0, xmmword ptr [rdi]
+       vptest   xmm1, xmm1
        jne      SHORT G_M42618_IG09
        add      rdi, 16
        cmp      rdi, rcx
        jbe      SHORT G_M42618_IG08
-						;; size=16 bbWeight=4 PerfScore 34.00
+						;; size=20 bbWeight=4 PerfScore 34.00
 G_M42618_IG09:
        mov      rcx, rdi
        sub      rcx, rax
        shr      rcx, 1
        sub      rsi, rcx
 						;; size=12 bbWeight=0.50 PerfScore 0.62
 G_M42618_IG10:
        cmp      rsi, 4
        jb       SHORT G_M42618_IG15
        align    [0 bytes for IG11]
 						;; size=6 bbWeight=1 PerfScore 1.25
 G_M42618_IG11:
        mov      ecx, dword ptr [rdi]
        mov      edx, dword ptr [rdi+0x04]
        mov      r8d, ecx
        or       r8d, edx
        test     r8d, 0xD1FFAB1E
        je       SHORT G_M42618_IG14
 						;; size=20 bbWeight=4 PerfScore 23.00
 G_M42618_IG12:
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M42618_IG13
        mov      ecx, edx
        add      rdi, 4
 						;; size=14 bbWeight=0.50 PerfScore 0.88
 G_M42618_IG13:
        test     ecx, 0xFF80
        jne      SHORT G_M42618_IG18
        jmp      SHORT G_M42618_IG17
 						;; size=10 bbWeight=0.50 PerfScore 1.62
 G_M42618_IG14:
        add      rdi, 8
        add      rsi, -4
        cmp      rsi, 4
        jae      SHORT G_M42618_IG11
 						;; size=14 bbWeight=4 PerfScore 7.00
 G_M42618_IG15:
        test     sil, 2
        je       SHORT G_M42618_IG16
        mov      ecx, dword ptr [rdi]
        test     ecx, 0xD1FFAB1E
        jne      SHORT G_M42618_IG13
        add      rdi, 4
 						;; size=20 bbWeight=0.50 PerfScore 2.38
 G_M42618_IG16:
        test     sil, 1
        je       SHORT G_M42618_IG18
        cmp      word  ptr [rdi], 127
        ja       SHORT G_M42618_IG18
 						;; size=12 bbWeight=0.50 PerfScore 2.62
 G_M42618_IG17:
        add      rdi, 2
 						;; size=4 bbWeight=0.50 PerfScore 0.12
 G_M42618_IG18:
        mov      rcx, rdi
        sub      rcx, rax
        mov      rax, rcx
        shr      rax, 1
 						;; size=12 bbWeight=1 PerfScore 1.25
 G_M42618_IG19:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=1 PerfScore 2.50
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 337, prolog size 4, PerfScore 197.12, instruction count 91, allocated bytes for code 337 (MethodHash=bc9a5985) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 341, prolog size 4, PerfScore 197.00, instruction count 93, allocated bytes for code 341 (MethodHash=bc9a5985) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment