Skip to content

Instantly share code, notes, and snippets.

@MihuBot
Created July 14, 2024 23:49
Show Gist options
  • Save MihuBot/81f068513146abad0226099bd9fab2d6 to your computer and use it in GitHub Desktop.
Save MihuBot/81f068513146abad0226099bd9fab2d6 to your computer and use it in GitHub Desktop.

Top method improvements

-8 (-2.42 % of base) - System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte
 ; Assembly listing for method System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 6 single block inlinees; 7 inlinees without PGO data
+; 0 inlinees with PGO data; 6 single block inlinees; 10 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;* V00 arg0         [V00    ] (  0,  0   )  struct (16) zero-ref    multireg-arg ld-addr-op single-def <System.ReadOnlySpan`1[ushort]>
 ;* V01 arg1         [V01    ] (  0,  0   )  struct (16) zero-ref    multireg-arg ld-addr-op single-def <System.Span`1[ubyte]>
-;  V02 arg2         [V02,T06] (  4,  3   )   byref  ->  rbx         single-def
+;  V02 arg2         [V02,T07] (  4,  3   )   byref  ->  rbx         single-def
 ;  V03 loc0         [V03,T00] ( 12, 42.50)    long  ->  r15        
 ;  V04 loc1         [V04,T02] (  3,  9   )    long  ->  r13        
-;* V05 loc2         [V05,T19] (  0,  0   )   byref  ->  zero-ref    single-def
-;* V06 loc3         [V06,T20] (  0,  0   )   byref  ->  zero-ref    single-def
+;* V05 loc2         [V05,T20] (  0,  0   )   byref  ->  zero-ref    single-def
+;* V06 loc3         [V06,T21] (  0,  0   )   byref  ->  zero-ref    single-def
 ;  V07 loc4         [V07    ] (  2,  1   )     int  ->  [rbp-0x28]  do-not-enreg[X] addr-exposed ld-addr-op
-;  V08 loc5         [V08,T23] (  3, 24   )  simd16  ->  mm7         <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V09 loc6         [V09,T24] (  3, 24   )  simd16  ->  mm8         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V08 loc5         [V08,T24] (  3, 24   )  simd16  ->  mm6         <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V09 loc6         [V09,T25] (  3, 24   )  simd16  ->  mm7         <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;* V10 loc7         [V10    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V11 loc8         [V11    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V12 loc9         [V12,T25] (  3, 16   )  simd16  ->  mm9         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V12 loc9         [V12,T26] (  3, 16   )  simd16  ->  mm8         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V13 loc10        [V13    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V14 loc11        [V14    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V15 loc12        [V15    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;# V16 OutArgs      [V16    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V17 tmp1         [V17,T21] (  3, 48   )  simd16  ->  mm9         "dup spill"
+;  V17 tmp1         [V17,T22] (  3, 48   )  simd16  ->  mm8         "dup spill"
 ;* V18 tmp2         [V18    ] (  0,  0   )  struct (16) zero-ref    "impAppendStmt" <System.ReadOnlySpan`1[ushort]>
 ;* V19 tmp3         [V19    ] (  0,  0   )  struct (16) zero-ref    "spilled call-like call argument" <System.Span`1[ubyte]>
-;  V20 tmp4         [V20,T12] (  2,  2   )     int  ->  rax         "impAppendStmt"
+;  V20 tmp4         [V20,T13] (  2,  2   )     int  ->  rax         "impAppendStmt"
 ;* V21 tmp5         [V21    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;* V22 tmp6         [V22    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ushort]>
 ;* V23 tmp7         [V23    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.Span`1[ubyte]>
 ;* V24 tmp8         [V24    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V25 tmp9         [V25    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V26 tmp10        [V26    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V27 tmp11        [V27    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V28 tmp12        [V28    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V29 tmp13        [V29    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V29 tmp13        [V29,T23] (  2, 32   )  simd16  ->  mm6         "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;* V30 tmp14        [V30    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;* V31 tmp15        [V31    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V32 tmp16        [V32    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V33 tmp17        [V33    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V34 tmp18        [V34,T07] (  4,  4   )     int  ->   r8         "Inlining Arg"
-;* V35 tmp19        [V35    ] (  0,  0   )  struct (16) zero-ref    multireg-arg ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ushort]>
-;  V36 tmp20        [V36,T10] (  2,  2   )   byref  ->  rdi         single-def "Inlining Arg"
-;  V37 tmp21        [V37,T13] (  2,  2   )     int  ->  rsi         "Inlining Arg"
-;  V38 tmp22        [V38,T08] (  4,  4   )     int  ->   r8         "Inlining Arg"
-;* V39 tmp23        [V39    ] (  0,  0   )  struct (16) zero-ref    multireg-arg ld-addr-op "NewObj constructor temp" <System.Span`1[ubyte]>
-;  V40 tmp24        [V40,T11] (  2,  2   )   byref  ->  rdx         single-def "Inlining Arg"
-;  V41 tmp25        [V41,T14] (  2,  2   )     int  ->  rcx         "Inlining Arg"
-;  V42 tmp26        [V42,T01] (  4, 17.50)   byref  ->  rdi         single-def "field V00._reference (fldOffset=0x0)" P-INDEP
-;  V43 tmp27        [V43,T05] (  5,  3.50)     int  ->  rsi         single-def "field V00._length (fldOffset=0x8)" P-INDEP
-;  V44 tmp28        [V44,T03] (  3,  5.50)   byref  ->  rdx         single-def "field V01._reference (fldOffset=0x0)" P-INDEP
-;  V45 tmp29        [V45,T09] (  3,  2   )     int  ->  rcx         single-def "field V01._length (fldOffset=0x8)" P-INDEP
-;* V46 tmp30        [V46    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V18._reference (fldOffset=0x0)" P-INDEP
-;* V47 tmp31        [V47    ] (  0,  0   )     int  ->  zero-ref    "field V18._length (fldOffset=0x8)" P-INDEP
-;* V48 tmp32        [V48    ] (  0,  0   )   byref  ->  zero-ref    "field V19._reference (fldOffset=0x0)" P-INDEP
-;* V49 tmp33        [V49    ] (  0,  0   )     int  ->  zero-ref    "field V19._length (fldOffset=0x8)" P-INDEP
-;* V50 tmp34        [V50    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V22._reference (fldOffset=0x0)" P-INDEP
-;* V51 tmp35        [V51    ] (  0,  0   )     int  ->  zero-ref    "field V22._length (fldOffset=0x8)" P-INDEP
-;* V52 tmp36        [V52    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V23._reference (fldOffset=0x0)" P-INDEP
-;* V53 tmp37        [V53    ] (  0,  0   )     int  ->  zero-ref    "field V23._length (fldOffset=0x8)" P-INDEP
-;  V54 tmp38        [V54,T15] (  2,  1   )   byref  ->  rdi         single-def "field V35._reference (fldOffset=0x0)" P-INDEP
-;  V55 tmp39        [V55,T17] (  2,  1   )     int  ->  rsi         "field V35._length (fldOffset=0x8)" P-INDEP
-;  V56 tmp40        [V56,T16] (  2,  1   )   byref  ->  rdx         single-def "field V39._reference (fldOffset=0x0)" P-INDEP
-;  V57 tmp41        [V57,T18] (  2,  1   )     int  ->  rcx         "field V39._length (fldOffset=0x8)" P-INDEP
-;  V58 cse0         [V58,T26] (  2,  9   )  simd16  ->  mm0         hoist "CSE #01: aggressive"
-;  V59 cse1         [V59,T27] (  2,  9   )  simd16  ->  mm1         hoist "CSE #02: aggressive"
-;  V60 cse2         [V60,T28] (  2,  9   )  simd16  ->  mm2         hoist "CSE #03: aggressive"
-;  V61 cse3         [V61,T29] (  2,  9   )  simd16  ->  mm3         hoist "CSE #04: aggressive"
-;  V62 cse4         [V62,T30] (  2,  9   )  simd16  ->  mm4         hoist "CSE #05: aggressive"
-;  V63 cse5         [V63,T31] (  2,  9   )  simd16  ->  mm5         hoist "CSE #06: aggressive"
-;  V64 cse6         [V64,T32] (  2,  9   )  simd16  ->  mm6         hoist "CSE #07: aggressive"
-;  V65 cse7         [V65,T04] (  3,  6   )    long  ->  r14         "CSE #08: aggressive"
-;  V66 rat0         [V66,T22] (  3, 48   )  simd16  ->  mm7         "ReplaceWithLclVar is creating a new local variable"
+;* V32 tmp16        [V32,T04] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V33 tmp17        [V33    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V34 tmp18        [V34    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V35 tmp19        [V35    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V36 tmp20        [V36    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V37 tmp21        [V37    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V38 tmp22        [V38    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V39 tmp23        [V39,T08] (  4,  4   )     int  ->   r8         "Inlining Arg"
+;* V40 tmp24        [V40    ] (  0,  0   )  struct (16) zero-ref    multireg-arg ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ushort]>
+;  V41 tmp25        [V41,T11] (  2,  2   )   byref  ->  rdi         single-def "Inlining Arg"
+;  V42 tmp26        [V42,T14] (  2,  2   )     int  ->  rsi         "Inlining Arg"
+;  V43 tmp27        [V43,T09] (  4,  4   )     int  ->   r8         "Inlining Arg"
+;* V44 tmp28        [V44    ] (  0,  0   )  struct (16) zero-ref    multireg-arg ld-addr-op "NewObj constructor temp" <System.Span`1[ubyte]>
+;  V45 tmp29        [V45,T12] (  2,  2   )   byref  ->  rdx         single-def "Inlining Arg"
+;  V46 tmp30        [V46,T15] (  2,  2   )     int  ->  rcx         "Inlining Arg"
+;  V47 tmp31        [V47,T01] (  4, 17.50)   byref  ->  rdi         single-def "field V00._reference (fldOffset=0x0)" P-INDEP
+;  V48 tmp32        [V48,T06] (  5,  3.50)     int  ->  rsi         single-def "field V00._length (fldOffset=0x8)" P-INDEP
+;  V49 tmp33        [V49,T03] (  3,  5.50)   byref  ->  rdx         single-def "field V01._reference (fldOffset=0x0)" P-INDEP
+;  V50 tmp34        [V50,T10] (  3,  2   )     int  ->  rcx         single-def "field V01._length (fldOffset=0x8)" P-INDEP
+;* V51 tmp35        [V51    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V18._reference (fldOffset=0x0)" P-INDEP
+;* V52 tmp36        [V52    ] (  0,  0   )     int  ->  zero-ref    "field V18._length (fldOffset=0x8)" P-INDEP
+;* V53 tmp37        [V53    ] (  0,  0   )   byref  ->  zero-ref    "field V19._reference (fldOffset=0x0)" P-INDEP
+;* V54 tmp38        [V54    ] (  0,  0   )     int  ->  zero-ref    "field V19._length (fldOffset=0x8)" P-INDEP
+;* V55 tmp39        [V55    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V22._reference (fldOffset=0x0)" P-INDEP
+;* V56 tmp40        [V56    ] (  0,  0   )     int  ->  zero-ref    "field V22._length (fldOffset=0x8)" P-INDEP
+;* V57 tmp41        [V57    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V23._reference (fldOffset=0x0)" P-INDEP
+;* V58 tmp42        [V58    ] (  0,  0   )     int  ->  zero-ref    "field V23._length (fldOffset=0x8)" P-INDEP
+;  V59 tmp43        [V59,T16] (  2,  1   )   byref  ->  rdi         single-def "field V40._reference (fldOffset=0x0)" P-INDEP
+;  V60 tmp44        [V60,T18] (  2,  1   )     int  ->  rsi         "field V40._length (fldOffset=0x8)" P-INDEP
+;  V61 tmp45        [V61,T17] (  2,  1   )   byref  ->  rdx         single-def "field V44._reference (fldOffset=0x0)" P-INDEP
+;  V62 tmp46        [V62,T19] (  2,  1   )     int  ->  rcx         "field V44._length (fldOffset=0x8)" P-INDEP
+;  V63 cse0         [V63,T27] (  2,  9   )  simd16  ->  mm0         hoist "CSE #01: aggressive"
+;  V64 cse1         [V64,T28] (  2,  9   )  simd16  ->  mm1         hoist "CSE #02: aggressive"
+;  V65 cse2         [V65,T29] (  2,  9   )  simd16  ->  mm2         hoist "CSE #03: aggressive"
+;  V66 cse3         [V66,T30] (  2,  9   )  simd16  ->  mm3         hoist "CSE #04: aggressive"
+;  V67 cse4         [V67,T31] (  2,  9   )  simd16  ->  mm4         hoist "CSE #05: aggressive"
+;  V68 cse5         [V68,T32] (  2,  9   )  simd16  ->  mm5         hoist "CSE #06: aggressive"
+;  V69 cse6         [V69,T05] (  3,  6   )    long  ->  r14         "CSE #07: aggressive"
 ;
 ; Lcl frame size = 16
 
 G_M6966_IG01:
        push     rbp
        push     r15
        push     r14
        push     r13
        push     rbx
        sub      rsp, 16
        lea      rbp, [rsp+0x30]
        mov      rbx, r8
 						;; size=20 bbWeight=1 PerfScore 6.00
 G_M6966_IG02:
        xor      r15d, r15d
        mov      r14d, esi
        lea      r13, [r14-0x10]
        vmovups  xmm0, xmmword ptr [reloc @RWD00]
        vmovups  xmm1, xmmword ptr [reloc @RWD16]
        vmovups  xmm2, xmmword ptr [reloc @RWD32]
        vmovups  xmm3, xmmword ptr [reloc @RWD48]
        vmovups  xmm4, xmmword ptr [reloc @RWD64]
        vmovups  xmm5, xmmword ptr [reloc @RWD80]
-       vmovups  xmm6, xmmword ptr [reloc @RWD96]
        jmp      SHORT G_M6966_IG04
        align    [0 bytes for IG03]
-						;; size=68 bbWeight=1 PerfScore 24.00
+						;; size=60 bbWeight=1 PerfScore 21.00
 G_M6966_IG03:
        mov      r15, r13
 						;; size=3 bbWeight=4 PerfScore 1.00
 G_M6966_IG04:
-       vmovups  xmm7, xmmword ptr [rdi+2*r15]
-       vmovups  xmm8, xmmword ptr [rdi+2*r15+0x10]
-       vpackuswb xmm9, xmm7, xmm8
-       vpaddb   xmm10, xmm0, xmm9
-       vpsubusb xmm10, xmm10, xmm1
-       vpsubb   xmm10, xmm10, xmm2
-       vpand    xmm9, xmm3, xmm9
-       vpsubb   xmm9, xmm9, xmm4
-       vpaddusb xmm9, xmm9, xmm5
-       vpminub  xmm9, xmm9, xmm10
-       vpternlogd xmm7, xmm6, xmm8, -56
-       vptest   xmm7, xmm7
+       vmovups  xmm6, xmmword ptr [rdi+2*r15]
+       vmovups  xmm7, xmmword ptr [rdi+2*r15+0x10]
+       vpackuswb xmm8, xmm6, xmm7
+       vpaddb   xmm9, xmm0, xmm8
+       vpsubusb xmm9, xmm9, xmm1
+       vpsubb   xmm9, xmm9, xmm2
+       vpand    xmm8, xmm3, xmm8
+       vpsubb   xmm8, xmm8, xmm4
+       vpaddusb xmm8, xmm8, xmm5
+       vpminub  xmm8, xmm8, xmm9
+       vpor     xmm6, xmm6, xmm7
+       vptest   xmm6, xmmword ptr [reloc @RWD96]
        jne      SHORT G_M6966_IG08
-						;; size=63 bbWeight=8 PerfScore 128.00
+						;; size=63 bbWeight=8 PerfScore 142.67
 G_M6966_IG05:
-       vpaddusb xmm7, xmm9, xmmword ptr [reloc @RWD112]
-       vpmovmskb eax, xmm7
+       vpaddusb xmm6, xmm8, xmmword ptr [reloc @RWD112]
+       vpmovmskb eax, xmm6
        test     eax, eax
        jne      SHORT G_M6966_IG08
-       vpmaddubsw xmm7, xmm9, xmmword ptr [reloc @RWD128]
-       vpshufb  xmm7, xmm7, xmmword ptr [reloc @RWD144]
+       vpmaddubsw xmm6, xmm8, xmmword ptr [reloc @RWD128]
+       vpshufb  xmm6, xmm6, xmmword ptr [reloc @RWD144]
        mov      rax, r15
        shr      rax, 1
-       vmovd    qword ptr [rdx+rax], xmm7
+       vmovd    qword ptr [rdx+rax], xmm6
        add      r15, 16
        cmp      r15, r14
        jne      SHORT G_M6966_IG10
 						;; size=55 bbWeight=4 PerfScore 70.00
 G_M6966_IG06:
        mov      dword ptr [rbx], esi
        mov      eax, 1
 						;; size=7 bbWeight=0.50 PerfScore 0.62
 G_M6966_IG07:
        add      rsp, 16
        pop      rbx
        pop      r13
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=13 bbWeight=0.50 PerfScore 1.88
 G_M6966_IG08:
        mov      r8d, r15d
        cmp      r8d, esi
        ja       SHORT G_M6966_IG11
        mov      eax, r8d
        lea      rdi, bword ptr [rdi+2*rax]
        sub      esi, r8d
        mov      r8, r15
        shr      r8, 1
        cmp      r8d, ecx
        ja       SHORT G_M6966_IG11
        mov      eax, r8d
        add      rdx, rax
        sub      ecx, r8d
        lea      r8, [rbp-0x28]
        mov      rax, 0xD1FFAB1E      ; code for System.HexConverter:TryDecodeFromUtf16_Scalar(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte
        call     [rax]System.HexConverter:TryDecodeFromUtf16_Scalar(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte
        mov      ecx, r15d
        add      ecx, dword ptr [rbp-0x28]
        mov      dword ptr [rbx], ecx
 						;; size=62 bbWeight=0.50 PerfScore 6.12
 G_M6966_IG09:
        add      rsp, 16
        pop      rbx
        pop      r13
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=13 bbWeight=0.50 PerfScore 1.88
 G_M6966_IG10:
        cmp      r15, r13
        jbe      G_M6966_IG04
        jmp      G_M6966_IG03
 						;; size=14 bbWeight=4 PerfScore 13.00
 G_M6966_IG11:
        mov      rax, 0xD1FFAB1E      ; code for System.ThrowHelper:ThrowArgumentOutOfRangeException()
        call     [rax]System.ThrowHelper:ThrowArgumentOutOfRangeException()
        int3     
 						;; size=13 bbWeight=0 PerfScore 0.00
 RWD00  	dq	C6C6C6C6C6C6C6C6h, C6C6C6C6C6C6C6C6h
 RWD16  	dq	0606060606060606h, 0606060606060606h
 RWD32  	dq	F0F0F0F0F0F0F0F0h, F0F0F0F0F0F0F0F0h
 RWD48  	dq	DFDFDFDFDFDFDFDFh, DFDFDFDFDFDFDFDFh
 RWD64  	dq	4141414141414141h, 4141414141414141h
 RWD80  	dq	0A0A0A0A0A0A0A0Ah, 0A0A0A0A0A0A0A0Ah
 RWD96  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD112 	dq	7070707070707070h, 7070707070707070h
 RWD128 	dq	0110011001100110h, 0110011001100110h
 RWD144 	dq	0E0C0A0806040200h, 0000000000000000h
 
 
-; Total bytes of code 331, prolog size 20, PerfScore 252.50, instruction count 87, allocated bytes for code 331 (MethodHash=bb7ae4c9) for method System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte (FullOpts)
+; Total bytes of code 323, prolog size 20, PerfScore 264.17, instruction count 86, allocated bytes for code 323 (MethodHash=bb7ae4c9) for method System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte (FullOpts)
-7 (-2.86 % of base) - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 8 single block inlinees; 4 inlinees without PGO data
+; 0 inlinees with PGO data; 12 single block inlinees; 20 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T04] (  3,  3   )    long  ->  rdi         single-def
 ;  V01 arg1         [V01,T03] (  5,  3.50)    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T05] (  3,  2.50)    long  ->  rdx         single-def
 ;  V03 loc0         [V03,T01] (  5, 10.50)   byref  ->  rdi         single-def
-;  V04 loc1         [V04,T08] ( 14, 18.50)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V04 loc1         [V04,T11] ( 14, 18.50)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;  V05 loc2         [V05,T02] (  5,  6   )   byref  ->  rcx         single-def
 ;  V06 loc3         [V06,T00] ( 12, 27   )    long  ->  rax        
 ;  V07 loc4         [V07,T06] (  2,  4.50)    long  ->  rdx        
-;  V08 loc5         [V08,T10] (  3, 12   )  simd64  ->  mm3         <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V08 loc5         [V08,T14] (  3, 12   )  simd64  ->  mm3         <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;# V09 OutArgs      [V09    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V10 tmp1         [V10    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
 ;* V11 tmp2         [V11    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
-;  V12 tmp3         [V12,T09] (  2, 16   )  simd64  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;  V12 tmp3         [V12,T12] (  2, 16   )  simd64  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
 ;* V13 tmp4         [V13    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
-;* V14 tmp5         [V14    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V15 tmp6         [V15    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V16 tmp7         [V16    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V17 tmp8         [V17    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V18 tmp9         [V18    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;* V19 tmp10        [V19    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V20 tmp11        [V20    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V21 tmp12        [V21    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;  V22 cse0         [V22,T11] (  5,  7   )  simd64  ->  mm1         "CSE #01: moderate"
-;  V23 cse1         [V23,T12] (  5,  6   )  simd64  ->  mm2         "CSE #02: moderate"
-;  V24 rat0         [V24,T07] (  3, 24   )  simd64  ->  mm4         "ReplaceWithLclVar is creating a new local variable"
+;* V14 tmp5         [V14    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V15 tmp6         [V15    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V16 tmp7         [V16,T08] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V17 tmp8         [V17    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V18 tmp9         [V18    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V19 tmp10        [V19    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V20 tmp11        [V20    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V21 tmp12        [V21    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V22 tmp13        [V22    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V23 tmp14        [V23    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V24 tmp15        [V24    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V25 tmp16        [V25    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V26 tmp17        [V26,T09] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V27 tmp18        [V27    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V28 tmp19        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V29 tmp20        [V29    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V30 tmp21        [V30    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V31 tmp22        [V31    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V32 tmp23        [V32    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V33 tmp24        [V33    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;  V34 tmp25        [V34,T13] (  2, 16   )  simd64  ->  mm4         "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V35 tmp26        [V35    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V36 tmp27        [V36    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V37 tmp28        [V37,T07] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V38 tmp29        [V38    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V39 tmp30        [V39    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V40 tmp31        [V40    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V41 tmp32        [V41    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V42 tmp33        [V42    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V43 tmp34        [V43    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V44 tmp35        [V44    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V45 tmp36        [V45    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V46 tmp37        [V46,T10] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V47 tmp38        [V47    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V48 tmp39        [V48    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V49 tmp40        [V49    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V50 tmp41        [V50    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V51 tmp42        [V51    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V52 tmp43        [V52    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V53 tmp44        [V53    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;  V54 cse0         [V54,T16] (  5,  6   )  simd64  ->  mm2         "CSE #02: aggressive"
+;  V55 cse1         [V55,T15] (  5,  7   )  simd64  ->  mm1         "CSE #01: aggressive"
 ;
 ; Lcl frame size = 0
 
 G_M60939_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M60939_IG02:
        vmovups  zmm0, zmmword ptr [rdi]
        vmovups  zmm1, zmmword ptr [reloc @RWD00]
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
        je       SHORT G_M60939_IG05
 						;; size=29 bbWeight=1 PerfScore 12.00
 G_M60939_IG03:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M60939_IG04:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M60939_IG05:
        mov      rcx, rsi
        vpackuswb zmm0, zmm0, zmm0
        vmovups  zmm2, zmmword ptr [reloc @RWD64]
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rcx], ymm0
        mov      eax, 32
        test     sil, 32
        jne      SHORT G_M60939_IG06
        vmovups  zmm0, zmmword ptr [rdi+0x40]
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
        jne      SHORT G_M60939_IG08
        vpackuswb zmm0, zmm0, zmm0
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rcx+0x20], ymm0
 						;; size=77 bbWeight=0.50 PerfScore 11.88
 G_M60939_IG06:
        and      rsi, 63
        mov      rax, rsi
        neg      rax
        add      rax, 64
        add      rdx, -64
        align    [0 bytes for IG07]
 						;; size=18 bbWeight=0.50 PerfScore 0.62
 G_M60939_IG07:
        vmovups  zmm0, zmmword ptr [rdi+2*rax]
        vmovups  zmm3, zmmword ptr [rdi+2*rax+0x40]
-       vmovaps  zmm4, zmm0
-       vpternlogd zmm4, zmm1, zmm3, -56
-       vptestmw k1, zmm4, zmm4
+       vpord    zmm4, zmm0, zmm3
+       vptestmw k1, zmm1, zmm4
        kortestd k1, k1
        jne      SHORT G_M60939_IG09
        vpackuswb zmm0, zmm0, zmm3
        vpermq   zmm0, zmm2, zmm0
        vmovups  zmmword ptr [rcx+rax], zmm0
        add      rax, 64
        cmp      rax, rdx
        jbe      SHORT G_M60939_IG07
-						;; size=69 bbWeight=4 PerfScore 81.00
+						;; size=62 bbWeight=4 PerfScore 79.33
 G_M60939_IG08:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M60939_IG09:
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
        jne      SHORT G_M60939_IG08
        vpackuswb zmm0, zmm0, zmm0
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rcx+rax], ymm0
        add      rax, 32
        jmp      SHORT G_M60939_IG08
 						;; size=36 bbWeight=0.50 PerfScore 6.12
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD64  	dq	0000000000000000h, 0000000000000002h, 0000000000000004h, 0000000000000006h, 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
 
 
-; Total bytes of code 245, prolog size 4, PerfScore 115.50, instruction count 56, allocated bytes for code 257 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 238, prolog size 4, PerfScore 113.83, instruction count 55, allocated bytes for code 250 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (FullOpts)
-7 (-0.82 % of base) - System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong
 ; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 21 single block inlinees; 25 inlinees without PGO data
+; 0 inlinees with PGO data; 41 single block inlinees; 65 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T06] (  9,  9   )    long  ->  rdi         single-def
 ;  V01 arg1         [V01,T04] ( 15, 12   )    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T11] (  9,  6   )    long  ->  rdx         single-def
 ;  V03 loc0         [V03,T00] ( 23, 30   )    long  ->  rax        
 ;  V04 loc1         [V04,T12] ( 13,  6.50)     int  ->   r8        
 ;* V05 loc2         [V05    ] (  0,  0   )     int  ->  zero-ref   
 ;  V06 loc3         [V06,T05] (  7, 14   )    long  ->  rcx        
-;  V07 loc4         [V07,T22] (  5,  2.50)    long  ->  rdx        
+;  V07 loc4         [V07,T25] (  5,  2.50)    long  ->  rdx        
 ;  V08 loc5         [V08,T16] (  2,  4.50)    long  ->   r8        
 ;# V09 OutArgs      [V09    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V10 tmp1         [V10,T23] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
+;  V10 tmp1         [V10,T26] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
 ;  V11 tmp2         [V11,T07] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
-;  V12 tmp3         [V12,T30] ( 14, 17.50)  simd64  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V12 tmp3         [V12,T41] ( 14, 17.50)  simd64  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;  V13 tmp4         [V13,T13] (  5,  6   )   byref  ->  rax         single-def "Inline stloc first use temp"
 ;* V14 tmp5         [V14    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
 ;  V15 tmp6         [V15,T01] ( 12, 27   )    long  ->   r8         "Inline stloc first use temp"
 ;  V16 tmp7         [V16,T17] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
-;  V17 tmp8         [V17,T36] (  3, 12   )  simd64  ->  mm3         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;  V17 tmp8         [V17,T50] (  3, 12   )  simd64  ->  mm3         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V18 tmp9         [V18    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
-;  V19 tmp10        [V19,T33] (  2, 16   )  simd64  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;  V19 tmp10        [V19,T44] (  2, 16   )  simd64  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
 ;* V20 tmp11        [V20    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
-;* V21 tmp12        [V21    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V22 tmp13        [V22    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V23 tmp14        [V23    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V24 tmp15        [V24    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V25 tmp16        [V25    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;* V26 tmp17        [V26    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V27 tmp18        [V27    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V28 tmp19        [V28    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;  V29 tmp20        [V29,T24] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
-;  V30 tmp21        [V30,T08] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
-;  V31 tmp22        [V31,T31] ( 14, 17.50)  simd32  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;  V32 tmp23        [V32,T14] (  5,  6   )   byref  ->  rax         single-def "Inline stloc first use temp"
-;* V33 tmp24        [V33    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
-;  V34 tmp25        [V34,T02] ( 12, 27   )    long  ->   r8         "Inline stloc first use temp"
-;  V35 tmp26        [V35,T18] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
-;  V36 tmp27        [V36,T37] (  3, 12   )  simd32  ->  mm2         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V37 tmp28        [V37    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
-;  V38 tmp29        [V38,T34] (  2, 16   )  simd32  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
-;* V39 tmp30        [V39    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
-;* V40 tmp31        [V40    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V41 tmp32        [V41    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V42 tmp33        [V42    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V43 tmp34        [V43    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V44 tmp35        [V44    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V45 tmp36        [V45    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V46 tmp37        [V46    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V47 tmp38        [V47    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V48 tmp39        [V48    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V49 tmp40        [V49    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V50 tmp41        [V50    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V51 tmp42        [V51    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V52 tmp43        [V52    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V53 tmp44        [V53    ] (  0,  0   )  simd32  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V54 tmp45        [V54    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V55 tmp46        [V55    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;  V56 tmp47        [V56,T25] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
-;* V57 tmp48        [V57,T27] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
-;* V58 tmp49        [V58    ] (  0,  0   )    long  ->  zero-ref    "Inline stloc first use temp"
-;  V59 tmp50        [V59,T09] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
-;  V60 tmp51        [V60,T32] ( 14, 17.50)  simd16  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;  V61 tmp52        [V61,T15] (  5,  6   )   byref  ->   r8         single-def "Inline stloc first use temp"
-;* V62 tmp53        [V62    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
-;  V63 tmp54        [V63,T03] ( 11, 26.50)    long  ->  rax         "Inline stloc first use temp"
-;  V64 tmp55        [V64,T19] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
-;  V65 tmp56        [V65,T38] (  3, 12   )  simd16  ->  mm2         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V66 tmp57        [V66    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
-;  V67 tmp58        [V67,T35] (  2, 16   )  simd16  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
-;* V68 tmp59        [V68    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
-;* V69 tmp60        [V69    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V70 tmp61        [V70    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V71 tmp62        [V71    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V72 tmp63        [V72    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V73 tmp64        [V73    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V74 tmp65        [V74    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V21 tmp12        [V21    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V22 tmp13        [V22    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V23 tmp14        [V23,T30] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V24 tmp15        [V24    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V25 tmp16        [V25    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V26 tmp17        [V26    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V27 tmp18        [V27    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V28 tmp19        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V29 tmp20        [V29    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V30 tmp21        [V30    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V31 tmp22        [V31    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V32 tmp23        [V32    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V33 tmp24        [V33,T31] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V34 tmp25        [V34    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V35 tmp26        [V35    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V36 tmp27        [V36    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V37 tmp28        [V37    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V38 tmp29        [V38    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V39 tmp30        [V39    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V40 tmp31        [V40    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;  V41 tmp32        [V41,T45] (  2, 16   )  simd64  ->  mm4         "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V42 tmp33        [V42    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V43 tmp34        [V43    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V44 tmp35        [V44,T20] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V45 tmp36        [V45    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V46 tmp37        [V46    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V47 tmp38        [V47    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V48 tmp39        [V48    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V49 tmp40        [V49    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V50 tmp41        [V50    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V51 tmp42        [V51    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V52 tmp43        [V52    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V53 tmp44        [V53,T32] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V54 tmp45        [V54    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V55 tmp46        [V55    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V56 tmp47        [V56    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V57 tmp48        [V57    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V58 tmp49        [V58    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V59 tmp50        [V59    ] (  0,  0   )  simd64  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V60 tmp51        [V60    ] (  0,  0   )  simd64  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;  V61 tmp52        [V61,T27] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
+;  V62 tmp53        [V62,T08] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
+;  V63 tmp54        [V63,T42] ( 14, 17.50)  simd32  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;  V64 tmp55        [V64,T14] (  5,  6   )   byref  ->  rax         single-def "Inline stloc first use temp"
+;* V65 tmp56        [V65    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;  V66 tmp57        [V66,T02] ( 12, 27   )    long  ->   r8         "Inline stloc first use temp"
+;  V67 tmp58        [V67,T18] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
+;  V68 tmp59        [V68,T51] (  3, 12   )  simd32  ->  mm2         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V69 tmp60        [V69    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;  V70 tmp61        [V70,T46] (  2, 16   )  simd32  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;* V71 tmp62        [V71    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V72 tmp63        [V72    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V73 tmp64        [V73    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V74 tmp65        [V74,T33] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
 ;* V75 tmp66        [V75    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V76 tmp67        [V76    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V77 tmp68        [V77    ] (  0,  0   )  simd16  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V78 tmp69        [V78    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V79 tmp70        [V79    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V80 tmp71        [V80    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V81 tmp72        [V81    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
-;  V82 tmp73        [V82,T28] (  3, 24   )  simd16  ->  mm1         "dup spill"
-;* V83 tmp74        [V83    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[uint]>
-;* V84 tmp75        [V84    ] (  0,  0   )   byref  ->  zero-ref    "Inlining Arg"
-;  V85 tmp76        [V85,T20] (  3,  3   )   byref  ->  rcx         single-def "Inlining Arg"
-;  V86 tmp77        [V86,T21] (  3,  3   )   byref  ->  rdx         "Inlining Arg"
-;* V87 tmp78        [V87,T26] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V88 cse0         [V88,T10] (  3,  8.50)    long  ->  r10         "CSE #05: conservative"
-;  V89 cse1         [V89,T39] (  5,  6   )  simd64  ->  mm1         "CSE #01: conservative"
-;  V90 cse2         [V90,T40] (  5,  6   )  simd32  ->  mm1         "CSE #03: conservative"
-;  V91 cse3         [V91,T41] (  5,  6   )  simd16  ->  mm1         "CSE #04: conservative"
-;  V92 cse4         [V92,T42] (  5,  6   )  simd64  ->  mm2         "CSE #02: conservative"
-;  V93 rat0         [V93,T29] (  3, 24   )  simd64  ->  mm4         "ReplaceWithLclVar is creating a new local variable"
+;* V76 tmp67        [V76    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V77 tmp68        [V77    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V78 tmp69        [V78    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V79 tmp70        [V79    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V80 tmp71        [V80    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V81 tmp72        [V81    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V82 tmp73        [V82    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V83 tmp74        [V83    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V84 tmp75        [V84,T34] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V85 tmp76        [V85    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V86 tmp77        [V86    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V87 tmp78        [V87    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V88 tmp79        [V88    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V89 tmp80        [V89    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V90 tmp81        [V90    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V91 tmp82        [V91    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;  V92 tmp83        [V92,T47] (  2, 16   )  simd32  ->  mm3         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V93 tmp84        [V93    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V94 tmp85        [V94    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V95 tmp86        [V95,T21] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V96 tmp87        [V96    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V97 tmp88        [V97    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V98 tmp89        [V98    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V99 tmp90        [V99    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V100 tmp91       [V100    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V101 tmp92       [V101    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V102 tmp93       [V102    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
+;* V103 tmp94       [V103    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V104 tmp95       [V104,T35] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V105 tmp96       [V105    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V106 tmp97       [V106    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V107 tmp98       [V107    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V108 tmp99       [V108    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V109 tmp100      [V109    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V110 tmp101      [V110    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V111 tmp102      [V111    ] (  0,  0   )  simd32  ->  zero-ref    "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;  V112 tmp103      [V112,T28] (  3,  1.50)    long  ->  rax         "Inline return value spill temp"
+;* V113 tmp104      [V113,T36] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V114 tmp105      [V114    ] (  0,  0   )    long  ->  zero-ref    "Inline stloc first use temp"
+;  V115 tmp106      [V115,T09] (  5,  9.50)   byref  ->  rcx         single-def "Inline stloc first use temp"
+;  V116 tmp107      [V116,T43] ( 14, 17.50)  simd16  ->  mm0         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;  V117 tmp108      [V117,T15] (  5,  6   )   byref  ->   r8         single-def "Inline stloc first use temp"
+;* V118 tmp109      [V118    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;  V119 tmp110      [V119,T03] ( 11, 26.50)    long  ->  rax         "Inline stloc first use temp"
+;  V120 tmp111      [V120,T19] (  2,  4.50)    long  ->   r9         "Inline stloc first use temp"
+;  V121 tmp112      [V121,T52] (  3, 12   )  simd16  ->  mm2         "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V122 tmp113      [V122    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;  V123 tmp114      [V123,T48] (  2, 16   )  simd16  ->  mm0         "Spilling op1 side effects for HWIntrinsic"
+;* V124 tmp115      [V124    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V125 tmp116      [V125    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V126 tmp117      [V126    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V127 tmp118      [V127,T37] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V128 tmp119      [V128    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V129 tmp120      [V129    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V130 tmp121      [V130    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V131 tmp122      [V131    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V132 tmp123      [V132    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V133 tmp124      [V133    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V134 tmp125      [V134    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V135 tmp126      [V135    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V136 tmp127      [V136,T38] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V137 tmp128      [V137    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V138 tmp129      [V138    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V139 tmp130      [V139    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V140 tmp131      [V140    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V141 tmp132      [V141    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V142 tmp133      [V142    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;  V143 tmp134      [V143,T49] (  2, 16   )  simd16  ->  mm3         "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V144 tmp135      [V144    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V145 tmp136      [V145    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V146 tmp137      [V146,T22] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V147 tmp138      [V147    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V148 tmp139      [V148    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V149 tmp140      [V149    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V150 tmp141      [V150    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V151 tmp142      [V151    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V152 tmp143      [V152    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V153 tmp144      [V153    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V154 tmp145      [V154,T39] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V155 tmp146      [V155    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V156 tmp147      [V156    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V157 tmp148      [V157    ] (  0,  0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V158 tmp149      [V158    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V159 tmp150      [V159    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V160 tmp151      [V160    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V161 tmp152      [V161    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
+;  V162 tmp153      [V162,T40] (  3, 24   )  simd16  ->  mm1         "dup spill"
+;* V163 tmp154      [V163    ] (  0,  0   )  simd16  ->  zero-ref    "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[uint]>
+;* V164 tmp155      [V164    ] (  0,  0   )   byref  ->  zero-ref    "Inlining Arg"
+;  V165 tmp156      [V165,T23] (  3,  3   )   byref  ->  rcx         single-def "Inlining Arg"
+;  V166 tmp157      [V166,T24] (  3,  3   )   byref  ->  rdx         "Inlining Arg"
+;* V167 tmp158      [V167,T29] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V168 cse0        [V168,T10] (  3,  8.50)    long  ->  r10         "CSE #05: moderate"
+;  V169 cse1        [V169,T53] (  5,  6   )  simd64  ->  mm1         "CSE #01: moderate"
+;  V170 cse2        [V170,T54] (  5,  6   )  simd32  ->  mm1         "CSE #03: moderate"
+;  V171 cse3        [V171,T55] (  5,  6   )  simd16  ->  mm1         "CSE #04: moderate"
+;  V172 cse4        [V172,T56] (  5,  6   )  simd64  ->  mm2         "CSE #02: moderate"
 ;
 ; Lcl frame size = 0
 
 G_M6063_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M6063_IG02:
        xor      eax, eax
        cmp      rdx, 32
        jb       G_M6063_IG21
 						;; size=12 bbWeight=1 PerfScore 1.50
 G_M6063_IG03:
        mov      rcx, qword ptr [rdi]
        mov      r8, 0xD1FFAB1E
        test     rcx, r8
        jne      G_M6063_IG27
        cmp      rdx, 128
        jb       SHORT G_M6063_IG04
        mov      rcx, rdi
        vmovups  zmm0, zmmword ptr [rcx]
        vmovups  zmm1, zmmword ptr [reloc @RWD00]
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
 		  ;; NOP compensation instructions of 3 bytes.
        je       G_M6063_IG17
        xor      eax, eax
        jmp      G_M6063_IG21
        align    [3 bytes for IG08]
 						;; size=80 bbWeight=0.50 PerfScore 9.62
 G_M6063_IG04:
        cmp      rdx, 64
        jb       SHORT G_M6063_IG05
        mov      rcx, rdi
        vmovups  ymm0, ymmword ptr [rcx]
        vmovups  ymm1, ymmword ptr [reloc @RWD00]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        je       G_M6063_IG11
        xor      eax, eax
        jmp      G_M6063_IG15
 						;; size=39 bbWeight=0.50 PerfScore 9.38
 G_M6063_IG05:
        mov      rcx, rdi
        vmovups  xmm0, xmmword ptr [rcx]
        vmovups  xmm1, xmmword ptr [reloc @RWD00]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        je       SHORT G_M6063_IG06
        xor      eax, eax
        jmp      SHORT G_M6063_IG09
 						;; size=26 bbWeight=0.50 PerfScore 6.75
 G_M6063_IG06:
        mov      r8, rsi
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [r8], xmm0
        mov      eax, 8
        test     sil, 8
        jne      SHORT G_M6063_IG07
        vmovups  xmm0, xmmword ptr [rcx+0x10]
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      SHORT G_M6063_IG09
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [r8+0x08], xmm0
 						;; size=45 bbWeight=0.50 PerfScore 7.88
 G_M6063_IG07:
        mov      rax, rsi
        and      rax, 15
        neg      rax
        add      rax, 16
        lea      r9, [rdx-0x10]
 						;; size=18 bbWeight=0.50 PerfScore 0.75
 G_M6063_IG08:
        vmovups  xmm0, xmmword ptr [rcx+2*rax]
        lea      r10, [rax+0x08]
        vmovups  xmm2, xmmword ptr [rcx+2*r10]
        vpor     xmm3, xmm0, xmm2
-       vptest   xmm3, xmm1
+       vptest   xmm1, xmm3
        jne      SHORT G_M6063_IG10
        vpackuswb xmm0, xmm0, xmm2
        vmovups  xmmword ptr [r8+rax], xmm0
        add      rax, 16
        cmp      rax, r9
        jbe      SHORT G_M6063_IG08
 						;; size=45 bbWeight=4 PerfScore 69.33
 G_M6063_IG09:
        jmp      G_M6063_IG21
        align    [0 bytes for IG13]
 						;; size=5 bbWeight=0.50 PerfScore 1.00
 G_M6063_IG10:
-       vptest   xmm0, xmm1
+       vptest   xmm1, xmm0
        jne      SHORT G_M6063_IG09
        vpackuswb xmm0, xmm0, xmm0
        vmovsd   qword ptr [r8+rax], xmm0
        mov      rax, r10
        jmp      SHORT G_M6063_IG09
 						;; size=22 bbWeight=0.50 PerfScore 4.62
 G_M6063_IG11:
        mov      rax, rsi
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rax], xmm0
        mov      r8d, 16
        test     sil, 16
        jne      SHORT G_M6063_IG12
        vmovups  ymm0, ymmword ptr [rcx+0x20]
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        jne      SHORT G_M6063_IG14
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rax+0x10], xmm0
 						;; size=56 bbWeight=0.50 PerfScore 11.38
 G_M6063_IG12:
        mov      r8, rsi
        and      r8, 31
        neg      r8
        add      r8, 32
        lea      r9, [rdx-0x20]
 						;; size=18 bbWeight=0.50 PerfScore 0.75
 G_M6063_IG13:
        vmovups  ymm0, ymmword ptr [rcx+2*r8]
        vmovups  ymm2, ymmword ptr [rcx+2*r8+0x20]
        vpor     ymm3, ymm0, ymm2
-       vptest   ymm3, ymm1
+       vptest   ymm1, ymm3
        jne      SHORT G_M6063_IG16
        vpackuswb ymm0, ymm0, ymm2
        vpermq   ymm0, ymm0, -40
        vmovups  ymmword ptr [rax+r8], ymm0
        add      r8, 32
        cmp      r8, r9
        jbe      SHORT G_M6063_IG13
 						;; size=49 bbWeight=4 PerfScore 91.33
 G_M6063_IG14:
        mov      rax, r8
 						;; size=3 bbWeight=0.50 PerfScore 0.12
 G_M6063_IG15:
        jmp      G_M6063_IG21
        align    [0 bytes for IG19]
 						;; size=5 bbWeight=0.50 PerfScore 1.00
 G_M6063_IG16:
-       vptest   ymm0, ymm1
+       vptest   ymm1, ymm0
        jne      SHORT G_M6063_IG14
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
        vmovups  xmmword ptr [rax+r8], xmm0
        add      r8, 16
        jmp      SHORT G_M6063_IG14
 						;; size=29 bbWeight=0.50 PerfScore 6.62
 G_M6063_IG17:
        mov      rax, rsi
        vpackuswb zmm0, zmm0, zmm0
        vmovups  zmm2, zmmword ptr [reloc @RWD64]
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rax], ymm0
        mov      r8d, 32
        test     sil, 32
        jne      SHORT G_M6063_IG18
        vmovups  zmm0, zmmword ptr [rcx+0x40]
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
        jne      SHORT G_M6063_IG20
        vpackuswb zmm0, zmm0, zmm0
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rax+0x20], ymm0
 						;; size=78 bbWeight=0.50 PerfScore 11.88
 G_M6063_IG18:
        mov      r8, rsi
        and      r8, 63
        neg      r8
        add      r8, 64
        lea      r9, [rdx-0x40]
 						;; size=18 bbWeight=0.50 PerfScore 0.75
 G_M6063_IG19:
        vmovups  zmm0, zmmword ptr [rcx+2*r8]
        vmovups  zmm3, zmmword ptr [rcx+2*r8+0x40]
-       vmovaps  zmm4, zmm0
-       vpternlogd zmm4, zmm1, zmm3, -56
-       vptestmw k1, zmm4, zmm4
+       vpord    zmm4, zmm0, zmm3
+       vptestmw k1, zmm1, zmm4
        kortestd k1, k1
        jne      G_M6063_IG26
        vpackuswb zmm0, zmm0, zmm3
        vpermq   zmm0, zmm2, zmm0
        vmovups  zmmword ptr [rax+r8], zmm0
        add      r8, 64
        cmp      r8, r9
        jbe      SHORT G_M6063_IG19
-						;; size=73 bbWeight=4 PerfScore 81.00
+						;; size=66 bbWeight=4 PerfScore 79.33
 G_M6063_IG20:
        mov      rax, r8
 						;; size=3 bbWeight=0.50 PerfScore 0.12
 G_M6063_IG21:
        sub      rdx, rax
        cmp      rdx, 4
        jb       SHORT G_M6063_IG23
        lea      r8, [rax+rdx-0x04]
        align    [0 bytes for IG22]
 						;; size=14 bbWeight=0.50 PerfScore 1.25
 G_M6063_IG22:
        mov      rcx, qword ptr [rdi+2*rax]
        mov      r9, 0xD1FFAB1E
        test     rcx, r9
        jne      G_M6063_IG27
        vmovd    xmm1, rcx
        vpackuswb xmm2, xmm1, xmm1
        vmovd    dword ptr [rsi+rax], xmm2
        add      rax, 4
        cmp      rax, r8
        jbe      SHORT G_M6063_IG22
 						;; size=46 bbWeight=4 PerfScore 40.00
 G_M6063_IG23:
        test     dl, 2
        je       SHORT G_M6063_IG24
        mov      r8d, dword ptr [rdi+2*rax]
        test     r8d, 0xD1FFAB1E
        jne      G_M6063_IG28
        lea      rcx, [rsi+rax]
        mov      byte  ptr [rcx], r8b
        shr      r8d, 16
        mov      byte  ptr [rcx+0x01], r8b
        add      rax, 2
 						;; size=41 bbWeight=0.50 PerfScore 3.88
 G_M6063_IG24:
        test     dl, 1
        je       SHORT G_M6063_IG29
        movzx    r8, word  ptr [rdi+2*rax]
        cmp      r8d, 127
        ja       SHORT G_M6063_IG29
 						;; size=16 bbWeight=0.50 PerfScore 2.25
 G_M6063_IG25:
        mov      byte  ptr [rsi+rax], r8b
        inc      rax
        jmp      SHORT G_M6063_IG29
 						;; size=9 bbWeight=0.50 PerfScore 1.62
 G_M6063_IG26:
        vptestmw k1, zmm1, zmm0
        kortestd k1, k1
        jne      G_M6063_IG20
        vpackuswb zmm0, zmm0, zmm0
        vpermq   zmm0, zmm2, zmm0
        vmovups  ymmword ptr [rax+r8], ymm0
        add      r8, 32
        jmp      G_M6063_IG20
 						;; size=44 bbWeight=0.50 PerfScore 6.12
 G_M6063_IG27:
        mov      r8d, ecx
        test     r8d, 0xD1FFAB1E
        jne      SHORT G_M6063_IG28
        lea      rdx, [rsi+rax]
        mov      byte  ptr [rdx], r8b
        shr      r8d, 16
        mov      byte  ptr [rdx+0x01], r8b
        shr      rcx, 32
        mov      r8d, ecx
        add      rax, 2
 						;; size=38 bbWeight=0.50 PerfScore 2.75
 G_M6063_IG28:
        test     r8d, 0xFF80
        je       SHORT G_M6063_IG25
 						;; size=9 bbWeight=0.50 PerfScore 0.62
 G_M6063_IG29:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=1 PerfScore 2.50
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
 RWD64  	dq	0000000000000000h, 0000000000000002h, 0000000000000004h, 0000000000000006h, 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
 
 
-; Total bytes of code 850, prolog size 4, PerfScore 378.04, instruction count 200, allocated bytes for code 859 (MethodHash=53fae850) for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 843, prolog size 4, PerfScore 376.38, instruction count 199, allocated bytes for code 852 (MethodHash=53fae850) for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
-1 (-1.75 % of base) - System.Text.Ascii+PlainLoader`1[int]:EqualAndAscii512(byref,byref):ubyte
 ; Assembly listing for method System.Text.Ascii+PlainLoader`1[int]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 2 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] (  3,  3   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T01] (  3,  3   )   byref  ->  rsi         single-def
 ;  V02 loc0         [V02,T02] (  3,  2.50)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[int]>
 ;* V03 loc1         [V03    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[int]>
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V05 tmp1         [V05    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V05 tmp1         [V05    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V06 tmp2         [V06    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[int]>
+;* V07 tmp3         [V07    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
+;* V08 tmp4         [V08    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
+;* V09 tmp5         [V09    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;
 ; Lcl frame size = 0
 
 G_M10181_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M10181_IG02:
        vmovups  zmm0, zmmword ptr [rdi]
        vpcmpd   k1, zmm0, zmmword ptr [rsi], 4
        kortestw k1, k1
        jne      SHORT G_M10181_IG04
 						;; size=19 bbWeight=1 PerfScore 12.00
 G_M10181_IG03:
-       vptestmw k1, zmm0, zmmword ptr [reloc @RWD00]
-       kortestd k1, k1
+       vptestmd k1, zmm0, dword ptr [reloc @RWD00] {1to16}
+       kortestw k1, k1
        je       SHORT G_M10181_IG06
-						;; size=17 bbWeight=0.50 PerfScore 3.50
+						;; size=16 bbWeight=0.50 PerfScore 3.50
 G_M10181_IG04:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M10181_IG05:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M10181_IG06:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M10181_IG07:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
-RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
+RWD00  	dd	FFFFFF80h
 
 
-; Total bytes of code 57, prolog size 4, PerfScore 19.50, instruction count 17, allocated bytes for code 62 (MethodHash=aa5ad83a) for method System.Text.Ascii+PlainLoader`1[int]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
+; Total bytes of code 56, prolog size 4, PerfScore 19.50, instruction count 17, allocated bytes for code 60 (MethodHash=aa5ad83a) for method System.Text.Ascii+PlainLoader`1[int]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
-1 (-1.75 % of base) - System.Text.Ascii+PlainLoader`1[long]:EqualAndAscii512(byref,byref):ubyte
 ; Assembly listing for method System.Text.Ascii+PlainLoader`1[long]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; partially interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 4 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T00] (  3,  3   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T01] (  3,  3   )   byref  ->  rsi         single-def
 ;  V02 loc0         [V02,T02] (  3,  2.50)  simd64  ->  mm0         <System.Runtime.Intrinsics.Vector512`1[long]>
 ;* V03 loc1         [V03    ] (  0,  0   )  simd64  ->  zero-ref    <System.Runtime.Intrinsics.Vector512`1[long]>
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V05 tmp1         [V05    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V05 tmp1         [V05    ] (  0,  0   )  simd64  ->  zero-ref    "spilled call-like call argument"
+;* V06 tmp2         [V06    ] (  0,  0   )  simd64  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[long]>
+;* V07 tmp3         [V07    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V08 tmp4         [V08    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V09 tmp5         [V09    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V10 tmp6         [V10    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V11 tmp7         [V11    ] (  0,  0   )    long  ->  zero-ref    "Inlining Arg"
+;* V12 tmp8         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;
 ; Lcl frame size = 0
 
 G_M46940_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M46940_IG02:
        vmovups  zmm0, zmmword ptr [rdi]
        vpcmpq   k1, zmm0, zmmword ptr [rsi], 4
        kortestb k1, k1
        jne      SHORT G_M46940_IG04
 						;; size=19 bbWeight=1 PerfScore 12.00
 G_M46940_IG03:
-       vptestmw k1, zmm0, zmmword ptr [reloc @RWD00]
-       kortestd k1, k1
+       vptestmq k1, zmm0, qword ptr [reloc @RWD00] {1to8}
+       kortestb k1, k1
        je       SHORT G_M46940_IG06
-						;; size=17 bbWeight=0.50 PerfScore 3.50
+						;; size=16 bbWeight=0.50 PerfScore 3.50
 G_M46940_IG04:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M46940_IG05:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M46940_IG06:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M46940_IG07:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
-RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
+RWD00  	dq	FFFFFFFFFFFFFF80h
 
 
-; Total bytes of code 57, prolog size 4, PerfScore 19.50, instruction count 17, allocated bytes for code 62 (MethodHash=e19948a3) for method System.Text.Ascii+PlainLoader`1[long]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
+; Total bytes of code 56, prolog size 4, PerfScore 19.50, instruction count 17, allocated bytes for code 60 (MethodHash=e19948a3) for method System.Text.Ascii+PlainLoader`1[long]:EqualAndAscii512(byref,byref):ubyte (FullOpts)
-1 (-0.07 % of base) - System.Text.Unicode.Utf8Utility:TranscodeToUtf8(ulong,int,ulong,int,byref,byref):int
 ; Assembly listing for method System.Text.Unicode.Utf8Utility:TranscodeToUtf8(ulong,int,ulong,int,byref,byref):int (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX512 - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
-; 0 inlinees with PGO data; 12 single block inlinees; 14 inlinees without PGO data
+; 0 inlinees with PGO data; 14 single block inlinees; 18 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T01] ( 61,30623.50)    long  ->  rbx        
-;  V01 arg1         [V01,T25] ( 12,    8.50)     int  ->  r13        
+;  V01 arg1         [V01,T26] ( 12,    8.50)     int  ->  r13        
 ;  V02 arg2         [V02,T02] ( 60,29211   )    long  ->  r15        
 ;  V03 arg3         [V03,T03] ( 45,25458.50)     int  ->  r14        
-;  V04 arg4         [V04,T28] (  4,    3   )   byref  ->  r12         single-def
-;  V05 arg5         [V05,T29] (  4,    3   )   byref  ->  [rbp-0x30]  single-def
+;  V04 arg4         [V04,T29] (  4,    3   )   byref  ->  r12         single-def
+;  V05 arg5         [V05,T30] (  4,    3   )   byref  ->  [rbp-0x30]  single-def
 ;  V06 loc0         [V06,T05] ( 10, 6357   )    long  ->  rax        
-;  V07 loc1         [V07,T35] (  2,  256.50)  simd16  ->  mm0         ld-addr-op <System.Runtime.Intrinsics.Vector128`1[short]>
+;* V07 loc1         [V07    ] (  0,    0   )  simd16  ->  zero-ref    ld-addr-op <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V08 loc2         [V08,T00] ( 45,45146.50)     int  ->  registers  
-;  V09 loc3         [V09,T26] ( 12,    6   )     int  ->  rax        
-;  V10 loc4         [V10,T31] (  5,    2.50)     int  ->  rax        
-;  V11 loc5         [V11,T27] (  6,    5   )    long  ->  rax        
+;  V09 loc3         [V09,T27] ( 12,    6   )     int  ->  rax        
+;  V10 loc4         [V10,T32] (  5,    2.50)     int  ->  rax        
+;  V11 loc5         [V11,T28] (  6,    5   )    long  ->  rax        
 ;* V12 loc6         [V12    ] (  0,    0   )     int  ->  zero-ref   
-;  V13 loc7         [V13,T21] (  3,   96   )     int  ->  rdx        
+;  V13 loc7         [V13,T22] (  3,   96   )     int  ->  rdx        
 ;  V14 loc8         [V14,T14] (  3,  320   )     int  ->  rcx        
-;  V15 loc9         [V15,T17] (  8,  226   )    long  ->  rcx        
+;  V15 loc9         [V15,T18] (  8,  226   )    long  ->  rcx        
 ;  V16 loc10        [V16,T13] (  6,  864   )     int  ->  rdi        
-;  V17 loc11        [V17,T34] ( 11, 1158   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[short]>
+;  V17 loc11        [V17,T35] ( 11, 1158   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V18 loc12        [V18    ] (  0,    0   )     ref  ->  zero-ref    class-hnd <System.Object>
 ;* V19 loc13        [V19    ] (  0,    0   )     ref  ->  zero-ref    class-hnd <System.Object>
 ;* V20 loc14        [V20    ] (  0,    0   )     ref  ->  zero-ref    class-hnd <System.Object>
 ;* V21 loc15        [V21    ] (  0,    0   )     int  ->  zero-ref   
 ;* V22 loc16        [V22    ] (  0,    0   )     int  ->  zero-ref   
 ;* V23 loc17        [V23    ] (  0,    0   )     int  ->  zero-ref   
 ;# V24 OutArgs      [V24    ] (  1,    1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V25 tmp1         [V25,T33] (  2,    2   )     int  ->  rdx         "Inline return value spill temp"
-;  V26 tmp2         [V26,T22] (  3,   96   )    long  ->  rdx         "Inline return value spill temp"
-;  V27 tmp3         [V27,T18] (  3,  192   )    long  ->  rcx         "Inlining Arg"
-;  V28 tmp4         [V28,T19] (  3,  192   )    long  ->  rdx         "Inlining Arg"
-;* V29 tmp5         [V29,T15] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V30 tmp6         [V30,T16] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V31 tmp7         [V31,T06] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V32 tmp8         [V32,T04] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V33 tmp9         [V33    ] (  0,    0   )     int  ->  zero-ref    "Inlining Arg"
-;* V34 tmp10        [V34    ] (  0,    0   )     int  ->  zero-ref    "impSpillLclRefs"
-;* V35 tmp11        [V35,T23] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V36 tmp12        [V36,T07] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V37 tmp13        [V37,T10] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V38 tmp14        [V38,T11] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V39 tmp15        [V39    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
-;* V40 tmp16        [V40    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
-;* V41 tmp17        [V41,T08] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V42 tmp18        [V42    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
-;* V43 tmp19        [V43    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
-;* V44 tmp20        [V44,T12] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V45 tmp21        [V45,T09] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;* V46 tmp22        [V46,T32] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
-;  V47 tmp23        [V47,T24] (  5,   20   )     int  ->  rcx         "Inlining Arg"
+;  V25 tmp1         [V25,T34] (  2,    2   )     int  ->  rdx         "Inline return value spill temp"
+;  V26 tmp2         [V26,T23] (  3,   96   )    long  ->  rdx         "Inline return value spill temp"
+;  V27 tmp3         [V27,T19] (  3,  192   )    long  ->  rcx         "Inlining Arg"
+;  V28 tmp4         [V28,T20] (  3,  192   )    long  ->  rdx         "Inlining Arg"
+;* V29 tmp5         [V29    ] (  0,    0   )  simd16  ->  zero-ref    "spilled call-like call argument"
+;* V30 tmp6         [V30    ] (  0,    0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V31 tmp7         [V31,T17] (  0,    0   )  ushort  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp"
+;* V32 tmp8         [V32    ] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V33 tmp9         [V33    ] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V34 tmp10        [V34    ] (  0,    0   )  ushort  ->  zero-ref    "Inline stloc first use temp"
+;* V35 tmp11        [V35    ] (  0,    0   )  ushort  ->  zero-ref    "Inlining Arg"
+;* V36 tmp12        [V36    ] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V37 tmp13        [V37,T15] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V38 tmp14        [V38,T16] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V39 tmp15        [V39,T06] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V40 tmp16        [V40,T04] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V41 tmp17        [V41    ] (  0,    0   )     int  ->  zero-ref    "Inlining Arg"
+;* V42 tmp18        [V42    ] (  0,    0   )     int  ->  zero-ref    "impSpillLclRefs"
+;* V43 tmp19        [V43,T24] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V44 tmp20        [V44,T07] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V45 tmp21        [V45,T10] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V46 tmp22        [V46,T11] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V47 tmp23        [V47    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
 ;* V48 tmp24        [V48    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
-;* V49 tmp25        [V49    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V49 tmp25        [V49,T08] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V50 tmp26        [V50    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
-;  V51 rat0         [V51,T20] (  3,  192   )    long  ->  rcx         "ReplaceWithLclVar is creating a new local variable"
-;  V52 rat1         [V52,T30] (  3,    3   )    long  ->  rax         "ReplaceWithLclVar is creating a new local variable"
+;* V51 tmp27        [V51    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V52 tmp28        [V52,T12] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V53 tmp29        [V53,T09] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;* V54 tmp30        [V54,T33] (  0,    0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
+;  V55 tmp31        [V55,T25] (  5,   20   )     int  ->  rcx         "Inlining Arg"
+;* V56 tmp32        [V56    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V57 tmp33        [V57    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;* V58 tmp34        [V58    ] (  0,    0   )     int  ->  zero-ref    "Inline stloc first use temp"
+;  V59 rat0         [V59,T21] (  3,  192   )    long  ->  rcx         "ReplaceWithLclVar is creating a new local variable"
+;  V60 rat1         [V60,T31] (  3,    3   )    long  ->  rax         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 8
 
 G_M33313_IG01:
        push     rbp
        push     r15
        push     r14
        push     r13
        push     r12
        push     rbx
        push     rax
        lea      rbp, [rsp+0x30]
        mov      bword ptr [rbp-0x30], r9
        mov      rbx, rdi
        mov      r13d, esi
        mov      r15, rdx
        mov      r14d, ecx
        mov      r12, r8
 						;; size=35 bbWeight=1 PerfScore 9.75
 G_M33313_IG02:
        cmp      r13d, r14d
        mov      edx, r14d
        cmovle   edx, r13d
        mov      rdi, rbx
        mov      rsi, r15
        mov      rcx, 0xD1FFAB1E      ; code for System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong
        call     [rcx]System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong
        lea      rbx, [rbx+2*rax]
        add      r15, rax
        cmp      eax, r13d
        jne      SHORT G_M33313_IG05
 						;; size=40 bbWeight=1 PerfScore 6.50
 G_M33313_IG03:
        mov      qword ptr [r12], rbx
        mov      r12, bword ptr [rbp-0x30]
        mov      qword ptr [r12], r15
        xor      eax, eax
 						;; size=14 bbWeight=0.50 PerfScore 1.62
 G_M33313_IG04:
        add      rsp, 8
        pop      rbx
        pop      r12
        pop      r13
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=15 bbWeight=0.50 PerfScore 2.12
 G_M33313_IG05:
        sub      r13d, eax
        sub      r14d, eax
        cmp      r13d, 2
        jl       G_M33313_IG17
        mov      eax, r13d
        lea      rax, [rbx+2*rax-0x04]
-       vmovups  xmm0, xmmword ptr [reloc @RWD00]
-						;; size=32 bbWeight=0.50 PerfScore 3.00
+						;; size=24 bbWeight=0.50 PerfScore 1.50
 G_M33313_IG06:
        mov      ecx, dword ptr [rbx]
 						;; size=2 bbWeight=4 PerfScore 8.00
 G_M33313_IG07:
        test     ecx, 0xD1FFAB1E
        jne      G_M33313_IG21
 						;; size=12 bbWeight=256 PerfScore 320.00
 G_M33313_IG08:
        cmp      r14d, 2
        jl       G_M33313_IG33
        mov      edx, ecx
        shr      edx, 8
        or       edx, ecx
        mov      word  ptr [r15], dx
        add      rbx, 4
        add      r15, 2
        add      r14d, -2
        mov      rcx, rax
        sub      rcx, rbx
        mov      rdx, rcx
        shr      rdx, 63
        add      rcx, rdx
        sar      rcx, 1
        add      ecx, 2
        movsxd   rdx, r14d
        cmp      rcx, rdx
        jle      SHORT G_M33313_IG09
        jmp      SHORT G_M33313_IG10
-       align    [0 bytes for IG11]
-						;; size=65 bbWeight=32 PerfScore 312.00
+       align    [3 bytes for IG11]
+						;; size=68 bbWeight=32 PerfScore 312.00
 G_M33313_IG09:
        mov      rdx, rcx
 						;; size=3 bbWeight=32 PerfScore 8.00
 G_M33313_IG10:
        mov      ecx, edx
        shr      ecx, 3
        xor      edi, edi
        test     ecx, ecx
        je       SHORT G_M33313_IG12
 						;; size=11 bbWeight=32 PerfScore 72.00
 G_M33313_IG11:
-       vmovups  xmm1, xmmword ptr [rbx]
-       vptest   xmm1, xmm0
+       vmovups  xmm0, xmmword ptr [rbx]
+       vptest   xmm0, xmmword ptr [reloc @RWD00]
        jne      G_M33313_IG19
-       vpackuswb xmm1, xmm1, xmm1
-       vmovq    qword ptr [r15], xmm1
+       vpackuswb xmm0, xmm0, xmm0
+       vmovq    qword ptr [r15], xmm0
        add      rbx, 16
        add      r15, 8
        inc      edi
        cmp      edi, ecx
        jb       SHORT G_M33313_IG11
-						;; size=38 bbWeight=256 PerfScore 3328.00
+						;; size=42 bbWeight=256 PerfScore 3840.00
 G_M33313_IG12:
        shl      edi, 3
        sub      r14d, edi
        test     dl, 4
        je       SHORT G_M33313_IG15
        mov      rcx, qword ptr [rbx]
        mov      rdx, 0xD1FFAB1E
        test     rcx, rdx
        jne      G_M33313_IG20
 						;; size=33 bbWeight=32 PerfScore 176.00
 G_M33313_IG13:
-       vmovd    xmm1, rcx
-       vpackuswb xmm1, xmm1, xmm1
-       vmovd    dword ptr [r15], xmm1
+       vmovd    xmm0, rcx
+       vpackuswb xmm0, xmm0, xmm0
+       vmovd    dword ptr [r15], xmm0
        add      rbx, 8
 						;; size=18 bbWeight=2 PerfScore 10.50
 G_M33313_IG14:
        add      r15, 4
        add      r14d, -4
 						;; size=8 bbWeight=2 PerfScore 1.00
 G_M33313_IG15:
        cmp      rbx, rax
        jbe      G_M33313_IG06
 						;; size=9 bbWeight=4 PerfScore 5.00
 G_M33313_IG16:
        sub      rax, rbx
        mov      r13, rax
        shr      r13, 63
        add      r13, rax
        sar      r13, 1
        add      r13d, 2
 						;; size=20 bbWeight=0.50 PerfScore 1.00
 G_M33313_IG17:
        test     r13d, r13d
        je       G_M33313_IG45
        movzx    rax, word  ptr [rbx]
 						;; size=12 bbWeight=0.50 PerfScore 1.62
 G_M33313_IG18:
        cmp      eax, 127
        ja       G_M33313_IG42
        test     r14d, r14d
        je       G_M33313_IG36
        mov      byte  ptr [r15], al
        add      rbx, 2
        inc      r15
        jmp      G_M33313_IG44
        align    [0 bytes for IG24]
 						;; size=33 bbWeight=0.50 PerfScore 3.00
 G_M33313_IG19:
        lea      ecx, [8*rdi]
        sub      r14d, ecx
-       vmovd    rcx, xmm1
+       vmovd    rcx, xmm0
        mov      rdx, 0xD1FFAB1E
        test     rcx, rdx
        jne      SHORT G_M33313_IG20
-       vpackuswb xmm2, xmm1, xmm1
-       vmovd    dword ptr [r15], xmm2
+       vpackuswb xmm1, xmm0, xmm0
+       vmovd    dword ptr [r15], xmm1
        add      rbx, 8
        add      r15, 4
        add      r14d, -4
-       vpextrq  rcx, xmm1, 1
+       vpextrq  rcx, xmm0, 1
 						;; size=57 bbWeight=32 PerfScore 352.00
 G_M33313_IG20:
        mov      edx, ecx
        test     edx, 0xD1FFAB1E
        jne      G_M33313_IG31
        mov      edi, edx
        shr      edi, 8
        or       edi, edx
        mov      word  ptr [r15], di
        add      rbx, 4
        add      r15, 2
        add      r14d, -2
        shr      rcx, 32
        mov      edx, ecx
        mov      ecx, edx
 						;; size=45 bbWeight=32 PerfScore 168.00
 G_M33313_IG21:
        test     ecx, 0xFF80
        jne      SHORT G_M33313_IG23
 						;; size=8 bbWeight=1024 PerfScore 1280.00
 G_M33313_IG22:
        test     r14d, r14d
        je       G_M33313_IG36
        mov      byte  ptr [r15], cl
        add      rbx, 2
        inc      r15
        dec      r14d
        cmp      rbx, rax
        ja       G_M33313_IG16
        mov      ecx, dword ptr [rbx]
 						;; size=33 bbWeight=128 PerfScore 800.00
 G_M33313_IG23:
        test     ecx, 0xF800
        jne      SHORT G_M33313_IG27
 						;; size=8 bbWeight=1024 PerfScore 1280.00
 G_M33313_IG24:
        lea      edx, [rcx+D1FFAB1EH]
        cmp      edx, 0xD1FFAB1E
        jbe      G_M33313_IG32
 						;; size=18 bbWeight=2048 PerfScore 3584.00
 G_M33313_IG25:
        cmp      r14d, 2
        jl       G_M33313_IG36
        lea      edx, [4*rcx]
        and      edx, 0x1F00
        mov      edi, ecx
        and      edi, 63
        lea      edx, [rdx+rdi+0xC080]
        movzx    rdx, dx
        movbe    word  ptr [r15], dx
        cmp      ecx, 0xD1FFAB1E
        jb       G_M33313_IG34
        add      rbx, 2
        add      r15, 2
        add      r14d, -2
        cmp      rbx, rax
        ja       G_M33313_IG16
 						;; size=77 bbWeight=32 PerfScore 256.00
 G_M33313_IG26:
        mov      ecx, dword ptr [rbx]
        align    [0 bytes for IG27]
 						;; size=2 bbWeight=256 PerfScore 512.00
 G_M33313_IG27:
        lea      edx, [rcx-0xD800]
        test     edx, 0xF800
        je       G_M33313_IG37
 						;; size=18 bbWeight=1024 PerfScore 1792.00
 G_M33313_IG28:
        test     ecx, 0xD1FFAB1E
        jne      G_M33313_IG40
 						;; size=12 bbWeight=2048 PerfScore 2560.00
 G_M33313_IG29:
        cmp      r14d, 3
        jl       G_M33313_IG36
        lea      edx, [4*rcx]
        and      edx, 0x3F00
        movzx    rdi, cx
        shr      edi, 12
        add      edx, edi
        add      edx, 0x80E0
        mov      word  ptr [r15], dx
        mov      edx, ecx
        and      edx, 63
        or       edx, -128
        mov      byte  ptr [r15+0x02], dl
        add      rbx, 2
        add      r15, 3
        add      r14d, -3
        cmp      ecx, 0xD1FFAB1E
        jb       G_M33313_IG41
 						;; size=77 bbWeight=2048 PerfScore 16384.00
 G_M33313_IG30:
        cmp      rbx, rax
        ja       G_M33313_IG16
        mov      ecx, dword ptr [rbx]
        jmp      G_M33313_IG21
 						;; size=16 bbWeight=16 PerfScore 84.00
 G_M33313_IG31:
        mov      ecx, edx
        jmp      G_M33313_IG21
 						;; size=7 bbWeight=16 PerfScore 36.00
 G_M33313_IG32:
        cmp      r14d, 4
        jl       SHORT G_M33313_IG33
        mov      edx, ecx
        shr      edx, 6
        and      edx, 0xD1FFAB1E
        shl      ecx, 8
        and      ecx, 0xD1FFAB1E
        add      ecx, edx
        add      ecx, 0xD1FFAB1E
        mov      dword ptr [r15], ecx
        add      rbx, 4
        add      r15, 4
        add      r14d, -4
        cmp      rbx, rax
        ja       G_M33313_IG16
        mov      ecx, dword ptr [rbx]
        lea      edx, [rcx-0x80]
        movzx    rdx, dx
        cmp      edx, 0x780
        jb       G_M33313_IG24
        jmp      G_M33313_IG07
 						;; size=83 bbWeight=4096 PerfScore 51200.00
 G_M33313_IG33:
        movzx    rax, cx
        jmp      G_M33313_IG18
 						;; size=8 bbWeight=0.50 PerfScore 1.12
 G_M33313_IG34:
        cmp      r14d, 3
        jl       SHORT G_M33313_IG35
        shr      ecx, 16
        mov      byte  ptr [r15+0x02], cl
        add      rbx, 4
        add      r15, 3
        add      r14d, -3
        jmp      G_M33313_IG15
 						;; size=30 bbWeight=2 PerfScore 11.00
 G_M33313_IG35:
        add      rbx, 2
        add      r15, 2
 						;; size=8 bbWeight=0.50 PerfScore 0.25
 G_M33313_IG36:
        mov      eax, 1
        jmp      G_M33313_IG47
 						;; size=10 bbWeight=0.50 PerfScore 1.12
 G_M33313_IG37:
        lea      edx, [rcx+D1FFAB1EH]
        test     edx, 0xD1FFAB1E
        je       SHORT G_M33313_IG39
 						;; size=14 bbWeight=2 PerfScore 3.50
 G_M33313_IG38:
        mov      eax, 3
        jmp      G_M33313_IG47
 						;; size=10 bbWeight=0.50 PerfScore 1.12
 G_M33313_IG39:
        cmp      r14d, 4
        jl       SHORT G_M33313_IG36
        add      ecx, 64
        mov      edx, ecx
        and      edx, 3
        shl      edx, 20
        or       edx, 0xD1FFAB1E
        mov      edi, ecx
        and      edi, 0xD1FFAB1E
        bswap    edi
        rol      edi, 16
        or       edx, edi
        mov      edi, ecx
        shr      edi, 6
        and      edi, 0xD1FFAB1E
        or       edx, edi
        and      ecx, 252
        shl      ecx, 6
        or       ecx, edx
        mov      dword ptr [r15], ecx
        add      rbx, 4
        jmp      G_M33313_IG14
 						;; size=74 bbWeight=2 PerfScore 20.00
 G_M33313_IG40:
        lea      edx, [rcx+D1FFAB1EH]
        cmp      edx, 0xD1FFAB1E
        jb       G_M33313_IG29
        cmp      r14d, 6
        jl       G_M33313_IG29
        lea      edx, [4*rcx]
        and      edx, 0x3F00
        mov      edi, ecx
        and      edi, 63
        shl      edi, 16
        or       edx, edi
        mov      edi, ecx
        shr      edi, 4
        and      edi, 0xD1FFAB1E
        mov      esi, ecx
        shr      esi, 12
        and      esi, 15
        or       edi, esi
        add      edx, edi
        add      edx, 0xD1FFAB1E
        mov      dword ptr [r15], edx
        mov      edx, ecx
        shr      edx, 22
        and      edx, 63
        shr      ecx, 8
        and      ecx, 0x3F00
        add      ecx, edx
        add      ecx, 0x8080
        mov      word  ptr [r15+0x04], cx
        add      rbx, 4
        add      r15, 6
        add      r14d, -6
        cmp      rbx, rax
        ja       G_M33313_IG16
        mov      ecx, dword ptr [rbx]
        test     ecx, 0xF800
        jne      G_M33313_IG27
        jmp      G_M33313_IG07
 						;; size=153 bbWeight=1024 PerfScore 19712.00
 G_M33313_IG41:
        test     r14d, r14d
        je       G_M33313_IG36
        shr      ecx, 16
        mov      byte  ptr [r15], cl
        add      rbx, 2
        inc      r15
        dec      r14d
        cmp      rbx, rax
        ja       G_M33313_IG16
        mov      ecx, dword ptr [rbx]
        test     ecx, 0xF800
        jne      G_M33313_IG27
        jmp      G_M33313_IG07
 						;; size=53 bbWeight=1024 PerfScore 10240.00
 G_M33313_IG42:
        cmp      eax, 0x800
        jae      SHORT G_M33313_IG43
        cmp      r14d, 2
        jl       G_M33313_IG36
        mov      ecx, eax
        and      ecx, 63
        or       ecx, -128
        mov      byte  ptr [r15+0x01], cl
        shr      eax, 6
        or       eax, -64
        mov      byte  ptr [r15], al
        add      rbx, 2
        add      r15, 2
        jmp      SHORT G_M33313_IG44
 						;; size=48 bbWeight=0.50 PerfScore 4.25
 G_M33313_IG43:
        lea      ecx, [rax-0xD800]
        cmp      ecx, 0x7FF
        jbe      SHORT G_M33313_IG46
        cmp      r14d, 3
        jl       G_M33313_IG36
        mov      ecx, eax
        and      ecx, 63
        or       ecx, -128
        mov      byte  ptr [r15+0x02], cl
        mov      ecx, eax
        shr      ecx, 6
        and      ecx, 63
        or       ecx, -128
        mov      byte  ptr [r15+0x01], cl
        shr      eax, 12
        or       eax, -32
        mov      byte  ptr [r15], al
        add      rbx, 2
        add      r15, 3
 						;; size=68 bbWeight=0.50 PerfScore 4.62
 G_M33313_IG44:
        cmp      r13d, 1
        jg       G_M33313_IG36
 						;; size=10 bbWeight=0.50 PerfScore 0.62
 G_M33313_IG45:
        xor      eax, eax
        jmp      SHORT G_M33313_IG47
 						;; size=4 bbWeight=0.50 PerfScore 1.12
 G_M33313_IG46:
        cmp      eax, 0xDBFF
        ja       G_M33313_IG38
        mov      eax, 2
 						;; size=16 bbWeight=0.50 PerfScore 0.75
 G_M33313_IG47:
        mov      qword ptr [r12], rbx
        mov      r12, bword ptr [rbp-0x30]
        mov      qword ptr [r12], r15
 						;; size=12 bbWeight=0.50 PerfScore 1.50
 G_M33313_IG48:
        add      rsp, 8
        pop      rbx
        pop      r12
        pop      r13
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=15 bbWeight=0.50 PerfScore 2.12
 RWD00  	dq	FF80FF80FF80FF80h, FF80FF80FF80FF80h
 
 
-; Total bytes of code 1394, prolog size 35, PerfScore 114562.25, instruction count 370, allocated bytes for code 1394 (MethodHash=85587dde) for method System.Text.Unicode.Utf8Utility:TranscodeToUtf8(ulong,int,ulong,int,byref,byref):int (FullOpts)
+; Total bytes of code 1393, prolog size 35, PerfScore 115072.75, instruction count 369, allocated bytes for code 1393 (MethodHash=85587dde) for method System.Text.Unicode.Utf8Utility:TranscodeToUtf8(ulong,int,ulong,int,byref,byref):int (FullOpts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment