forked from MihaZupan/runtime-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JitDiff X64] [xtqqczze] Replace use of target dependent TestZ
intrinsic
#501
Comments
Top method regressions30 (3.53 % of base) - System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong ; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 21 single block inlinees; 25 inlinees without PGO data
+; 0 inlinees with PGO data; 25 single block inlinees; 21 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T06] ( 9, 9 ) long -> rdi single-def
; V01 arg1 [V01,T04] ( 15, 12 ) long -> rsi single-def
; V02 arg2 [V02,T11] ( 9, 6 ) long -> rdx single-def
; V03 loc0 [V03,T00] ( 23, 30 ) long -> rax
; V04 loc1 [V04,T12] ( 13, 6.50) int -> r8
;* V05 loc2 [V05 ] ( 0, 0 ) int -> zero-ref
; V06 loc3 [V06,T05] ( 7, 14 ) long -> rcx
; V07 loc4 [V07,T22] ( 5, 2.50) long -> rdx
; V08 loc5 [V08,T16] ( 2, 4.50) long -> r8
;# V09 OutArgs [V09 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V10 tmp1 [V10,T23] ( 3, 1.50) long -> rax "Inline return value spill temp"
; V11 tmp2 [V11,T07] ( 5, 9.50) byref -> rcx single-def "Inline stloc first use temp"
-; V12 tmp3 [V12,T30] ( 14, 17.50) simd64 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+; V12 tmp3 [V12,T32] ( 14, 17.50) simd64 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
; V13 tmp4 [V13,T13] ( 5, 6 ) byref -> rax single-def "Inline stloc first use temp"
;* V14 tmp5 [V14 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
; V15 tmp6 [V15,T01] ( 12, 27 ) long -> r8 "Inline stloc first use temp"
; V16 tmp7 [V16,T17] ( 2, 4.50) long -> r9 "Inline stloc first use temp"
-; V17 tmp8 [V17,T36] ( 3, 12 ) simd64 -> mm3 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+; V17 tmp8 [V17,T38] ( 3, 12 ) simd64 -> mm3 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V18 tmp9 [V18 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
-; V19 tmp10 [V19,T33] ( 2, 16 ) simd64 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+; V19 tmp10 [V19,T35] ( 2, 16 ) simd64 -> mm0 "Spilling op1 side effects for HWIntrinsic"
;* V20 tmp11 [V20 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
;* V21 tmp12 [V21 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V22 tmp13 [V22 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V23 tmp14 [V23 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V24 tmp15 [V24 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V25 tmp16 [V25 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V26 tmp17 [V26 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V27 tmp18 [V27 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V28 tmp19 [V28 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V29 tmp20 [V29,T24] ( 3, 1.50) long -> rax "Inline return value spill temp"
; V30 tmp21 [V30,T08] ( 5, 9.50) byref -> rcx single-def "Inline stloc first use temp"
-; V31 tmp22 [V31,T31] ( 14, 17.50) simd32 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V31 tmp22 [V31,T33] ( 14, 17.50) simd32 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V32 tmp23 [V32,T14] ( 5, 6 ) byref -> rax single-def "Inline stloc first use temp"
;* V33 tmp24 [V33 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V34 tmp25 [V34,T02] ( 12, 27 ) long -> r8 "Inline stloc first use temp"
; V35 tmp26 [V35,T18] ( 2, 4.50) long -> r9 "Inline stloc first use temp"
-; V36 tmp27 [V36,T37] ( 3, 12 ) simd32 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V36 tmp27 [V36,T39] ( 3, 12 ) simd32 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
;* V37 tmp28 [V37 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-; V38 tmp29 [V38,T34] ( 2, 16 ) simd32 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+; V38 tmp29 [V38,T36] ( 2, 16 ) simd32 -> mm0 "Spilling op1 side effects for HWIntrinsic"
;* V39 tmp30 [V39 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-;* V40 tmp31 [V40 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V41 tmp32 [V41 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V40 tmp31 [V40 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+;* V41 tmp32 [V41 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V42 tmp33 [V42 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V43 tmp34 [V43 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V44 tmp35 [V44 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V45 tmp36 [V45 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V44 tmp35 [V44 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V45 tmp36 [V45 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V46 tmp37 [V46 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V47 tmp38 [V47 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V48 tmp39 [V48 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V49 tmp40 [V49 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V50 tmp41 [V50 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V51 tmp42 [V51 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V52 tmp43 [V52 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V53 tmp44 [V53 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V54 tmp45 [V54 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V55 tmp46 [V55 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-; V56 tmp47 [V56,T25] ( 3, 1.50) long -> rax "Inline return value spill temp"
-;* V57 tmp48 [V57,T27] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
-;* V58 tmp49 [V58 ] ( 0, 0 ) long -> zero-ref "Inline stloc first use temp"
-; V59 tmp50 [V59,T09] ( 5, 9.50) byref -> rcx single-def "Inline stloc first use temp"
-; V60 tmp51 [V60,T32] ( 14, 17.50) simd16 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V61 tmp52 [V61,T15] ( 5, 6 ) byref -> r8 single-def "Inline stloc first use temp"
-;* V62 tmp53 [V62 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
-; V63 tmp54 [V63,T03] ( 11, 26.50) long -> rax "Inline stloc first use temp"
-; V64 tmp55 [V64,T19] ( 2, 4.50) long -> r9 "Inline stloc first use temp"
-; V65 tmp56 [V65,T38] ( 3, 12 ) simd16 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V66 tmp57 [V66 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
-; V67 tmp58 [V67,T35] ( 2, 16 ) simd16 -> mm0 "Spilling op1 side effects for HWIntrinsic"
-;* V68 tmp59 [V68 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
-;* V69 tmp60 [V69 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V70 tmp61 [V70 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V71 tmp62 [V71 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V72 tmp63 [V72 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V73 tmp64 [V73 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V74 tmp65 [V74 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V75 tmp66 [V75 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V76 tmp67 [V76 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V77 tmp68 [V77 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V78 tmp69 [V78 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V79 tmp70 [V79 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V80 tmp71 [V80 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V81 tmp72 [V81 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
-; V82 tmp73 [V82,T28] ( 3, 24 ) simd16 -> mm1 "dup spill"
-;* V83 tmp74 [V83 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[uint]>
-;* V84 tmp75 [V84 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
-; V85 tmp76 [V85,T20] ( 3, 3 ) byref -> rcx single-def "Inlining Arg"
-; V86 tmp77 [V86,T21] ( 3, 3 ) byref -> rdx "Inlining Arg"
-;* V87 tmp78 [V87,T26] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V88 cse0 [V88,T10] ( 3, 8.50) long -> r10 "CSE #05: conservative"
-; V89 cse1 [V89,T39] ( 5, 6 ) simd64 -> mm1 "CSE #01: conservative"
-; V90 cse2 [V90,T40] ( 5, 6 ) simd32 -> mm1 "CSE #03: conservative"
-; V91 cse3 [V91,T41] ( 5, 6 ) simd16 -> mm1 "CSE #04: conservative"
-; V92 cse4 [V92,T42] ( 5, 6 ) simd64 -> mm2 "CSE #02: conservative"
-; V93 rat0 [V93,T29] ( 3, 24 ) simd64 -> mm4 "ReplaceWithLclVar is creating a new local variable"
+; V48 tmp39 [V48,T25] ( 3, 1.50) long -> rax "Inline return value spill temp"
+;* V49 tmp40 [V49,T27] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
+;* V50 tmp41 [V50 ] ( 0, 0 ) long -> zero-ref "Inline stloc first use temp"
+; V51 tmp42 [V51,T09] ( 5, 9.50) byref -> rcx single-def "Inline stloc first use temp"
+; V52 tmp43 [V52,T34] ( 14, 17.50) simd16 -> mm0 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V53 tmp44 [V53,T15] ( 5, 6 ) byref -> r8 single-def "Inline stloc first use temp"
+;* V54 tmp45 [V54 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+; V55 tmp46 [V55,T03] ( 11, 26.50) long -> rax "Inline stloc first use temp"
+; V56 tmp47 [V56,T19] ( 2, 4.50) long -> r9 "Inline stloc first use temp"
+; V57 tmp48 [V57,T40] ( 3, 12 ) simd16 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V58 tmp49 [V58 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+; V59 tmp50 [V59,T37] ( 2, 16 ) simd16 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+;* V60 tmp51 [V60 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+;* V61 tmp52 [V61 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V62 tmp53 [V62 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V63 tmp54 [V63 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V64 tmp55 [V64 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V65 tmp56 [V65 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V66 tmp57 [V66 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V67 tmp58 [V67 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V68 tmp59 [V68 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V69 tmp60 [V69 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
+; V70 tmp61 [V70,T28] ( 3, 24 ) simd16 -> mm1 "dup spill"
+;* V71 tmp62 [V71 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[uint]>
+;* V72 tmp63 [V72 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
+; V73 tmp64 [V73,T20] ( 3, 3 ) byref -> rcx single-def "Inlining Arg"
+; V74 tmp65 [V74,T21] ( 3, 3 ) byref -> rdx "Inlining Arg"
+;* V75 tmp66 [V75,T26] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V76 cse0 [V76,T10] ( 3, 8.50) long -> r10 "CSE #05: conservative"
+; V77 cse1 [V77,T41] ( 5, 6 ) simd64 -> mm1 "CSE #01: conservative"
+; V78 cse2 [V78,T42] ( 5, 6 ) simd32 -> mm1 "CSE #03: conservative"
+; V79 cse3 [V79,T43] ( 5, 6 ) simd16 -> mm1 "CSE #04: conservative"
+; V80 cse4 [V80,T44] ( 5, 6 ) simd64 -> mm2 "CSE #02: conservative"
+; V81 rat0 [V81,T29] ( 3, 24 ) simd16 -> mm3 "ReplaceWithLclVar is creating a new local variable"
+; V82 rat1 [V82,T30] ( 3, 24 ) simd32 -> mm3 "ReplaceWithLclVar is creating a new local variable"
+; V83 rat2 [V83,T31] ( 3, 24 ) simd64 -> mm4 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M6063_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M6063_IG02:
xor eax, eax
cmp rdx, 32
jb G_M6063_IG21
;; size=12 bbWeight=1 PerfScore 1.50
G_M6063_IG03:
mov rcx, qword ptr [rdi]
mov r8, 0xD1FFAB1E
test rcx, r8
jne G_M6063_IG27
cmp rdx, 128
jb SHORT G_M6063_IG04
mov rcx, rdi
vmovups zmm0, zmmword ptr [rcx]
vmovups zmm1, zmmword ptr [reloc @RWD00]
vptestmw k1, zmm1, zmm0
kortestd k1, k1
;; NOP compensation instructions of 3 bytes.
je G_M6063_IG17
xor eax, eax
jmp G_M6063_IG21
align [3 bytes for IG08]
;; size=80 bbWeight=0.50 PerfScore 9.62
G_M6063_IG04:
cmp rdx, 64
jb SHORT G_M6063_IG05
mov rcx, rdi
vmovups ymm0, ymmword ptr [rcx]
vmovups ymm1, ymmword ptr [reloc @RWD00]
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
je G_M6063_IG11
xor eax, eax
jmp G_M6063_IG15
;; size=39 bbWeight=0.50 PerfScore 9.38
G_M6063_IG05:
mov rcx, rdi
vmovups xmm0, xmmword ptr [rcx]
vmovups xmm1, xmmword ptr [reloc @RWD00]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
je SHORT G_M6063_IG06
xor eax, eax
jmp SHORT G_M6063_IG09
;; size=26 bbWeight=0.50 PerfScore 6.75
G_M6063_IG06:
mov r8, rsi
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [r8], xmm0
mov eax, 8
test sil, 8
jne SHORT G_M6063_IG07
vmovups xmm0, xmmword ptr [rcx+0x10]
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne SHORT G_M6063_IG09
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [r8+0x08], xmm0
;; size=45 bbWeight=0.50 PerfScore 7.88
G_M6063_IG07:
mov rax, rsi
and rax, 15
neg rax
add rax, 16
lea r9, [rdx-0x10]
;; size=18 bbWeight=0.50 PerfScore 0.75
G_M6063_IG08:
vmovups xmm0, xmmword ptr [rcx+2*rax]
lea r10, [rax+0x08]
vmovups xmm2, xmmword ptr [rcx+2*r10]
- vpor xmm3, xmm0, xmm2
- vptest xmm3, xmm1
+ vmovaps xmm3, xmm0
+ vpternlogd xmm3, xmm2, xmm1, -88
+ vptest xmm3, xmm3
jne SHORT G_M6063_IG10
vpackuswb xmm0, xmm0, xmm2
vmovups xmmword ptr [r8+rax], xmm0
add rax, 16
cmp rax, r9
jbe SHORT G_M6063_IG08
- ;; size=45 bbWeight=4 PerfScore 69.33
+ ;; size=52 bbWeight=4 PerfScore 71.00
G_M6063_IG09:
jmp G_M6063_IG21
- align [0 bytes for IG13]
- ;; size=5 bbWeight=0.50 PerfScore 1.00
+ align [7 bytes for IG13]
+ ;; size=12 bbWeight=0.50 PerfScore 1.00
G_M6063_IG10:
- vptest xmm0, xmm1
+ vptest xmm1, xmm0
jne SHORT G_M6063_IG09
vpackuswb xmm0, xmm0, xmm0
vmovsd qword ptr [r8+rax], xmm0
mov rax, r10
jmp SHORT G_M6063_IG09
;; size=22 bbWeight=0.50 PerfScore 4.62
G_M6063_IG11:
mov rax, rsi
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rax], xmm0
mov r8d, 16
test sil, 16
jne SHORT G_M6063_IG12
vmovups ymm0, ymmword ptr [rcx+0x20]
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
jne SHORT G_M6063_IG14
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rax+0x10], xmm0
;; size=56 bbWeight=0.50 PerfScore 11.38
G_M6063_IG12:
mov r8, rsi
and r8, 31
neg r8
add r8, 32
lea r9, [rdx-0x20]
;; size=18 bbWeight=0.50 PerfScore 0.75
G_M6063_IG13:
vmovups ymm0, ymmword ptr [rcx+2*r8]
vmovups ymm2, ymmword ptr [rcx+2*r8+0x20]
- vpor ymm3, ymm0, ymm2
- vptest ymm3, ymm1
+ vmovaps ymm3, ymm0
+ vpternlogd ymm3, ymm2, ymm1, -88
+ vptest ymm3, ymm3
jne SHORT G_M6063_IG16
vpackuswb ymm0, ymm0, ymm2
vpermq ymm0, ymm0, -40
vmovups ymmword ptr [rax+r8], ymm0
add r8, 32
cmp r8, r9
jbe SHORT G_M6063_IG13
- ;; size=49 bbWeight=4 PerfScore 91.33
+ ;; size=56 bbWeight=4 PerfScore 93.00
G_M6063_IG14:
mov rax, r8
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M6063_IG15:
jmp G_M6063_IG21
align [0 bytes for IG19]
;; size=5 bbWeight=0.50 PerfScore 1.00
G_M6063_IG16:
- vptest ymm0, ymm1
+ vptest ymm1, ymm0
jne SHORT G_M6063_IG14
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
vmovups xmmword ptr [rax+r8], xmm0
add r8, 16
jmp SHORT G_M6063_IG14
;; size=29 bbWeight=0.50 PerfScore 6.62
G_M6063_IG17:
mov rax, rsi
vpackuswb zmm0, zmm0, zmm0
vmovups zmm2, zmmword ptr [reloc @RWD64]
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rax], ymm0
mov r8d, 32
test sil, 32
jne SHORT G_M6063_IG18
vmovups zmm0, zmmword ptr [rcx+0x40]
vptestmw k1, zmm1, zmm0
kortestd k1, k1
+ ;; NOP compensation instructions of 3 bytes.
jne SHORT G_M6063_IG20
vpackuswb zmm0, zmm0, zmm0
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rax+0x20], ymm0
- ;; size=78 bbWeight=0.50 PerfScore 11.88
+ ;; size=81 bbWeight=0.50 PerfScore 11.88
G_M6063_IG18:
mov r8, rsi
and r8, 63
neg r8
add r8, 64
lea r9, [rdx-0x40]
;; size=18 bbWeight=0.50 PerfScore 0.75
G_M6063_IG19:
vmovups zmm0, zmmword ptr [rcx+2*r8]
vmovups zmm3, zmmword ptr [rcx+2*r8+0x40]
vmovaps zmm4, zmm0
vpternlogd zmm4, zmm3, zmm1, -88
vptestmw k1, zmm4, zmm4
kortestd k1, k1
+ ;; NOP compensation instructions of 3 bytes.
jne G_M6063_IG26
vpackuswb zmm0, zmm0, zmm3
vpermq zmm0, zmm2, zmm0
vmovups zmmword ptr [rax+r8], zmm0
add r8, 64
cmp r8, r9
jbe SHORT G_M6063_IG19
- ;; size=73 bbWeight=4 PerfScore 81.00
+ ;; size=76 bbWeight=4 PerfScore 81.00
G_M6063_IG20:
mov rax, r8
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M6063_IG21:
sub rdx, rax
cmp rdx, 4
jb SHORT G_M6063_IG23
lea r8, [rax+rdx-0x04]
- align [0 bytes for IG22]
- ;; size=14 bbWeight=0.50 PerfScore 1.25
+ align [3 bytes for IG22]
+ ;; size=17 bbWeight=0.50 PerfScore 1.38
G_M6063_IG22:
mov rcx, qword ptr [rdi+2*rax]
mov r9, 0xD1FFAB1E
test rcx, r9
jne G_M6063_IG27
vmovd xmm1, rcx
vpackuswb xmm2, xmm1, xmm1
vmovd dword ptr [rsi+rax], xmm2
add rax, 4
cmp rax, r8
jbe SHORT G_M6063_IG22
;; size=46 bbWeight=4 PerfScore 40.00
G_M6063_IG23:
test dl, 2
je SHORT G_M6063_IG24
mov r8d, dword ptr [rdi+2*rax]
test r8d, 0xD1FFAB1E
jne G_M6063_IG28
lea rcx, [rsi+rax]
mov byte ptr [rcx], r8b
shr r8d, 16
mov byte ptr [rcx+0x01], r8b
add rax, 2
;; size=41 bbWeight=0.50 PerfScore 3.88
G_M6063_IG24:
test dl, 1
je SHORT G_M6063_IG29
movzx r8, word ptr [rdi+2*rax]
cmp r8d, 127
ja SHORT G_M6063_IG29
;; size=16 bbWeight=0.50 PerfScore 2.25
G_M6063_IG25:
mov byte ptr [rsi+rax], r8b
inc rax
jmp SHORT G_M6063_IG29
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M6063_IG26:
vptestmw k1, zmm1, zmm0
kortestd k1, k1
jne G_M6063_IG20
vpackuswb zmm0, zmm0, zmm0
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rax+r8], ymm0
add r8, 32
jmp G_M6063_IG20
;; size=44 bbWeight=0.50 PerfScore 6.12
G_M6063_IG27:
mov r8d, ecx
test r8d, 0xD1FFAB1E
jne SHORT G_M6063_IG28
lea rdx, [rsi+rax]
mov byte ptr [rdx], r8b
shr r8d, 16
mov byte ptr [rdx+0x01], r8b
shr rcx, 32
mov r8d, ecx
add rax, 2
;; size=38 bbWeight=0.50 PerfScore 2.75
G_M6063_IG28:
test r8d, 0xFF80
je SHORT G_M6063_IG25
;; size=9 bbWeight=0.50 PerfScore 0.62
G_M6063_IG29:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD64 dq 0000000000000000h, 0000000000000002h, 0000000000000004h, 0000000000000006h, 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
-; Total bytes of code 850, prolog size 4, PerfScore 378.04, instruction count 200, allocated bytes for code 859 (MethodHash=53fae850) for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 880, prolog size 4, PerfScore 381.50, instruction count 202, allocated bytes for code 883 (MethodHash=53fae850) for method System.Text.Ascii:NarrowUtf16ToAscii(ulong,ulong,ulong):ulong (FullOpts) 14 (4.02 % of base) - System.Text.Ascii:IsValidCore[short](byref,int):ubyte ; Assembly listing for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 6 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 18, 17 ) byref -> rdi
; V01 arg1 [V01,T04] ( 11, 7 ) int -> rsi single-def
; V02 loc0 [V02,T08] ( 5, 2.50) byref -> rcx single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
; V05 loc3 [V05,T03] ( 5, 16.50) long -> rax
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T02] ( 6, 17 ) long -> rdx
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T05] ( 4, 5.50) long -> rsi
; V10 loc8 [V10,T00] ( 5, 20 ) byref -> rax
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
; V17 tmp2 [V17,T10] ( 2, 1 ) ubyte -> rdx "Inline return value spill temp"
;* V18 tmp3 [V18 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V19 tmp4 [V19 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V20 tmp5 [V20 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[short]>
;* V21 tmp6 [V21 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V22 tmp7 [V22 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
;* V23 tmp8 [V23 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V24 tmp9 [V24,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
+; V24 tmp9 [V24,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
;* V25 tmp10 [V25 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
; V26 tmp11 [V26,T11] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
;* V27 tmp12 [V27 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V28 tmp13 [V28,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
+; V28 tmp13 [V28,T16] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
; V29 tmp14 [V29,T06] ( 5, 5 ) int -> rdx "Single return block return value"
; V30 tmp15 [V30,T09] ( 2, 2 ) long -> rax "Cast away GC"
-; V31 cse0 [V31,T12] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #04: aggressive"
+; V31 cse0 [V31,T12] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #04: aggressive"
; V32 cse1 [V32,T07] ( 3, 5 ) long -> rsi "CSE #01: aggressive"
+; V33 rat0 [V33,T13] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V34 rat1 [V34,T14] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M12635_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M12635_IG02:
cmp esi, 8
jge SHORT G_M12635_IG07
;; size=5 bbWeight=1 PerfScore 1.25
G_M12635_IG03:
movsxd rax, esi
cmp rax, 4
jge G_M12635_IG16
xor eax, eax
mov esi, esi
test rsi, rsi
je SHORT G_M12635_IG05
align [1 bytes for IG04]
;; size=23 bbWeight=0.50 PerfScore 1.75
G_M12635_IG04:
cmp word ptr [rdi+2*rax], 127
ja G_M12635_IG11
inc rax
cmp rax, rsi
jb SHORT G_M12635_IG04
;; size=19 bbWeight=4 PerfScore 22.00
G_M12635_IG05:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M12635_IG06:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M12635_IG07:
movsxd rax, esi
lea rcx, bword ptr [rdi+2*rax]
cmp esi, 16
jg SHORT G_M12635_IG08
vmovups xmm0, xmmword ptr [rdi]
- vpor xmm0, xmm0, xmmword ptr [rcx-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rcx-0x10]
+ vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete dl
movzx rdx, dl
jmp G_M12635_IG17
align [0 bytes for IG10]
- ;; size=41 bbWeight=0.50 PerfScore 8.62
+ ;; size=48 bbWeight=0.50 PerfScore 9.12
G_M12635_IG08:
cmp esi, 32
jg SHORT G_M12635_IG09
vmovups ymm0, ymmword ptr [rdi]
- vpor ymm0, ymm0, ymmword ptr [rcx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rcx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogd ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete dl
movzx rdx, dl
jmp G_M12635_IG17
- ;; size=38 bbWeight=0.50 PerfScore 10.75
+ ;; size=45 bbWeight=0.50 PerfScore 12.00
G_M12635_IG09:
cmp esi, 64
jle SHORT G_M12635_IG15
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne SHORT G_M12635_IG11
mov rax, rdi
and rax, 31
shr rax, 1
mov rdx, rax
neg rdx
add rdx, 64
movsxd rsi, esi
add rsi, -64
cmp rdx, rsi
jae SHORT G_M12635_IG14
;; size=74 bbWeight=0.50 PerfScore 15.38
G_M12635_IG10:
lea rax, bword ptr [rdi+2*rdx]
vmovups ymm0, ymmword ptr [rax]
- vmovups ymm2, ymmword ptr [rax+0x20]
- vpternlogd ymm0, ymm2, ymmword ptr [rax+0x40], -2
+ vmovups ymm1, ymmword ptr [rax+0x20]
+ vpternlogd ymm0, ymm1, ymmword ptr [rax+0x40], -2
vpor ymm0, ymm0, ymmword ptr [rax+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
je SHORT G_M12635_IG13
;; size=33 bbWeight=4 PerfScore 90.00
G_M12635_IG11:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M12635_IG12:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M12635_IG13:
add rdx, 64
cmp rdx, rsi
jb SHORT G_M12635_IG10
;; size=9 bbWeight=4 PerfScore 6.00
G_M12635_IG14:
lea rdi, bword ptr [rdi+2*rsi]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M12635_IG15:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rcx-0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rcx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rcx-0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rcx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete dl
movzx rdx, dl
jmp SHORT G_M12635_IG17
;; size=43 bbWeight=0.50 PerfScore 14.12
G_M12635_IG16:
mov rdx, qword ptr [rdi]
movsxd rax, esi
or rdx, qword ptr [rdi+2*rax-0x08]
mov rax, 0xD1FFAB1E
test rdx, rax
sete dl
movzx rdx, dl
;; size=30 bbWeight=0.50 PerfScore 3.50
G_M12635_IG17:
movzx rax, dl
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M12635_IG18:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 348, prolog size 4, PerfScore 179.00, instruction count 95, allocated bytes for code 348 (MethodHash=fb38cea4) for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts)
+; Total bytes of code 362, prolog size 4, PerfScore 180.75, instruction count 97, allocated bytes for code 362 (MethodHash=fb38cea4) for method System.Text.Ascii:IsValidCore[short](byref,int):ubyte (FullOpts) 14 (3.65 % of base) - System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte ; Assembly listing for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 6 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 20, 18 ) byref -> rdi
; V01 arg1 [V01,T04] ( 13, 8 ) int -> rsi single-def
; V02 loc0 [V02,T08] ( 5, 2.50) byref -> rax single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
; V05 loc3 [V05,T03] ( 5, 16.50) long -> rax
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T02] ( 6, 17 ) long -> rcx
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T06] ( 4, 5.50) long -> rsi
; V10 loc8 [V10,T01] ( 5, 20 ) byref -> rdx
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
;* V17 tmp2 [V17 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V18 tmp3 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V19 tmp4 [V19 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V20 tmp5 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V21 tmp6 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V22 tmp7 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V23 tmp8 [V23 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V24 tmp9 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 tmp10 [V25,T12] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+; V25 tmp10 [V25,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V26 tmp11 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
; V27 tmp12 [V27,T10] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V28 tmp13 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V29 tmp14 [V29,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+; V29 tmp14 [V29,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V30 tmp15 [V30,T05] ( 6, 6 ) int -> rcx "Single return block return value"
; V31 tmp16 [V31,T09] ( 2, 2 ) long -> rcx "Cast away GC"
-; V32 cse0 [V32,T11] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #04: aggressive"
+; V32 cse0 [V32,T11] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #04: aggressive"
; V33 cse1 [V33,T07] ( 3, 5 ) long -> rcx "CSE #01: aggressive"
+; V34 rat0 [V34,T12] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V35 rat1 [V35,T13] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M58774_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M58774_IG02:
cmp esi, 16
jge SHORT G_M58774_IG06
;; size=5 bbWeight=1 PerfScore 1.25
G_M58774_IG03:
movsxd rax, esi
cmp rax, 8
jge G_M58774_IG12
cmp esi, 4
jl G_M58774_IG13
mov eax, dword ptr [rdi]
add esi, -4
movsxd rcx, esi
or eax, dword ptr [rdi+rcx]
test eax, 0xD1FFAB1E
sete cl
movzx rcx, cl
;; size=44 bbWeight=0.50 PerfScore 4.88
G_M58774_IG04:
movzx rax, cl
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M58774_IG05:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M58774_IG06:
movsxd rax, esi
add rax, rdi
cmp esi, 32
jg SHORT G_M58774_IG07
vmovups xmm0, xmmword ptr [rdi]
- vpor xmm0, xmm0, xmmword ptr [rax-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rax-0x10]
+ vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete cl
movzx rcx, cl
jmp SHORT G_M58774_IG04
align [0 bytes for IG09]
- ;; size=37 bbWeight=0.50 PerfScore 8.50
+ ;; size=44 bbWeight=0.50 PerfScore 9.00
G_M58774_IG07:
cmp esi, 64
jg SHORT G_M58774_IG08
vmovups ymm0, ymmword ptr [rdi]
- vpor ymm0, ymm0, ymmword ptr [rax-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rax-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogd ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete cl
movzx rcx, cl
jmp SHORT G_M58774_IG04
- ;; size=35 bbWeight=0.50 PerfScore 10.75
+ ;; size=42 bbWeight=0.50 PerfScore 12.00
G_M58774_IG08:
cmp esi, 128
jle SHORT G_M58774_IG11
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne G_M58774_IG17
mov rcx, rdi
and rcx, 31
neg rcx
add rcx, 128
movsxd rsi, esi
add rsi, -128
cmp rcx, rsi
jae SHORT G_M58774_IG10
;; size=78 bbWeight=0.50 PerfScore 15.00
G_M58774_IG09:
lea rdx, bword ptr [rdi+rcx]
vmovups ymm0, ymmword ptr [rdx]
- vmovups ymm2, ymmword ptr [rdx+0x20]
- vpternlogd ymm0, ymm2, ymmword ptr [rdx+0x40], -2
+ vmovups ymm1, ymmword ptr [rdx+0x20]
+ vpternlogd ymm0, ymm1, ymmword ptr [rdx+0x40], -2
vpor ymm0, ymm0, ymmword ptr [rdx+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
jne G_M58774_IG17
add rcx, 128
cmp rcx, rsi
jb SHORT G_M58774_IG09
;; size=49 bbWeight=4 PerfScore 96.00
G_M58774_IG10:
add rdi, rsi
;; size=3 bbWeight=0.50 PerfScore 0.12
G_M58774_IG11:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rax-0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rax-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rax-0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rax-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete cl
movzx rcx, cl
jmp G_M58774_IG04
align [0 bytes for IG14]
;; size=46 bbWeight=0.50 PerfScore 14.12
G_M58774_IG12:
mov rcx, qword ptr [rdi]
movsxd rsi, esi
or rcx, qword ptr [rdi+rsi-0x08]
mov rdi, 0xD1FFAB1E
test rcx, rdi
sete cl
movzx rcx, cl
jmp G_M58774_IG04
;; size=35 bbWeight=0.50 PerfScore 4.50
G_M58774_IG13:
xor eax, eax
mov ecx, esi
test rcx, rcx
je SHORT G_M58774_IG15
;; size=9 bbWeight=0.50 PerfScore 0.88
G_M58774_IG14:
cmp byte ptr [rdi+rax], 127
ja SHORT G_M58774_IG17
inc rax
cmp rax, rcx
jb SHORT G_M58774_IG14
;; size=14 bbWeight=4 PerfScore 22.00
G_M58774_IG15:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M58774_IG16:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M58774_IG17:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M58774_IG18:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
RWD00 dq 8080808080808080h, 8080808080808080h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq 8080808080808080h, 8080808080808080h, 8080808080808080h, 8080808080808080h
-; Total bytes of code 384, prolog size 4, PerfScore 183.38, instruction count 103, allocated bytes for code 384 (MethodHash=d69a1a69) for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts)
+; Total bytes of code 398, prolog size 4, PerfScore 185.12, instruction count 105, allocated bytes for code 398 (MethodHash=d69a1a69) for method System.Text.Ascii:IsValidCore[ubyte](byref,int):ubyte (FullOpts) 12 (3.40 % of base) - System.Text.Ascii:IsValidCore[int](byref,int):ubyte ; Assembly listing for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 18, 13 ) byref -> rdi
; V01 arg1 [V01,T03] ( 9, 6 ) int -> rsi single-def
; V02 loc0 [V02,T07] ( 5, 2.50) byref -> rdx single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
;* V05 loc3 [V05,T10] ( 0, 0 ) long -> zero-ref
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T01] ( 6, 17 ) long -> r8
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T04] ( 4, 5.50) long -> rcx
; V10 loc8 [V10,T00] ( 5, 20 ) byref -> rax
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
;* V17 tmp2 [V17 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inlining Arg"
; V18 tmp3 [V18,T09] ( 2, 1 ) ubyte -> r8 "Inline return value spill temp"
;* V19 tmp4 [V19 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V20 tmp5 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V21 tmp6 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[int]>
;* V22 tmp7 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V23 tmp8 [V23 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
;* V24 tmp9 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 tmp10 [V25,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+; V25 tmp10 [V25,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
;* V26 tmp11 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
; V27 tmp12 [V27,T11] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
;* V28 tmp13 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V29 tmp14 [V29,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+; V29 tmp14 [V29,T16] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
; V30 tmp15 [V30,T05] ( 5, 5 ) int -> r8 "Single return block return value"
; V31 tmp16 [V31,T08] ( 2, 2 ) long -> rax "Cast away GC"
-; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #03: aggressive"
+; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #03: aggressive"
; V33 cse1 [V33,T06] ( 6, 3 ) long -> rcx multi-def "CSE #01: aggressive"
+; V34 rat0 [V34,T13] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V35 rat1 [V35,T14] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M8346_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M8346_IG02:
cmp esi, 4
jge SHORT G_M8346_IG04
;; size=5 bbWeight=1 PerfScore 1.25
G_M8346_IG03:
movsxd rcx, esi
cmp rcx, 2
jge G_M8346_IG13
test esi, esi
je G_M8346_IG16
jmp G_M8346_IG18
;; size=26 bbWeight=0.50 PerfScore 2.38
G_M8346_IG04:
movsxd rcx, esi
lea rdx, bword ptr [rdi+4*rcx]
cmp esi, 8
jg SHORT G_M8346_IG05
vmovups xmm0, xmmword ptr [rdi]
- vpor xmm0, xmm0, xmmword ptr [rdx-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rdx-0x10]
+ vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete r8b
movzx r8, r8b
jmp G_M8346_IG14
- align [2 bytes for IG07]
- ;; size=45 bbWeight=0.50 PerfScore 8.62
+ align [0 bytes for IG07]
+ ;; size=50 bbWeight=0.50 PerfScore 9.12
G_M8346_IG05:
cmp esi, 16
jg SHORT G_M8346_IG06
vmovups ymm0, ymmword ptr [rdi]
- vpor ymm0, ymm0, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogd ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete r8b
movzx r8, r8b
jmp G_M8346_IG14
- ;; size=40 bbWeight=0.50 PerfScore 10.75
+ ;; size=47 bbWeight=0.50 PerfScore 12.00
G_M8346_IG06:
cmp esi, 32
jle SHORT G_M8346_IG12
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne SHORT G_M8346_IG08
mov rax, rdi
and rax, 31
shr rax, 2
mov r8, rax
neg r8
add r8, 32
add rcx, -32
cmp r8, rcx
jae SHORT G_M8346_IG11
;; size=72 bbWeight=0.50 PerfScore 15.25
G_M8346_IG07:
lea rax, bword ptr [rdi+4*r8]
vmovups ymm0, ymmword ptr [rax]
- vmovups ymm2, ymmword ptr [rax+0x20]
- vpternlogd ymm0, ymm2, ymmword ptr [rax+0x40], -2
+ vmovups ymm1, ymmword ptr [rax+0x20]
+ vpternlogd ymm0, ymm1, ymmword ptr [rax+0x40], -2
vpor ymm0, ymm0, ymmword ptr [rax+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
je SHORT G_M8346_IG10
;; size=33 bbWeight=4 PerfScore 90.00
G_M8346_IG08:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M8346_IG09:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG10:
add r8, 32
cmp r8, rcx
jb SHORT G_M8346_IG07
;; size=9 bbWeight=4 PerfScore 6.00
G_M8346_IG11:
lea rdi, bword ptr [rdi+4*rcx]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M8346_IG12:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdx-0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdx-0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete r8b
movzx r8, r8b
jmp SHORT G_M8346_IG14
;; size=45 bbWeight=0.50 PerfScore 14.12
G_M8346_IG13:
mov r8, qword ptr [rdi]
or r8, qword ptr [rdi+4*rcx-0x08]
mov rax, 0xD1FFAB1E
test r8, rax
sete r8b
movzx r8, r8b
;; size=29 bbWeight=0.50 PerfScore 3.38
G_M8346_IG14:
movzx rax, r8b
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M8346_IG15:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG16:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M8346_IG17:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG18:
cmp dword ptr [rdi], edi
mov rax, 0xD1FFAB1E ; code for System.ThrowHelper:ThrowNotSupportedException()
call [rax]System.ThrowHelper:ThrowNotSupportedException()
int3
;; size=15 bbWeight=0 PerfScore 0.00
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
+; Total bytes of code 365, prolog size 4, PerfScore 159.12, instruction count 92, allocated bytes for code 365 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts) Larger list of diffs: https://gist.github.com/MihuBot/6be1c4e16f21c742c3995e6ba8009dec |
Top method improvements-4 (-15.38 % of base) - System.Text.Ascii:VectorContainsNonAsciiChar(System.Runtime.Intrinsics.Vector128`1[ushort]):ubyte ; Assembly listing for method System.Text.Ascii:VectorContainsNonAsciiChar(System.Runtime.Intrinsics.Vector128`1[ushort]):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
-; rbp based frame
+; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
-; V00 arg0 [V00,T00] ( 1, 1 ) simd16 -> [rbp+0x10] single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V00 arg0 [V00,T00] ( 1, 1 ) simd16 -> [rsp+0x08] single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V01 loc0 [V01 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V02 loc1 [V02 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+;# V02 OutArgs [V02 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M20248_IG01:
- push rbp
- mov rbp, rsp
- ;; size=4 bbWeight=1 PerfScore 1.25
+ ;; size=0 bbWeight=1 PerfScore 0.00
G_M20248_IG02:
- vmovaps xmm0, xmmword ptr [rbp+0x10]
+ vmovaps xmm0, xmmword ptr [rsp+0x08]
vptest xmm0, xmmword ptr [reloc @RWD00]
setne al
movzx rax, al
- ;; size=20 bbWeight=1 PerfScore 9.25
+ ;; size=21 bbWeight=1 PerfScore 9.25
G_M20248_IG03:
- pop rbp
ret
- ;; size=2 bbWeight=1 PerfScore 1.50
+ ;; size=1 bbWeight=1 PerfScore 1.00
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 26, prolog size 4, PerfScore 12.00, instruction count 8, allocated bytes for code 26 (MethodHash=17a9b0e7) for method System.Text.Ascii:VectorContainsNonAsciiChar(System.Runtime.Intrinsics.Vector128`1[ushort]):ubyte (FullOpts)
+; Total bytes of code 22, prolog size 0, PerfScore 10.25, instruction count 5, allocated bytes for code 22 (MethodHash=17a9b0e7) for method System.Text.Ascii:VectorContainsNonAsciiChar(System.Runtime.Intrinsics.Vector128`1[ushort]):ubyte (FullOpts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Job completed in 14 minutes.
dotnet/runtime#104488
Diffs
Artifacts:
The text was updated successfully, but these errors were encountered: