forked from MihaZupan/runtime-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JitDiff X64] xtqqczze/dotnet-runtime/IsAsciiTVector #512
Comments
Top method regressions36 (10.68 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong ; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 8 single block inlinees; 5 inlinees without PGO data
+; 0 inlinees with PGO data; 18 single block inlinees; 25 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 32, 74 ) long -> rdi
; V01 arg1 [V01,T01] ( 16, 20.50) long -> rsi
; V02 loc0 [V02,T04] ( 12, 7 ) long -> rax
; V03 loc1 [V03,T02] ( 7, 10.50) int -> rcx
; V04 loc2 [V04,T05] ( 2, 4.50) long -> rcx
; V05 loc3 [V05,T06] ( 2, 4.50) long -> rcx
; V06 loc4 [V06,T07] ( 2, 4.50) long -> rcx
; V07 loc5 [V07,T03] ( 3, 8.50) int -> rdx
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V09 tmp1 [V09 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
-;* V10 tmp2 [V10 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-;* V11 tmp3 [V11 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
-;* V12 tmp4 [V12 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;* V13 tmp5 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V14 tmp6 [V14 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V09 tmp1 [V09,T21] ( 2, 2 ) simd16 -> mm0 "spilled call-like call argument"
+; V10 tmp2 [V10,T22] ( 2, 2 ) simd32 -> mm0 "spilled call-like call argument"
+; V11 tmp3 [V11,T23] ( 2, 2 ) simd64 -> mm0 "spilled call-like call argument"
+;* V12 tmp4 [V12 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
+;* V13 tmp5 [V13 ] ( 0, 0 ) simd64 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V14 tmp6 [V14,T12] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
;* V15 tmp7 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V16 tmp8 [V16 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V17 tmp9 [V17 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V18 tmp10 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V19 tmp11 [V19 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V20 tmp12 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V21 tmp13 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V22 tmp14 [V22 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V23 tmp15 [V23 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
-;* V24 tmp16 [V24,T08] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 cse0 [V25,T09] ( 3, 5 ) simd64 -> mm0 "CSE #01: aggressive"
-; V26 cse1 [V26,T10] ( 3, 5 ) simd32 -> mm0 "CSE #04: aggressive"
-; V27 cse2 [V27,T11] ( 3, 5 ) simd16 -> mm0 "CSE #05: aggressive"
+;* V16 tmp8 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V17 tmp9 [V17 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V18 tmp10 [V18 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V19 tmp11 [V19 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V20 tmp12 [V20,T15] ( 2, 16 ) simd64 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V21 tmp13 [V21 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
+;* V22 tmp14 [V22 ] ( 0, 0 ) simd64 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V23 tmp15 [V23,T08] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V24 tmp16 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V25 tmp17 [V25 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V26 tmp18 [V26 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V27 tmp19 [V27 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V28 tmp20 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V29 tmp21 [V29 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V30 tmp22 [V30 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V31 tmp23 [V31,T13] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V32 tmp24 [V32 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V33 tmp25 [V33 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V34 tmp26 [V34 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V35 tmp27 [V35 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V36 tmp28 [V36 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V37 tmp29 [V37,T16] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V38 tmp30 [V38 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V39 tmp31 [V39 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V40 tmp32 [V40,T09] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V41 tmp33 [V41 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V42 tmp34 [V42 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V43 tmp35 [V43 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V44 tmp36 [V44 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V45 tmp37 [V45 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V46 tmp38 [V46 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+;* V47 tmp39 [V47 ] ( 0, 0 ) simd16 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V48 tmp40 [V48,T14] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V49 tmp41 [V49 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V50 tmp42 [V50 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V51 tmp43 [V51 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V52 tmp44 [V52 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V53 tmp45 [V53 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V54 tmp46 [V54,T17] ( 2, 16 ) simd16 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V55 tmp47 [V55 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+;* V56 tmp48 [V56 ] ( 0, 0 ) simd16 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V57 tmp49 [V57,T10] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V58 tmp50 [V58 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V59 tmp51 [V59 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V60 tmp52 [V60 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V61 tmp53 [V61 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V62 tmp54 [V62 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V63 tmp55 [V63 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
+;* V64 tmp56 [V64,T11] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V65 cse0 [V65,T18] ( 3, 5 ) simd64 -> mm1 "CSE #01: moderate"
+; V66 cse1 [V66,T19] ( 3, 5 ) simd32 -> mm1 "CSE #04: moderate"
+; V67 cse2 [V67,T20] ( 3, 5 ) simd16 -> mm1 "CSE #05: moderate"
;
; Lcl frame size = 0
G_M42618_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M42618_IG02:
mov rax, rdi
cmp rsi, 64
jb SHORT G_M42618_IG05
;; size=9 bbWeight=1 PerfScore 1.50
G_M42618_IG03:
- vmovups zmm0, zmmword ptr [reloc @RWD00]
- vptestmw k1, zmm0, zmmword ptr [rax]
+ vmovups zmm0, zmmword ptr [rax]
+ vmovups zmm1, zmmword ptr [reloc @RWD00]
+ vptestmw k1, zmm1, zmm0
kortestd k1, k1
;; NOP compensation instructions of 3 bytes.
jne G_M42618_IG10
lea rcx, [rax+2*rsi-0x40]
lea rdi, [rax+0x40]
and rdi, -64
- align [0 bytes for IG04]
- ;; size=43 bbWeight=0.50 PerfScore 6.38
+ align [2 bytes for IG04]
+ ;; size=51 bbWeight=0.50 PerfScore 7.00
G_M42618_IG04:
- vptestmw k1, zmm0, zmmword ptr [rdi]
+ vmovdqa32 zmm0, zmmword ptr [rdi]
+ vptestmw k1, zmm1, zmm0
kortestd k1, k1
;; NOP compensation instructions of 3 bytes.
jne G_M42618_IG09
add rdi, 64
cmp rdi, rcx
jbe SHORT G_M42618_IG04
- jmp SHORT G_M42618_IG09
- ;; NOP compensation instructions of 3 bytes.
- ;; size=34 bbWeight=4 PerfScore 46.00
+ jmp G_M42618_IG09
+ ;; size=40 bbWeight=4 PerfScore 50.00
G_M42618_IG05:
cmp rsi, 32
jb SHORT G_M42618_IG07
- vmovups ymm0, ymmword ptr [reloc @RWD00]
- vptest ymm0, ymmword ptr [rax]
+ vmovups ymm0, ymmword ptr [rax]
+ vmovups ymm1, ymmword ptr [reloc @RWD00]
+ vptest ymm1, ymm0
jne SHORT G_M42618_IG10
+ ;; NOP compensation instructions of 4 bytes.
lea rcx, [rax+2*rsi-0x20]
lea rdi, [rax+0x20]
and rdi, -32
- align [4 bytes for IG06]
- ;; size=38 bbWeight=0.50 PerfScore 8.12
+ align [14 bytes for IG06]
+ ;; size=56 bbWeight=0.50 PerfScore 9.12
G_M42618_IG06:
- vptest ymm0, ymmword ptr [rdi]
+ vmovdqa ymm0, ymmword ptr [rdi]
+ vptest ymm1, ymm0
jne SHORT G_M42618_IG09
add rdi, 32
cmp rdi, rcx
jbe SHORT G_M42618_IG06
jmp SHORT G_M42618_IG09
- ;; size=18 bbWeight=4 PerfScore 50.00
+ ;; size=22 bbWeight=4 PerfScore 58.00
G_M42618_IG07:
cmp rsi, 16
jb SHORT G_M42618_IG10
- vmovups xmm0, xmmword ptr [reloc @RWD00]
- vptest xmm0, xmmword ptr [rax]
+ vmovups xmm0, xmmword ptr [rax]
+ vmovups xmm1, xmmword ptr [reloc @RWD00]
+ vptest xmm1, xmm0
jne SHORT G_M42618_IG10
lea rcx, [rax+2*rsi-0x10]
lea rdi, [rax+0x10]
and rdi, -16
- align [12 bytes for IG08]
- ;; size=46 bbWeight=0.50 PerfScore 6.62
+ align [4 bytes for IG08]
+ ;; size=42 bbWeight=0.50 PerfScore 7.12
G_M42618_IG08:
- vptest xmm0, xmmword ptr [rdi]
+ vmovdqa xmm0, xmmword ptr [rdi]
+ vptest xmm1, xmm0
jne SHORT G_M42618_IG09
add rdi, 16
cmp rdi, rcx
jbe SHORT G_M42618_IG08
- ;; size=16 bbWeight=4 PerfScore 34.00
+ ;; size=20 bbWeight=4 PerfScore 38.00
G_M42618_IG09:
mov rcx, rdi
sub rcx, rax
shr rcx, 1
sub rsi, rcx
;; size=12 bbWeight=0.50 PerfScore 0.62
G_M42618_IG10:
cmp rsi, 4
jb SHORT G_M42618_IG15
align [0 bytes for IG11]
;; size=6 bbWeight=1 PerfScore 1.25
G_M42618_IG11:
mov ecx, dword ptr [rdi]
mov edx, dword ptr [rdi+0x04]
mov r8d, ecx
or r8d, edx
test r8d, 0xD1FFAB1E
je SHORT G_M42618_IG14
;; size=20 bbWeight=4 PerfScore 23.00
G_M42618_IG12:
test ecx, 0xD1FFAB1E
jne SHORT G_M42618_IG13
mov ecx, edx
add rdi, 4
;; size=14 bbWeight=0.50 PerfScore 0.88
G_M42618_IG13:
test ecx, 0xFF80
jne SHORT G_M42618_IG18
jmp SHORT G_M42618_IG17
;; size=10 bbWeight=0.50 PerfScore 1.62
G_M42618_IG14:
add rdi, 8
add rsi, -4
cmp rsi, 4
jae SHORT G_M42618_IG11
;; size=14 bbWeight=4 PerfScore 7.00
G_M42618_IG15:
test sil, 2
je SHORT G_M42618_IG16
mov ecx, dword ptr [rdi]
test ecx, 0xD1FFAB1E
jne SHORT G_M42618_IG13
add rdi, 4
;; size=20 bbWeight=0.50 PerfScore 2.38
G_M42618_IG16:
test sil, 1
je SHORT G_M42618_IG18
cmp word ptr [rdi], 127
ja SHORT G_M42618_IG18
;; size=12 bbWeight=0.50 PerfScore 2.62
G_M42618_IG17:
add rdi, 2
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M42618_IG18:
mov rcx, rdi
sub rcx, rax
mov rax, rcx
shr rax, 1
;; size=12 bbWeight=1 PerfScore 1.25
G_M42618_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 337, prolog size 4, PerfScore 197.12, instruction count 91, allocated bytes for code 337 (MethodHash=bc9a5985) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 373, prolog size 4, PerfScore 215.25, instruction count 97, allocated bytes for code 373 (MethodHash=bc9a5985) for method System.Text.Ascii:GetIndexOfFirstNonAsciiChar_Vector(ulong,ulong):ulong (FullOpts) 12 (3.40 % of base) - System.Text.Ascii:IsValidCore[int](byref,int):ubyte ; Assembly listing for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 2 single block inlinees; 7 inlinees without PGO data
+; 0 inlinees with PGO data; 12 single block inlinees; 12 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 18, 13 ) byref -> rdi
; V01 arg1 [V01,T03] ( 9, 6 ) int -> rsi single-def
; V02 loc0 [V02,T07] ( 5, 2.50) byref -> rdx single-def
;* V03 loc1 [V03 ] ( 0, 0 ) int -> zero-ref
;* V04 loc2 [V04 ] ( 0, 0 ) long -> zero-ref
;* V05 loc3 [V05,T10] ( 0, 0 ) long -> zero-ref
;* V06 loc4 [V06 ] ( 0, 0 ) long -> zero-ref
; V07 loc5 [V07,T01] ( 6, 17 ) long -> r8
;* V08 loc6 [V08 ] ( 0, 0 ) long -> zero-ref
; V09 loc7 [V09,T04] ( 4, 5.50) long -> rcx
; V10 loc8 [V10,T00] ( 5, 20 ) byref -> rax
;* V11 loc9 [V11 ] ( 0, 0 ) long -> zero-ref
;* V12 loc10 [V12 ] ( 0, 0 ) long -> zero-ref
;* V13 loc11 [V13 ] ( 0, 0 ) long -> zero-ref
;* V14 loc12 [V14 ] ( 0, 0 ) byref -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V16 tmp1 [V16 ] ( 0, 0 ) int -> zero-ref
;* V17 tmp2 [V17 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inlining Arg"
; V18 tmp3 [V18,T09] ( 2, 1 ) ubyte -> r8 "Inline return value spill temp"
;* V19 tmp4 [V19 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
-;* V20 tmp5 [V20 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V21 tmp6 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[int]>
-;* V22 tmp7 [V22 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V23 tmp8 [V23 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
-;* V24 tmp9 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V25 tmp10 [V25,T13] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
-;* V26 tmp11 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V27 tmp12 [V27,T11] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
-;* V28 tmp13 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V29 tmp14 [V29,T14] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
-; V30 tmp15 [V30,T05] ( 5, 5 ) int -> r8 "Single return block return value"
-; V31 tmp16 [V31,T08] ( 2, 2 ) long -> rax "Cast away GC"
-; V32 cse0 [V32,T12] ( 7, 7 ) simd32 -> mm1 multi-def "CSE #03: aggressive"
-; V33 cse1 [V33,T06] ( 6, 3 ) long -> rcx multi-def "CSE #01: aggressive"
+;* V20 tmp5 [V20 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[int]>
+;* V21 tmp6 [V21 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+;* V22 tmp7 [V22 ] ( 0, 0 ) simd16 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[int]>
+;* V23 tmp8 [V23 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inline stloc first use temp"
+;* V24 tmp9 [V24 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
+;* V25 tmp10 [V25 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V26 tmp11 [V26 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V27 tmp12 [V27 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V28 tmp13 [V28 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V29 tmp14 [V29 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inline stloc first use temp"
+;* V30 tmp15 [V30 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
+;* V31 tmp16 [V31 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V32 tmp17 [V32,T15] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V33 tmp18 [V33 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V34 tmp19 [V34 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V35 tmp20 [V35 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inline stloc first use temp"
+;* V36 tmp21 [V36 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
+;* V37 tmp22 [V37 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V38 tmp23 [V38,T11] ( 2, 16 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V39 tmp24 [V39 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V40 tmp25 [V40 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V41 tmp26 [V41 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inline stloc first use temp"
+;* V42 tmp27 [V42 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
+;* V43 tmp28 [V43 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V44 tmp29 [V44,T16] ( 2, 2 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V45 tmp30 [V45 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V46 tmp31 [V46 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[int]>
+;* V47 tmp32 [V47 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inline stloc first use temp"
+;* V48 tmp33 [V48 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
+;* V49 tmp34 [V49 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V50 tmp35 [V50,T05] ( 5, 5 ) int -> r8 "Single return block return value"
+; V51 tmp36 [V51,T08] ( 2, 2 ) long -> rax "Cast away GC"
+; V52 cse0 [V52,T12] ( 7, 7 ) simd32 -> mm2 multi-def "CSE #03: aggressive"
+; V53 cse1 [V53,T06] ( 6, 3 ) long -> rcx multi-def "CSE #01: aggressive"
+; V54 rat0 [V54,T13] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V55 rat1 [V55,T14] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M8346_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M8346_IG02:
cmp esi, 4
jge SHORT G_M8346_IG04
;; size=5 bbWeight=1 PerfScore 1.25
G_M8346_IG03:
movsxd rcx, esi
cmp rcx, 2
jge G_M8346_IG13
test esi, esi
je G_M8346_IG16
jmp G_M8346_IG18
;; size=26 bbWeight=0.50 PerfScore 2.38
G_M8346_IG04:
movsxd rcx, esi
lea rdx, bword ptr [rdi+4*rcx]
cmp esi, 8
jg SHORT G_M8346_IG05
vmovups xmm0, xmmword ptr [rdi]
- vpor xmm0, xmm0, xmmword ptr [rdx-0x10]
- vptest xmm0, xmmword ptr [reloc @RWD00]
+ vmovups xmm1, xmmword ptr [rdx-0x10]
+ vpternlogd xmm0, xmm1, xmmword ptr [reloc @RWD00], -88
+ vptest xmm0, xmm0
sete r8b
movzx r8, r8b
jmp G_M8346_IG14
- align [2 bytes for IG07]
- ;; size=45 bbWeight=0.50 PerfScore 8.62
+ align [0 bytes for IG07]
+ ;; size=50 bbWeight=0.50 PerfScore 9.12
G_M8346_IG05:
cmp esi, 16
jg SHORT G_M8346_IG06
vmovups ymm0, ymmword ptr [rdi]
- vpor ymm0, ymm0, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vmovups ymm1, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vpternlogd ymm0, ymm1, ymm2, -88
+ vptest ymm0, ymm0
sete r8b
movzx r8, r8b
jmp G_M8346_IG14
- ;; size=40 bbWeight=0.50 PerfScore 10.75
+ ;; size=47 bbWeight=0.50 PerfScore 12.00
G_M8346_IG06:
cmp esi, 32
jle SHORT G_M8346_IG12
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdi+0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdi+0x60]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdi+0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdi+0x60]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
jne SHORT G_M8346_IG08
mov rax, rdi
and rax, 31
shr rax, 2
mov r8, rax
neg r8
add r8, 32
add rcx, -32
cmp r8, rcx
jae SHORT G_M8346_IG11
;; size=72 bbWeight=0.50 PerfScore 15.25
G_M8346_IG07:
lea rax, bword ptr [rdi+4*r8]
vmovups ymm0, ymmword ptr [rax]
- vmovups ymm2, ymmword ptr [rax+0x20]
- vpternlogd ymm0, ymm2, ymmword ptr [rax+0x40], -2
+ vmovups ymm1, ymmword ptr [rax+0x20]
+ vpternlogd ymm0, ymm1, ymmword ptr [rax+0x40], -2
vpor ymm0, ymm0, ymmword ptr [rax+0x60]
- vptest ymm0, ymm1
+ vptest ymm2, ymm0
je SHORT G_M8346_IG10
;; size=33 bbWeight=4 PerfScore 90.00
G_M8346_IG08:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M8346_IG09:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG10:
add r8, 32
cmp r8, rcx
jb SHORT G_M8346_IG07
;; size=9 bbWeight=4 PerfScore 6.00
G_M8346_IG11:
lea rdi, bword ptr [rdi+4*rcx]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M8346_IG12:
- vmovups ymm1, ymmword ptr [rdi]
+ vmovups ymm2, ymmword ptr [rdi]
vmovups ymm0, ymmword ptr [rdi+0x20]
- vpternlogd ymm1, ymm0, ymmword ptr [rdx-0x40], -2
- vpor ymm0, ymm1, ymmword ptr [rdx-0x20]
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vptest ymm0, ymm1
+ vpternlogd ymm2, ymm0, ymmword ptr [rdx-0x40], -2
+ vpor ymm0, ymm2, ymmword ptr [rdx-0x20]
+ vmovups ymm2, ymmword ptr [reloc @RWD32]
+ vptest ymm2, ymm0
sete r8b
movzx r8, r8b
jmp SHORT G_M8346_IG14
;; size=45 bbWeight=0.50 PerfScore 14.12
G_M8346_IG13:
mov r8, qword ptr [rdi]
or r8, qword ptr [rdi+4*rcx-0x08]
mov rax, 0xD1FFAB1E
test r8, rax
sete r8b
movzx r8, r8b
;; size=29 bbWeight=0.50 PerfScore 3.38
G_M8346_IG14:
movzx rax, r8b
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M8346_IG15:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG16:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M8346_IG17:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M8346_IG18:
cmp dword ptr [rdi], edi
mov rax, 0xD1FFAB1E ; code for System.ThrowHelper:ThrowNotSupportedException()
call [rax]System.ThrowHelper:ThrowNotSupportedException()
int3
;; size=15 bbWeight=0 PerfScore 0.00
-RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
+RWD00 dq FFFFFF80FFFFFF80h, FFFFFF80FFFFFF80h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
-RWD32 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
+RWD32 dq FFFFFF80FFFFFF80h, FFFFFF80FFFFFF80h, FFFFFF80FFFFFF80h, FFFFFF80FFFFFF80h
-; Total bytes of code 353, prolog size 4, PerfScore 157.38, instruction count 90, allocated bytes for code 353 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts)
+; Total bytes of code 365, prolog size 4, PerfScore 159.12, instruction count 92, allocated bytes for code 365 (MethodHash=10a8df65) for method System.Text.Ascii:IsValidCore[int](byref,int):ubyte (FullOpts) 11 (2.96 % of base) - System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong ; Assembly listing for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 8 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 12 single block inlinees; 8 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 38, 77 ) long -> rdi
; V01 arg1 [V01,T01] ( 17, 21 ) long -> rsi
; V02 loc0 [V02,T04] ( 12, 7 ) long -> rax
; V03 loc1 [V03,T02] ( 9, 11.50) int -> rcx
; V04 loc2 [V04,T05] ( 2, 4.50) long -> rcx
; V05 loc3 [V05,T06] ( 2, 4.50) long -> rcx
; V06 loc4 [V06,T07] ( 2, 4.50) long -> rcx
; V07 loc5 [V07,T03] ( 3, 8.50) int -> rdx
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V09 tmp1 [V09 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+; V09 tmp1 [V09,T12] ( 2, 2 ) simd16 -> mm0 "spilled call-like call argument"
;* V10 tmp2 [V10 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V11 tmp3 [V11 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
-;* V12 tmp4 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V13 tmp5 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V14 tmp6 [V14 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V15 tmp7 [V15 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
-; V16 cse0 [V16,T08] ( 3, 5 ) simd16 -> mm0 "CSE #02: aggressive"
+;* V12 tmp4 [V12 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+;* V13 tmp5 [V13 ] ( 0, 0 ) simd16 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V14 tmp6 [V14,T09] ( 0, 0 ) ubyte -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V15 tmp7 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V16 tmp8 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V17 tmp9 [V17 ] ( 0, 0 ) ubyte -> zero-ref "Inline stloc first use temp"
+;* V18 tmp10 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
+;* V19 tmp11 [V19 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V20 tmp12 [V20,T10] ( 2, 16 ) simd16 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V21 tmp13 [V21 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+;* V22 tmp14 [V22 ] ( 0, 0 ) simd16 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+;* V23 tmp15 [V23,T08] ( 0, 0 ) ubyte -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V24 tmp16 [V24 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V25 tmp17 [V25 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V26 tmp18 [V26 ] ( 0, 0 ) ubyte -> zero-ref "Inline stloc first use temp"
+;* V27 tmp19 [V27 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
+;* V28 tmp20 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V29 tmp21 [V29 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
+; V30 cse0 [V30,T11] ( 3, 5 ) simd16 -> mm1 "CSE #02: moderate"
;
; Lcl frame size = 0
G_M50024_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M50024_IG02:
mov rax, rdi
cmp rsi, 128
jb SHORT G_M50024_IG05
;; size=12 bbWeight=1 PerfScore 1.50
G_M50024_IG03:
vmovups zmm0, zmmword ptr [rax]
vpmovb2m k1, zmm0
kmovq rcx, k1
;; NOP compensation instructions of 3 bytes.
test rcx, rcx
jne G_M50024_IG10
lea rcx, [rax+rsi-0x40]
lea rdi, [rax+0x40]
and rdi, -64
align [6 bytes for IG04]
;; size=48 bbWeight=0.50 PerfScore 5.62
G_M50024_IG04:
vmovdqa32 zmm0, zmmword ptr [rdi]
vpmovb2m k1, zmm0
kmovq rdx, k1
;; NOP compensation instructions of 3 bytes.
test rdx, rdx
jne G_M50024_IG09
add rdi, 64
cmp rdi, rcx
jbe SHORT G_M50024_IG04
jmp SHORT G_M50024_IG09
;; NOP compensation instructions of 3 bytes.
;; size=43 bbWeight=4 PerfScore 51.00
G_M50024_IG05:
cmp rsi, 64
jb SHORT G_M50024_IG07
vmovups ymm0, ymmword ptr [rax]
vpmovmskb ecx, ymm0
test ecx, ecx
jne SHORT G_M50024_IG10
+ ;; NOP compensation instructions of 4 bytes.
lea rcx, [rax+rsi-0x20]
lea rdi, [rax+0x20]
and rdi, -32
- align [6 bytes for IG06]
+ align [2 bytes for IG06]
;; size=37 bbWeight=0.50 PerfScore 6.25
G_M50024_IG06:
vmovdqa ymm0, ymmword ptr [rdi]
vpmovmskb edx, ymm0
test edx, edx
jne SHORT G_M50024_IG09
add rdi, 32
cmp rdi, rcx
jbe SHORT G_M50024_IG06
jmp SHORT G_M50024_IG09
;; size=23 bbWeight=4 PerfScore 51.00
G_M50024_IG07:
cmp rsi, 32
jb SHORT G_M50024_IG10
- vmovups xmm0, xmmword ptr [reloc @RWD00]
- vptest xmm0, xmmword ptr [rax]
+ vmovups xmm0, xmmword ptr [rax]
+ vmovups xmm1, xmmword ptr [reloc @RWD00]
+ vptest xmm1, xmm0
jne SHORT G_M50024_IG10
lea rcx, [rax+rsi-0x10]
lea rdi, [rax+0x10]
and rdi, -16
- align [0 bytes for IG08]
- ;; size=34 bbWeight=0.50 PerfScore 6.50
+ align [3 bytes for IG08]
+ ;; size=41 bbWeight=0.50 PerfScore 7.12
G_M50024_IG08:
- vptest xmm0, xmmword ptr [rdi]
+ vmovdqa xmm0, xmmword ptr [rdi]
+ vptest xmm1, xmm0
jne SHORT G_M50024_IG09
add rdi, 16
cmp rdi, rcx
jbe SHORT G_M50024_IG08
- ;; size=16 bbWeight=4 PerfScore 34.00
+ ;; size=20 bbWeight=4 PerfScore 38.00
G_M50024_IG09:
sub rsi, rdi
add rsi, rax
;; size=6 bbWeight=0.50 PerfScore 0.25
G_M50024_IG10:
cmp rsi, 8
jb SHORT G_M50024_IG15
align [0 bytes for IG11]
;; size=6 bbWeight=1 PerfScore 1.25
G_M50024_IG11:
mov ecx, dword ptr [rdi]
mov edx, dword ptr [rdi+0x04]
mov r8d, ecx
or r8d, edx
test r8d, 0xD1FFAB1E
je SHORT G_M50024_IG14
;; size=20 bbWeight=4 PerfScore 23.00
G_M50024_IG12:
test ecx, 0xD1FFAB1E
jne SHORT G_M50024_IG13
mov ecx, edx
add rdi, 4
;; size=14 bbWeight=0.50 PerfScore 0.88
G_M50024_IG13:
and ecx, 0xD1FFAB1E
xor esi, esi
tzcnt esi, ecx
shr esi, 3
mov ecx, esi
add rdi, rcx
jmp SHORT G_M50024_IG18
;; size=22 bbWeight=0.50 PerfScore 2.75
G_M50024_IG14:
add rdi, 8
add rsi, -8
cmp rsi, 8
jae SHORT G_M50024_IG11
;; size=14 bbWeight=4 PerfScore 7.00
G_M50024_IG15:
test sil, 4
je SHORT G_M50024_IG16
mov ecx, dword ptr [rdi]
test ecx, 0xD1FFAB1E
jne SHORT G_M50024_IG13
add rdi, 4
;; size=20 bbWeight=0.50 PerfScore 2.38
G_M50024_IG16:
test sil, 2
je SHORT G_M50024_IG17
movzx rcx, word ptr [rdi]
test ecx, 0xD1FFAB1E
jne SHORT G_M50024_IG13
add rdi, 2
;; size=21 bbWeight=0.50 PerfScore 2.38
G_M50024_IG17:
test sil, 1
je SHORT G_M50024_IG18
lea rcx, [rdi+0x01]
cmp byte ptr [rdi], 0
cmovge rdi, rcx
;; size=17 bbWeight=0.50 PerfScore 2.50
G_M50024_IG18:
mov rcx, rdi
sub rcx, rax
mov rax, rcx
;; size=9 bbWeight=1 PerfScore 0.75
G_M50024_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq 8080808080808080h, 8080808080808080h
-; Total bytes of code 371, prolog size 4, PerfScore 202.75, instruction count 104, allocated bytes for code 371 (MethodHash=58923c97) for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 382, prolog size 4, PerfScore 207.38, instruction count 106, allocated bytes for code 382 (MethodHash=58923c97) for method System.Text.Ascii:GetIndexOfFirstNonAsciiByte_Vector(ulong,ulong):ulong (FullOpts) 9 (12.68 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this ; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 1 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 4 inlinees without PGO data
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) byref -> zero-ref this single-def
; V01 arg1 [V01,T00] ( 4, 4 ) long -> rsi single-def
;* V02 arg2 [V02 ] ( 0, 0 ) long -> zero-ref single-def
;* V03 arg3 [V03 ] ( 0, 0 ) int -> zero-ref single-def
; V04 arg4 [V04,T01] ( 4, 3 ) byref -> r8 single-def
-; V05 loc0 [V05,T02] ( 3, 2.50) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ushort]>
-; V06 loc1 [V06,T03] ( 3, 2.50) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V05 loc0 [V05,T04] ( 3, 2.50) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V06 loc1 [V06,T05] ( 3, 2.50) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ushort]>
;# V07 OutArgs [V07 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V08 tmp1 [V08 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V09 tmp2 [V09 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
-;* V10 tmp3 [V10 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+; V08 tmp1 [V08,T03] ( 2, 4 ) simd32 -> mm2 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V09 tmp2 [V09 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V10 tmp3 [V10 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ushort]>
+;* V11 tmp4 [V11,T02] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V12 tmp5 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V13 tmp6 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V14 tmp7 [V14 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V15 tmp8 [V15 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V16 tmp9 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;
-; Lcl frame size = 0
+; Lcl frame size = 8
G_M46395_IG01:
- ;; size=0 bbWeight=1 PerfScore 0.00
+ push rax
+ ;; size=1 bbWeight=1 PerfScore 1.00
G_M46395_IG02:
vmovups ymm0, ymmword ptr [rsi]
vmovups ymm1, ymmword ptr [rsi+0x20]
vpor ymm2, ymm0, ymm1
vptest ymm2, ymmword ptr [reloc @RWD00]
je SHORT G_M46395_IG05
;; size=24 bbWeight=1 PerfScore 18.33
G_M46395_IG03:
vxorps ymm0, ymm0, ymm0
vmovups ymmword ptr [r8], ymm0
xor eax, eax
;; size=11 bbWeight=0.50 PerfScore 1.29
G_M46395_IG04:
vzeroupper
+ add rsp, 8
ret
- ;; size=4 bbWeight=0.50 PerfScore 1.00
+ ;; size=8 bbWeight=0.50 PerfScore 1.12
G_M46395_IG05:
vpmovwb ymm0, ymm0
vpmovwb ymm1, ymm1
vinserti128 ymm0, ymm0, xmm1, 1
vmovups ymmword ptr [r8], ymm0
mov eax, 1
;; size=28 bbWeight=0.50 PerfScore 5.12
G_M46395_IG06:
vzeroupper
+ add rsp, 8
ret
- ;; size=4 bbWeight=0.50 PerfScore 1.00
+ ;; size=8 bbWeight=0.50 PerfScore 1.12
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 71, prolog size 0, PerfScore 26.75, instruction count 17, allocated bytes for code 71 (MethodHash=fb454ac4) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 80, prolog size 1, PerfScore 28.00, instruction count 20, allocated bytes for code 80 (MethodHash=fb454ac4) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadAvxVector256(ulong,ulong,int,byref):ubyte:this (FullOpts) 9 (17.65 % of base) - System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this ; Assembly listing for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
-; 0 inlinees with PGO data; 0 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 2 single block inlinees; 5 inlinees without PGO data
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) byref -> zero-ref this single-def
; V01 arg1 [V01,T00] ( 4, 4 ) long -> rsi single-def
;* V02 arg2 [V02 ] ( 0, 0 ) long -> zero-ref single-def
;* V03 arg3 [V03 ] ( 0, 0 ) int -> zero-ref single-def
; V04 arg4 [V04,T01] ( 4, 3 ) byref -> r8 single-def
-; V05 loc0 [V05,T02] ( 3, 2.50) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V06 loc1 [V06,T03] ( 3, 2.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V05 loc0 [V05,T04] ( 3, 2.50) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V06 loc1 [V06,T05] ( 3, 2.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ushort]>
;# V07 OutArgs [V07 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V08 tmp1 [V08 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V09 tmp2 [V09 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V10 tmp3 [V10 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V11 tmp4 [V11 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+; V08 tmp1 [V08,T03] ( 2, 4 ) simd16 -> mm2 "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V09 tmp2 [V09 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
+;* V10 tmp3 [V10 ] ( 0, 0 ) simd16 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+;* V11 tmp4 [V11,T02] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V12 tmp5 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V13 tmp6 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V14 tmp7 [V14 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V15 tmp8 [V15 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V16 tmp9 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V17 tmp10 [V17 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;
-; Lcl frame size = 0
+; Lcl frame size = 8
G_M11006_IG01:
- ;; size=0 bbWeight=1 PerfScore 0.00
+ push rax
+ ;; size=1 bbWeight=1 PerfScore 1.00
G_M11006_IG02:
vmovups xmm0, xmmword ptr [rsi]
vmovups xmm1, xmmword ptr [rsi+0x10]
vpor xmm2, xmm0, xmm1
vptest xmm2, xmmword ptr [reloc @RWD00]
je SHORT G_M11006_IG05
;; size=24 bbWeight=1 PerfScore 14.33
G_M11006_IG03:
vxorps xmm0, xmm0, xmm0
vmovups xmmword ptr [r8], xmm0
xor eax, eax
;; size=11 bbWeight=0.50 PerfScore 1.29
G_M11006_IG04:
+ add rsp, 8
ret
- ;; size=1 bbWeight=0.50 PerfScore 0.50
+ ;; size=5 bbWeight=0.50 PerfScore 0.62
G_M11006_IG05:
vpackuswb xmm0, xmm0, xmm1
vmovups xmmword ptr [r8], xmm0
mov eax, 1
;; size=14 bbWeight=0.50 PerfScore 1.62
G_M11006_IG06:
+ add rsp, 8
ret
- ;; size=1 bbWeight=0.50 PerfScore 0.50
+ ;; size=5 bbWeight=0.50 PerfScore 0.62
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
-; Total bytes of code 51, prolog size 0, PerfScore 18.25, instruction count 13, allocated bytes for code 51 (MethodHash=0badd501) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts)
+; Total bytes of code 60, prolog size 1, PerfScore 19.50, instruction count 16, allocated bytes for code 60 (MethodHash=0badd501) for method System.Buffers.Text.Base64Url+Base64UrlDecoderChar:TryLoadVector128(ulong,ulong,int,byref):ubyte:this (FullOpts) 7 (1.77 % of base) - System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong ; Assembly listing for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 32, 34.50) long -> rbx
; V01 arg1 [V01,T01] ( 17, 10 ) long -> rsi
;* V02 loc0 [V02,T08] ( 0, 0 ) int -> zero-ref
;* V03 loc1 [V03,T09] ( 0, 0 ) int -> zero-ref
-; V04 loc2 [V04,T10] ( 9, 11.50) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V05 loc3 [V05,T11] ( 3, 8.50) simd16 -> mm3 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V04 loc2 [V04,T11] ( 9, 11.50) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V05 loc3 [V05,T12] ( 3, 8.50) simd16 -> mm3 <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V06 loc4 [V06,T04] ( 4, 2 ) int -> r14
; V07 loc5 [V07,T03] ( 8, 4 ) long -> r15
-; V08 loc6 [V08,T12] ( 5, 6 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V09 loc7 [V09,T13] ( 3, 1.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V08 loc6 [V08,T13] ( 5, 6 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V09 loc7 [V09,T14] ( 3, 1.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V10 loc8 [V10,T05] ( 3, 1.50) int -> rdi
; V11 loc9 [V11,T02] ( 2, 4.50) long -> rdi
;* V12 loc10 [V12 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V13 loc11 [V13,T07] ( 2, 1 ) long -> rdi
;* V14 loc12 [V14 ] ( 0, 0 ) int -> zero-ref
;# V15 OutArgs [V15 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V16 cse0 [V16,T06] ( 3, 1.50) long -> rdi "CSE #01: moderate"
+; V17 rat0 [V17,T10] ( 3, 24 ) simd16 -> mm4 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 8
G_M38868_IG01:
push rbp
push r15
push r14
push rbx
push rax
lea rbp, [rsp+0x20]
mov rbx, rdi
;; size=15 bbWeight=1 PerfScore 5.75
G_M38868_IG02:
test rsi, rsi
jne SHORT G_M38868_IG05
;; size=5 bbWeight=1 PerfScore 1.25
G_M38868_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M38868_IG04:
add rsp, 8
pop rbx
pop r14
pop r15
pop rbp
ret
;; size=11 bbWeight=0.50 PerfScore 1.62
G_M38868_IG05:
mov r15, rbx
cmp rsi, 8
jb G_M38868_IG10
vmovups xmm0, xmmword ptr [reloc @RWD00]
vmovups xmm1, xmmword ptr [reloc @RWD16]
vpaddusw xmm2, xmm1, xmmword ptr [r15]
vpmovmskb r14d, xmm2
test r14d, 0xAAAA
jne G_M38868_IG18
add rsi, rsi
cmp rsi, 32
jb SHORT G_M38868_IG08
lea rbx, [r15+0x10]
and rbx, -16
add rsi, r15
sub rsi, rbx
cmp rsi, 32
jb SHORT G_M38868_IG07
lea rdi, [rbx+rsi-0x20]
align [0 bytes for IG06]
;; size=85 bbWeight=0.50 PerfScore 9.38
G_M38868_IG06:
vmovdqa xmm2, xmmword ptr [rbx]
vmovdqa xmm3, xmmword ptr [rbx+0x10]
- vpor xmm4, xmm2, xmm3
- vptest xmm4, xmm0
+ vmovaps xmm4, xmm2
+ vpternlogd xmm4, xmm3, xmm0, -88
+ vptest xmm4, xmm4
jne G_M38868_IG16
add rbx, 32
cmp rbx, rdi
jbe SHORT G_M38868_IG06
- ;; size=33 bbWeight=4 PerfScore 55.33
+ ;; size=40 bbWeight=4 PerfScore 57.00
G_M38868_IG07:
test sil, 16
je SHORT G_M38868_IG09
vmovdqa xmm2, xmmword ptr [rbx]
vptest xmm2, xmm0
jne G_M38868_IG17
;; size=21 bbWeight=0.50 PerfScore 4.62
G_M38868_IG08:
add rbx, 16
;; size=4 bbWeight=0.50 PerfScore 0.12
G_M38868_IG09:
movzx rdi, sil
test dil, 15
je G_M38868_IG19
and rsi, 15
add rsi, rbx
mov rbx, rsi
sub rbx, 16
vmovups xmm2, xmmword ptr [rbx]
vptest xmm2, xmm0
jne G_M38868_IG17
add rbx, 16
jmp G_M38868_IG19
;; size=52 bbWeight=0.50 PerfScore 6.38
G_M38868_IG10:
test sil, 4
je SHORT G_M38868_IG12
mov rdi, qword ptr [r15]
mov rax, 0xD1FFAB1E
and rdi, rax
je SHORT G_M38868_IG11
xor ebx, ebx
tzcnt rbx, rdi
shr rbx, 3
and rbx, -2
add rbx, r15
jmp SHORT G_M38868_IG19
;; size=44 bbWeight=0.50 PerfScore 5.00
G_M38868_IG11:
lea rbx, [r15+0x08]
;; size=4 bbWeight=0.50 PerfScore 0.25
G_M38868_IG12:
test sil, 2
je SHORT G_M38868_IG13
mov edi, dword ptr [rbx]
test edi, 0xD1FFAB1E
jne SHORT G_M38868_IG14
add rbx, 4
;; size=20 bbWeight=0.50 PerfScore 2.38
G_M38868_IG13:
test sil, 1
je SHORT G_M38868_IG19
cmp word ptr [rbx], 255
ja SHORT G_M38868_IG19
jmp SHORT G_M38868_IG15
;; size=15 bbWeight=0.50 PerfScore 3.62
G_M38868_IG14:
mov rax, 0xD1FFAB1E ; code for System.Text.Latin1Utility:FirstCharInUInt32IsLatin1(uint):ubyte
call [rax]System.Text.Latin1Utility:FirstCharInUInt32IsLatin1(uint):ubyte
test eax, eax
je SHORT G_M38868_IG19
;; size=16 bbWeight=0.50 PerfScore 2.25
G_M38868_IG15:
add rbx, 2
jmp SHORT G_M38868_IG19
;; size=6 bbWeight=0.50 PerfScore 1.12
G_M38868_IG16:
vptest xmm2, xmm0
jne SHORT G_M38868_IG17
add rbx, 16
vmovaps xmm2, xmm3
;; size=15 bbWeight=0.50 PerfScore 2.25
G_M38868_IG17:
vpaddusw xmm0, xmm2, xmm1
vpmovmskb r14d, xmm0
;; size=8 bbWeight=0.50 PerfScore 1.17
G_M38868_IG18:
and r14d, 0xAAAA
xor eax, eax
tzcnt eax, r14d
lea rbx, [rbx+rax-0x01]
;; size=19 bbWeight=0.50 PerfScore 1.75
G_M38868_IG19:
mov rax, rbx
sub rax, r15
shr rax, 1
;; size=9 bbWeight=0.50 PerfScore 0.50
G_M38868_IG20:
add rsp, 8
pop rbx
pop r14
pop r15
pop rbp
ret
;; size=11 bbWeight=0.50 PerfScore 1.62
RWD00 dq FF00FF00FF00FF00h, FF00FF00FF00FF00h
RWD16 dq 7F007F007F007F00h, 7F007F007F007F00h
-; Total bytes of code 395, prolog size 15, PerfScore 106.50, instruction count 111, allocated bytes for code 395 (MethodHash=0f68682b) for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts)
+; Total bytes of code 402, prolog size 15, PerfScore 108.17, instruction count 112, allocated bytes for code 402 (MethodHash=0f68682b) for method System.Text.Latin1Utility:GetIndexOfFirstNonLatin1Char_Sse2(ulong,ulong):ulong (FullOpts) 7 (4.70 % of base) - System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong ; Assembly listing for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 6, 11.50) long -> rdi single-def
; V01 arg1 [V01,T02] ( 8, 8.50) long -> rsi single-def
; V02 arg2 [V02,T03] ( 3, 2.50) long -> rdx single-def
;* V03 loc0 [V03,T05] ( 0, 0 ) int -> zero-ref
;* V04 loc1 [V04 ] ( 0, 0 ) long -> zero-ref
-; V05 loc2 [V05,T08] ( 5, 7 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[short]>
+; V05 loc2 [V05,T09] ( 5, 7 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[short]>
;* V06 loc3 [V06 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V07 loc4 [V07,T06] ( 14, 18.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[short]>
+; V07 loc4 [V07,T07] ( 14, 18.50) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[short]>
;* V08 loc5 [V08 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V09 loc6 [V09,T00] ( 12, 27 ) long -> rax
; V10 loc7 [V10,T04] ( 2, 4.50) long -> rdx
-; V11 loc8 [V11,T07] ( 3, 12 ) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[short]>
+; V11 loc8 [V11,T08] ( 3, 12 ) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[short]>
;* V12 loc9 [V12 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;# V13 OutArgs [V13 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V14 rat0 [V14,T06] ( 3, 24 ) simd16 -> mm3 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M23879_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M23879_IG02:
vmovups xmm0, xmmword ptr [reloc @RWD00]
vmovups xmm1, xmmword ptr [rdi]
vptest xmm1, xmm0
je SHORT G_M23879_IG05
;; size=19 bbWeight=1 PerfScore 11.00
G_M23879_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M23879_IG04:
pop rbp
ret
;; size=2 bbWeight=0.50 PerfScore 0.75
G_M23879_IG05:
vpackuswb xmm1, xmm1, xmm1
vmovq qword ptr [rsi], xmm1
mov eax, 8
test sil, 8
jne SHORT G_M23879_IG06
vmovups xmm1, xmmword ptr [rdi+0x10]
vptest xmm1, xmm0
jne SHORT G_M23879_IG08
vpackuswb xmm1, xmm1, xmm1
vmovq qword ptr [rsi+0x08], xmm1
;; size=40 bbWeight=0.50 PerfScore 7.75
G_M23879_IG06:
mov rax, rsi
and rax, 15
neg rax
add rax, 16
add rdx, -16
align [0 bytes for IG07]
;; size=18 bbWeight=0.50 PerfScore 0.62
G_M23879_IG07:
vmovups xmm1, xmmword ptr [rdi+2*rax]
vmovups xmm2, xmmword ptr [rdi+2*rax+0x10]
- vpor xmm3, xmm1, xmm2
- vptest xmm3, xmm0
+ vmovaps xmm3, xmm1
+ vpternlogd xmm3, xmm2, xmm0, -88
+ vptest xmm3, xmm3
jne SHORT G_M23879_IG09
vpackuswb xmm1, xmm1, xmm2
vmovdqa xmmword ptr [rsi+rax], xmm1
add rax, 16
cmp rax, rdx
jbe SHORT G_M23879_IG07
- ;; size=40 bbWeight=4 PerfScore 67.33
+ ;; size=47 bbWeight=4 PerfScore 69.00
G_M23879_IG08:
pop rbp
ret
;; size=2 bbWeight=0.50 PerfScore 0.75
G_M23879_IG09:
vptest xmm1, xmm0
jne SHORT G_M23879_IG08
vpackuswb xmm0, xmm1, xmm1
vmovq qword ptr [rsi+rax], xmm0
add rax, 8
jmp SHORT G_M23879_IG08
;; size=22 bbWeight=0.50 PerfScore 4.62
RWD00 dq FF00FF00FF00FF00h, FF00FF00FF00FF00h
-; Total bytes of code 149, prolog size 4, PerfScore 94.21, instruction count 43, allocated bytes for code 149 (MethodHash=f65ba2b8) for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 156, prolog size 4, PerfScore 95.88, instruction count 44, allocated bytes for code 156 (MethodHash=f65ba2b8) for method System.Text.Latin1Utility:NarrowUtf16ToLatin1_Sse2(ulong,ulong,ulong):ulong (FullOpts) Larger list of diffs: https://gist.github.com/MihuBot/7a2eed39af0a5c637a756b96ef7699c8 |
Top method improvements-8 (-2.42 % of base) - System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte ; Assembly listing for method System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 6 single block inlinees; 7 inlinees without PGO data
+; 0 inlinees with PGO data; 6 single block inlinees; 10 inlinees without PGO data
; Final local variable assignments
;
;* V00 arg0 [V00 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op single-def <System.ReadOnlySpan`1[ushort]>
;* V01 arg1 [V01 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op single-def <System.Span`1[ubyte]>
-; V02 arg2 [V02,T06] ( 4, 3 ) byref -> rbx single-def
+; V02 arg2 [V02,T07] ( 4, 3 ) byref -> rbx single-def
; V03 loc0 [V03,T00] ( 12, 42.50) long -> r15
; V04 loc1 [V04,T02] ( 3, 9 ) long -> r13
-;* V05 loc2 [V05,T19] ( 0, 0 ) byref -> zero-ref single-def
-;* V06 loc3 [V06,T20] ( 0, 0 ) byref -> zero-ref single-def
+;* V05 loc2 [V05,T20] ( 0, 0 ) byref -> zero-ref single-def
+;* V06 loc3 [V06,T21] ( 0, 0 ) byref -> zero-ref single-def
; V07 loc4 [V07 ] ( 2, 1 ) int -> [rbp-0x28] do-not-enreg[X] addr-exposed ld-addr-op
-; V08 loc5 [V08,T23] ( 3, 24 ) simd16 -> mm7 <System.Runtime.Intrinsics.Vector128`1[ushort]>
-; V09 loc6 [V09,T24] ( 3, 24 ) simd16 -> mm8 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V08 loc5 [V08,T24] ( 3, 24 ) simd16 -> mm6 <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V09 loc6 [V09,T25] ( 3, 24 ) simd16 -> mm7 <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V11 loc8 [V11 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V12 loc9 [V12,T25] ( 3, 16 ) simd16 -> mm9 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+; V12 loc9 [V12,T26] ( 3, 16 ) simd16 -> mm8 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V13 loc10 [V13 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V14 loc11 [V14 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V15 loc12 [V15 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;# V16 OutArgs [V16 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V17 tmp1 [V17,T21] ( 3, 48 ) simd16 -> mm9 "dup spill"
+; V17 tmp1 [V17,T22] ( 3, 48 ) simd16 -> mm8 "dup spill"
;* V18 tmp2 [V18 ] ( 0, 0 ) struct (16) zero-ref "impAppendStmt" <System.ReadOnlySpan`1[ushort]>
;* V19 tmp3 [V19 ] ( 0, 0 ) struct (16) zero-ref "spilled call-like call argument" <System.Span`1[ubyte]>
-; V20 tmp4 [V20,T12] ( 2, 2 ) int -> rax "impAppendStmt"
+; V20 tmp4 [V20,T13] ( 2, 2 ) int -> rax "impAppendStmt"
;* V21 tmp5 [V21 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
;* V22 tmp6 [V22 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ushort]>
;* V23 tmp7 [V23 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg" <System.Span`1[ubyte]>
;* V24 tmp8 [V24 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V25 tmp9 [V25 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V26 tmp10 [V26 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V27 tmp11 [V27 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V28 tmp12 [V28 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V29 tmp13 [V29 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
+; V29 tmp13 [V29,T23] ( 2, 32 ) simd16 -> mm6 "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V30 tmp14 [V30 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
;* V31 tmp15 [V31 ] ( 0, 0 ) simd16 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ushort]>
-;* V32 tmp16 [V32 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V33 tmp17 [V33 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V34 tmp18 [V34,T07] ( 4, 4 ) int -> r8 "Inlining Arg"
-;* V35 tmp19 [V35 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ushort]>
-; V36 tmp20 [V36,T10] ( 2, 2 ) byref -> rdi single-def "Inlining Arg"
-; V37 tmp21 [V37,T13] ( 2, 2 ) int -> rsi "Inlining Arg"
-; V38 tmp22 [V38,T08] ( 4, 4 ) int -> r8 "Inlining Arg"
-;* V39 tmp23 [V39 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op "NewObj constructor temp" <System.Span`1[ubyte]>
-; V40 tmp24 [V40,T11] ( 2, 2 ) byref -> rdx single-def "Inlining Arg"
-; V41 tmp25 [V41,T14] ( 2, 2 ) int -> rcx "Inlining Arg"
-; V42 tmp26 [V42,T01] ( 4, 17.50) byref -> rdi single-def "field V00._reference (fldOffset=0x0)" P-INDEP
-; V43 tmp27 [V43,T05] ( 5, 3.50) int -> rsi single-def "field V00._length (fldOffset=0x8)" P-INDEP
-; V44 tmp28 [V44,T03] ( 3, 5.50) byref -> rdx single-def "field V01._reference (fldOffset=0x0)" P-INDEP
-; V45 tmp29 [V45,T09] ( 3, 2 ) int -> rcx single-def "field V01._length (fldOffset=0x8)" P-INDEP
-;* V46 tmp30 [V46 ] ( 0, 0 ) byref -> zero-ref single-def "field V18._reference (fldOffset=0x0)" P-INDEP
-;* V47 tmp31 [V47 ] ( 0, 0 ) int -> zero-ref "field V18._length (fldOffset=0x8)" P-INDEP
-;* V48 tmp32 [V48 ] ( 0, 0 ) byref -> zero-ref "field V19._reference (fldOffset=0x0)" P-INDEP
-;* V49 tmp33 [V49 ] ( 0, 0 ) int -> zero-ref "field V19._length (fldOffset=0x8)" P-INDEP
-;* V50 tmp34 [V50 ] ( 0, 0 ) byref -> zero-ref single-def "field V22._reference (fldOffset=0x0)" P-INDEP
-;* V51 tmp35 [V51 ] ( 0, 0 ) int -> zero-ref "field V22._length (fldOffset=0x8)" P-INDEP
-;* V52 tmp36 [V52 ] ( 0, 0 ) byref -> zero-ref single-def "field V23._reference (fldOffset=0x0)" P-INDEP
-;* V53 tmp37 [V53 ] ( 0, 0 ) int -> zero-ref "field V23._length (fldOffset=0x8)" P-INDEP
-; V54 tmp38 [V54,T15] ( 2, 1 ) byref -> rdi single-def "field V35._reference (fldOffset=0x0)" P-INDEP
-; V55 tmp39 [V55,T17] ( 2, 1 ) int -> rsi "field V35._length (fldOffset=0x8)" P-INDEP
-; V56 tmp40 [V56,T16] ( 2, 1 ) byref -> rdx single-def "field V39._reference (fldOffset=0x0)" P-INDEP
-; V57 tmp41 [V57,T18] ( 2, 1 ) int -> rcx "field V39._length (fldOffset=0x8)" P-INDEP
-; V58 cse0 [V58,T26] ( 2, 9 ) simd16 -> mm0 hoist "CSE #01: aggressive"
-; V59 cse1 [V59,T27] ( 2, 9 ) simd16 -> mm1 hoist "CSE #02: aggressive"
-; V60 cse2 [V60,T28] ( 2, 9 ) simd16 -> mm2 hoist "CSE #03: aggressive"
-; V61 cse3 [V61,T29] ( 2, 9 ) simd16 -> mm3 hoist "CSE #04: aggressive"
-; V62 cse4 [V62,T30] ( 2, 9 ) simd16 -> mm4 hoist "CSE #05: aggressive"
-; V63 cse5 [V63,T31] ( 2, 9 ) simd16 -> mm5 hoist "CSE #06: aggressive"
-; V64 cse6 [V64,T32] ( 2, 9 ) simd16 -> mm6 hoist "CSE #07: aggressive"
-; V65 cse7 [V65,T04] ( 3, 6 ) long -> r14 "CSE #08: aggressive"
-; V66 rat0 [V66,T22] ( 3, 48 ) simd16 -> mm7 "ReplaceWithLclVar is creating a new local variable"
+;* V32 tmp16 [V32,T04] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V33 tmp17 [V33 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V34 tmp18 [V34 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V35 tmp19 [V35 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V36 tmp20 [V36 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V37 tmp21 [V37 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V38 tmp22 [V38 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
+; V39 tmp23 [V39,T08] ( 4, 4 ) int -> r8 "Inlining Arg"
+;* V40 tmp24 [V40 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ushort]>
+; V41 tmp25 [V41,T11] ( 2, 2 ) byref -> rdi single-def "Inlining Arg"
+; V42 tmp26 [V42,T14] ( 2, 2 ) int -> rsi "Inlining Arg"
+; V43 tmp27 [V43,T09] ( 4, 4 ) int -> r8 "Inlining Arg"
+;* V44 tmp28 [V44 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op "NewObj constructor temp" <System.Span`1[ubyte]>
+; V45 tmp29 [V45,T12] ( 2, 2 ) byref -> rdx single-def "Inlining Arg"
+; V46 tmp30 [V46,T15] ( 2, 2 ) int -> rcx "Inlining Arg"
+; V47 tmp31 [V47,T01] ( 4, 17.50) byref -> rdi single-def "field V00._reference (fldOffset=0x0)" P-INDEP
+; V48 tmp32 [V48,T06] ( 5, 3.50) int -> rsi single-def "field V00._length (fldOffset=0x8)" P-INDEP
+; V49 tmp33 [V49,T03] ( 3, 5.50) byref -> rdx single-def "field V01._reference (fldOffset=0x0)" P-INDEP
+; V50 tmp34 [V50,T10] ( 3, 2 ) int -> rcx single-def "field V01._length (fldOffset=0x8)" P-INDEP
+;* V51 tmp35 [V51 ] ( 0, 0 ) byref -> zero-ref single-def "field V18._reference (fldOffset=0x0)" P-INDEP
+;* V52 tmp36 [V52 ] ( 0, 0 ) int -> zero-ref "field V18._length (fldOffset=0x8)" P-INDEP
+;* V53 tmp37 [V53 ] ( 0, 0 ) byref -> zero-ref "field V19._reference (fldOffset=0x0)" P-INDEP
+;* V54 tmp38 [V54 ] ( 0, 0 ) int -> zero-ref "field V19._length (fldOffset=0x8)" P-INDEP
+;* V55 tmp39 [V55 ] ( 0, 0 ) byref -> zero-ref single-def "field V22._reference (fldOffset=0x0)" P-INDEP
+;* V56 tmp40 [V56 ] ( 0, 0 ) int -> zero-ref "field V22._length (fldOffset=0x8)" P-INDEP
+;* V57 tmp41 [V57 ] ( 0, 0 ) byref -> zero-ref single-def "field V23._reference (fldOffset=0x0)" P-INDEP
+;* V58 tmp42 [V58 ] ( 0, 0 ) int -> zero-ref "field V23._length (fldOffset=0x8)" P-INDEP
+; V59 tmp43 [V59,T16] ( 2, 1 ) byref -> rdi single-def "field V40._reference (fldOffset=0x0)" P-INDEP
+; V60 tmp44 [V60,T18] ( 2, 1 ) int -> rsi "field V40._length (fldOffset=0x8)" P-INDEP
+; V61 tmp45 [V61,T17] ( 2, 1 ) byref -> rdx single-def "field V44._reference (fldOffset=0x0)" P-INDEP
+; V62 tmp46 [V62,T19] ( 2, 1 ) int -> rcx "field V44._length (fldOffset=0x8)" P-INDEP
+; V63 cse0 [V63,T27] ( 2, 9 ) simd16 -> mm0 hoist "CSE #01: aggressive"
+; V64 cse1 [V64,T28] ( 2, 9 ) simd16 -> mm1 hoist "CSE #02: aggressive"
+; V65 cse2 [V65,T29] ( 2, 9 ) simd16 -> mm2 hoist "CSE #03: aggressive"
+; V66 cse3 [V66,T30] ( 2, 9 ) simd16 -> mm3 hoist "CSE #04: aggressive"
+; V67 cse4 [V67,T31] ( 2, 9 ) simd16 -> mm4 hoist "CSE #05: aggressive"
+; V68 cse5 [V68,T32] ( 2, 9 ) simd16 -> mm5 hoist "CSE #06: aggressive"
+; V69 cse6 [V69,T05] ( 3, 6 ) long -> r14 "CSE #07: aggressive"
;
; Lcl frame size = 16
G_M6966_IG01:
push rbp
push r15
push r14
push r13
push rbx
sub rsp, 16
lea rbp, [rsp+0x30]
mov rbx, r8
;; size=20 bbWeight=1 PerfScore 6.00
G_M6966_IG02:
xor r15d, r15d
mov r14d, esi
lea r13, [r14-0x10]
vmovups xmm0, xmmword ptr [reloc @RWD00]
vmovups xmm1, xmmword ptr [reloc @RWD16]
vmovups xmm2, xmmword ptr [reloc @RWD32]
vmovups xmm3, xmmword ptr [reloc @RWD48]
vmovups xmm4, xmmword ptr [reloc @RWD64]
vmovups xmm5, xmmword ptr [reloc @RWD80]
- vmovups xmm6, xmmword ptr [reloc @RWD96]
jmp SHORT G_M6966_IG04
align [0 bytes for IG03]
- ;; size=68 bbWeight=1 PerfScore 24.00
+ ;; size=60 bbWeight=1 PerfScore 21.00
G_M6966_IG03:
mov r15, r13
;; size=3 bbWeight=4 PerfScore 1.00
G_M6966_IG04:
- vmovups xmm7, xmmword ptr [rdi+2*r15]
- vmovups xmm8, xmmword ptr [rdi+2*r15+0x10]
- vpackuswb xmm9, xmm7, xmm8
- vpaddb xmm10, xmm0, xmm9
- vpsubusb xmm10, xmm10, xmm1
- vpsubb xmm10, xmm10, xmm2
- vpand xmm9, xmm3, xmm9
- vpsubb xmm9, xmm9, xmm4
- vpaddusb xmm9, xmm9, xmm5
- vpminub xmm9, xmm9, xmm10
- vpternlogd xmm7, xmm8, xmm6, -88
- vptest xmm7, xmm7
+ vmovups xmm6, xmmword ptr [rdi+2*r15]
+ vmovups xmm7, xmmword ptr [rdi+2*r15+0x10]
+ vpackuswb xmm8, xmm6, xmm7
+ vpaddb xmm9, xmm0, xmm8
+ vpsubusb xmm9, xmm9, xmm1
+ vpsubb xmm9, xmm9, xmm2
+ vpand xmm8, xmm3, xmm8
+ vpsubb xmm8, xmm8, xmm4
+ vpaddusb xmm8, xmm8, xmm5
+ vpminub xmm8, xmm8, xmm9
+ vpor xmm6, xmm6, xmm7
+ vptest xmm6, xmmword ptr [reloc @RWD96]
jne SHORT G_M6966_IG08
- ;; size=63 bbWeight=8 PerfScore 128.00
+ ;; size=63 bbWeight=8 PerfScore 142.67
G_M6966_IG05:
- vpaddusb xmm7, xmm9, xmmword ptr [reloc @RWD112]
- vpmovmskb eax, xmm7
+ vpaddusb xmm6, xmm8, xmmword ptr [reloc @RWD112]
+ vpmovmskb eax, xmm6
test eax, eax
jne SHORT G_M6966_IG08
- vpmaddubsw xmm7, xmm9, xmmword ptr [reloc @RWD128]
- vpshufb xmm7, xmm7, xmmword ptr [reloc @RWD144]
+ vpmaddubsw xmm6, xmm8, xmmword ptr [reloc @RWD128]
+ vpshufb xmm6, xmm6, xmmword ptr [reloc @RWD144]
mov rax, r15
shr rax, 1
- vmovd qword ptr [rdx+rax], xmm7
+ vmovd qword ptr [rdx+rax], xmm6
add r15, 16
cmp r15, r14
jne SHORT G_M6966_IG10
;; size=55 bbWeight=4 PerfScore 70.00
G_M6966_IG06:
mov dword ptr [rbx], esi
mov eax, 1
;; size=7 bbWeight=0.50 PerfScore 0.62
G_M6966_IG07:
add rsp, 16
pop rbx
pop r13
pop r14
pop r15
pop rbp
ret
;; size=13 bbWeight=0.50 PerfScore 1.88
G_M6966_IG08:
mov r8d, r15d
cmp r8d, esi
ja SHORT G_M6966_IG11
mov eax, r8d
lea rdi, bword ptr [rdi+2*rax]
sub esi, r8d
mov r8, r15
shr r8, 1
cmp r8d, ecx
ja SHORT G_M6966_IG11
mov eax, r8d
add rdx, rax
sub ecx, r8d
lea r8, [rbp-0x28]
mov rax, 0xD1FFAB1E ; code for System.HexConverter:TryDecodeFromUtf16_Scalar(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte
call [rax]System.HexConverter:TryDecodeFromUtf16_Scalar(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte
mov ecx, r15d
add ecx, dword ptr [rbp-0x28]
mov dword ptr [rbx], ecx
;; size=62 bbWeight=0.50 PerfScore 6.12
G_M6966_IG09:
add rsp, 16
pop rbx
pop r13
pop r14
pop r15
pop rbp
ret
;; size=13 bbWeight=0.50 PerfScore 1.88
G_M6966_IG10:
cmp r15, r13
jbe G_M6966_IG04
jmp G_M6966_IG03
;; size=14 bbWeight=4 PerfScore 13.00
G_M6966_IG11:
mov rax, 0xD1FFAB1E ; code for System.ThrowHelper:ThrowArgumentOutOfRangeException()
call [rax]System.ThrowHelper:ThrowArgumentOutOfRangeException()
int3
;; size=13 bbWeight=0 PerfScore 0.00
RWD00 dq C6C6C6C6C6C6C6C6h, C6C6C6C6C6C6C6C6h
RWD16 dq 0606060606060606h, 0606060606060606h
RWD32 dq F0F0F0F0F0F0F0F0h, F0F0F0F0F0F0F0F0h
RWD48 dq DFDFDFDFDFDFDFDFh, DFDFDFDFDFDFDFDFh
RWD64 dq 4141414141414141h, 4141414141414141h
RWD80 dq 0A0A0A0A0A0A0A0Ah, 0A0A0A0A0A0A0A0Ah
RWD96 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD112 dq 7070707070707070h, 7070707070707070h
RWD128 dq 0110011001100110h, 0110011001100110h
RWD144 dq 0E0C0A0806040200h, 0000000000000000h
-; Total bytes of code 331, prolog size 20, PerfScore 252.50, instruction count 87, allocated bytes for code 331 (MethodHash=bb7ae4c9) for method System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte (FullOpts)
+; Total bytes of code 323, prolog size 20, PerfScore 264.17, instruction count 86, allocated bytes for code 323 (MethodHash=bb7ae4c9) for method System.HexConverter:TryDecodeFromUtf16_Vector128(System.ReadOnlySpan`1[ushort],System.Span`1[ubyte],byref):ubyte (FullOpts) -7 (-2.86 % of base) - System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong ; Assembly listing for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 8 single block inlinees; 4 inlinees without PGO data
+; 0 inlinees with PGO data; 12 single block inlinees; 20 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T04] ( 3, 3 ) long -> rdi single-def
; V01 arg1 [V01,T03] ( 5, 3.50) long -> rsi single-def
; V02 arg2 [V02,T05] ( 3, 2.50) long -> rdx single-def
; V03 loc0 [V03,T01] ( 5, 10.50) byref -> rdi single-def
-; V04 loc1 [V04,T08] ( 14, 18.50) simd64 -> mm0 <System.Runtime.Intrinsics.Vector512`1[ushort]>
+; V04 loc1 [V04,T11] ( 14, 18.50) simd64 -> mm0 <System.Runtime.Intrinsics.Vector512`1[ushort]>
; V05 loc2 [V05,T02] ( 5, 6 ) byref -> rcx single-def
; V06 loc3 [V06,T00] ( 12, 27 ) long -> rax
; V07 loc4 [V07,T06] ( 2, 4.50) long -> rdx
-; V08 loc5 [V08,T10] ( 3, 12 ) simd64 -> mm3 <System.Runtime.Intrinsics.Vector512`1[ushort]>
+; V08 loc5 [V08,T14] ( 3, 12 ) simd64 -> mm3 <System.Runtime.Intrinsics.Vector512`1[ushort]>
;# V09 OutArgs [V09 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V10 tmp1 [V10 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
;* V11 tmp2 [V11 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
-; V12 tmp3 [V12,T09] ( 2, 16 ) simd64 -> mm0 "Spilling op1 side effects for HWIntrinsic"
+; V12 tmp3 [V12,T12] ( 2, 16 ) simd64 -> mm0 "Spilling op1 side effects for HWIntrinsic"
;* V13 tmp4 [V13 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
-;* V14 tmp5 [V14 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V15 tmp6 [V15 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V16 tmp7 [V16 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V17 tmp8 [V17 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V18 tmp9 [V18 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
-;* V19 tmp10 [V19 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V20 tmp11 [V20 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V21 tmp12 [V21 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-; V22 cse0 [V22,T11] ( 5, 7 ) simd64 -> mm1 "CSE #01: moderate"
-; V23 cse1 [V23,T12] ( 5, 6 ) simd64 -> mm2 "CSE #02: moderate"
-; V24 rat0 [V24,T07] ( 3, 24 ) simd64 -> mm4 "ReplaceWithLclVar is creating a new local variable"
+;* V14 tmp5 [V14 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
+;* V15 tmp6 [V15 ] ( 0, 0 ) simd64 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V16 tmp7 [V16,T08] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V17 tmp8 [V17 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V18 tmp9 [V18 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V19 tmp10 [V19 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V20 tmp11 [V20 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V21 tmp12 [V21 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V22 tmp13 [V22 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V23 tmp14 [V23 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V24 tmp15 [V24 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
+;* V25 tmp16 [V25 ] ( 0, 0 ) simd64 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V26 tmp17 [V26,T09] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V27 tmp18 [V27 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V28 tmp19 [V28 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V29 tmp20 [V29 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V30 tmp21 [V30 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V31 tmp22 [V31 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V32 tmp23 [V32 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V33 tmp24 [V33 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+; V34 tmp25 [V34,T13] ( 2, 16 ) simd64 -> mm4 "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V35 tmp26 [V35 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
+;* V36 tmp27 [V36 ] ( 0, 0 ) simd64 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V37 tmp28 [V37,T07] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V38 tmp29 [V38 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V39 tmp30 [V39 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V40 tmp31 [V40 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V41 tmp32 [V41 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V42 tmp33 [V42 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V43 tmp34 [V43 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V44 tmp35 [V44 ] ( 0, 0 ) simd64 -> zero-ref "spilled call-like call argument"
+;* V45 tmp36 [V45 ] ( 0, 0 ) simd64 -> zero-ref ld-addr-op "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ushort]>
+;* V46 tmp37 [V46,T10] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V47 tmp38 [V47 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V48 tmp39 [V48 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V49 tmp40 [V49 ] ( 0, 0 ) ushort -> zero-ref "Inline stloc first use temp"
+;* V50 tmp41 [V50 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
+;* V51 tmp42 [V51 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+;* V52 tmp43 [V52 ] ( 0, 0 ) simd64 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+;* V53 tmp44 [V53 ] ( 0, 0 ) simd64 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+; V54 cse0 [V54,T16] ( 5, 6 ) simd64 -> mm2 "CSE #02: aggressive"
+; V55 cse1 [V55,T15] ( 5, 7 ) simd64 -> mm1 "CSE #01: aggressive"
;
; Lcl frame size = 0
G_M60939_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M60939_IG02:
vmovups zmm0, zmmword ptr [rdi]
vmovups zmm1, zmmword ptr [reloc @RWD00]
vptestmw k1, zmm1, zmm0
kortestd k1, k1
je SHORT G_M60939_IG05
;; size=29 bbWeight=1 PerfScore 12.00
G_M60939_IG03:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M60939_IG04:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M60939_IG05:
mov rcx, rsi
vpackuswb zmm0, zmm0, zmm0
vmovups zmm2, zmmword ptr [reloc @RWD64]
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rcx], ymm0
mov eax, 32
test sil, 32
jne SHORT G_M60939_IG06
vmovups zmm0, zmmword ptr [rdi+0x40]
vptestmw k1, zmm1, zmm0
kortestd k1, k1
jne SHORT G_M60939_IG08
vpackuswb zmm0, zmm0, zmm0
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rcx+0x20], ymm0
;; size=77 bbWeight=0.50 PerfScore 11.88
G_M60939_IG06:
and rsi, 63
mov rax, rsi
neg rax
add rax, 64
add rdx, -64
align [0 bytes for IG07]
;; size=18 bbWeight=0.50 PerfScore 0.62
G_M60939_IG07:
vmovups zmm0, zmmword ptr [rdi+2*rax]
vmovups zmm3, zmmword ptr [rdi+2*rax+0x40]
- vmovaps zmm4, zmm0
- vpternlogd zmm4, zmm3, zmm1, -88
- vptestmw k1, zmm4, zmm4
+ vpord zmm4, zmm0, zmm3
+ vptestmw k1, zmm1, zmm4
kortestd k1, k1
jne SHORT G_M60939_IG09
vpackuswb zmm0, zmm0, zmm3
vpermq zmm0, zmm2, zmm0
vmovups zmmword ptr [rcx+rax], zmm0
add rax, 64
cmp rax, rdx
jbe SHORT G_M60939_IG07
- ;; size=69 bbWeight=4 PerfScore 81.00
+ ;; size=62 bbWeight=4 PerfScore 79.33
G_M60939_IG08:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M60939_IG09:
vptestmw k1, zmm1, zmm0
kortestd k1, k1
jne SHORT G_M60939_IG08
vpackuswb zmm0, zmm0, zmm0
vpermq zmm0, zmm2, zmm0
vmovups ymmword ptr [rcx+rax], ymm0
add rax, 32
jmp SHORT G_M60939_IG08
;; size=36 bbWeight=0.50 PerfScore 6.12
RWD00 dq FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h, FF80FF80FF80FF80h
RWD64 dq 0000000000000000h, 0000000000000002h, 0000000000000004h, 0000000000000006h, 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
-; Total bytes of code 245, prolog size 4, PerfScore 115.50, instruction count 56, allocated bytes for code 257 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (FullOpts)
+; Total bytes of code 238, prolog size 4, PerfScore 113.83, instruction count 55, allocated bytes for code 250 (MethodHash=483911f4) for method System.Text.Ascii:NarrowUtf16ToAscii_Intrinsified_512(ulong,ulong,ulong):ulong (FullOpts) Larger list of diffs: https://gist.github.com/MihuBot/687c48544de5bea090c5eef155f96737 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Job completed in 15 minutes.
Diffs
Artifacts:
The text was updated successfully, but these errors were encountered: