-
Notifications
You must be signed in to change notification settings - Fork 12.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Handle cvt_scale F32/F16->F4/F8 gfx950 hazard #117844
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changesgfx950 SP changes doc says: Co-authored-by: Pravin Jagtap <[email protected]> Patch is 28.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117844.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 4c37ef8855a5ba..ecf03b14143ee3 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -909,8 +909,9 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
// There are three different types of instructions
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
- // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
- // CVT_SR_BF8_F32 with op_sel[3:2]
+ // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
+ // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
+ // op_sel[3:2]
// != 0
if (SIInstrInfo::isSDWA(MI)) {
// Type 1: SDWA with dst_sel != DWORD
@@ -918,8 +919,8 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return nullptr;
} else {
- // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
- // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
+ // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
+ // with op_sel[3:2] != 0)
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
SISrcMods::DST_OP_SEL ||
@@ -983,7 +984,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}
- if (ST.hasDstSelForwardingHazard()) {
+ if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
const int Shift16DefWaitstates = 1;
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
@@ -1094,7 +1095,8 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
// problematic thus far.
// see checkVALUHazards()
- if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
+ if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
+ !ST.hasCvtScaleForwardingHazard())
return 0;
const MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ea5e159fdd8363..5cecaf6349c883 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1264,6 +1264,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
+ bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
+
bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
bool requiresCodeObjectV6() const { return RequiresCOV6; }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ab5f0694c07f95..5a0e812748fbb7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -378,6 +378,14 @@ struct VOPTrue16Info {
bool IsTrue16;
};
+#define GET_FP8DstByteSelTable_DECL
+#define GET_FP8DstByteSelTable_IMPL
+
+struct DPMACCInstructionInfo {
+ uint16_t Opcode;
+ bool IsDPMACCInstruction;
+};
+
struct FP8DstByteSelInfo {
uint16_t Opcode;
bool HasFP8DstByteSel;
@@ -418,6 +426,8 @@ struct FP8DstByteSelInfo {
#define GET_getMFMA_F8F6F4_WithSize_DECL
#define GET_getMFMA_F8F6F4_WithSize_IMPL
#define GET_isMFMA_F8F6F4Table_IMPL
+#define GET_isCvtScaleF32_F32F16ToF8F4Table_IMPL
+
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9f7fbec6a542f7..ea497d7b239d7e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -103,6 +103,10 @@ struct MFMA_F8F6F4_Info {
uint8_t NumRegsSrcB;
};
+struct CvtScaleF32_F32F16ToF8F4_Info {
+ unsigned Opcode;
+};
+
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
@@ -112,6 +116,7 @@ struct MFMA_F8F6F4_Info {
#define GET_MAIInstInfoTable_DECL
#define GET_MAIInstInfoTable_DECL
#define GET_isMFMA_F8F6F4Table_DECL
+#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c8c36714909adf..1160975f3302a9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -970,11 +970,16 @@ class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profil
let HasOMod = 0;
}
+class VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
+ let HasFP8DstByteSel = 1;
+}
+
class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
FP32InputMods:$src2_modifiers, Src2RC64:$src2,
VGPR_32:$vdst_in, op_sel0:$op_sel);
+ let HasFP8DstByteSel = 1;
}
@@ -992,6 +997,7 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Profile<
HasSrc0FloatMods, HasSrc1FloatMods,
HasSrc2FloatMods>.ret);
let HasExtVOP3DPP = 0;
+ let HasFP8DstByteSel = 1;
}
class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
@@ -1004,6 +1010,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
+ let HasFP8DstByteSel = 1;
}
def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1015,6 +1022,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
+ let HasFP8DstByteSel = 1;
}
class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
@@ -1090,7 +1098,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in
let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in {
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
- defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
+ defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
@@ -2047,6 +2055,7 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
}
}
}
+
} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 75834316750951..6a25e346c89447 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -255,3 +255,399 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
...
+
+
+---
+# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_bf16_hazard
+# GCN: V_CVT_SCALEF32_SR_FP8_BF16_e64
+# GCN: GLOBAL_STORE_DWORD
+name: test_cvt_scalef32_sr_fp8_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr5, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f16_hazard
+# GCN: V_CVT_SCALEF32_SR_FP8_F16_e64
+# GCN: GLOBAL_STORE_DWORD
+name: test_cvt_scalef32_sr_fp8_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr5, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f32_hazard
+# GCN: V_CVT_SCALEF32_SR_FP8_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_sr_fp8_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec
+ renamable $vgpr2 = V_ADD_U32_e32 4, killed $vgpr5, implicit $exec
+ GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_hazard
+# GCN: V_CVT_SCALEF32_PK_FP8_F32_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_fp8_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec
+ renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_hazard
+# GCN: V_CVT_SCALEF32_PK_FP8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_fp8_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_hazard
+# GCN: V_CVT_SCALEF32_SR_BF8_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_pk_fp8_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_sr_bf8_f16_hazard
+# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_sr_bf8_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_sr_bf8_f32_hazard
+# GCN: V_CVT_SCALEF32_SR_BF8_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_sr_bf8_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_hazard
+# GCN: V_CVT_SCALEF32_PK_BF8_F32_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_bf8_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec
+ renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_hazard
+# GCN: V_CVT_SCALEF32_PK_BF8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_bf8_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_hazard
+# GCN: V_CVT_SCALEF32_PK_BF8_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_PK_ADD_U16
+name: test_cvt_scalef32_pk_bf8_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_BF16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scale_fp4_f32_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scale_fp4_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec
+ renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec
+ renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 4, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_f16_hazard
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_scalef32_sr_pk_fp4_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_bf16_hazard
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_scalef32_sr_pk_fp4_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_hazard
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_scalef32_sr_pk_fp4_f32_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 8, killed $vgpr2_vgpr3, 0, killed $vgpr4, 4, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_fp4_f16_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_fp4_f16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_fp4_bf16_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_fp4_bf16_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
+---
+# GCN-LABEL: test_cvt_scalef32_hazard_skipping_over_meta_instr
+# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_cvt_scalef32_hazard_skipping_over_meta_instr
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0...
[truncated]
|
2758ec2
to
ffac062
Compare
10d9189
to
986bb31
Compare
ffac062
to
c609205
Compare
gfx950 SP changes doc says: No 4 clk forwarding on opcodes that convert from F32/F16->F8 or F32/F16->F4. Must insert a NOP or instruction writing some other destination VREG after a conversion to F4/F8 since it writes either low/high half or bytes. Co-authored-by: Pravin Jagtap <[email protected]> Co-authored-by: Jeffrey Byrnes <[email protected]>
986bb31
to
e3cf2d7
Compare
gfx950 SP changes doc says:
No 4 clk forwarding on opcodes that convert from
F32/F16->F8 or F32/F16->F4. Must insert a NOP or
instruction writing some other destination VREG
after a conversion to F4/F8 since it writes either
low/high half or bytes.
Co-authored-by: Pravin Jagtap [email protected]
Co-authored-by: Jeffrey Byrnes [email protected]