Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Add v_mfma_i32_16x16x64_i8 for gfx950 #116728

Merged
merged 1 commit into from
Nov 21, 2024

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Nov 19, 2024

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Nov 19, 2024

@llvm/pr-subscribers-mc
@llvm/pr-subscribers-clang
@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Patch is 33.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116728.diff

13 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1-1)
  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (+6)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+7)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl (+4-1)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2-1)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
  • (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+7-1)
  • (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+8)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+285)
  • (modified) llvm/test/MC/AMDGPU/mai-gfx950.s (+55)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt (+34)
  • (modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+9-3)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 7ce8f2c1669d67..be5e7928b48fcb 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -436,7 +436,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-
 //===----------------------------------------------------------------------===//
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
-
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x64_i8, "V4iV4iV4iV4iIiIiIi", "nc", "gfx950-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 841d8fcad0fee0..77d0e464ae5d2e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -431,4 +431,10 @@ v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
   return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
 }
 
+// CHECK-GFX950-LABEL: @test_mfma_i32_16x16x64_i8(
+// CHECK-GFX950: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 1, i32 2, i32 3)
+v4i test_mfma_i32_16x16x64_i8(v4i a, v4i b, v4i c) {
+  return __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 1, 2, 3);
+}
+
 #endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 4af67763c40dd2..bb805b56e10e3a 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -5,6 +5,7 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
 typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
+typedef int int4 __attribute__((ext_vector_type(4)));
 
 
 void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
@@ -26,3 +27,9 @@ void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, fl
   *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0);  // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
   *out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X);  // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
 }
+
+void test_mfma_i32_16x16x64_i8(__global int4* out, int4 a, int4 b, int4 c, int X) {
+  *out = __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x64_i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 0, X, 0);  // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x64_i8' must be a constant integer}}
+  *out = __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 0, 0, X);  // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x64_i8' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index e0fd2aa5c58a02..3553fc3e4a49c6 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -5,11 +5,14 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 typedef float float16 __attribute__((ext_vector_type(16)));
 typedef half half8 __attribute__((ext_vector_type(8)));
 typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
+typedef int int4 __attribute__((ext_vector_type(4)));
 
 void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
           __global float16* out1, half8 a1, half8 b1, float16 c1,
-          __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) {
+          __global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2,
+          __global int4* out3, int4 a3, int4 b3, int4 c3) {
   *out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
   *out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
   *out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
+  *out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 596dc3c9656244..98781ed55dc47c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3138,6 +3138,7 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
 defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
 def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
 def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
+def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
 
 def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
 def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty, llvm_v8i32_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0467d26bd3093d..9bb4e5be6299b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4749,7 +4749,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
     case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
     case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
-    case Intrinsic::amdgcn_mfma_f32_32x32x16_f16: {
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
+    case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: {
       // Default for MAI intrinsics.
       // srcC can also be an immediate which can be folded later.
       // FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index cedbfc97d2c6be..ef5b14f90c7983 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2880,6 +2880,7 @@ def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
 def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
 def VOP_V4F32_V8I32_V8I32_V4F32   : VOPProfile <[v4f32,  v8i32, v8i32, v4f32]>;
 def VOP_V16F32_V8I32_V8I32_V16F32   : VOPProfile <[v16f32, v8i32, v8i32, v16f32]>;
+def VOP_V4I32_V4I32_V4I32_V4I32     : VOPProfile <[v4i32,  v4i32, v4i32, v4i32]>;
 
 
 class Commutable_REV <string revOp, bit isOrig> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f38d3f35cc8429..d1427a45f0e83d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -647,6 +647,11 @@ def VOPProfileMAI_F32_V8I32_X128_VCD : VOPProfileMAI<VOP_V4F32_V8I32_V8I32_V4F32
 def VOPProfileMAI_F32_V8I32_X512     : VOPProfileMAI<VOP_V16F32_V8I32_V8I32_V16F32,    AISrc_512_f32,  ADst_512, AVSrc_256>;
 def VOPProfileMAI_F32_V8I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V8I32_V8I32_V16F32,    VISrc_512_f32,  VDst_512, AVSrc_256>;
 
+// For i32_16x16x64_i8
+def VOPProfileMAI_I32_V4I32_X128     : VOPProfileMAI<VOP_V4I32_V4I32_V4I32_V4I32,    AISrc_128_f32,  ADst_128, AVSrc_128>;
+def VOPProfileMAI_I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4I32_V4I32_V4I32_V4I32,    VISrc_128_f32,  VDst_128, AVSrc_128>;
+
+
 class MFMATable <bit is_mac, string Name> {
   bit IsMac = is_mac;
   string FMAOp = Name;
@@ -815,6 +820,7 @@ defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16",
 let SubtargetPredicate = HasGFX950Insts, is_gfx940_xdl = 1 in {
 defm V_MFMA_F32_16X16X32_F16   : MAIInst<"v_mfma_f32_16x16x32f16",    "F32_V8F16_X32", int_amdgcn_mfma_f32_16x16x32_f16>;
 defm V_MFMA_F32_32X32X16_F16   : MAIInst<"v_mfma_f32_32x32x16f16",    "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
+defm V_MFMA_I32_16X16X64_I8    : MAIInst<"v_mfma_i32_16x16x64i8",     "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
 defm V_MFMA_F32_32X32X16_BF16  : MAIInst<"v_mfma_f32_32x32x16bf16",   "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
 
 defm V_MFMA_F32_16X16X128_F8F6F4 : MAIInst<"v_mfma_f32_16x16x128f8f6f4",
@@ -946,7 +952,6 @@ def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>;
 def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>;
 def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>;
 def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>;
-def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
 def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;
 
 
@@ -1888,6 +1893,7 @@ defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
 defm V_MFMA_F32_16X16X32_F16     : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x16x32_f16">;
 defm V_MFMA_F32_32X32X16_F16     : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
+defm V_MFMA_I32_16X16X64_I8      : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
 defm V_MFMA_F32_32X32X16_BF16    : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
 
 defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_gfx950 <0x2d, "v_mfma_f32_16x16x128_f8f6f4">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index a8684a55ab5c2d..9a6eb32e7e8780 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -305,6 +305,14 @@ define amdgpu_kernel void @mfma_f32_scale_32x32x64_f8f6f4(<8 x i32> %arg0, <8 x
   ret void
 }
 
+declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, ptr addrspace(1) %out) {
+  %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+  store <4 x i32> %result, ptr addrspace(1) %out
+  ret void
+}
 
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 88d04e9fb428a2..555756d3fdf3f8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -272,3 +272,288 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__flags__mac(<16 x float> %arg2,
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
   ret <16 x float> %result
 }
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
+; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[0:1], 0x24
+; SDAG-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[18:19]
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xa4
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx8 s[20:27], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[18:19]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xa4
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
+  store <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) #0 {
+; SDAG-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[0:1], 0x24
+; SDAG-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[18:19]
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xa4
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx8 s[20:27], s[0:1], 0x24
+; GISEL-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[18:19]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xa4
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT:    s_endpgm
+  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
+  store <16 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.i32.16x16x64.i8
+; --------------------------------------------------------------------
+
+declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
+; GCN-LABEL: test_mfma_i32_16x16x64_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
+; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 1, i32 1, i32 1)
+  ret <4 x i32...
[truncated]

Copy link
Contributor Author

arsenm commented Nov 19, 2024

This stack of pull requests is managed by Graphite. Learn more about stacking.

This was referenced Nov 25, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category llvm:analysis llvm:ir mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants