1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32 3 4declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>) 5declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>) 6declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) 7declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) 8declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) 9declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) 10 11; The tests demonstrate that the following WMMA register constraints are satisfied. 12; 13; v_wmma D, A, B, C 14; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case). 15; 16; In each test, 17; - first wmma instruction: the dest register D is different than all the sources 18; - second wmma instruction: the dest register D and src2 (C) are the same 19 20 21; @llvm.amdgcn.wmma.f32.16x16x16.f16 22 23define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { 24; W32-LABEL: test_wmma_f32_16x16x16_f16: 25; W32: ; %bb.0: ; %bb 26; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] 27; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] 28; W32-NEXT: s_clause 0x1 29; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 30; W32-NEXT: global_store_b128 v[24:25], v[28:31], off 31; W32-NEXT: s_clause 0x1 32; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 33; W32-NEXT: global_store_b128 v[26:27], v[16:19], off 34; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 35; W32-NEXT: s_endpgm 36bb: 37 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) 38 %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x float> %C) 39 store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 40 store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 41 ret void 42} 43 44; @llvm.amdgcn.wmma.f32.16x16x16.bf16 45 46define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { 47; W32-LABEL: test_wmma_f32_16x16x16_bf16: 48; W32: ; %bb.0: ; %bb 49; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] 50; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] 51; W32-NEXT: s_clause 0x1 52; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 53; W32-NEXT: global_store_b128 v[24:25], v[28:31], off 54; W32-NEXT: s_clause 0x1 55; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 56; W32-NEXT: global_store_b128 v[26:27], v[16:19], off 57; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 58; W32-NEXT: s_endpgm 59bb: 60 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) 61 %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x float> %C) 62 store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 63 store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 64 ret void 65} 66 67; @llvm.amdgcn.wmma.f16.16x16x16.f16 68 69define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) { 70; W32-LABEL: test_wmma_f16_16x16x16_f16_lo: 71; W32: ; %bb.0: ; %bb 72; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] 73; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] 74; W32-NEXT: s_clause 0x1 75; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 76; W32-NEXT: global_store_b128 v[24:25], v[28:31], off 77; W32-NEXT: s_clause 0x1 78; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 79; W32-NEXT: global_store_b128 v[26:27], v[16:19], off 80; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 81; W32-NEXT: s_endpgm 82bb: 83 %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0) 84 %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 0) 85 store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32 86 store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32 87 ret void 88} 89 90define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) { 91; W32-LABEL: test_wmma_f16_16x16x16_f16_hi: 92; W32: ; %bb.0: ; %bb 93; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] 94; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1] 95; W32-NEXT: s_clause 0x1 96; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 97; W32-NEXT: global_store_b128 v[24:25], v[28:31], off 98; W32-NEXT: s_clause 0x1 99; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 100; W32-NEXT: global_store_b128 v[26:27], v[16:19], off 101; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 102; W32-NEXT: s_endpgm 103bb: 104 %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1) 105 %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 1) 106 store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32 107 store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32 108 ret void 109} 110 111; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 112 113define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) { 114; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo: 115; W32: ; %bb.0: ; %bb 116; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] 117; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] 118; W32-NEXT: s_clause 0x1 119; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 120; W32-NEXT: global_store_b128 v[24:25], v[28:31], off 121; W32-NEXT: s_clause 0x1 122; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 123; W32-NEXT: global_store_b128 v[26:27], v[16:19], off 124; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 125; W32-NEXT: s_endpgm 126bb: 127 %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0) 128 %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 0) 129 store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32 130 store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32 131 ret void 132} 133 134define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) { 135; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi: 136; W32: ; %bb.0: ; %bb 137; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] 138; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1] 139; W32-NEXT: s_clause 0x1 140; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 141; W32-NEXT: global_store_b128 v[24:25], v[28:31], off 142; W32-NEXT: s_clause 0x1 143; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 144; W32-NEXT: global_store_b128 v[26:27], v[16:19], off 145; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 146; W32-NEXT: s_endpgm 147bb: 148 %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1) 149 %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 1) 150 store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32 151 store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32 152 ret void 153} 154 155; @llvm.amdgcn.wmma.i32.16x16x16.iu8 156 157define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 158; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: 159; W32: ; %bb.0: ; %bb 160; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] 161; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] 162; W32-NEXT: s_clause 0x1 163; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 164; W32-NEXT: global_store_b128 v[16:17], v[20:23], off 165; W32-NEXT: s_clause 0x1 166; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 167; W32-NEXT: global_store_b128 v[18:19], v[8:11], off 168; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 169; W32-NEXT: s_endpgm 170bb: 171 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) 172 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) 173 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 174 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 175 ret void 176} 177 178define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 179; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: 180; W32: ; %bb.0: ; %bb 181; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] 182; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] 183; W32-NEXT: s_clause 0x1 184; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 185; W32-NEXT: global_store_b128 v[16:17], v[20:23], off 186; W32-NEXT: s_clause 0x1 187; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 188; W32-NEXT: global_store_b128 v[18:19], v[8:11], off 189; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 190; W32-NEXT: s_endpgm 191bb: 192 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) 193 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) 194 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 195 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 196 ret void 197} 198 199define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 200; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: 201; W32: ; %bb.0: ; %bb 202; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] 203; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] 204; W32-NEXT: s_clause 0x1 205; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 206; W32-NEXT: global_store_b128 v[16:17], v[20:23], off 207; W32-NEXT: s_clause 0x1 208; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 209; W32-NEXT: global_store_b128 v[18:19], v[8:11], off 210; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 211; W32-NEXT: s_endpgm 212bb: 213 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) 214 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0) 215 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 216 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 217 ret void 218} 219 220define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 221; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: 222; W32: ; %bb.0: ; %bb 223; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] 224; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] 225; W32-NEXT: s_clause 0x1 226; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 227; W32-NEXT: global_store_b128 v[16:17], v[20:23], off 228; W32-NEXT: s_clause 0x1 229; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 230; W32-NEXT: global_store_b128 v[18:19], v[8:11], off 231; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 232; W32-NEXT: s_endpgm 233bb: 234 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) 235 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0) 236 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 237 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 238 ret void 239} 240 241define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 242; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: 243; W32: ; %bb.0: ; %bb 244; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] clamp 245; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] clamp 246; W32-NEXT: s_clause 0x1 247; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 248; W32-NEXT: global_store_b128 v[16:17], v[20:23], off 249; W32-NEXT: s_clause 0x1 250; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 251; W32-NEXT: global_store_b128 v[18:19], v[8:11], off 252; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 253; W32-NEXT: s_endpgm 254bb: 255 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) 256 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) 257 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 258 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 259 ret void 260} 261 262define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 263; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: 264; W32: ; %bb.0: ; %bb 265; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp 266; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] clamp 267; W32-NEXT: s_clause 0x1 268; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 269; W32-NEXT: global_store_b128 v[16:17], v[20:23], off 270; W32-NEXT: s_clause 0x1 271; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 272; W32-NEXT: global_store_b128 v[18:19], v[8:11], off 273; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 274; W32-NEXT: s_endpgm 275bb: 276 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) 277 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) 278 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 279 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 280 ret void 281} 282 283define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 284; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: 285; W32: ; %bb.0: ; %bb 286; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp 287; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] clamp 288; W32-NEXT: s_clause 0x1 289; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 290; W32-NEXT: global_store_b128 v[16:17], v[20:23], off 291; W32-NEXT: s_clause 0x1 292; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 293; W32-NEXT: global_store_b128 v[18:19], v[8:11], off 294; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 295; W32-NEXT: s_endpgm 296bb: 297 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) 298 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1) 299 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 300 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 301 ret void 302} 303 304define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 305; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: 306; W32: ; %bb.0: ; %bb 307; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp 308; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] clamp 309; W32-NEXT: s_clause 0x1 310; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 311; W32-NEXT: global_store_b128 v[16:17], v[20:23], off 312; W32-NEXT: s_clause 0x1 313; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 314; W32-NEXT: global_store_b128 v[18:19], v[8:11], off 315; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 316; W32-NEXT: s_endpgm 317bb: 318 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) 319 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1) 320 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 321 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 322 ret void 323} 324 325; @llvm.amdgcn.wmma.i32.16x16x16.iu4 326 327define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 328; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: 329; W32: ; %bb.0: ; %bb 330; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] 331; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] 332; W32-NEXT: s_clause 0x1 333; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 334; W32-NEXT: global_store_b128 v[12:13], v[16:19], off 335; W32-NEXT: s_clause 0x1 336; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 337; W32-NEXT: global_store_b128 v[14:15], v[4:7], off 338; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 339; W32-NEXT: s_endpgm 340bb: 341 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) 342 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) 343 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 344 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 345 ret void 346} 347 348define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 349; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: 350; W32: ; %bb.0: ; %bb 351; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] 352; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] 353; W32-NEXT: s_clause 0x1 354; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 355; W32-NEXT: global_store_b128 v[12:13], v[16:19], off 356; W32-NEXT: s_clause 0x1 357; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 358; W32-NEXT: global_store_b128 v[14:15], v[4:7], off 359; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 360; W32-NEXT: s_endpgm 361bb: 362 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) 363 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) 364 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 365 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 366 ret void 367} 368 369define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 370; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: 371; W32: ; %bb.0: ; %bb 372; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] 373; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] 374; W32-NEXT: s_clause 0x1 375; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 376; W32-NEXT: global_store_b128 v[12:13], v[16:19], off 377; W32-NEXT: s_clause 0x1 378; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 379; W32-NEXT: global_store_b128 v[14:15], v[4:7], off 380; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 381; W32-NEXT: s_endpgm 382bb: 383 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) 384 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0) 385 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 386 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 387 ret void 388} 389 390define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 391; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: 392; W32: ; %bb.0: ; %bb 393; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] 394; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] 395; W32-NEXT: s_clause 0x1 396; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 397; W32-NEXT: global_store_b128 v[12:13], v[16:19], off 398; W32-NEXT: s_clause 0x1 399; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 400; W32-NEXT: global_store_b128 v[14:15], v[4:7], off 401; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 402; W32-NEXT: s_endpgm 403bb: 404 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) 405 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0) 406 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 407 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 408 ret void 409} 410 411 412define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 413; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: 414; W32: ; %bb.0: ; %bb 415; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] clamp 416; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] clamp 417; W32-NEXT: s_clause 0x1 418; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 419; W32-NEXT: global_store_b128 v[12:13], v[16:19], off 420; W32-NEXT: s_clause 0x1 421; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 422; W32-NEXT: global_store_b128 v[14:15], v[4:7], off 423; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 424; W32-NEXT: s_endpgm 425bb: 426 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) 427 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) 428 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 429 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 430 ret void 431} 432 433define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 434; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: 435; W32: ; %bb.0: ; %bb 436; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp 437; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] clamp 438; W32-NEXT: s_clause 0x1 439; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 440; W32-NEXT: global_store_b128 v[12:13], v[16:19], off 441; W32-NEXT: s_clause 0x1 442; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 443; W32-NEXT: global_store_b128 v[14:15], v[4:7], off 444; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 445; W32-NEXT: s_endpgm 446bb: 447 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) 448 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) 449 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 450 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 451 ret void 452} 453 454define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 455; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: 456; W32: ; %bb.0: ; %bb 457; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp 458; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] clamp 459; W32-NEXT: s_clause 0x1 460; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 461; W32-NEXT: global_store_b128 v[12:13], v[16:19], off 462; W32-NEXT: s_clause 0x1 463; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 464; W32-NEXT: global_store_b128 v[14:15], v[4:7], off 465; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 466; W32-NEXT: s_endpgm 467bb: 468 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) 469 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1) 470 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 471 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 472 ret void 473} 474 475define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { 476; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: 477; W32: ; %bb.0: ; %bb 478; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp 479; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] clamp 480; W32-NEXT: s_clause 0x1 481; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 482; W32-NEXT: global_store_b128 v[12:13], v[16:19], off 483; W32-NEXT: s_clause 0x1 484; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 485; W32-NEXT: global_store_b128 v[14:15], v[4:7], off 486; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 487; W32-NEXT: s_endpgm 488bb: 489 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) 490 %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1) 491 store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 492 store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 493 ret void 494} 495 496