1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 3 4declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) 5declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) 6declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) 7declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) 8declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) 9declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) 10 11; The tests demonstrate that the following WMMA register constraints are satisfied. 12; 13; v_wmma D, A, B, C 14; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case). 15; 16; In each test, 17; - first wmma instruction: the dest register D is different than all the sources 18; - second wmma instruction: the dest register D and src2 (C) are the same 19 20 21; @llvm.amdgcn.wmma.f32.16x16x16.f16 22 23define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { 24; W64-LABEL: test_wmma_f32_16x16x16_f16: 25; W64: ; %bb.0: ; %bb 26; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] 27; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] 28; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 29; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 30; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 31; W64-NEXT: s_endpgm 32bb: 33 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) 34 %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <4 x float> %C) 35 store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 36 store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 37 ret void 38} 39 40; @llvm.amdgcn.wmma.f32.16x16x16.bf16 41 42define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { 43; W64-LABEL: test_wmma_f32_16x16x16_bf16: 44; W64: ; %bb.0: ; %bb 45; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] 46; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] 47; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 48; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 49; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 50; W64-NEXT: s_endpgm 51bb: 52 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) 53 %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <4 x float> %C) 54 store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 55 store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 56 ret void 57} 58 59; @llvm.amdgcn.wmma.f16.16x16x16.f16 60 61define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) { 62; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: 63; W64: ; %bb.0: ; %bb 64; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] 65; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] 66; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 67; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 68; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 69; W64-NEXT: s_endpgm 70bb: 71 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) 72 %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 0) 73 store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16 74 store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16 75 ret void 76} 77 78define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) { 79; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: 80; W64: ; %bb.0: ; %bb 81; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] 82; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] 83; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 84; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 85; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 86; W64-NEXT: s_endpgm 87bb: 88 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) 89 %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 1) 90 store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16 91 store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16 92 ret void 93} 94 95; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 96 97define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) { 98; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: 99; W64: ; %bb.0: ; %bb 100; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] 101; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] 102; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 103; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 104; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 105; W64-NEXT: s_endpgm 106bb: 107 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) 108 %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 0) 109 store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16 110 store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16 111 ret void 112} 113 114define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) { 115; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: 116; W64: ; %bb.0: ; %bb 117; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] 118; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] 119; W64-NEXT: global_store_b128 v[20:21], v[24:27], off 120; W64-NEXT: global_store_b128 v[22:23], v[16:19], off 121; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 122; W64-NEXT: s_endpgm 123bb: 124 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) 125 %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 1) 126 store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16 127 store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16 128 ret void 129} 130 131; @llvm.amdgcn.wmma.i32.16x16x16.iu8 132 133define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 134; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned: 135; W64: ; %bb.0: ; %bb 136; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] 137; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] 138; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 139; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 140; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 141; W64-NEXT: s_endpgm 142bb: 143 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 144 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 145 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 146 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 147 ret void 148} 149 150 151define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 152; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed: 153; W64: ; %bb.0: ; %bb 154; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] 155; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] 156; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 157; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 158; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 159; W64-NEXT: s_endpgm 160bb: 161 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 162 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 163 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 164 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 165 ret void 166} 167 168define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 169; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned: 170; W64: ; %bb.0: ; %bb 171; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] 172; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] 173; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 174; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 175; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 176; W64-NEXT: s_endpgm 177bb: 178 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 179 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0) 180 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 181 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 182 ret void 183} 184 185define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 186; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed: 187; W64: ; %bb.0: ; %bb 188; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] 189; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] 190; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 191; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 192; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 193; W64-NEXT: s_endpgm 194bb: 195 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 196 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0) 197 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 198 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 199 ret void 200} 201 202define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 203; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp: 204; W64: ; %bb.0: ; %bb 205; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp 206; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp 207; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 208; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 209; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 210; W64-NEXT: s_endpgm 211bb: 212 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 213 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 214 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 215 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 216 ret void 217} 218 219define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 220; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp: 221; W64: ; %bb.0: ; %bb 222; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp 223; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp 224; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 225; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 226; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 227; W64-NEXT: s_endpgm 228bb: 229 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 230 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 231 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 232 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 233 ret void 234} 235 236define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 237; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp: 238; W64: ; %bb.0: ; %bb 239; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp 240; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp 241; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 242; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 243; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 244; W64-NEXT: s_endpgm 245bb: 246 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 247 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1) 248 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 249 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 250 ret void 251} 252 253define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 254; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp: 255; W64: ; %bb.0: ; %bb 256; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp 257; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp 258; W64-NEXT: global_store_b128 v[12:13], v[16:19], off 259; W64-NEXT: global_store_b128 v[14:15], v[8:11], off 260; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 261; W64-NEXT: s_endpgm 262bb: 263 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 264 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1) 265 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 266 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 267 ret void 268} 269 270; @llvm.amdgcn.wmma.i32.16x16x16.iu4 271 272define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 273; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned: 274; W64: ; %bb.0: ; %bb 275; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] 276; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] 277; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 278; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 279; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 280; W64-NEXT: s_endpgm 281bb: 282 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 283 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 284 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 285 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 286 ret void 287} 288 289define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 290; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed: 291; W64: ; %bb.0: ; %bb 292; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] 293; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] 294; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 295; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 296; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 297; W64-NEXT: s_endpgm 298bb: 299 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 300 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 301 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 302 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 303 ret void 304} 305 306define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 307; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned: 308; W64: ; %bb.0: ; %bb 309; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] 310; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] 311; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 312; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 313; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 314; W64-NEXT: s_endpgm 315bb: 316 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 317 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0) 318 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 319 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 320 ret void 321} 322 323define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 324; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed: 325; W64: ; %bb.0: ; %bb 326; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] 327; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] 328; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 329; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 330; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 331; W64-NEXT: s_endpgm 332bb: 333 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 334 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0) 335 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 336 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 337 ret void 338} 339 340define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 341; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp: 342; W64: ; %bb.0: ; %bb 343; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp 344; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp 345; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 346; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 347; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 348; W64-NEXT: s_endpgm 349bb: 350 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 351 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 352 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 353 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 354 ret void 355} 356 357define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 358; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp: 359; W64: ; %bb.0: ; %bb 360; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp 361; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp 362; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 363; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 364; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 365; W64-NEXT: s_endpgm 366bb: 367 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 368 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 369 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 370 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 371 ret void 372} 373 374define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 375; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp: 376; W64: ; %bb.0: ; %bb 377; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp 378; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp 379; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 380; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 381; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 382; W64-NEXT: s_endpgm 383bb: 384 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 385 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1) 386 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 387 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 388 ret void 389} 390 391define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { 392; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp: 393; W64: ; %bb.0: ; %bb 394; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp 395; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp 396; W64-NEXT: global_store_b128 v[8:9], v[12:15], off 397; W64-NEXT: global_store_b128 v[10:11], v[4:7], off 398; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 399; W64-NEXT: s_endpgm 400bb: 401 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 402 %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1) 403 store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 404 store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 405 ret void 406} 407 408