1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
3
4declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
5declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
6declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
7declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
8declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
9declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
10
11; The tests demonstrate that the following WMMA register constraints are satisfied.
12;
13; v_wmma D, A, B, C
14; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
15;
16; In each test,
17;   - first wmma instruction: the dest register D is different than all the sources
18;   - second wmma instruction: the dest register D and src2 (C) are the same
19
20
21; @llvm.amdgcn.wmma.f32.16x16x16.f16
22
23define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
24; W64-LABEL: test_wmma_f32_16x16x16_f16:
25; W64:       ; %bb.0: ; %bb
26; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
27; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
28; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
29; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
30; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
31; W64-NEXT:    s_endpgm
32bb:
33  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
34  %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <4 x float> %C)
35  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
36  store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
37  ret void
38}
39
40; @llvm.amdgcn.wmma.f32.16x16x16.bf16
41
42define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
43; W64-LABEL: test_wmma_f32_16x16x16_bf16:
44; W64:       ; %bb.0: ; %bb
45; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
46; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
47; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
48; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
49; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
50; W64-NEXT:    s_endpgm
51bb:
52  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
53  %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <4 x float> %C)
54  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
55  store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
56  ret void
57}
58
59; @llvm.amdgcn.wmma.f16.16x16x16.f16
60
61define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) {
62; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
63; W64:       ; %bb.0: ; %bb
64; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
65; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
66; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
67; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
68; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
69; W64-NEXT:    s_endpgm
70bb:
71  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
72  %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 0)
73  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
74  store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16
75  ret void
76}
77
78define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) {
79; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
80; W64:       ; %bb.0: ; %bb
81; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
82; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
83; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
84; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
85; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
86; W64-NEXT:    s_endpgm
87bb:
88  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
89  %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 1)
90  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
91  store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16
92  ret void
93}
94
95; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
96
97define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) {
98; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
99; W64:       ; %bb.0: ; %bb
100; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
101; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
102; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
103; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
104; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
105; W64-NEXT:    s_endpgm
106bb:
107  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
108  %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 0)
109  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
110  store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16
111  ret void
112}
113
114define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) {
115; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
116; W64:       ; %bb.0: ; %bb
117; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
118; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
119; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
120; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
121; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
122; W64-NEXT:    s_endpgm
123bb:
124  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
125  %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 1)
126  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
127  store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16
128  ret void
129}
130
131; @llvm.amdgcn.wmma.i32.16x16x16.iu8
132
133define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
134; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
135; W64:       ; %bb.0: ; %bb
136; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11]
137; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11]
138; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
139; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
140; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
141; W64-NEXT:    s_endpgm
142bb:
143  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
144  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
145  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
146  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
147  ret void
148}
149
150
151define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
152; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
153; W64:       ; %bb.0: ; %bb
154; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
155; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0]
156; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
157; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
158; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
159; W64-NEXT:    s_endpgm
160bb:
161  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
162  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
163  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
164  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
165  ret void
166}
167
168define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
169; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
170; W64:       ; %bb.0: ; %bb
171; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
172; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0]
173; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
174; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
175; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
176; W64-NEXT:    s_endpgm
177bb:
178  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
179  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
180  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
181  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
182  ret void
183}
184
185define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
186; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
187; W64:       ; %bb.0: ; %bb
188; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
189; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0]
190; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
191; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
192; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
193; W64-NEXT:    s_endpgm
194bb:
195  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
196  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
197  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
198  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
199  ret void
200}
201
202define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
203; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
204; W64:       ; %bb.0: ; %bb
205; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp
206; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp
207; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
208; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
209; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
210; W64-NEXT:    s_endpgm
211bb:
212  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
213  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
214  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
215  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
216  ret void
217}
218
219define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
220; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
221; W64:       ; %bb.0: ; %bb
222; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
223; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
224; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
225; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
226; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
227; W64-NEXT:    s_endpgm
228bb:
229  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
230  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
231  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
232  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
233  ret void
234}
235
236define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
237; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
238; W64:       ; %bb.0: ; %bb
239; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
240; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
241; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
242; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
243; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
244; W64-NEXT:    s_endpgm
245bb:
246  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
247  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
248  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
249  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
250  ret void
251}
252
253define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
254; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
255; W64:       ; %bb.0: ; %bb
256; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
257; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
258; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
259; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
260; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
261; W64-NEXT:    s_endpgm
262bb:
263  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
264  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
265  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
266  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
267  ret void
268}
269
270; @llvm.amdgcn.wmma.i32.16x16x16.iu4
271
272define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
273; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
274; W64:       ; %bb.0: ; %bb
275; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7]
276; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7]
277; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
278; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
279; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
280; W64-NEXT:    s_endpgm
281bb:
282  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
283  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
284  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
285  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
286  ret void
287}
288
289define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
290; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
291; W64:       ; %bb.0: ; %bb
292; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
293; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0]
294; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
295; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
296; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
297; W64-NEXT:    s_endpgm
298bb:
299  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
300  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
301  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
302  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
303  ret void
304}
305
306define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
307; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
308; W64:       ; %bb.0: ; %bb
309; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
310; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0]
311; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
312; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
313; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
314; W64-NEXT:    s_endpgm
315bb:
316  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
317  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
318  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
319  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
320  ret void
321}
322
323define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
324; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
325; W64:       ; %bb.0: ; %bb
326; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
327; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0]
328; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
329; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
330; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
331; W64-NEXT:    s_endpgm
332bb:
333  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
334  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
335  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
336  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
337  ret void
338}
339
340define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
341; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
342; W64:       ; %bb.0: ; %bb
343; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp
344; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp
345; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
346; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
347; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
348; W64-NEXT:    s_endpgm
349bb:
350  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
351  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
352  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
353  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
354  ret void
355}
356
357define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
358; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
359; W64:       ; %bb.0: ; %bb
360; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
361; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
362; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
363; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
364; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
365; W64-NEXT:    s_endpgm
366bb:
367  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
368  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
369  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
370  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
371  ret void
372}
373
374define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
375; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
376; W64:       ; %bb.0: ; %bb
377; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
378; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
379; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
380; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
381; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
382; W64-NEXT:    s_endpgm
383bb:
384  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
385  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
386  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
387  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
388  ret void
389}
390
391define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
392; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
393; W64:       ; %bb.0: ; %bb
394; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
395; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
396; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
397; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
398; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
399; W64-NEXT:    s_endpgm
400bb:
401  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
402  %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
403  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
404  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
405  ret void
406}
407
408