1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
3
4declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
5declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
6declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
7declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
8declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
9declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
10
11; The tests demonstrate that the following WMMA register constraints are satisfied.
12;
13; v_wmma D, A, B, C
14; A and B cannot overlap with D. C cannot partially overlap with D, but it is OK for them to be the same (which is a typical case).
15;
16; In each test,
17;   - first wmma instruction: the dest register D is different than all the sources
18;   - second wmma instruction: the dest register D and src2 (C) are the same
19
20
21; @llvm.amdgcn.wmma.f32.16x16x16.f16
22
23define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
24; W32-LABEL: test_wmma_f32_16x16x16_f16:
25; W32:       ; %bb.0: ; %bb
26; W32-NEXT:    v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
27; W32-NEXT:    v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
28; W32-NEXT:    s_clause 0x1
29; W32-NEXT:    global_store_b128 v[24:25], v[32:35], off offset:16
30; W32-NEXT:    global_store_b128 v[24:25], v[28:31], off
31; W32-NEXT:    s_clause 0x1
32; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
33; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
34; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
35; W32-NEXT:    s_endpgm
36bb:
37  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
38  %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x float> %C)
39  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
40  store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
41  ret void
42}
43
44; @llvm.amdgcn.wmma.f32.16x16x16.bf16
45
46define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
47; W32-LABEL: test_wmma_f32_16x16x16_bf16:
48; W32:       ; %bb.0: ; %bb
49; W32-NEXT:    v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
50; W32-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
51; W32-NEXT:    s_clause 0x1
52; W32-NEXT:    global_store_b128 v[24:25], v[32:35], off offset:16
53; W32-NEXT:    global_store_b128 v[24:25], v[28:31], off
54; W32-NEXT:    s_clause 0x1
55; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
56; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
57; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
58; W32-NEXT:    s_endpgm
59bb:
60  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
61  %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x float> %C)
62  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
63  store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
64  ret void
65}
66
67; @llvm.amdgcn.wmma.f16.16x16x16.f16
68
69define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) {
70; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
71; W32:       ; %bb.0: ; %bb
72; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
73; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
74; W32-NEXT:    s_clause 0x1
75; W32-NEXT:    global_store_b128 v[24:25], v[32:35], off offset:16
76; W32-NEXT:    global_store_b128 v[24:25], v[28:31], off
77; W32-NEXT:    s_clause 0x1
78; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
79; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
80; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
81; W32-NEXT:    s_endpgm
82bb:
83  %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
84  %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 0)
85  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
86  store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32
87  ret void
88}
89
90define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) {
91; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
92; W32:       ; %bb.0: ; %bb
93; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
94; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
95; W32-NEXT:    s_clause 0x1
96; W32-NEXT:    global_store_b128 v[24:25], v[32:35], off offset:16
97; W32-NEXT:    global_store_b128 v[24:25], v[28:31], off
98; W32-NEXT:    s_clause 0x1
99; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
100; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
101; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
102; W32-NEXT:    s_endpgm
103bb:
104  %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
105  %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 1)
106  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
107  store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32
108  ret void
109}
110
111; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
112
113define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) {
114; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
115; W32:       ; %bb.0: ; %bb
116; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
117; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23]
118; W32-NEXT:    s_clause 0x1
119; W32-NEXT:    global_store_b128 v[24:25], v[32:35], off offset:16
120; W32-NEXT:    global_store_b128 v[24:25], v[28:31], off
121; W32-NEXT:    s_clause 0x1
122; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
123; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
124; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
125; W32-NEXT:    s_endpgm
126bb:
127  %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
128  %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 0)
129  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
130  store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32
131  ret void
132}
133
134define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) {
135; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
136; W32:       ; %bb.0: ; %bb
137; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
138; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
139; W32-NEXT:    s_clause 0x1
140; W32-NEXT:    global_store_b128 v[24:25], v[32:35], off offset:16
141; W32-NEXT:    global_store_b128 v[24:25], v[28:31], off
142; W32-NEXT:    s_clause 0x1
143; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
144; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
145; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
146; W32-NEXT:    s_endpgm
147bb:
148  %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
149  %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 1)
150  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
151  store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32
152  ret void
153}
154
155; @llvm.amdgcn.wmma.i32.16x16x16.iu8
156
157define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
158; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
159; W32:       ; %bb.0: ; %bb
160; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15]
161; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15]
162; W32-NEXT:    s_clause 0x1
163; W32-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
164; W32-NEXT:    global_store_b128 v[16:17], v[20:23], off
165; W32-NEXT:    s_clause 0x1
166; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
167; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
168; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
169; W32-NEXT:    s_endpgm
170bb:
171  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
172  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
173  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
174  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
175  ret void
176}
177
178define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
179; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
180; W32:       ; %bb.0: ; %bb
181; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
182; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0]
183; W32-NEXT:    s_clause 0x1
184; W32-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
185; W32-NEXT:    global_store_b128 v[16:17], v[20:23], off
186; W32-NEXT:    s_clause 0x1
187; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
188; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
189; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
190; W32-NEXT:    s_endpgm
191bb:
192  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
193  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
194  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
195  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
196  ret void
197}
198
199define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
200; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
201; W32:       ; %bb.0: ; %bb
202; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
203; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0]
204; W32-NEXT:    s_clause 0x1
205; W32-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
206; W32-NEXT:    global_store_b128 v[16:17], v[20:23], off
207; W32-NEXT:    s_clause 0x1
208; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
209; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
210; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
211; W32-NEXT:    s_endpgm
212bb:
213  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
214  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
215  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
216  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
217  ret void
218}
219
220define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
221; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
222; W32:       ; %bb.0: ; %bb
223; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
224; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0]
225; W32-NEXT:    s_clause 0x1
226; W32-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
227; W32-NEXT:    global_store_b128 v[16:17], v[20:23], off
228; W32-NEXT:    s_clause 0x1
229; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
230; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
231; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
232; W32-NEXT:    s_endpgm
233bb:
234  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
235  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
236  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
237  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
238  ret void
239}
240
241define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
242; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
243; W32:       ; %bb.0: ; %bb
244; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] clamp
245; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] clamp
246; W32-NEXT:    s_clause 0x1
247; W32-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
248; W32-NEXT:    global_store_b128 v[16:17], v[20:23], off
249; W32-NEXT:    s_clause 0x1
250; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
251; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
252; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
253; W32-NEXT:    s_endpgm
254bb:
255  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
256  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
257  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
258  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
259  ret void
260}
261
262define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
263; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
264; W32:       ; %bb.0: ; %bb
265; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
266; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
267; W32-NEXT:    s_clause 0x1
268; W32-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
269; W32-NEXT:    global_store_b128 v[16:17], v[20:23], off
270; W32-NEXT:    s_clause 0x1
271; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
272; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
273; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
274; W32-NEXT:    s_endpgm
275bb:
276  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
277  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
278  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
279  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
280  ret void
281}
282
283define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
284; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
285; W32:       ; %bb.0: ; %bb
286; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
287; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
288; W32-NEXT:    s_clause 0x1
289; W32-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
290; W32-NEXT:    global_store_b128 v[16:17], v[20:23], off
291; W32-NEXT:    s_clause 0x1
292; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
293; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
294; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
295; W32-NEXT:    s_endpgm
296bb:
297  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
298  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
299  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
300  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
301  ret void
302}
303
304define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
305; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
306; W32:       ; %bb.0: ; %bb
307; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
308; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
309; W32-NEXT:    s_clause 0x1
310; W32-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
311; W32-NEXT:    global_store_b128 v[16:17], v[20:23], off
312; W32-NEXT:    s_clause 0x1
313; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
314; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
315; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
316; W32-NEXT:    s_endpgm
317bb:
318  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
319  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
320  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
321  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
322  ret void
323}
324
325; @llvm.amdgcn.wmma.i32.16x16x16.iu4
326
327define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
328; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
329; W32:       ; %bb.0: ; %bb
330; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11]
331; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11]
332; W32-NEXT:    s_clause 0x1
333; W32-NEXT:    global_store_b128 v[12:13], v[20:23], off offset:16
334; W32-NEXT:    global_store_b128 v[12:13], v[16:19], off
335; W32-NEXT:    s_clause 0x1
336; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
337; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
338; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
339; W32-NEXT:    s_endpgm
340bb:
341  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
342  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
343  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
344  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
345  ret void
346}
347
348define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
349; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
350; W32:       ; %bb.0: ; %bb
351; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
352; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0]
353; W32-NEXT:    s_clause 0x1
354; W32-NEXT:    global_store_b128 v[12:13], v[20:23], off offset:16
355; W32-NEXT:    global_store_b128 v[12:13], v[16:19], off
356; W32-NEXT:    s_clause 0x1
357; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
358; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
359; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
360; W32-NEXT:    s_endpgm
361bb:
362  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
363  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
364  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
365  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
366  ret void
367}
368
369define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
370; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
371; W32:       ; %bb.0: ; %bb
372; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
373; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0]
374; W32-NEXT:    s_clause 0x1
375; W32-NEXT:    global_store_b128 v[12:13], v[20:23], off offset:16
376; W32-NEXT:    global_store_b128 v[12:13], v[16:19], off
377; W32-NEXT:    s_clause 0x1
378; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
379; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
380; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
381; W32-NEXT:    s_endpgm
382bb:
383  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
384  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
385  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
386  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
387  ret void
388}
389
390define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
391; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
392; W32:       ; %bb.0: ; %bb
393; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
394; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0]
395; W32-NEXT:    s_clause 0x1
396; W32-NEXT:    global_store_b128 v[12:13], v[20:23], off offset:16
397; W32-NEXT:    global_store_b128 v[12:13], v[16:19], off
398; W32-NEXT:    s_clause 0x1
399; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
400; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
401; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
402; W32-NEXT:    s_endpgm
403bb:
404  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
405  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
406  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
407  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
408  ret void
409}
410
411
412define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
413; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
414; W32:       ; %bb.0: ; %bb
415; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] clamp
416; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] clamp
417; W32-NEXT:    s_clause 0x1
418; W32-NEXT:    global_store_b128 v[12:13], v[20:23], off offset:16
419; W32-NEXT:    global_store_b128 v[12:13], v[16:19], off
420; W32-NEXT:    s_clause 0x1
421; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
422; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
423; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
424; W32-NEXT:    s_endpgm
425bb:
426  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
427  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
428  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
429  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
430  ret void
431}
432
433define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
434; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
435; W32:       ; %bb.0: ; %bb
436; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
437; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
438; W32-NEXT:    s_clause 0x1
439; W32-NEXT:    global_store_b128 v[12:13], v[20:23], off offset:16
440; W32-NEXT:    global_store_b128 v[12:13], v[16:19], off
441; W32-NEXT:    s_clause 0x1
442; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
443; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
444; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
445; W32-NEXT:    s_endpgm
446bb:
447  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
448  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
449  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
450  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
451  ret void
452}
453
454define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
455; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
456; W32:       ; %bb.0: ; %bb
457; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
458; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
459; W32-NEXT:    s_clause 0x1
460; W32-NEXT:    global_store_b128 v[12:13], v[20:23], off offset:16
461; W32-NEXT:    global_store_b128 v[12:13], v[16:19], off
462; W32-NEXT:    s_clause 0x1
463; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
464; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
465; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
466; W32-NEXT:    s_endpgm
467bb:
468  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
469  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
470  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
471  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
472  ret void
473}
474
475define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
476; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
477; W32:       ; %bb.0: ; %bb
478; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
479; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
480; W32-NEXT:    s_clause 0x1
481; W32-NEXT:    global_store_b128 v[12:13], v[20:23], off offset:16
482; W32-NEXT:    global_store_b128 v[12:13], v[16:19], off
483; W32-NEXT:    s_clause 0x1
484; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
485; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
486; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
487; W32-NEXT:    s_endpgm
488bb:
489  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
490  %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
491  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
492  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
493  ret void
494}
495
496