1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3
4define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
5; GFX9-LABEL: shuffle_v4f16_23uu:
6; GFX9:       ; %bb.0:
7; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
9; GFX9-NEXT:    s_waitcnt vmcnt(0)
10; GFX9-NEXT:    v_mov_b32_e32 v0, v1
11; GFX9-NEXT:    s_setpc_b64 s[30:31]
12  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
13  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
14  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
15  ret <4 x half> %shuffle
16}
17
18define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
19; GFX9-LABEL: shuffle_v4f16_234u:
20; GFX9:       ; %bb.0:
21; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
23; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
24; GFX9-NEXT:    s_waitcnt vmcnt(0)
25; GFX9-NEXT:    v_mov_b32_e32 v0, v1
26; GFX9-NEXT:    v_mov_b32_e32 v1, v2
27; GFX9-NEXT:    s_setpc_b64 s[30:31]
28  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
29  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
30  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
31  ret <4 x half> %shuffle
32}
33
34define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
35; GFX9-LABEL: shuffle_v4f16_u1u3:
36; GFX9:       ; %bb.0:
37; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
39; GFX9-NEXT:    s_waitcnt vmcnt(0)
40; GFX9-NEXT:    s_setpc_b64 s[30:31]
41  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
42  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
43  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
44  ret <4 x half> %shuffle
45}
46
47define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
48; GFX9-LABEL: shuffle_v4f16_u3u1:
49; GFX9:       ; %bb.0:
50; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
52; GFX9-NEXT:    s_waitcnt vmcnt(0)
53; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
54; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
55; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
56; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
57; GFX9-NEXT:    s_setpc_b64 s[30:31]
58  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
59  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
60  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
61  ret <4 x half> %shuffle
62}
63
64define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
65; GFX9-LABEL: shuffle_v4f16_u3uu:
66; GFX9:       ; %bb.0:
67; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
69; GFX9-NEXT:    s_waitcnt vmcnt(0)
70; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
71; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
72; GFX9-NEXT:    s_setpc_b64 s[30:31]
73  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
74  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
75  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
76  ret <4 x half> %shuffle
77}
78
79define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
80; GFX9-LABEL: shuffle_v4f16_3u6u:
81; GFX9:       ; %bb.0:
82; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
84; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
85; GFX9-NEXT:    s_waitcnt vmcnt(1)
86; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
87; GFX9-NEXT:    s_waitcnt vmcnt(0)
88; GFX9-NEXT:    s_setpc_b64 s[30:31]
89  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
90  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
91  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
92  ret <4 x half> %shuffle
93}
94
95define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
96; GFX9-LABEL: shuffle_v4f16_3uu7:
97; GFX9:       ; %bb.0:
98; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
100; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
101; GFX9-NEXT:    s_waitcnt vmcnt(1)
102; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
103; GFX9-NEXT:    s_waitcnt vmcnt(0)
104; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
105; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
106; GFX9-NEXT:    s_setpc_b64 s[30:31]
107  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
108  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
109  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
110  ret <4 x half> %shuffle
111}
112
113define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
114; GFX9-LABEL: shuffle_v4f16_35u5:
115; GFX9:       ; %bb.0:
116; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
118; GFX9-NEXT:    global_load_dword v1, v[2:3], off
119; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
120; GFX9-NEXT:    s_waitcnt vmcnt(1)
121; GFX9-NEXT:    v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
122; GFX9-NEXT:    s_waitcnt vmcnt(0)
123; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
124; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
125; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
126; GFX9-NEXT:    s_setpc_b64 s[30:31]
127  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
128  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
129  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
130  ret <4 x half> %shuffle
131}
132
133define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
134; GFX9-LABEL: shuffle_v4f16_357u:
135; GFX9:       ; %bb.0:
136; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
138; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
139; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
140; GFX9-NEXT:    s_waitcnt vmcnt(1)
141; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
142; GFX9-NEXT:    s_waitcnt vmcnt(0)
143; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
144; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
145; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
146; GFX9-NEXT:    s_setpc_b64 s[30:31]
147  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
148  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
149  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
150  ret <4 x half> %shuffle
151}
152
153define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
154; GFX9-LABEL: shuffle_v4f16_0101:
155; GFX9:       ; %bb.0:
156; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
158; GFX9-NEXT:    s_waitcnt vmcnt(0)
159; GFX9-NEXT:    v_mov_b32_e32 v1, v0
160; GFX9-NEXT:    s_setpc_b64 s[30:31]
161  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
162  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
163  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
164  ret <4 x half> %shuffle
165}
166
167define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
168; GFX9-LABEL: shuffle_v4f16_0123:
169; GFX9:       ; %bb.0:
170; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
172; GFX9-NEXT:    s_waitcnt vmcnt(0)
173; GFX9-NEXT:    s_setpc_b64 s[30:31]
174  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
175  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
176  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
177  ret <4 x half> %shuffle
178}
179
180define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
181; GFX9-LABEL: shuffle_v4f16_0145:
182; GFX9:       ; %bb.0:
183; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
185; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
186; GFX9-NEXT:    s_waitcnt vmcnt(0)
187; GFX9-NEXT:    v_mov_b32_e32 v1, v2
188; GFX9-NEXT:    s_setpc_b64 s[30:31]
189  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
190  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
191  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
192  ret <4 x half> %shuffle
193}
194
195define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
196; GFX9-LABEL: shuffle_v4f16_0167:
197; GFX9:       ; %bb.0:
198; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
200; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
201; GFX9-NEXT:    s_waitcnt vmcnt(0)
202; GFX9-NEXT:    v_mov_b32_e32 v1, v3
203; GFX9-NEXT:    s_setpc_b64 s[30:31]
204  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
205  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
206  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
207  ret <4 x half> %shuffle
208}
209
210define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
211; GFX9-LABEL: shuffle_v4f16_2301:
212; GFX9:       ; %bb.0:
213; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
215; GFX9-NEXT:    s_waitcnt vmcnt(0)
216; GFX9-NEXT:    v_mov_b32_e32 v0, v2
217; GFX9-NEXT:    s_setpc_b64 s[30:31]
218  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
219  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
220  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
221  ret <4 x half> %shuffle
222}
223
224define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
225; GFX9-LABEL: shuffle_v4f16_2323:
226; GFX9:       ; %bb.0:
227; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
229; GFX9-NEXT:    s_waitcnt vmcnt(0)
230; GFX9-NEXT:    v_mov_b32_e32 v0, v1
231; GFX9-NEXT:    s_setpc_b64 s[30:31]
232  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
233  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
234  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
235  ret <4 x half> %shuffle
236}
237
238define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
239; GFX9-LABEL: shuffle_v4f16_2345:
240; GFX9:       ; %bb.0:
241; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
243; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
244; GFX9-NEXT:    s_waitcnt vmcnt(0)
245; GFX9-NEXT:    v_mov_b32_e32 v0, v1
246; GFX9-NEXT:    v_mov_b32_e32 v1, v2
247; GFX9-NEXT:    s_setpc_b64 s[30:31]
248  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
249  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
250  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
251  ret <4 x half> %shuffle
252}
253
254define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
255; GFX9-LABEL: shuffle_v4f16_2367:
256; GFX9:       ; %bb.0:
257; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
259; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
260; GFX9-NEXT:    s_waitcnt vmcnt(0)
261; GFX9-NEXT:    v_mov_b32_e32 v0, v1
262; GFX9-NEXT:    v_mov_b32_e32 v1, v3
263; GFX9-NEXT:    s_setpc_b64 s[30:31]
264  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
265  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
266  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
267  ret <4 x half> %shuffle
268}
269
270define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
271; GFX9-LABEL: shuffle_v4f16_4501:
272; GFX9:       ; %bb.0:
273; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274; GFX9-NEXT:    global_load_dwordx2 v[3:4], v[2:3], off
275; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
276; GFX9-NEXT:    s_waitcnt vmcnt(1)
277; GFX9-NEXT:    v_mov_b32_e32 v0, v3
278; GFX9-NEXT:    s_waitcnt vmcnt(0)
279; GFX9-NEXT:    s_setpc_b64 s[30:31]
280  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
281  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
282  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
283  ret <4 x half> %shuffle
284}
285
286define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
287; GFX9-LABEL: shuffle_v4f16_4523:
288; GFX9:       ; %bb.0:
289; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
291; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
292; GFX9-NEXT:    s_waitcnt vmcnt(0)
293; GFX9-NEXT:    v_mov_b32_e32 v0, v2
294; GFX9-NEXT:    s_setpc_b64 s[30:31]
295  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
296  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
297  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
298  ret <4 x half> %shuffle
299}
300
301define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
302; GFX9-LABEL: shuffle_v4f16_4545:
303; GFX9:       ; %bb.0:
304; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
306; GFX9-NEXT:    s_waitcnt vmcnt(0)
307; GFX9-NEXT:    v_mov_b32_e32 v1, v0
308; GFX9-NEXT:    s_setpc_b64 s[30:31]
309  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
310  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
311  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
312  ret <4 x half> %shuffle
313}
314
315define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
316; GFX9-LABEL: shuffle_v4f16_4567:
317; GFX9:       ; %bb.0:
318; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
320; GFX9-NEXT:    s_waitcnt vmcnt(0)
321; GFX9-NEXT:    s_setpc_b64 s[30:31]
322  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
323  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
324  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
325  ret <4 x half> %shuffle
326}
327
328define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
329; GFX9-LABEL: shuffle_v4f16_6701:
330; GFX9:       ; %bb.0:
331; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
333; GFX9-NEXT:    s_waitcnt vmcnt(0)
334; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
335; GFX9-NEXT:    v_mov_b32_e32 v0, v3
336; GFX9-NEXT:    s_waitcnt vmcnt(0)
337; GFX9-NEXT:    s_setpc_b64 s[30:31]
338  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
339  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
340  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
341  ret <4 x half> %shuffle
342}
343
344define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
345; GFX9-LABEL: shuffle_v4f16_6723:
346; GFX9:       ; %bb.0:
347; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
349; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
350; GFX9-NEXT:    s_waitcnt vmcnt(0)
351; GFX9-NEXT:    v_mov_b32_e32 v0, v3
352; GFX9-NEXT:    s_setpc_b64 s[30:31]
353  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
354  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
355  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
356  ret <4 x half> %shuffle
357}
358
359define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
360; GFX9-LABEL: shuffle_v4f16_6745:
361; GFX9:       ; %bb.0:
362; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
364; GFX9-NEXT:    s_waitcnt vmcnt(0)
365; GFX9-NEXT:    v_mov_b32_e32 v0, v2
366; GFX9-NEXT:    s_setpc_b64 s[30:31]
367  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
368  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
369  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
370  ret <4 x half> %shuffle
371}
372
373define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
374; GFX9-LABEL: shuffle_v4f16_6767:
375; GFX9:       ; %bb.0:
376; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
378; GFX9-NEXT:    s_waitcnt vmcnt(0)
379; GFX9-NEXT:    v_mov_b32_e32 v0, v1
380; GFX9-NEXT:    s_setpc_b64 s[30:31]
381  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
382  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
383  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
384  ret <4 x half> %shuffle
385}
386
387define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
388; GFX9-LABEL: shuffle_v4f16_2356:
389; GFX9:       ; %bb.0:
390; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
392; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
393; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
394; GFX9-NEXT:    s_waitcnt vmcnt(1)
395; GFX9-NEXT:    v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
396; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v0
397; GFX9-NEXT:    s_waitcnt vmcnt(0)
398; GFX9-NEXT:    v_mov_b32_e32 v0, v5
399; GFX9-NEXT:    s_setpc_b64 s[30:31]
400  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
401  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
402  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
403  ret <4 x half> %shuffle
404}
405
406define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
407; GFX9-LABEL: shuffle_v4f16_5623:
408; GFX9:       ; %bb.0:
409; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
411; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
412; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
413; GFX9-NEXT:    s_waitcnt vmcnt(0)
414; GFX9-NEXT:    v_and_b32_sdwa v0, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
415; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
416; GFX9-NEXT:    s_setpc_b64 s[30:31]
417  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
418  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
419  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
420  ret <4 x half> %shuffle
421}
422
423define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
424; GFX9-LABEL: shuffle_v4f16_3456:
425; GFX9:       ; %bb.0:
426; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
428; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
429; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
430; GFX9-NEXT:    s_waitcnt vmcnt(1)
431; GFX9-NEXT:    v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
432; GFX9-NEXT:    s_waitcnt vmcnt(0)
433; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
434; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
435; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
436; GFX9-NEXT:    s_setpc_b64 s[30:31]
437  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
438  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
439  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
440  ret <4 x half> %shuffle
441}
442
443define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
444; GFX9-LABEL: shuffle_v4f16_5634:
445; GFX9:       ; %bb.0:
446; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
448; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
449; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
450; GFX9-NEXT:    s_waitcnt vmcnt(1)
451; GFX9-NEXT:    v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
452; GFX9-NEXT:    s_waitcnt vmcnt(0)
453; GFX9-NEXT:    v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
454; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
455; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v3
456; GFX9-NEXT:    s_setpc_b64 s[30:31]
457  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
458  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
459  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
460  ret <4 x half> %shuffle
461}
462
463define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
464; GFX9-LABEL: shuffle_v4f16_5734:
465; GFX9:       ; %bb.0:
466; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
468; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
469; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
470; GFX9-NEXT:    s_waitcnt vmcnt(1)
471; GFX9-NEXT:    v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
472; GFX9-NEXT:    s_waitcnt vmcnt(0)
473; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
474; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
475; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v3
476; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v2
477; GFX9-NEXT:    s_setpc_b64 s[30:31]
478  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
479  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
480  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
481  ret <4 x half> %shuffle
482}
483
484define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
485; GFX9-LABEL: shuffle_v4i16_2356:
486; GFX9:       ; %bb.0:
487; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
488; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
489; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
490; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
491; GFX9-NEXT:    s_waitcnt vmcnt(1)
492; GFX9-NEXT:    v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
493; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v0
494; GFX9-NEXT:    s_waitcnt vmcnt(0)
495; GFX9-NEXT:    v_mov_b32_e32 v0, v5
496; GFX9-NEXT:    s_setpc_b64 s[30:31]
497  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
498  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
499  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
500  ret <4 x i16> %shuffle
501}
502
503define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
504; GFX9-LABEL: shuffle_v4i16_0167:
505; GFX9:       ; %bb.0:
506; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
508; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
509; GFX9-NEXT:    s_waitcnt vmcnt(0)
510; GFX9-NEXT:    v_mov_b32_e32 v1, v3
511; GFX9-NEXT:    s_setpc_b64 s[30:31]
512  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
513  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
514  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
515  ret <4 x i16> %shuffle
516}
517
518define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
519; GFX9-LABEL: shuffle_v4f16_0000:
520; GFX9:       ; %bb.0:
521; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
523; GFX9-NEXT:    s_waitcnt vmcnt(0)
524; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
525; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
526; GFX9-NEXT:    v_mov_b32_e32 v1, v0
527; GFX9-NEXT:    s_setpc_b64 s[30:31]
528  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
529  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
530  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
531  ret <4 x half> %shuffle
532}
533
534define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
535; GFX9-LABEL: shuffle_v4f16_1010:
536; GFX9:       ; %bb.0:
537; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
539; GFX9-NEXT:    s_waitcnt vmcnt(0)
540; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
541; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
542; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
543; GFX9-NEXT:    v_mov_b32_e32 v1, v0
544; GFX9-NEXT:    s_setpc_b64 s[30:31]
545  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
546  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
547  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
548  ret <4 x half> %shuffle
549}
550
551define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
552; GFX9-LABEL: shuffle_v4f16_1100:
553; GFX9:       ; %bb.0:
554; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
556; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
557; GFX9-NEXT:    s_waitcnt vmcnt(0)
558; GFX9-NEXT:    v_and_b32_e32 v1, v2, v0
559; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
560; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
561; GFX9-NEXT:    v_and_b32_e32 v0, v2, v3
562; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
563; GFX9-NEXT:    s_setpc_b64 s[30:31]
564  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
565  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
566  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
567  ret <4 x half> %shuffle
568}
569
570define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
571; GFX9-LABEL: shuffle_v4f16_6161:
572; GFX9:       ; %bb.0:
573; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
574; GFX9-NEXT:    global_load_dword v0, v[0:1], off
575; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
576; GFX9-NEXT:    s_waitcnt vmcnt(1)
577; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
578; GFX9-NEXT:    s_waitcnt vmcnt(0)
579; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
580; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
581; GFX9-NEXT:    v_mov_b32_e32 v1, v0
582; GFX9-NEXT:    s_setpc_b64 s[30:31]
583  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
584  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
585  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
586  ret <4 x half> %shuffle
587}
588
589define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
590; GFX9-LABEL: shuffle_v4f16_2333:
591; GFX9:       ; %bb.0:
592; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
594; GFX9-NEXT:    s_waitcnt vmcnt(0)
595; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
596; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
597; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
598; GFX9-NEXT:    v_mov_b32_e32 v0, v2
599; GFX9-NEXT:    s_setpc_b64 s[30:31]
600  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
601  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
602  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
603  ret <4 x half> %shuffle
604}
605
606define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
607; GFX9-LABEL: shuffle_v4f16_6667:
608; GFX9:       ; %bb.0:
609; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
611; GFX9-NEXT:    s_waitcnt vmcnt(0)
612; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
613; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
614; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
615; GFX9-NEXT:    v_mov_b32_e32 v0, v2
616; GFX9-NEXT:    s_setpc_b64 s[30:31]
617  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
618  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
619  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
620  ret <4 x half> %shuffle
621}
622
623define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
624; GFX9-LABEL: shuffle_v8f16_0101:
625; GFX9:       ; %bb.0:
626; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
627; GFX9-NEXT:    global_load_dword v0, v[0:1], off
628; GFX9-NEXT:    s_waitcnt vmcnt(0)
629; GFX9-NEXT:    v_mov_b32_e32 v1, v0
630; GFX9-NEXT:    s_setpc_b64 s[30:31]
631  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
632  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
633  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
634  ret <4 x half> %shuffle
635}
636
637define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
638; GFX9-LABEL: shuffle_v8f16_0123:
639; GFX9:       ; %bb.0:
640; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
642; GFX9-NEXT:    s_waitcnt vmcnt(0)
643; GFX9-NEXT:    s_setpc_b64 s[30:31]
644  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
645  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
646  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
647  ret <4 x half> %shuffle
648}
649
650define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
651; GFX9-LABEL: shuffle_v8f16_4589:
652; GFX9:       ; %bb.0:
653; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654; GFX9-NEXT:    global_load_dword v2, v[2:3], off
655; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:8
656; GFX9-NEXT:    s_waitcnt vmcnt(1)
657; GFX9-NEXT:    v_mov_b32_e32 v1, v2
658; GFX9-NEXT:    s_waitcnt vmcnt(0)
659; GFX9-NEXT:    s_setpc_b64 s[30:31]
660  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
661  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
662  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
663  ret <4 x half> %shuffle
664}
665
666define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
667; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
668; GFX9:       ; %bb.0:
669; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
670; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
671; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
672; GFX9-NEXT:    s_waitcnt vmcnt(1)
673; GFX9-NEXT:    v_mov_b32_e32 v0, v2
674; GFX9-NEXT:    s_waitcnt vmcnt(0)
675; GFX9-NEXT:    s_setpc_b64 s[30:31]
676  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
677  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
678  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
679  ret <4 x half> %shuffle
680}
681
682define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
683; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
684; GFX9:       ; %bb.0:
685; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
686; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
687; GFX9-NEXT:    global_load_dword v1, v[0:1], off offset:4
688; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
689; GFX9-NEXT:    s_waitcnt vmcnt(1)
690; GFX9-NEXT:    v_and_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
691; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
692; GFX9-NEXT:    s_waitcnt vmcnt(0)
693; GFX9-NEXT:    s_setpc_b64 s[30:31]
694  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
695  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
696  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
697  ret <4 x half> %shuffle
698}
699
700define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
701; GFX9-LABEL: shuffle_v3f16_0122:
702; GFX9:       ; %bb.0:
703; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
704; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
705; GFX9-NEXT:    s_waitcnt vmcnt(0)
706; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
707; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
708; GFX9-NEXT:    s_setpc_b64 s[30:31]
709  %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
710  %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
711  %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
712  ret <4 x half> %shuffle
713}
714
715define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
716; GFX9-LABEL: shuffle_v2f16_0122:
717; GFX9:       ; %bb.0:
718; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719; GFX9-NEXT:    global_load_dword v0, v[0:1], off
720; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
721; GFX9-NEXT:    s_waitcnt vmcnt(0)
722; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
723; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
724; GFX9-NEXT:    s_setpc_b64 s[30:31]
725  %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
726  %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
727  %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
728  ret <4 x half> %shuffle
729}
730
731define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
732; GFX9-LABEL: shuffle_v6f16_452367:
733; GFX9:       ; %bb.0:
734; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
735; GFX9-NEXT:    global_load_dword v3, v[2:3], off
736; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
737; GFX9-NEXT:    s_waitcnt vmcnt(0)
738; GFX9-NEXT:    v_mov_b32_e32 v0, v2
739; GFX9-NEXT:    v_mov_b32_e32 v2, v3
740; GFX9-NEXT:    s_setpc_b64 s[30:31]
741  %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
742  %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
743  %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
744  ret <6 x half> %shuffle
745}
746
747define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C)  {
748; GFX9-LABEL: fma_shuffle:
749; GFX9:       ; %bb.0: ; %entry
750; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
751; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
752; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
753; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
754; GFX9-NEXT:    v_mov_b32_e32 v1, s1
755; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v4
756; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
757; GFX9-NEXT:    v_mov_b32_e32 v3, s3
758; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v4
759; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
760; GFX9-NEXT:    v_mov_b32_e32 v5, s5
761; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
762; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
763; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off
764; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
765; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
766; GFX9-NEXT:    s_waitcnt vmcnt(0)
767; GFX9-NEXT:    v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1]
768; GFX9-NEXT:    v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1]
769; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v3, v6 op_sel:[1,0,0]
770; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
771; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
772; GFX9-NEXT:    s_endpgm
773entry:
774  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
775  %tmp12 = zext i32 %tmp1 to i64
776  %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
777  %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
778  %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
779  %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
780  %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
781  %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
782  %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
783  %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
784  %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
785  %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
786  %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
787  %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
788  %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
789  %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
790  %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
791  %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
792  %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
793  %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
794  %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
795  %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
796  %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
797  %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
798  store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
799  ret void
800}
801
802define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
803; GFX9-LABEL: shuffle_v4f16_0456:
804; GFX9:       ; %bb.0:
805; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
806; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
807; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
808; GFX9-NEXT:    s_waitcnt vmcnt(0)
809; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
810; GFX9-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
811; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
812; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
813; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v4
814; GFX9-NEXT:    s_setpc_b64 s[30:31]
815  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
816  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
817  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
818  ret <4 x half> %shuffle
819}
820
821declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
822declare i32 @llvm.amdgcn.workitem.id.x() #0
823
824attributes #0 = { nounwind readnone speculatable }
825