1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
5
6define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
7; GFX9-LABEL: shuffle_v4f16_23uu:
8; GFX9:       ; %bb.0:
9; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
11; GFX9-NEXT:    s_waitcnt vmcnt(0)
12; GFX9-NEXT:    s_setpc_b64 s[30:31]
13;
14; GFX10-LABEL: shuffle_v4f16_23uu:
15; GFX10:       ; %bb.0:
16; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
18; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
19; GFX10-NEXT:    s_waitcnt vmcnt(0)
20; GFX10-NEXT:    s_setpc_b64 s[30:31]
21;
22; GFX11-LABEL: shuffle_v4f16_23uu:
23; GFX11:       ; %bb.0:
24; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
26; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
27; GFX11-NEXT:    s_waitcnt vmcnt(0)
28; GFX11-NEXT:    s_setpc_b64 s[30:31]
29  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
30  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
31  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
32  ret <4 x half> %shuffle
33}
34
35define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
36; GFX9-LABEL: shuffle_v4f16_234u:
37; GFX9:       ; %bb.0:
38; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
40; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
41; GFX9-NEXT:    s_waitcnt vmcnt(1)
42; GFX9-NEXT:    v_mov_b32_e32 v0, v6
43; GFX9-NEXT:    s_waitcnt vmcnt(0)
44; GFX9-NEXT:    v_mov_b32_e32 v1, v4
45; GFX9-NEXT:    s_setpc_b64 s[30:31]
46;
47; GFX10-LABEL: shuffle_v4f16_234u:
48; GFX10:       ; %bb.0:
49; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
51; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
52; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
53; GFX10-NEXT:    s_waitcnt vmcnt(1)
54; GFX10-NEXT:    v_mov_b32_e32 v0, v6
55; GFX10-NEXT:    s_waitcnt vmcnt(0)
56; GFX10-NEXT:    v_mov_b32_e32 v1, v4
57; GFX10-NEXT:    s_setpc_b64 s[30:31]
58;
59; GFX11-LABEL: shuffle_v4f16_234u:
60; GFX11:       ; %bb.0:
61; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
63; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
64; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
65; GFX11-NEXT:    s_waitcnt vmcnt(0)
66; GFX11-NEXT:    s_setpc_b64 s[30:31]
67  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
68  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
69  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
70  ret <4 x half> %shuffle
71}
72
73define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
74; GFX9-LABEL: shuffle_v4f16_u1u3:
75; GFX9:       ; %bb.0:
76; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
78; GFX9-NEXT:    s_waitcnt vmcnt(0)
79; GFX9-NEXT:    s_setpc_b64 s[30:31]
80;
81; GFX10-LABEL: shuffle_v4f16_u1u3:
82; GFX10:       ; %bb.0:
83; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
84; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
85; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
86; GFX10-NEXT:    s_waitcnt vmcnt(0)
87; GFX10-NEXT:    s_setpc_b64 s[30:31]
88;
89; GFX11-LABEL: shuffle_v4f16_u1u3:
90; GFX11:       ; %bb.0:
91; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
93; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
94; GFX11-NEXT:    s_waitcnt vmcnt(0)
95; GFX11-NEXT:    s_setpc_b64 s[30:31]
96  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
97  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
98  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
99  ret <4 x half> %shuffle
100}
101
102define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
103; GFX9-LABEL: shuffle_v4f16_u3u1:
104; GFX9:       ; %bb.0:
105; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
107; GFX9-NEXT:    s_waitcnt vmcnt(0)
108; GFX9-NEXT:    v_mov_b32_e32 v0, v2
109; GFX9-NEXT:    s_setpc_b64 s[30:31]
110;
111; GFX10-LABEL: shuffle_v4f16_u3u1:
112; GFX10:       ; %bb.0:
113; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
115; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
116; GFX10-NEXT:    s_waitcnt vmcnt(0)
117; GFX10-NEXT:    v_mov_b32_e32 v0, v2
118; GFX10-NEXT:    s_setpc_b64 s[30:31]
119;
120; GFX11-LABEL: shuffle_v4f16_u3u1:
121; GFX11:       ; %bb.0:
122; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
124; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
125; GFX11-NEXT:    s_waitcnt vmcnt(0)
126; GFX11-NEXT:    v_mov_b32_e32 v0, v2
127; GFX11-NEXT:    s_setpc_b64 s[30:31]
128  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
129  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
130  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
131  ret <4 x half> %shuffle
132}
133
134define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
135; GFX9-LABEL: shuffle_v4f16_u3uu:
136; GFX9:       ; %bb.0:
137; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
139; GFX9-NEXT:    s_waitcnt vmcnt(0)
140; GFX9-NEXT:    s_setpc_b64 s[30:31]
141;
142; GFX10-LABEL: shuffle_v4f16_u3uu:
143; GFX10:       ; %bb.0:
144; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
146; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
147; GFX10-NEXT:    s_waitcnt vmcnt(0)
148; GFX10-NEXT:    s_setpc_b64 s[30:31]
149;
150; GFX11-LABEL: shuffle_v4f16_u3uu:
151; GFX11:       ; %bb.0:
152; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
154; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
155; GFX11-NEXT:    s_waitcnt vmcnt(0)
156; GFX11-NEXT:    s_setpc_b64 s[30:31]
157  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
158  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
159  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
160  ret <4 x half> %shuffle
161}
162
163define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
164; GFX9-LABEL: shuffle_v4f16_3u6u:
165; GFX9:       ; %bb.0:
166; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
168; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
169; GFX9-NEXT:    s_waitcnt vmcnt(1)
170; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
171; GFX9-NEXT:    s_waitcnt vmcnt(0)
172; GFX9-NEXT:    v_mov_b32_e32 v1, v4
173; GFX9-NEXT:    s_setpc_b64 s[30:31]
174;
175; GFX10-LABEL: shuffle_v4f16_3u6u:
176; GFX10:       ; %bb.0:
177; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
179; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
180; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
181; GFX10-NEXT:    s_waitcnt vmcnt(1)
182; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
183; GFX10-NEXT:    s_waitcnt vmcnt(0)
184; GFX10-NEXT:    v_mov_b32_e32 v1, v4
185; GFX10-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX11-LABEL: shuffle_v4f16_3u6u:
188; GFX11:       ; %bb.0:
189; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
191; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
192; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
193; GFX11-NEXT:    s_waitcnt vmcnt(1)
194; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
195; GFX11-NEXT:    s_waitcnt vmcnt(0)
196; GFX11-NEXT:    s_setpc_b64 s[30:31]
197  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
198  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
199  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
200  ret <4 x half> %shuffle
201}
202
203define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
204; GFX9-LABEL: shuffle_v4f16_3uu7:
205; GFX9:       ; %bb.0:
206; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
208; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
209; GFX9-NEXT:    s_waitcnt vmcnt(1)
210; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
211; GFX9-NEXT:    s_waitcnt vmcnt(0)
212; GFX9-NEXT:    v_mov_b32_e32 v1, v4
213; GFX9-NEXT:    s_setpc_b64 s[30:31]
214;
215; GFX10-LABEL: shuffle_v4f16_3uu7:
216; GFX10:       ; %bb.0:
217; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
219; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
220; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
221; GFX10-NEXT:    s_waitcnt vmcnt(1)
222; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
223; GFX10-NEXT:    s_waitcnt vmcnt(0)
224; GFX10-NEXT:    v_mov_b32_e32 v1, v4
225; GFX10-NEXT:    s_setpc_b64 s[30:31]
226;
227; GFX11-LABEL: shuffle_v4f16_3uu7:
228; GFX11:       ; %bb.0:
229; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
231; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
232; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
233; GFX11-NEXT:    s_waitcnt vmcnt(1)
234; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
235; GFX11-NEXT:    s_waitcnt vmcnt(0)
236; GFX11-NEXT:    s_setpc_b64 s[30:31]
237  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
238  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
239  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
240  ret <4 x half> %shuffle
241}
242
243define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
244; GFX9-LABEL: shuffle_v4f16_35u5:
245; GFX9:       ; %bb.0:
246; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
248; GFX9-NEXT:    global_load_dword v4, v[2:3], off
249; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
250; GFX9-NEXT:    s_waitcnt vmcnt(1)
251; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
252; GFX9-NEXT:    s_waitcnt vmcnt(0)
253; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
254; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
255; GFX9-NEXT:    v_mov_b32_e32 v1, v4
256; GFX9-NEXT:    s_setpc_b64 s[30:31]
257;
258; GFX10-LABEL: shuffle_v4f16_35u5:
259; GFX10:       ; %bb.0:
260; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
261; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
262; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
263; GFX10-NEXT:    global_load_dword v4, v[2:3], off
264; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
265; GFX10-NEXT:    s_waitcnt vmcnt(1)
266; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
267; GFX10-NEXT:    s_waitcnt vmcnt(0)
268; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
269; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
270; GFX10-NEXT:    v_mov_b32_e32 v1, v4
271; GFX10-NEXT:    s_setpc_b64 s[30:31]
272;
273; GFX11-LABEL: shuffle_v4f16_35u5:
274; GFX11:       ; %bb.0:
275; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
277; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
278; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
279; GFX11-NEXT:    s_waitcnt vmcnt(1)
280; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
281; GFX11-NEXT:    s_waitcnt vmcnt(0)
282; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
283; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
284; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
285; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
286; GFX11-NEXT:    s_setpc_b64 s[30:31]
287  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
288  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
289  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
290  ret <4 x half> %shuffle
291}
292
293define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
294; GFX9-LABEL: shuffle_v4f16_357u:
295; GFX9:       ; %bb.0:
296; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
298; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
299; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
300; GFX9-NEXT:    s_waitcnt vmcnt(1)
301; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
302; GFX9-NEXT:    s_waitcnt vmcnt(0)
303; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
304; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
305; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
306; GFX9-NEXT:    s_setpc_b64 s[30:31]
307;
308; GFX10-LABEL: shuffle_v4f16_357u:
309; GFX10:       ; %bb.0:
310; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
312; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
313; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
314; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
315; GFX10-NEXT:    s_waitcnt vmcnt(1)
316; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
317; GFX10-NEXT:    s_waitcnt vmcnt(0)
318; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
319; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
320; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
321; GFX10-NEXT:    s_setpc_b64 s[30:31]
322;
323; GFX11-LABEL: shuffle_v4f16_357u:
324; GFX11:       ; %bb.0:
325; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
327; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
328; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
329; GFX11-NEXT:    s_waitcnt vmcnt(1)
330; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
331; GFX11-NEXT:    s_waitcnt vmcnt(0)
332; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
333; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
334; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
335; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
336; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
337; GFX11-NEXT:    s_setpc_b64 s[30:31]
338  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
339  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
340  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
341  ret <4 x half> %shuffle
342}
343
344define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
345; GFX9-LABEL: shuffle_v4f16_0101:
346; GFX9:       ; %bb.0:
347; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348; GFX9-NEXT:    global_load_dword v0, v[0:1], off
349; GFX9-NEXT:    s_waitcnt vmcnt(0)
350; GFX9-NEXT:    v_mov_b32_e32 v1, v0
351; GFX9-NEXT:    s_setpc_b64 s[30:31]
352;
353; GFX10-LABEL: shuffle_v4f16_0101:
354; GFX10:       ; %bb.0:
355; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
356; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
357; GFX10-NEXT:    global_load_dword v0, v[0:1], off
358; GFX10-NEXT:    s_waitcnt vmcnt(0)
359; GFX10-NEXT:    v_mov_b32_e32 v1, v0
360; GFX10-NEXT:    s_setpc_b64 s[30:31]
361;
362; GFX11-LABEL: shuffle_v4f16_0101:
363; GFX11:       ; %bb.0:
364; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
365; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
366; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
367; GFX11-NEXT:    s_waitcnt vmcnt(0)
368; GFX11-NEXT:    v_mov_b32_e32 v1, v0
369; GFX11-NEXT:    s_setpc_b64 s[30:31]
370  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
371  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
372  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
373  ret <4 x half> %shuffle
374}
375
376define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
377; GFX9-LABEL: shuffle_v4f16_0123:
378; GFX9:       ; %bb.0:
379; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
381; GFX9-NEXT:    s_waitcnt vmcnt(0)
382; GFX9-NEXT:    s_setpc_b64 s[30:31]
383;
384; GFX10-LABEL: shuffle_v4f16_0123:
385; GFX10:       ; %bb.0:
386; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
388; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
389; GFX10-NEXT:    s_waitcnt vmcnt(0)
390; GFX10-NEXT:    s_setpc_b64 s[30:31]
391;
392; GFX11-LABEL: shuffle_v4f16_0123:
393; GFX11:       ; %bb.0:
394; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
396; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
397; GFX11-NEXT:    s_waitcnt vmcnt(0)
398; GFX11-NEXT:    s_setpc_b64 s[30:31]
399  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
400  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
401  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
402  ret <4 x half> %shuffle
403}
404
405define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
406; GFX9-LABEL: shuffle_v4f16_0145:
407; GFX9:       ; %bb.0:
408; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409; GFX9-NEXT:    global_load_dword v4, v[0:1], off
410; GFX9-NEXT:    global_load_dword v5, v[2:3], off
411; GFX9-NEXT:    s_waitcnt vmcnt(1)
412; GFX9-NEXT:    v_mov_b32_e32 v0, v4
413; GFX9-NEXT:    s_waitcnt vmcnt(0)
414; GFX9-NEXT:    v_mov_b32_e32 v1, v5
415; GFX9-NEXT:    s_setpc_b64 s[30:31]
416;
417; GFX10-LABEL: shuffle_v4f16_0145:
418; GFX10:       ; %bb.0:
419; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
421; GFX10-NEXT:    global_load_dword v4, v[0:1], off
422; GFX10-NEXT:    global_load_dword v5, v[2:3], off
423; GFX10-NEXT:    s_waitcnt vmcnt(1)
424; GFX10-NEXT:    v_mov_b32_e32 v0, v4
425; GFX10-NEXT:    s_waitcnt vmcnt(0)
426; GFX10-NEXT:    v_mov_b32_e32 v1, v5
427; GFX10-NEXT:    s_setpc_b64 s[30:31]
428;
429; GFX11-LABEL: shuffle_v4f16_0145:
430; GFX11:       ; %bb.0:
431; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
433; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
434; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
435; GFX11-NEXT:    s_waitcnt vmcnt(0)
436; GFX11-NEXT:    s_setpc_b64 s[30:31]
437  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
438  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
439  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
440  ret <4 x half> %shuffle
441}
442
443define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
444; GFX9-LABEL: shuffle_v4f16_0167:
445; GFX9:       ; %bb.0:
446; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447; GFX9-NEXT:    global_load_dword v4, v[0:1], off
448; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
449; GFX9-NEXT:    s_waitcnt vmcnt(1)
450; GFX9-NEXT:    v_mov_b32_e32 v0, v4
451; GFX9-NEXT:    s_waitcnt vmcnt(0)
452; GFX9-NEXT:    v_mov_b32_e32 v1, v5
453; GFX9-NEXT:    s_setpc_b64 s[30:31]
454;
455; GFX10-LABEL: shuffle_v4f16_0167:
456; GFX10:       ; %bb.0:
457; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
459; GFX10-NEXT:    global_load_dword v4, v[0:1], off
460; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
461; GFX10-NEXT:    s_waitcnt vmcnt(1)
462; GFX10-NEXT:    v_mov_b32_e32 v0, v4
463; GFX10-NEXT:    s_waitcnt vmcnt(0)
464; GFX10-NEXT:    v_mov_b32_e32 v1, v5
465; GFX10-NEXT:    s_setpc_b64 s[30:31]
466;
467; GFX11-LABEL: shuffle_v4f16_0167:
468; GFX11:       ; %bb.0:
469; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
471; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
472; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
473; GFX11-NEXT:    s_waitcnt vmcnt(0)
474; GFX11-NEXT:    s_setpc_b64 s[30:31]
475  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
476  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
477  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
478  ret <4 x half> %shuffle
479}
480
481define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
482; GFX9-LABEL: shuffle_v4f16_2301:
483; GFX9:       ; %bb.0:
484; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
486; GFX9-NEXT:    s_waitcnt vmcnt(0)
487; GFX9-NEXT:    v_mov_b32_e32 v0, v2
488; GFX9-NEXT:    s_setpc_b64 s[30:31]
489;
490; GFX10-LABEL: shuffle_v4f16_2301:
491; GFX10:       ; %bb.0:
492; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
494; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
495; GFX10-NEXT:    s_waitcnt vmcnt(0)
496; GFX10-NEXT:    v_mov_b32_e32 v0, v2
497; GFX10-NEXT:    s_setpc_b64 s[30:31]
498;
499; GFX11-LABEL: shuffle_v4f16_2301:
500; GFX11:       ; %bb.0:
501; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
503; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
504; GFX11-NEXT:    s_waitcnt vmcnt(0)
505; GFX11-NEXT:    v_mov_b32_e32 v0, v2
506; GFX11-NEXT:    s_setpc_b64 s[30:31]
507  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
508  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
509  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
510  ret <4 x half> %shuffle
511}
512
513define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
514; GFX9-LABEL: shuffle_v4f16_2323:
515; GFX9:       ; %bb.0:
516; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
517; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
518; GFX9-NEXT:    s_waitcnt vmcnt(0)
519; GFX9-NEXT:    v_mov_b32_e32 v1, v0
520; GFX9-NEXT:    s_setpc_b64 s[30:31]
521;
522; GFX10-LABEL: shuffle_v4f16_2323:
523; GFX10:       ; %bb.0:
524; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
525; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
526; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
527; GFX10-NEXT:    s_waitcnt vmcnt(0)
528; GFX10-NEXT:    v_mov_b32_e32 v1, v0
529; GFX10-NEXT:    s_setpc_b64 s[30:31]
530;
531; GFX11-LABEL: shuffle_v4f16_2323:
532; GFX11:       ; %bb.0:
533; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
535; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
536; GFX11-NEXT:    s_waitcnt vmcnt(0)
537; GFX11-NEXT:    v_mov_b32_e32 v1, v0
538; GFX11-NEXT:    s_setpc_b64 s[30:31]
539  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
540  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
541  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
542  ret <4 x half> %shuffle
543}
544
545define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
546; GFX9-LABEL: shuffle_v4f16_2345:
547; GFX9:       ; %bb.0:
548; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
549; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
550; GFX9-NEXT:    global_load_dword v5, v[2:3], off
551; GFX9-NEXT:    s_waitcnt vmcnt(1)
552; GFX9-NEXT:    v_mov_b32_e32 v0, v4
553; GFX9-NEXT:    s_waitcnt vmcnt(0)
554; GFX9-NEXT:    v_mov_b32_e32 v1, v5
555; GFX9-NEXT:    s_setpc_b64 s[30:31]
556;
557; GFX10-LABEL: shuffle_v4f16_2345:
558; GFX10:       ; %bb.0:
559; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
560; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
561; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
562; GFX10-NEXT:    global_load_dword v5, v[2:3], off
563; GFX10-NEXT:    s_waitcnt vmcnt(1)
564; GFX10-NEXT:    v_mov_b32_e32 v0, v4
565; GFX10-NEXT:    s_waitcnt vmcnt(0)
566; GFX10-NEXT:    v_mov_b32_e32 v1, v5
567; GFX10-NEXT:    s_setpc_b64 s[30:31]
568;
569; GFX11-LABEL: shuffle_v4f16_2345:
570; GFX11:       ; %bb.0:
571; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
572; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
573; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
574; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
575; GFX11-NEXT:    s_waitcnt vmcnt(0)
576; GFX11-NEXT:    s_setpc_b64 s[30:31]
577  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
578  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
579  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
580  ret <4 x half> %shuffle
581}
582
583define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
584; GFX9-LABEL: shuffle_v4f16_2367:
585; GFX9:       ; %bb.0:
586; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
587; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
588; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
589; GFX9-NEXT:    s_waitcnt vmcnt(1)
590; GFX9-NEXT:    v_mov_b32_e32 v0, v4
591; GFX9-NEXT:    s_waitcnt vmcnt(0)
592; GFX9-NEXT:    v_mov_b32_e32 v1, v5
593; GFX9-NEXT:    s_setpc_b64 s[30:31]
594;
595; GFX10-LABEL: shuffle_v4f16_2367:
596; GFX10:       ; %bb.0:
597; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
599; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
600; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
601; GFX10-NEXT:    s_waitcnt vmcnt(1)
602; GFX10-NEXT:    v_mov_b32_e32 v0, v4
603; GFX10-NEXT:    s_waitcnt vmcnt(0)
604; GFX10-NEXT:    v_mov_b32_e32 v1, v5
605; GFX10-NEXT:    s_setpc_b64 s[30:31]
606;
607; GFX11-LABEL: shuffle_v4f16_2367:
608; GFX11:       ; %bb.0:
609; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
611; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
612; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
613; GFX11-NEXT:    s_waitcnt vmcnt(0)
614; GFX11-NEXT:    s_setpc_b64 s[30:31]
615  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
616  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
617  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
618  ret <4 x half> %shuffle
619}
620
621define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
622; GFX9-LABEL: shuffle_v4f16_4501:
623; GFX9:       ; %bb.0:
624; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625; GFX9-NEXT:    global_load_dword v4, v[2:3], off
626; GFX9-NEXT:    global_load_dword v5, v[0:1], off
627; GFX9-NEXT:    s_waitcnt vmcnt(1)
628; GFX9-NEXT:    v_mov_b32_e32 v0, v4
629; GFX9-NEXT:    s_waitcnt vmcnt(0)
630; GFX9-NEXT:    v_mov_b32_e32 v1, v5
631; GFX9-NEXT:    s_setpc_b64 s[30:31]
632;
633; GFX10-LABEL: shuffle_v4f16_4501:
634; GFX10:       ; %bb.0:
635; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
636; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
637; GFX10-NEXT:    global_load_dword v4, v[2:3], off
638; GFX10-NEXT:    global_load_dword v5, v[0:1], off
639; GFX10-NEXT:    s_waitcnt vmcnt(1)
640; GFX10-NEXT:    v_mov_b32_e32 v0, v4
641; GFX10-NEXT:    s_waitcnt vmcnt(0)
642; GFX10-NEXT:    v_mov_b32_e32 v1, v5
643; GFX10-NEXT:    s_setpc_b64 s[30:31]
644;
645; GFX11-LABEL: shuffle_v4f16_4501:
646; GFX11:       ; %bb.0:
647; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
648; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
649; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
650; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
651; GFX11-NEXT:    s_waitcnt vmcnt(1)
652; GFX11-NEXT:    v_mov_b32_e32 v0, v2
653; GFX11-NEXT:    s_waitcnt vmcnt(0)
654; GFX11-NEXT:    s_setpc_b64 s[30:31]
655  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
656  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
657  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
658  ret <4 x half> %shuffle
659}
660
661define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
662; GFX9-LABEL: shuffle_v4f16_4523:
663; GFX9:       ; %bb.0:
664; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX9-NEXT:    global_load_dword v4, v[2:3], off
666; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
667; GFX9-NEXT:    s_waitcnt vmcnt(1)
668; GFX9-NEXT:    v_mov_b32_e32 v0, v4
669; GFX9-NEXT:    s_waitcnt vmcnt(0)
670; GFX9-NEXT:    v_mov_b32_e32 v1, v5
671; GFX9-NEXT:    s_setpc_b64 s[30:31]
672;
673; GFX10-LABEL: shuffle_v4f16_4523:
674; GFX10:       ; %bb.0:
675; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
676; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
677; GFX10-NEXT:    global_load_dword v4, v[2:3], off
678; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
679; GFX10-NEXT:    s_waitcnt vmcnt(1)
680; GFX10-NEXT:    v_mov_b32_e32 v0, v4
681; GFX10-NEXT:    s_waitcnt vmcnt(0)
682; GFX10-NEXT:    v_mov_b32_e32 v1, v5
683; GFX10-NEXT:    s_setpc_b64 s[30:31]
684;
685; GFX11-LABEL: shuffle_v4f16_4523:
686; GFX11:       ; %bb.0:
687; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
689; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
690; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
691; GFX11-NEXT:    s_waitcnt vmcnt(1)
692; GFX11-NEXT:    v_mov_b32_e32 v0, v2
693; GFX11-NEXT:    s_waitcnt vmcnt(0)
694; GFX11-NEXT:    s_setpc_b64 s[30:31]
695  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
696  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
697  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
698  ret <4 x half> %shuffle
699}
700
701define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
702; GFX9-LABEL: shuffle_v4f16_4545:
703; GFX9:       ; %bb.0:
704; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705; GFX9-NEXT:    global_load_dword v0, v[2:3], off
706; GFX9-NEXT:    s_waitcnt vmcnt(0)
707; GFX9-NEXT:    v_mov_b32_e32 v1, v0
708; GFX9-NEXT:    s_setpc_b64 s[30:31]
709;
710; GFX10-LABEL: shuffle_v4f16_4545:
711; GFX10:       ; %bb.0:
712; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
713; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
714; GFX10-NEXT:    global_load_dword v0, v[2:3], off
715; GFX10-NEXT:    s_waitcnt vmcnt(0)
716; GFX10-NEXT:    v_mov_b32_e32 v1, v0
717; GFX10-NEXT:    s_setpc_b64 s[30:31]
718;
719; GFX11-LABEL: shuffle_v4f16_4545:
720; GFX11:       ; %bb.0:
721; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
722; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
723; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
724; GFX11-NEXT:    s_waitcnt vmcnt(0)
725; GFX11-NEXT:    v_mov_b32_e32 v1, v0
726; GFX11-NEXT:    s_setpc_b64 s[30:31]
727  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
728  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
729  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
730  ret <4 x half> %shuffle
731}
732
733define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
734; GFX9-LABEL: shuffle_v4f16_4567:
735; GFX9:       ; %bb.0:
736; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
737; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
738; GFX9-NEXT:    s_waitcnt vmcnt(0)
739; GFX9-NEXT:    s_setpc_b64 s[30:31]
740;
741; GFX10-LABEL: shuffle_v4f16_4567:
742; GFX10:       ; %bb.0:
743; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
744; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
745; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
746; GFX10-NEXT:    s_waitcnt vmcnt(0)
747; GFX10-NEXT:    s_setpc_b64 s[30:31]
748;
749; GFX11-LABEL: shuffle_v4f16_4567:
750; GFX11:       ; %bb.0:
751; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
753; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
754; GFX11-NEXT:    s_waitcnt vmcnt(0)
755; GFX11-NEXT:    s_setpc_b64 s[30:31]
756  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
757  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
758  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
759  ret <4 x half> %shuffle
760}
761
762define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
763; GFX9-LABEL: shuffle_v4f16_6701:
764; GFX9:       ; %bb.0:
765; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
767; GFX9-NEXT:    global_load_dword v5, v[0:1], off
768; GFX9-NEXT:    s_waitcnt vmcnt(1)
769; GFX9-NEXT:    v_mov_b32_e32 v0, v4
770; GFX9-NEXT:    s_waitcnt vmcnt(0)
771; GFX9-NEXT:    v_mov_b32_e32 v1, v5
772; GFX9-NEXT:    s_setpc_b64 s[30:31]
773;
774; GFX10-LABEL: shuffle_v4f16_6701:
775; GFX10:       ; %bb.0:
776; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
777; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
778; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
779; GFX10-NEXT:    global_load_dword v5, v[0:1], off
780; GFX10-NEXT:    s_waitcnt vmcnt(1)
781; GFX10-NEXT:    v_mov_b32_e32 v0, v4
782; GFX10-NEXT:    s_waitcnt vmcnt(0)
783; GFX10-NEXT:    v_mov_b32_e32 v1, v5
784; GFX10-NEXT:    s_setpc_b64 s[30:31]
785;
786; GFX11-LABEL: shuffle_v4f16_6701:
787; GFX11:       ; %bb.0:
788; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
789; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
790; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
791; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
792; GFX11-NEXT:    s_waitcnt vmcnt(1)
793; GFX11-NEXT:    v_mov_b32_e32 v0, v2
794; GFX11-NEXT:    s_waitcnt vmcnt(0)
795; GFX11-NEXT:    s_setpc_b64 s[30:31]
796  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
797  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
798  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
799  ret <4 x half> %shuffle
800}
801
802define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
803; GFX9-LABEL: shuffle_v4f16_6723:
804; GFX9:       ; %bb.0:
805; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
806; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
807; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
808; GFX9-NEXT:    s_waitcnt vmcnt(1)
809; GFX9-NEXT:    v_mov_b32_e32 v0, v4
810; GFX9-NEXT:    s_waitcnt vmcnt(0)
811; GFX9-NEXT:    v_mov_b32_e32 v1, v5
812; GFX9-NEXT:    s_setpc_b64 s[30:31]
813;
814; GFX10-LABEL: shuffle_v4f16_6723:
815; GFX10:       ; %bb.0:
816; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
817; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
818; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
819; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
820; GFX10-NEXT:    s_waitcnt vmcnt(1)
821; GFX10-NEXT:    v_mov_b32_e32 v0, v4
822; GFX10-NEXT:    s_waitcnt vmcnt(0)
823; GFX10-NEXT:    v_mov_b32_e32 v1, v5
824; GFX10-NEXT:    s_setpc_b64 s[30:31]
825;
826; GFX11-LABEL: shuffle_v4f16_6723:
827; GFX11:       ; %bb.0:
828; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
829; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
830; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
831; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
832; GFX11-NEXT:    s_waitcnt vmcnt(1)
833; GFX11-NEXT:    v_mov_b32_e32 v0, v2
834; GFX11-NEXT:    s_waitcnt vmcnt(0)
835; GFX11-NEXT:    s_setpc_b64 s[30:31]
836  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
837  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
838  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
839  ret <4 x half> %shuffle
840}
841
842define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
843; GFX9-LABEL: shuffle_v4f16_6745:
844; GFX9:       ; %bb.0:
845; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
846; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
847; GFX9-NEXT:    s_waitcnt vmcnt(0)
848; GFX9-NEXT:    v_mov_b32_e32 v0, v2
849; GFX9-NEXT:    s_setpc_b64 s[30:31]
850;
851; GFX10-LABEL: shuffle_v4f16_6745:
852; GFX10:       ; %bb.0:
853; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
854; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
855; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
856; GFX10-NEXT:    s_waitcnt vmcnt(0)
857; GFX10-NEXT:    v_mov_b32_e32 v0, v2
858; GFX10-NEXT:    s_setpc_b64 s[30:31]
859;
860; GFX11-LABEL: shuffle_v4f16_6745:
861; GFX11:       ; %bb.0:
862; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
863; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
864; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
865; GFX11-NEXT:    s_waitcnt vmcnt(0)
866; GFX11-NEXT:    v_mov_b32_e32 v0, v2
867; GFX11-NEXT:    s_setpc_b64 s[30:31]
868  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
869  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
870  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
871  ret <4 x half> %shuffle
872}
873
874define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
875; GFX9-LABEL: shuffle_v4f16_6767:
876; GFX9:       ; %bb.0:
877; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
878; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
879; GFX9-NEXT:    s_waitcnt vmcnt(0)
880; GFX9-NEXT:    v_mov_b32_e32 v1, v0
881; GFX9-NEXT:    s_setpc_b64 s[30:31]
882;
883; GFX10-LABEL: shuffle_v4f16_6767:
884; GFX10:       ; %bb.0:
885; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
886; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
887; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
888; GFX10-NEXT:    s_waitcnt vmcnt(0)
889; GFX10-NEXT:    v_mov_b32_e32 v1, v0
890; GFX10-NEXT:    s_setpc_b64 s[30:31]
891;
892; GFX11-LABEL: shuffle_v4f16_6767:
893; GFX11:       ; %bb.0:
894; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
895; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
896; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
897; GFX11-NEXT:    s_waitcnt vmcnt(0)
898; GFX11-NEXT:    v_mov_b32_e32 v1, v0
899; GFX11-NEXT:    s_setpc_b64 s[30:31]
900  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
901  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
902  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
903  ret <4 x half> %shuffle
904}
905
906define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
907; GFX9-LABEL: shuffle_v4f16_2356:
908; GFX9:       ; %bb.0:
909; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
911; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
912; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
913; GFX9-NEXT:    s_waitcnt vmcnt(1)
914; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
915; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
916; GFX9-NEXT:    s_waitcnt vmcnt(0)
917; GFX9-NEXT:    v_mov_b32_e32 v0, v4
918; GFX9-NEXT:    s_setpc_b64 s[30:31]
919;
920; GFX10-LABEL: shuffle_v4f16_2356:
921; GFX10:       ; %bb.0:
922; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
923; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
924; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
925; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
926; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
927; GFX10-NEXT:    s_waitcnt vmcnt(1)
928; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
929; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
930; GFX10-NEXT:    s_waitcnt vmcnt(0)
931; GFX10-NEXT:    v_mov_b32_e32 v0, v4
932; GFX10-NEXT:    s_setpc_b64 s[30:31]
933;
934; GFX11-LABEL: shuffle_v4f16_2356:
935; GFX11:       ; %bb.0:
936; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
938; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
939; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
940; GFX11-NEXT:    s_waitcnt vmcnt(1)
941; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
942; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
943; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
944; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
945; GFX11-NEXT:    s_waitcnt vmcnt(0)
946; GFX11-NEXT:    s_setpc_b64 s[30:31]
947  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
948  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
949  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
950  ret <4 x half> %shuffle
951}
952
953define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
954; GFX9-LABEL: shuffle_v4f16_5623:
955; GFX9:       ; %bb.0:
956; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
957; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
958; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
959; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
960; GFX9-NEXT:    s_waitcnt vmcnt(1)
961; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
962; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
963; GFX9-NEXT:    s_waitcnt vmcnt(0)
964; GFX9-NEXT:    v_mov_b32_e32 v1, v4
965; GFX9-NEXT:    s_setpc_b64 s[30:31]
966;
967; GFX10-LABEL: shuffle_v4f16_5623:
968; GFX10:       ; %bb.0:
969; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
970; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
971; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
972; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
973; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
974; GFX10-NEXT:    s_waitcnt vmcnt(1)
975; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
976; GFX10-NEXT:    s_waitcnt vmcnt(0)
977; GFX10-NEXT:    v_mov_b32_e32 v1, v4
978; GFX10-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
979; GFX10-NEXT:    s_setpc_b64 s[30:31]
980;
981; GFX11-LABEL: shuffle_v4f16_5623:
982; GFX11:       ; %bb.0:
983; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
984; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
985; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
986; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
987; GFX11-NEXT:    s_waitcnt vmcnt(1)
988; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
989; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
990; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
991; GFX11-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
992; GFX11-NEXT:    s_waitcnt vmcnt(0)
993; GFX11-NEXT:    s_setpc_b64 s[30:31]
994  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
995  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
996  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
997  ret <4 x half> %shuffle
998}
999
1000define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1001; GFX9-LABEL: shuffle_v4f16_3456:
1002; GFX9:       ; %bb.0:
1003; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
1005; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1006; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1007; GFX9-NEXT:    s_waitcnt vmcnt(1)
1008; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1009; GFX9-NEXT:    s_waitcnt vmcnt(0)
1010; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1011; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
1012; GFX9-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
1013; GFX9-NEXT:    s_setpc_b64 s[30:31]
1014;
1015; GFX10-LABEL: shuffle_v4f16_3456:
1016; GFX10:       ; %bb.0:
1017; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1018; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1019; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
1020; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1021; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1022; GFX10-NEXT:    s_waitcnt vmcnt(1)
1023; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1024; GFX10-NEXT:    s_waitcnt vmcnt(0)
1025; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1026; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
1027; GFX10-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
1028; GFX10-NEXT:    s_setpc_b64 s[30:31]
1029;
1030; GFX11-LABEL: shuffle_v4f16_3456:
1031; GFX11:       ; %bb.0:
1032; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1033; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1034; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
1035; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
1036; GFX11-NEXT:    s_waitcnt vmcnt(1)
1037; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
1038; GFX11-NEXT:    s_waitcnt vmcnt(0)
1039; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1040; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1041; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1042; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1043; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1044; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
1045; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
1046; GFX11-NEXT:    s_setpc_b64 s[30:31]
1047  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1048  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1049  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1050  ret <4 x half> %shuffle
1051}
1052
1053define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1054; GFX9-LABEL: shuffle_v4f16_5634:
1055; GFX9:       ; %bb.0:
1056; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
1058; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1059; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1060; GFX9-NEXT:    s_waitcnt vmcnt(1)
1061; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1062; GFX9-NEXT:    s_waitcnt vmcnt(0)
1063; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1064; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
1065; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
1066; GFX9-NEXT:    s_setpc_b64 s[30:31]
1067;
1068; GFX10-LABEL: shuffle_v4f16_5634:
1069; GFX10:       ; %bb.0:
1070; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1071; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1072; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1073; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
1074; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1075; GFX10-NEXT:    s_waitcnt vmcnt(1)
1076; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1077; GFX10-NEXT:    s_waitcnt vmcnt(0)
1078; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1079; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
1080; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v2
1081; GFX10-NEXT:    s_setpc_b64 s[30:31]
1082;
1083; GFX11-LABEL: shuffle_v4f16_5634:
1084; GFX11:       ; %bb.0:
1085; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1086; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1087; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1088; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1089; GFX11-NEXT:    s_waitcnt vmcnt(1)
1090; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1091; GFX11-NEXT:    s_waitcnt vmcnt(0)
1092; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1093; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1094; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1095; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
1096; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1097; GFX11-NEXT:    v_lshl_or_b32 v0, v3, 16, v1
1098; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 16, v4
1099; GFX11-NEXT:    s_setpc_b64 s[30:31]
1100  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1101  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1102  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
1103  ret <4 x half> %shuffle
1104}
1105
1106define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1107; GFX9-LABEL: shuffle_v4f16_5734:
1108; GFX9:       ; %bb.0:
1109; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1110; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
1111; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1112; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1113; GFX9-NEXT:    s_waitcnt vmcnt(1)
1114; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1115; GFX9-NEXT:    s_waitcnt vmcnt(0)
1116; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1117; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
1118; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
1119; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
1120; GFX9-NEXT:    s_setpc_b64 s[30:31]
1121;
1122; GFX10-LABEL: shuffle_v4f16_5734:
1123; GFX10:       ; %bb.0:
1124; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1126; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1127; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
1128; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1129; GFX10-NEXT:    s_waitcnt vmcnt(1)
1130; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1131; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
1132; GFX10-NEXT:    s_waitcnt vmcnt(0)
1133; GFX10-NEXT:    v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1134; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
1135; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
1136; GFX10-NEXT:    s_setpc_b64 s[30:31]
1137;
1138; GFX11-LABEL: shuffle_v4f16_5734:
1139; GFX11:       ; %bb.0:
1140; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1141; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1142; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1143; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1144; GFX11-NEXT:    s_waitcnt vmcnt(1)
1145; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1146; GFX11-NEXT:    s_waitcnt vmcnt(0)
1147; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1148; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1149; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1150; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1151; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
1152; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1153; GFX11-NEXT:    v_lshl_or_b32 v0, v3, 16, v1
1154; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 16, v4
1155; GFX11-NEXT:    s_setpc_b64 s[30:31]
1156  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1157  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1158  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
1159  ret <4 x half> %shuffle
1160}
1161
1162define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
1163; GFX9-LABEL: shuffle_v4i16_2356:
1164; GFX9:       ; %bb.0:
1165; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1167; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
1168; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1169; GFX9-NEXT:    s_waitcnt vmcnt(1)
1170; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1171; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
1172; GFX9-NEXT:    s_waitcnt vmcnt(0)
1173; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1174; GFX9-NEXT:    s_setpc_b64 s[30:31]
1175;
1176; GFX10-LABEL: shuffle_v4i16_2356:
1177; GFX10:       ; %bb.0:
1178; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1179; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1180; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1181; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
1182; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1183; GFX10-NEXT:    s_waitcnt vmcnt(1)
1184; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1185; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
1186; GFX10-NEXT:    s_waitcnt vmcnt(0)
1187; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1188; GFX10-NEXT:    s_setpc_b64 s[30:31]
1189;
1190; GFX11-LABEL: shuffle_v4i16_2356:
1191; GFX11:       ; %bb.0:
1192; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1193; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1194; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1195; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1196; GFX11-NEXT:    s_waitcnt vmcnt(1)
1197; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1198; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1199; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
1200; GFX11-NEXT:    s_waitcnt vmcnt(0)
1201; GFX11-NEXT:    s_setpc_b64 s[30:31]
1202  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
1203  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
1204  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
1205  ret <4 x i16> %shuffle
1206}
1207
1208define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
1209; GFX9-LABEL: shuffle_v4i16_0167:
1210; GFX9:       ; %bb.0:
1211; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1212; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1213; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
1214; GFX9-NEXT:    s_waitcnt vmcnt(1)
1215; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1216; GFX9-NEXT:    s_waitcnt vmcnt(0)
1217; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1218; GFX9-NEXT:    s_setpc_b64 s[30:31]
1219;
1220; GFX10-LABEL: shuffle_v4i16_0167:
1221; GFX10:       ; %bb.0:
1222; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1223; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1224; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1225; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
1226; GFX10-NEXT:    s_waitcnt vmcnt(1)
1227; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1228; GFX10-NEXT:    s_waitcnt vmcnt(0)
1229; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1230; GFX10-NEXT:    s_setpc_b64 s[30:31]
1231;
1232; GFX11-LABEL: shuffle_v4i16_0167:
1233; GFX11:       ; %bb.0:
1234; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1235; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1236; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1237; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
1238; GFX11-NEXT:    s_waitcnt vmcnt(0)
1239; GFX11-NEXT:    s_setpc_b64 s[30:31]
1240  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
1241  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
1242  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1243  ret <4 x i16> %shuffle
1244}
1245
1246define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1247; GFX9-LABEL: shuffle_v4f16_0000:
1248; GFX9:       ; %bb.0:
1249; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1250; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1251; GFX9-NEXT:    s_waitcnt vmcnt(0)
1252; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
1253; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1254; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1255; GFX9-NEXT:    s_setpc_b64 s[30:31]
1256;
1257; GFX10-LABEL: shuffle_v4f16_0000:
1258; GFX10:       ; %bb.0:
1259; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1260; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1261; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1262; GFX10-NEXT:    s_waitcnt vmcnt(0)
1263; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v0
1264; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1265; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1266; GFX10-NEXT:    s_setpc_b64 s[30:31]
1267;
1268; GFX11-LABEL: shuffle_v4f16_0000:
1269; GFX11:       ; %bb.0:
1270; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1271; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1272; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1273; GFX11-NEXT:    s_waitcnt vmcnt(0)
1274; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
1275; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1276; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1277; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1278; GFX11-NEXT:    s_setpc_b64 s[30:31]
1279  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1280  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1281  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
1282  ret <4 x half> %shuffle
1283}
1284
1285define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1286; GFX9-LABEL: shuffle_v4f16_1010:
1287; GFX9:       ; %bb.0:
1288; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1289; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1290; GFX9-NEXT:    s_waitcnt vmcnt(0)
1291; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
1292; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1293; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1294; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1295; GFX9-NEXT:    s_setpc_b64 s[30:31]
1296;
1297; GFX10-LABEL: shuffle_v4f16_1010:
1298; GFX10:       ; %bb.0:
1299; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1300; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1301; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1302; GFX10-NEXT:    s_waitcnt vmcnt(0)
1303; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
1304; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1305; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1306; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1307; GFX10-NEXT:    s_setpc_b64 s[30:31]
1308;
1309; GFX11-LABEL: shuffle_v4f16_1010:
1310; GFX11:       ; %bb.0:
1311; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1312; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1313; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1314; GFX11-NEXT:    s_waitcnt vmcnt(0)
1315; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1316; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1317; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1318; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1319; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1320; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1321; GFX11-NEXT:    s_setpc_b64 s[30:31]
1322  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1323  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1324  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
1325  ret <4 x half> %shuffle
1326}
1327
1328define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1329; GFX9-LABEL: shuffle_v4f16_1100:
1330; GFX9:       ; %bb.0:
1331; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1332; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1333; GFX9-NEXT:    s_waitcnt vmcnt(0)
1334; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
1335; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1336; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
1337; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
1338; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
1339; GFX9-NEXT:    s_setpc_b64 s[30:31]
1340;
1341; GFX10-LABEL: shuffle_v4f16_1100:
1342; GFX10:       ; %bb.0:
1343; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1344; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1345; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
1346; GFX10-NEXT:    s_waitcnt vmcnt(0)
1347; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1348; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v1
1349; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1350; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
1351; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
1352; GFX10-NEXT:    s_setpc_b64 s[30:31]
1353;
1354; GFX11-LABEL: shuffle_v4f16_1100:
1355; GFX11:       ; %bb.0:
1356; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1357; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1358; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
1359; GFX11-NEXT:    s_waitcnt vmcnt(0)
1360; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1361; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
1362; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1363; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1364; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
1365; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1366; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
1367; GFX11-NEXT:    s_setpc_b64 s[30:31]
1368  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1369  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1370  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
1371  ret <4 x half> %shuffle
1372}
1373
1374define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1375; GFX9-LABEL: shuffle_v4f16_6161:
1376; GFX9:       ; %bb.0:
1377; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1378; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
1379; GFX9-NEXT:    global_load_dword v5, v[0:1], off
1380; GFX9-NEXT:    s_waitcnt vmcnt(1)
1381; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
1382; GFX9-NEXT:    s_waitcnt vmcnt(0)
1383; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
1384; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1385; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1386; GFX9-NEXT:    s_setpc_b64 s[30:31]
1387;
1388; GFX10-LABEL: shuffle_v4f16_6161:
1389; GFX10:       ; %bb.0:
1390; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1391; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1392; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
1393; GFX10-NEXT:    global_load_dword v5, v[0:1], off
1394; GFX10-NEXT:    s_waitcnt vmcnt(1)
1395; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
1396; GFX10-NEXT:    s_waitcnt vmcnt(0)
1397; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
1398; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1399; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1400; GFX10-NEXT:    s_setpc_b64 s[30:31]
1401;
1402; GFX11-LABEL: shuffle_v4f16_6161:
1403; GFX11:       ; %bb.0:
1404; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1405; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1406; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
1407; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1408; GFX11-NEXT:    s_waitcnt vmcnt(1)
1409; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
1410; GFX11-NEXT:    s_waitcnt vmcnt(0)
1411; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1412; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1413; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1414; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1415; GFX11-NEXT:    s_setpc_b64 s[30:31]
1416  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1417  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1418  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
1419  ret <4 x half> %shuffle
1420}
1421
1422define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1423; GFX9-LABEL: shuffle_v4f16_2333:
1424; GFX9:       ; %bb.0:
1425; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1426; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
1427; GFX9-NEXT:    s_waitcnt vmcnt(0)
1428; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1429; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1430; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1431; GFX9-NEXT:    s_setpc_b64 s[30:31]
1432;
1433; GFX10-LABEL: shuffle_v4f16_2333:
1434; GFX10:       ; %bb.0:
1435; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1436; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1437; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
1438; GFX10-NEXT:    s_waitcnt vmcnt(0)
1439; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1440; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1441; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1442; GFX10-NEXT:    s_setpc_b64 s[30:31]
1443;
1444; GFX11-LABEL: shuffle_v4f16_2333:
1445; GFX11:       ; %bb.0:
1446; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1447; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1448; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1449; GFX11-NEXT:    s_waitcnt vmcnt(0)
1450; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1451; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1452; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1453; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1454; GFX11-NEXT:    s_setpc_b64 s[30:31]
1455  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1456  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1457  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1458  ret <4 x half> %shuffle
1459}
1460
1461define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1462; GFX9-LABEL: shuffle_v4f16_6667:
1463; GFX9:       ; %bb.0:
1464; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1465; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
1466; GFX9-NEXT:    s_waitcnt vmcnt(0)
1467; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1468; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1469; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1470; GFX9-NEXT:    s_setpc_b64 s[30:31]
1471;
1472; GFX10-LABEL: shuffle_v4f16_6667:
1473; GFX10:       ; %bb.0:
1474; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1475; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1476; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
1477; GFX10-NEXT:    s_waitcnt vmcnt(0)
1478; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1479; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1480; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1481; GFX10-NEXT:    s_setpc_b64 s[30:31]
1482;
1483; GFX11-LABEL: shuffle_v4f16_6667:
1484; GFX11:       ; %bb.0:
1485; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1486; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1487; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1488; GFX11-NEXT:    s_waitcnt vmcnt(0)
1489; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1490; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1491; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1492; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1493; GFX11-NEXT:    s_setpc_b64 s[30:31]
1494  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1495  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1496  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1497  ret <4 x half> %shuffle
1498}
1499
1500define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1501; GFX9-LABEL: shuffle_v8f16_0101:
1502; GFX9:       ; %bb.0:
1503; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1504; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1505; GFX9-NEXT:    s_waitcnt vmcnt(0)
1506; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1507; GFX9-NEXT:    s_setpc_b64 s[30:31]
1508;
1509; GFX10-LABEL: shuffle_v8f16_0101:
1510; GFX10:       ; %bb.0:
1511; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1512; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1513; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1514; GFX10-NEXT:    s_waitcnt vmcnt(0)
1515; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1516; GFX10-NEXT:    s_setpc_b64 s[30:31]
1517;
1518; GFX11-LABEL: shuffle_v8f16_0101:
1519; GFX11:       ; %bb.0:
1520; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1521; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1522; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1523; GFX11-NEXT:    s_waitcnt vmcnt(0)
1524; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1525; GFX11-NEXT:    s_setpc_b64 s[30:31]
1526  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1527  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1528  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1529  ret <4 x half> %shuffle
1530}
1531
1532define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1533; GFX9-LABEL: shuffle_v8f16_0123:
1534; GFX9:       ; %bb.0:
1535; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1536; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1537; GFX9-NEXT:    s_waitcnt vmcnt(0)
1538; GFX9-NEXT:    s_setpc_b64 s[30:31]
1539;
1540; GFX10-LABEL: shuffle_v8f16_0123:
1541; GFX10:       ; %bb.0:
1542; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1543; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1544; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1545; GFX10-NEXT:    s_waitcnt vmcnt(0)
1546; GFX10-NEXT:    s_setpc_b64 s[30:31]
1547;
1548; GFX11-LABEL: shuffle_v8f16_0123:
1549; GFX11:       ; %bb.0:
1550; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1551; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1552; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1553; GFX11-NEXT:    s_waitcnt vmcnt(0)
1554; GFX11-NEXT:    s_setpc_b64 s[30:31]
1555  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1556  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1557  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1558  ret <4 x half> %shuffle
1559}
1560
1561define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1562; GFX9-LABEL: shuffle_v8f16_4589:
1563; GFX9:       ; %bb.0:
1564; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1565; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
1566; GFX9-NEXT:    global_load_dword v5, v[2:3], off
1567; GFX9-NEXT:    s_waitcnt vmcnt(1)
1568; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1569; GFX9-NEXT:    s_waitcnt vmcnt(0)
1570; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1571; GFX9-NEXT:    s_setpc_b64 s[30:31]
1572;
1573; GFX10-LABEL: shuffle_v8f16_4589:
1574; GFX10:       ; %bb.0:
1575; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1576; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1577; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
1578; GFX10-NEXT:    global_load_dword v5, v[2:3], off
1579; GFX10-NEXT:    s_waitcnt vmcnt(1)
1580; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1581; GFX10-NEXT:    s_waitcnt vmcnt(0)
1582; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1583; GFX10-NEXT:    s_setpc_b64 s[30:31]
1584;
1585; GFX11-LABEL: shuffle_v8f16_4589:
1586; GFX11:       ; %bb.0:
1587; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1588; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1589; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
1590; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
1591; GFX11-NEXT:    s_waitcnt vmcnt(0)
1592; GFX11-NEXT:    s_setpc_b64 s[30:31]
1593  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1594  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1595  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
1596  ret <4 x half> %shuffle
1597}
1598
1599define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1600; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
1601; GFX9:       ; %bb.0:
1602; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1603; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
1604; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
1605; GFX9-NEXT:    s_waitcnt vmcnt(1)
1606; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1607; GFX9-NEXT:    s_waitcnt vmcnt(0)
1608; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1609; GFX9-NEXT:    s_setpc_b64 s[30:31]
1610;
1611; GFX10-LABEL: shuffle_v8f16_10_11_2_3:
1612; GFX10:       ; %bb.0:
1613; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1614; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1615; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
1616; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
1617; GFX10-NEXT:    s_waitcnt vmcnt(1)
1618; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1619; GFX10-NEXT:    s_waitcnt vmcnt(0)
1620; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1621; GFX10-NEXT:    s_setpc_b64 s[30:31]
1622;
1623; GFX11-LABEL: shuffle_v8f16_10_11_2_3:
1624; GFX11:       ; %bb.0:
1625; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1626; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1627; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
1628; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
1629; GFX11-NEXT:    s_waitcnt vmcnt(1)
1630; GFX11-NEXT:    v_mov_b32_e32 v0, v2
1631; GFX11-NEXT:    s_waitcnt vmcnt(0)
1632; GFX11-NEXT:    s_setpc_b64 s[30:31]
1633  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1634  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1635  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
1636  ret <4 x half> %shuffle
1637}
1638
1639define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1640; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
1641; GFX9:       ; %bb.0:
1642; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1643; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
1644; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
1645; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1646; GFX9-NEXT:    s_waitcnt vmcnt(1)
1647; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1648; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
1649; GFX9-NEXT:    s_waitcnt vmcnt(0)
1650; GFX9-NEXT:    v_mov_b32_e32 v1, v4
1651; GFX9-NEXT:    s_setpc_b64 s[30:31]
1652;
1653; GFX10-LABEL: shuffle_v8f16_13_14_2_3:
1654; GFX10:       ; %bb.0:
1655; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1656; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1657; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
1658; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
1659; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1660; GFX10-NEXT:    s_waitcnt vmcnt(1)
1661; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1662; GFX10-NEXT:    s_waitcnt vmcnt(0)
1663; GFX10-NEXT:    v_mov_b32_e32 v1, v4
1664; GFX10-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
1665; GFX10-NEXT:    s_setpc_b64 s[30:31]
1666;
1667; GFX11-LABEL: shuffle_v8f16_13_14_2_3:
1668; GFX11:       ; %bb.0:
1669; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1670; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1671; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
1672; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
1673; GFX11-NEXT:    s_waitcnt vmcnt(1)
1674; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
1675; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1676; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1677; GFX11-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
1678; GFX11-NEXT:    s_waitcnt vmcnt(0)
1679; GFX11-NEXT:    s_setpc_b64 s[30:31]
1680  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1681  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1682  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
1683  ret <4 x half> %shuffle
1684}
1685
1686define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
1687; GFX9-LABEL: shuffle_v3f16_0122:
1688; GFX9:       ; %bb.0:
1689; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1690; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1691; GFX9-NEXT:    s_waitcnt vmcnt(0)
1692; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1693; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1694; GFX9-NEXT:    s_setpc_b64 s[30:31]
1695;
1696; GFX10-LABEL: shuffle_v3f16_0122:
1697; GFX10:       ; %bb.0:
1698; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1699; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1700; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1701; GFX10-NEXT:    s_waitcnt vmcnt(0)
1702; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1703; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1704; GFX10-NEXT:    s_setpc_b64 s[30:31]
1705;
1706; GFX11-LABEL: shuffle_v3f16_0122:
1707; GFX11:       ; %bb.0:
1708; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1709; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1710; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1711; GFX11-NEXT:    s_waitcnt vmcnt(0)
1712; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1713; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1714; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1715; GFX11-NEXT:    s_setpc_b64 s[30:31]
1716  %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
1717  %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
1718  %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1719  ret <4 x half> %shuffle
1720}
1721
1722define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
1723; GFX9-LABEL: shuffle_v2f16_0122:
1724; GFX9:       ; %bb.0:
1725; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1726; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1727; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
1728; GFX9-NEXT:    s_waitcnt vmcnt(0)
1729; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1730; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
1731; GFX9-NEXT:    s_setpc_b64 s[30:31]
1732;
1733; GFX10-LABEL: shuffle_v2f16_0122:
1734; GFX10:       ; %bb.0:
1735; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1736; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1737; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1738; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
1739; GFX10-NEXT:    s_waitcnt vmcnt(0)
1740; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1741; GFX10-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
1742; GFX10-NEXT:    s_setpc_b64 s[30:31]
1743;
1744; GFX11-LABEL: shuffle_v2f16_0122:
1745; GFX11:       ; %bb.0:
1746; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1747; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1748; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1749; GFX11-NEXT:    s_waitcnt vmcnt(0)
1750; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1751; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1752; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1753; GFX11-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
1754; GFX11-NEXT:    s_setpc_b64 s[30:31]
1755  %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
1756  %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
1757  %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
1758  ret <4 x half> %shuffle
1759}
1760
1761define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
1762; GFX9-LABEL: shuffle_v6f16_452367:
1763; GFX9:       ; %bb.0:
1764; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1765; GFX9-NEXT:    v_mov_b32_e32 v6, v1
1766; GFX9-NEXT:    v_mov_b32_e32 v5, v0
1767; GFX9-NEXT:    v_mov_b32_e32 v4, v3
1768; GFX9-NEXT:    v_mov_b32_e32 v3, v2
1769; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
1770; GFX9-NEXT:    global_load_dword v7, v[3:4], off
1771; GFX9-NEXT:    s_waitcnt vmcnt(1)
1772; GFX9-NEXT:    v_mov_b32_e32 v0, v2
1773; GFX9-NEXT:    s_waitcnt vmcnt(0)
1774; GFX9-NEXT:    v_mov_b32_e32 v2, v7
1775; GFX9-NEXT:    s_setpc_b64 s[30:31]
1776;
1777; GFX10-LABEL: shuffle_v6f16_452367:
1778; GFX10:       ; %bb.0:
1779; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1780; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1781; GFX10-NEXT:    v_mov_b32_e32 v6, v1
1782; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1783; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1784; GFX10-NEXT:    v_mov_b32_e32 v3, v2
1785; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
1786; GFX10-NEXT:    global_load_dword v7, v[3:4], off
1787; GFX10-NEXT:    s_waitcnt vmcnt(1)
1788; GFX10-NEXT:    v_mov_b32_e32 v0, v2
1789; GFX10-NEXT:    s_waitcnt vmcnt(0)
1790; GFX10-NEXT:    v_mov_b32_e32 v2, v7
1791; GFX10-NEXT:    s_setpc_b64 s[30:31]
1792;
1793; GFX11-LABEL: shuffle_v6f16_452367:
1794; GFX11:       ; %bb.0:
1795; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1796; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1797; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
1798; GFX11-NEXT:    s_waitcnt vmcnt(0)
1799; GFX11-NEXT:    scratch_store_b96 off, v[4:6], s32
1800; GFX11-NEXT:    global_load_b96 v[4:6], v[2:3], off
1801; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32
1802; GFX11-NEXT:    s_waitcnt vmcnt(1)
1803; GFX11-NEXT:    scratch_store_b96 off, v[4:6], s32 offset:16
1804; GFX11-NEXT:    s_waitcnt vmcnt(0)
1805; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:16
1806; GFX11-NEXT:    v_mov_b32_e32 v0, v2
1807; GFX11-NEXT:    s_waitcnt vmcnt(0)
1808; GFX11-NEXT:    v_mov_b32_e32 v2, v3
1809; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1810; GFX11-NEXT:    s_setpc_b64 s[30:31]
1811  %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
1812  %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
1813  %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
1814  ret <6 x half> %shuffle
1815}
1816
1817define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C)  {
1818; GFX9-LABEL: fma_shuffle:
1819; GFX9:       ; %bb.0: ; %entry
1820; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1821; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1822; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1823; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1824; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1825; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1826; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[6:7]
1827; GFX9-NEXT:    s_waitcnt vmcnt(0)
1828; GFX9-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1829; GFX9-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1830; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1831; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1832; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
1833; GFX9-NEXT:    s_endpgm
1834;
1835; GFX10-LABEL: fma_shuffle:
1836; GFX10:       ; %bb.0: ; %entry
1837; GFX10-NEXT:    s_clause 0x1
1838; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1839; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1840; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1841; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1842; GFX10-NEXT:    s_clause 0x2
1843; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1844; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1845; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[6:7]
1846; GFX10-NEXT:    s_waitcnt vmcnt(0)
1847; GFX10-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1848; GFX10-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1849; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1850; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1851; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
1852; GFX10-NEXT:    s_endpgm
1853;
1854; GFX11-LABEL: fma_shuffle:
1855; GFX11:       ; %bb.0: ; %entry
1856; GFX11-NEXT:    s_clause 0x1
1857; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
1858; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
1859; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1860; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1861; GFX11-NEXT:    s_clause 0x2
1862; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
1863; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7]
1864; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[0:1]
1865; GFX11-NEXT:    s_waitcnt vmcnt(0)
1866; GFX11-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1867; GFX11-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1868; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1869; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1870; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1871; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
1872; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1873; GFX11-NEXT:    s_endpgm
1874entry:
1875  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
1876  %tmp12 = zext i32 %tmp1 to i64
1877  %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
1878  %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
1879  %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
1880  %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
1881  %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
1882  %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
1883  %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
1884  %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1885  %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1886  %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
1887  %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
1888  %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1889  %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
1890  %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1891  %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1892  %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
1893  %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1894  %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
1895  %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
1896  %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
1897  %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1898  %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1899  store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
1900  ret void
1901}
1902
1903define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1904; GFX9-LABEL: shuffle_v4f16_0456:
1905; GFX9:       ; %bb.0:
1906; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1907; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1908; GFX9-NEXT:    s_waitcnt vmcnt(0)
1909; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1910; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1911; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1912; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1913; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v4
1914; GFX9-NEXT:    s_waitcnt vmcnt(0)
1915; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1916; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
1917; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
1918; GFX9-NEXT:    s_setpc_b64 s[30:31]
1919;
1920; GFX10-LABEL: shuffle_v4f16_0456:
1921; GFX10:       ; %bb.0:
1922; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1923; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1924; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1925; GFX10-NEXT:    s_waitcnt vmcnt(0)
1926; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1927; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1928; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1929; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1930; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v4
1931; GFX10-NEXT:    s_waitcnt vmcnt(0)
1932; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1933; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
1934; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
1935; GFX10-NEXT:    s_setpc_b64 s[30:31]
1936;
1937; GFX11-LABEL: shuffle_v4f16_0456:
1938; GFX11:       ; %bb.0:
1939; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1940; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1941; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1942; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1943; GFX11-NEXT:    s_waitcnt vmcnt(0)
1944; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1945; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1946; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1947; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1948; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
1949; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1950; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
1951; GFX11-NEXT:    s_setpc_b64 s[30:31]
1952  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1953  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1954  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1955  ret <4 x half> %shuffle
1956}
1957
1958define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out)  {
1959; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
1960; GFX9:       ; %bb.0:
1961; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1962; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1963; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1964; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1965; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1966; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1967; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1968; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1969; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1970; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1971; GFX9-NEXT:    s_endpgm
1972;
1973; GFX10-LABEL: shuffle_scalar_load_v8i32_0123:
1974; GFX10:       ; %bb.0:
1975; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1976; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1977; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1978; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1979; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1980; GFX10-NEXT:    v_mov_b32_e32 v0, s4
1981; GFX10-NEXT:    v_mov_b32_e32 v1, s5
1982; GFX10-NEXT:    v_mov_b32_e32 v2, s6
1983; GFX10-NEXT:    v_mov_b32_e32 v3, s7
1984; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1985; GFX10-NEXT:    s_endpgm
1986;
1987; GFX11-LABEL: shuffle_scalar_load_v8i32_0123:
1988; GFX11:       ; %bb.0:
1989; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
1990; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1991; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
1992; GFX11-NEXT:    v_mov_b32_e32 v4, 0
1993; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1994; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
1995; GFX11-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
1996; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
1997; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1998; GFX11-NEXT:    s_endpgm
1999  %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
2000  %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2001  store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8
2002  ret void
2003}
2004
2005declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
2006declare i32 @llvm.amdgcn.workitem.id.x() #0
2007
2008attributes #0 = { nounwind readnone speculatable }
2009