1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4
5define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
6; GFX9-LABEL: shuffle_v4f16_23uu:
7; GFX9:       ; %bb.0:
8; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
10; GFX9-NEXT:    s_waitcnt vmcnt(0)
11; GFX9-NEXT:    s_setpc_b64 s[30:31]
12;
13; GFX10-LABEL: shuffle_v4f16_23uu:
14; GFX10:       ; %bb.0:
15; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
17; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
18; GFX10-NEXT:    s_waitcnt vmcnt(0)
19; GFX10-NEXT:    s_setpc_b64 s[30:31]
20  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
21  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
22  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
23  ret <4 x half> %shuffle
24}
25
26define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
27; GFX9-LABEL: shuffle_v4f16_234u:
28; GFX9:       ; %bb.0:
29; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
31; GFX9-NEXT:    s_waitcnt vmcnt(0)
32; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
33; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
34; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
35; GFX9-NEXT:    v_mov_b32_e32 v1, v4
36; GFX9-NEXT:    s_waitcnt vmcnt(0)
37; GFX9-NEXT:    v_mov_b32_e32 v0, v5
38; GFX9-NEXT:    s_setpc_b64 s[30:31]
39;
40; GFX10-LABEL: shuffle_v4f16_234u:
41; GFX10:       ; %bb.0:
42; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
44; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
45; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
46; GFX10-NEXT:    s_waitcnt vmcnt(1)
47; GFX10-NEXT:    v_mov_b32_e32 v0, v6
48; GFX10-NEXT:    s_waitcnt vmcnt(0)
49; GFX10-NEXT:    v_mov_b32_e32 v1, v4
50; GFX10-NEXT:    s_setpc_b64 s[30:31]
51  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
52  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
53  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
54  ret <4 x half> %shuffle
55}
56
57define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
58; GFX9-LABEL: shuffle_v4f16_u1u3:
59; GFX9:       ; %bb.0:
60; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
62; GFX9-NEXT:    s_waitcnt vmcnt(0)
63; GFX9-NEXT:    s_setpc_b64 s[30:31]
64;
65; GFX10-LABEL: shuffle_v4f16_u1u3:
66; GFX10:       ; %bb.0:
67; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
69; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
70; GFX10-NEXT:    s_waitcnt vmcnt(0)
71; GFX10-NEXT:    s_setpc_b64 s[30:31]
72  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
73  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
74  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
75  ret <4 x half> %shuffle
76}
77
78define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
79; GFX9-LABEL: shuffle_v4f16_u3u1:
80; GFX9:       ; %bb.0:
81; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
83; GFX9-NEXT:    s_waitcnt vmcnt(0)
84; GFX9-NEXT:    v_mov_b32_e32 v0, v2
85; GFX9-NEXT:    s_setpc_b64 s[30:31]
86;
87; GFX10-LABEL: shuffle_v4f16_u3u1:
88; GFX10:       ; %bb.0:
89; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
91; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
92; GFX10-NEXT:    s_waitcnt vmcnt(0)
93; GFX10-NEXT:    v_mov_b32_e32 v0, v2
94; GFX10-NEXT:    s_setpc_b64 s[30:31]
95  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
96  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
97  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
98  ret <4 x half> %shuffle
99}
100
101define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
102; GFX9-LABEL: shuffle_v4f16_u3uu:
103; GFX9:       ; %bb.0:
104; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
106; GFX9-NEXT:    s_waitcnt vmcnt(0)
107; GFX9-NEXT:    s_setpc_b64 s[30:31]
108;
109; GFX10-LABEL: shuffle_v4f16_u3uu:
110; GFX10:       ; %bb.0:
111; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
113; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
114; GFX10-NEXT:    s_waitcnt vmcnt(0)
115; GFX10-NEXT:    s_setpc_b64 s[30:31]
116  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
117  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
118  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
119  ret <4 x half> %shuffle
120}
121
122define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
123; GFX9-LABEL: shuffle_v4f16_3u6u:
124; GFX9:       ; %bb.0:
125; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
127; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
128; GFX9-NEXT:    s_waitcnt vmcnt(1)
129; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
130; GFX9-NEXT:    s_waitcnt vmcnt(0)
131; GFX9-NEXT:    v_mov_b32_e32 v1, v4
132; GFX9-NEXT:    s_setpc_b64 s[30:31]
133;
134; GFX10-LABEL: shuffle_v4f16_3u6u:
135; GFX10:       ; %bb.0:
136; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
138; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
139; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
140; GFX10-NEXT:    s_waitcnt vmcnt(1)
141; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
142; GFX10-NEXT:    s_waitcnt vmcnt(0)
143; GFX10-NEXT:    v_mov_b32_e32 v1, v4
144; GFX10-NEXT:    s_setpc_b64 s[30:31]
145  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
146  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
147  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
148  ret <4 x half> %shuffle
149}
150
151define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
152; GFX9-LABEL: shuffle_v4f16_3uu7:
153; GFX9:       ; %bb.0:
154; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
156; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
157; GFX9-NEXT:    s_waitcnt vmcnt(1)
158; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
159; GFX9-NEXT:    s_waitcnt vmcnt(0)
160; GFX9-NEXT:    v_mov_b32_e32 v1, v4
161; GFX9-NEXT:    s_setpc_b64 s[30:31]
162;
163; GFX10-LABEL: shuffle_v4f16_3uu7:
164; GFX10:       ; %bb.0:
165; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
167; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
168; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
169; GFX10-NEXT:    s_waitcnt vmcnt(1)
170; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
171; GFX10-NEXT:    s_waitcnt vmcnt(0)
172; GFX10-NEXT:    v_mov_b32_e32 v1, v4
173; GFX10-NEXT:    s_setpc_b64 s[30:31]
174  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
175  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
176  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
177  ret <4 x half> %shuffle
178}
179
180define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
181; GFX9-LABEL: shuffle_v4f16_35u5:
182; GFX9:       ; %bb.0:
183; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184; GFX9-NEXT:    global_load_dword v4, v[2:3], off
185; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
186; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
187; GFX9-NEXT:    s_waitcnt vmcnt(1)
188; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
189; GFX9-NEXT:    s_waitcnt vmcnt(0)
190; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
191; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
192; GFX9-NEXT:    v_mov_b32_e32 v1, v4
193; GFX9-NEXT:    s_setpc_b64 s[30:31]
194;
195; GFX10-LABEL: shuffle_v4f16_35u5:
196; GFX10:       ; %bb.0:
197; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
199; GFX10-NEXT:    global_load_dword v4, v[2:3], off
200; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
201; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
202; GFX10-NEXT:    s_waitcnt vmcnt(1)
203; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
204; GFX10-NEXT:    s_waitcnt vmcnt(0)
205; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
206; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
207; GFX10-NEXT:    v_mov_b32_e32 v1, v4
208; GFX10-NEXT:    s_setpc_b64 s[30:31]
209  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
210  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
211  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
212  ret <4 x half> %shuffle
213}
214
215define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
216; GFX9-LABEL: shuffle_v4f16_357u:
217; GFX9:       ; %bb.0:
218; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
220; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
221; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
222; GFX9-NEXT:    s_waitcnt vmcnt(1)
223; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
224; GFX9-NEXT:    s_waitcnt vmcnt(0)
225; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
226; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
227; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
228; GFX9-NEXT:    s_setpc_b64 s[30:31]
229;
230; GFX10-LABEL: shuffle_v4f16_357u:
231; GFX10:       ; %bb.0:
232; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
234; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
235; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
236; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
237; GFX10-NEXT:    s_waitcnt vmcnt(1)
238; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
239; GFX10-NEXT:    s_waitcnt vmcnt(0)
240; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
241; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
242; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
243; GFX10-NEXT:    s_setpc_b64 s[30:31]
244  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
245  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
246  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
247  ret <4 x half> %shuffle
248}
249
250define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
251; GFX9-LABEL: shuffle_v4f16_0101:
252; GFX9:       ; %bb.0:
253; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254; GFX9-NEXT:    global_load_dword v0, v[0:1], off
255; GFX9-NEXT:    s_waitcnt vmcnt(0)
256; GFX9-NEXT:    v_mov_b32_e32 v1, v0
257; GFX9-NEXT:    s_setpc_b64 s[30:31]
258;
259; GFX10-LABEL: shuffle_v4f16_0101:
260; GFX10:       ; %bb.0:
261; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
263; GFX10-NEXT:    global_load_dword v0, v[0:1], off
264; GFX10-NEXT:    s_waitcnt vmcnt(0)
265; GFX10-NEXT:    v_mov_b32_e32 v1, v0
266; GFX10-NEXT:    s_setpc_b64 s[30:31]
267  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
268  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
269  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
270  ret <4 x half> %shuffle
271}
272
273define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
274; GFX9-LABEL: shuffle_v4f16_0123:
275; GFX9:       ; %bb.0:
276; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
278; GFX9-NEXT:    s_waitcnt vmcnt(0)
279; GFX9-NEXT:    s_setpc_b64 s[30:31]
280;
281; GFX10-LABEL: shuffle_v4f16_0123:
282; GFX10:       ; %bb.0:
283; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
284; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
285; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
286; GFX10-NEXT:    s_waitcnt vmcnt(0)
287; GFX10-NEXT:    s_setpc_b64 s[30:31]
288  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
289  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
290  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
291  ret <4 x half> %shuffle
292}
293
294define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
295; GFX9-LABEL: shuffle_v4f16_0145:
296; GFX9:       ; %bb.0:
297; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298; GFX9-NEXT:    global_load_dword v4, v[0:1], off
299; GFX9-NEXT:    global_load_dword v5, v[2:3], off
300; GFX9-NEXT:    s_waitcnt vmcnt(1)
301; GFX9-NEXT:    v_mov_b32_e32 v0, v4
302; GFX9-NEXT:    s_waitcnt vmcnt(0)
303; GFX9-NEXT:    v_mov_b32_e32 v1, v5
304; GFX9-NEXT:    s_setpc_b64 s[30:31]
305;
306; GFX10-LABEL: shuffle_v4f16_0145:
307; GFX10:       ; %bb.0:
308; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
310; GFX10-NEXT:    global_load_dword v4, v[0:1], off
311; GFX10-NEXT:    global_load_dword v5, v[2:3], off
312; GFX10-NEXT:    s_waitcnt vmcnt(1)
313; GFX10-NEXT:    v_mov_b32_e32 v0, v4
314; GFX10-NEXT:    s_waitcnt vmcnt(0)
315; GFX10-NEXT:    v_mov_b32_e32 v1, v5
316; GFX10-NEXT:    s_setpc_b64 s[30:31]
317  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
318  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
319  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
320  ret <4 x half> %shuffle
321}
322
323define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
324; GFX9-LABEL: shuffle_v4f16_0167:
325; GFX9:       ; %bb.0:
326; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327; GFX9-NEXT:    global_load_dword v4, v[0:1], off
328; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
329; GFX9-NEXT:    s_waitcnt vmcnt(1)
330; GFX9-NEXT:    v_mov_b32_e32 v0, v4
331; GFX9-NEXT:    s_waitcnt vmcnt(0)
332; GFX9-NEXT:    v_mov_b32_e32 v1, v5
333; GFX9-NEXT:    s_setpc_b64 s[30:31]
334;
335; GFX10-LABEL: shuffle_v4f16_0167:
336; GFX10:       ; %bb.0:
337; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
338; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
339; GFX10-NEXT:    global_load_dword v4, v[0:1], off
340; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
341; GFX10-NEXT:    s_waitcnt vmcnt(1)
342; GFX10-NEXT:    v_mov_b32_e32 v0, v4
343; GFX10-NEXT:    s_waitcnt vmcnt(0)
344; GFX10-NEXT:    v_mov_b32_e32 v1, v5
345; GFX10-NEXT:    s_setpc_b64 s[30:31]
346  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
347  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
348  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
349  ret <4 x half> %shuffle
350}
351
352define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
353; GFX9-LABEL: shuffle_v4f16_2301:
354; GFX9:       ; %bb.0:
355; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
356; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
357; GFX9-NEXT:    s_waitcnt vmcnt(0)
358; GFX9-NEXT:    v_mov_b32_e32 v0, v2
359; GFX9-NEXT:    s_setpc_b64 s[30:31]
360;
361; GFX10-LABEL: shuffle_v4f16_2301:
362; GFX10:       ; %bb.0:
363; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
365; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
366; GFX10-NEXT:    s_waitcnt vmcnt(0)
367; GFX10-NEXT:    v_mov_b32_e32 v0, v2
368; GFX10-NEXT:    s_setpc_b64 s[30:31]
369  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
370  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
371  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
372  ret <4 x half> %shuffle
373}
374
375define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
376; GFX9-LABEL: shuffle_v4f16_2323:
377; GFX9:       ; %bb.0:
378; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
380; GFX9-NEXT:    s_waitcnt vmcnt(0)
381; GFX9-NEXT:    v_mov_b32_e32 v1, v0
382; GFX9-NEXT:    s_setpc_b64 s[30:31]
383;
384; GFX10-LABEL: shuffle_v4f16_2323:
385; GFX10:       ; %bb.0:
386; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
388; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
389; GFX10-NEXT:    s_waitcnt vmcnt(0)
390; GFX10-NEXT:    v_mov_b32_e32 v1, v0
391; GFX10-NEXT:    s_setpc_b64 s[30:31]
392  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
393  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
394  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
395  ret <4 x half> %shuffle
396}
397
398define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
399; GFX9-LABEL: shuffle_v4f16_2345:
400; GFX9:       ; %bb.0:
401; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
403; GFX9-NEXT:    global_load_dword v5, v[2:3], off
404; GFX9-NEXT:    s_waitcnt vmcnt(1)
405; GFX9-NEXT:    v_mov_b32_e32 v0, v4
406; GFX9-NEXT:    s_waitcnt vmcnt(0)
407; GFX9-NEXT:    v_mov_b32_e32 v1, v5
408; GFX9-NEXT:    s_setpc_b64 s[30:31]
409;
410; GFX10-LABEL: shuffle_v4f16_2345:
411; GFX10:       ; %bb.0:
412; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
414; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
415; GFX10-NEXT:    global_load_dword v5, v[2:3], off
416; GFX10-NEXT:    s_waitcnt vmcnt(1)
417; GFX10-NEXT:    v_mov_b32_e32 v0, v4
418; GFX10-NEXT:    s_waitcnt vmcnt(0)
419; GFX10-NEXT:    v_mov_b32_e32 v1, v5
420; GFX10-NEXT:    s_setpc_b64 s[30:31]
421  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
422  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
423  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
424  ret <4 x half> %shuffle
425}
426
427define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
428; GFX9-LABEL: shuffle_v4f16_2367:
429; GFX9:       ; %bb.0:
430; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
431; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
432; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
433; GFX9-NEXT:    s_waitcnt vmcnt(1)
434; GFX9-NEXT:    v_mov_b32_e32 v0, v4
435; GFX9-NEXT:    s_waitcnt vmcnt(0)
436; GFX9-NEXT:    v_mov_b32_e32 v1, v5
437; GFX9-NEXT:    s_setpc_b64 s[30:31]
438;
439; GFX10-LABEL: shuffle_v4f16_2367:
440; GFX10:       ; %bb.0:
441; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
443; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
444; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
445; GFX10-NEXT:    s_waitcnt vmcnt(1)
446; GFX10-NEXT:    v_mov_b32_e32 v0, v4
447; GFX10-NEXT:    s_waitcnt vmcnt(0)
448; GFX10-NEXT:    v_mov_b32_e32 v1, v5
449; GFX10-NEXT:    s_setpc_b64 s[30:31]
450  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
451  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
452  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
453  ret <4 x half> %shuffle
454}
455
456define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
457; GFX9-LABEL: shuffle_v4f16_4501:
458; GFX9:       ; %bb.0:
459; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
460; GFX9-NEXT:    global_load_dword v4, v[2:3], off
461; GFX9-NEXT:    global_load_dword v5, v[0:1], off
462; GFX9-NEXT:    s_waitcnt vmcnt(1)
463; GFX9-NEXT:    v_mov_b32_e32 v0, v4
464; GFX9-NEXT:    s_waitcnt vmcnt(0)
465; GFX9-NEXT:    v_mov_b32_e32 v1, v5
466; GFX9-NEXT:    s_setpc_b64 s[30:31]
467;
468; GFX10-LABEL: shuffle_v4f16_4501:
469; GFX10:       ; %bb.0:
470; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
471; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
472; GFX10-NEXT:    global_load_dword v4, v[2:3], off
473; GFX10-NEXT:    global_load_dword v5, v[0:1], off
474; GFX10-NEXT:    s_waitcnt vmcnt(1)
475; GFX10-NEXT:    v_mov_b32_e32 v0, v4
476; GFX10-NEXT:    s_waitcnt vmcnt(0)
477; GFX10-NEXT:    v_mov_b32_e32 v1, v5
478; GFX10-NEXT:    s_setpc_b64 s[30:31]
479  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
480  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
481  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
482  ret <4 x half> %shuffle
483}
484
485define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
486; GFX9-LABEL: shuffle_v4f16_4523:
487; GFX9:       ; %bb.0:
488; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
489; GFX9-NEXT:    global_load_dword v4, v[2:3], off
490; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
491; GFX9-NEXT:    s_waitcnt vmcnt(1)
492; GFX9-NEXT:    v_mov_b32_e32 v0, v4
493; GFX9-NEXT:    s_waitcnt vmcnt(0)
494; GFX9-NEXT:    v_mov_b32_e32 v1, v5
495; GFX9-NEXT:    s_setpc_b64 s[30:31]
496;
497; GFX10-LABEL: shuffle_v4f16_4523:
498; GFX10:       ; %bb.0:
499; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
501; GFX10-NEXT:    global_load_dword v4, v[2:3], off
502; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
503; GFX10-NEXT:    s_waitcnt vmcnt(1)
504; GFX10-NEXT:    v_mov_b32_e32 v0, v4
505; GFX10-NEXT:    s_waitcnt vmcnt(0)
506; GFX10-NEXT:    v_mov_b32_e32 v1, v5
507; GFX10-NEXT:    s_setpc_b64 s[30:31]
508  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
509  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
510  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
511  ret <4 x half> %shuffle
512}
513
514define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
515; GFX9-LABEL: shuffle_v4f16_4545:
516; GFX9:       ; %bb.0:
517; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518; GFX9-NEXT:    global_load_dword v0, v[2:3], off
519; GFX9-NEXT:    s_waitcnt vmcnt(0)
520; GFX9-NEXT:    v_mov_b32_e32 v1, v0
521; GFX9-NEXT:    s_setpc_b64 s[30:31]
522;
523; GFX10-LABEL: shuffle_v4f16_4545:
524; GFX10:       ; %bb.0:
525; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
526; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
527; GFX10-NEXT:    global_load_dword v0, v[2:3], off
528; GFX10-NEXT:    s_waitcnt vmcnt(0)
529; GFX10-NEXT:    v_mov_b32_e32 v1, v0
530; GFX10-NEXT:    s_setpc_b64 s[30:31]
531  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
532  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
533  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
534  ret <4 x half> %shuffle
535}
536
537define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
538; GFX9-LABEL: shuffle_v4f16_4567:
539; GFX9:       ; %bb.0:
540; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
541; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
542; GFX9-NEXT:    s_waitcnt vmcnt(0)
543; GFX9-NEXT:    s_setpc_b64 s[30:31]
544;
545; GFX10-LABEL: shuffle_v4f16_4567:
546; GFX10:       ; %bb.0:
547; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
549; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
550; GFX10-NEXT:    s_waitcnt vmcnt(0)
551; GFX10-NEXT:    s_setpc_b64 s[30:31]
552  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
553  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
554  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
555  ret <4 x half> %shuffle
556}
557
558define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
559; GFX9-LABEL: shuffle_v4f16_6701:
560; GFX9:       ; %bb.0:
561; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
563; GFX9-NEXT:    global_load_dword v5, v[0:1], off
564; GFX9-NEXT:    s_waitcnt vmcnt(1)
565; GFX9-NEXT:    v_mov_b32_e32 v0, v4
566; GFX9-NEXT:    s_waitcnt vmcnt(0)
567; GFX9-NEXT:    v_mov_b32_e32 v1, v5
568; GFX9-NEXT:    s_setpc_b64 s[30:31]
569;
570; GFX10-LABEL: shuffle_v4f16_6701:
571; GFX10:       ; %bb.0:
572; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
574; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
575; GFX10-NEXT:    global_load_dword v5, v[0:1], off
576; GFX10-NEXT:    s_waitcnt vmcnt(1)
577; GFX10-NEXT:    v_mov_b32_e32 v0, v4
578; GFX10-NEXT:    s_waitcnt vmcnt(0)
579; GFX10-NEXT:    v_mov_b32_e32 v1, v5
580; GFX10-NEXT:    s_setpc_b64 s[30:31]
581  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
582  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
583  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
584  ret <4 x half> %shuffle
585}
586
587define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
588; GFX9-LABEL: shuffle_v4f16_6723:
589; GFX9:       ; %bb.0:
590; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
591; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
592; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
593; GFX9-NEXT:    s_waitcnt vmcnt(1)
594; GFX9-NEXT:    v_mov_b32_e32 v0, v4
595; GFX9-NEXT:    s_waitcnt vmcnt(0)
596; GFX9-NEXT:    v_mov_b32_e32 v1, v5
597; GFX9-NEXT:    s_setpc_b64 s[30:31]
598;
599; GFX10-LABEL: shuffle_v4f16_6723:
600; GFX10:       ; %bb.0:
601; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
603; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
604; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
605; GFX10-NEXT:    s_waitcnt vmcnt(1)
606; GFX10-NEXT:    v_mov_b32_e32 v0, v4
607; GFX10-NEXT:    s_waitcnt vmcnt(0)
608; GFX10-NEXT:    v_mov_b32_e32 v1, v5
609; GFX10-NEXT:    s_setpc_b64 s[30:31]
610  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
611  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
612  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
613  ret <4 x half> %shuffle
614}
615
616define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
617; GFX9-LABEL: shuffle_v4f16_6745:
618; GFX9:       ; %bb.0:
619; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
621; GFX9-NEXT:    s_waitcnt vmcnt(0)
622; GFX9-NEXT:    v_mov_b32_e32 v0, v2
623; GFX9-NEXT:    s_setpc_b64 s[30:31]
624;
625; GFX10-LABEL: shuffle_v4f16_6745:
626; GFX10:       ; %bb.0:
627; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
629; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
630; GFX10-NEXT:    s_waitcnt vmcnt(0)
631; GFX10-NEXT:    v_mov_b32_e32 v0, v2
632; GFX10-NEXT:    s_setpc_b64 s[30:31]
633  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
634  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
635  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
636  ret <4 x half> %shuffle
637}
638
639define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
640; GFX9-LABEL: shuffle_v4f16_6767:
641; GFX9:       ; %bb.0:
642; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
644; GFX9-NEXT:    s_waitcnt vmcnt(0)
645; GFX9-NEXT:    v_mov_b32_e32 v1, v0
646; GFX9-NEXT:    s_setpc_b64 s[30:31]
647;
648; GFX10-LABEL: shuffle_v4f16_6767:
649; GFX10:       ; %bb.0:
650; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
652; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
653; GFX10-NEXT:    s_waitcnt vmcnt(0)
654; GFX10-NEXT:    v_mov_b32_e32 v1, v0
655; GFX10-NEXT:    s_setpc_b64 s[30:31]
656  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
657  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
658  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
659  ret <4 x half> %shuffle
660}
661
662define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
663; GFX9-LABEL: shuffle_v4f16_2356:
664; GFX9:       ; %bb.0:
665; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
667; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
668; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
669; GFX9-NEXT:    s_waitcnt vmcnt(1)
670; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
671; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
672; GFX9-NEXT:    s_waitcnt vmcnt(0)
673; GFX9-NEXT:    v_mov_b32_e32 v0, v4
674; GFX9-NEXT:    s_setpc_b64 s[30:31]
675;
676; GFX10-LABEL: shuffle_v4f16_2356:
677; GFX10:       ; %bb.0:
678; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
679; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
680; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
681; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
682; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
683; GFX10-NEXT:    s_waitcnt vmcnt(1)
684; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
685; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
686; GFX10-NEXT:    s_waitcnt vmcnt(0)
687; GFX10-NEXT:    v_mov_b32_e32 v0, v4
688; GFX10-NEXT:    s_setpc_b64 s[30:31]
689  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
690  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
691  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
692  ret <4 x half> %shuffle
693}
694
695define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
696; GFX9-LABEL: shuffle_v4f16_5623:
697; GFX9:       ; %bb.0:
698; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
699; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
700; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
701; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
702; GFX9-NEXT:    s_waitcnt vmcnt(1)
703; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
704; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
705; GFX9-NEXT:    s_waitcnt vmcnt(0)
706; GFX9-NEXT:    v_mov_b32_e32 v1, v4
707; GFX9-NEXT:    s_setpc_b64 s[30:31]
708;
709; GFX10-LABEL: shuffle_v4f16_5623:
710; GFX10:       ; %bb.0:
711; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
712; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
713; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
714; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
715; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
716; GFX10-NEXT:    s_waitcnt vmcnt(1)
717; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
718; GFX10-NEXT:    s_waitcnt vmcnt(0)
719; GFX10-NEXT:    v_mov_b32_e32 v1, v4
720; GFX10-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
721; GFX10-NEXT:    s_setpc_b64 s[30:31]
722  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
723  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
724  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
725  ret <4 x half> %shuffle
726}
727
728define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
729; GFX9-LABEL: shuffle_v4f16_3456:
730; GFX9:       ; %bb.0:
731; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
732; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
733; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
734; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
735; GFX9-NEXT:    s_waitcnt vmcnt(1)
736; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
737; GFX9-NEXT:    s_waitcnt vmcnt(0)
738; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
739; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
740; GFX9-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
741; GFX9-NEXT:    s_setpc_b64 s[30:31]
742;
743; GFX10-LABEL: shuffle_v4f16_3456:
744; GFX10:       ; %bb.0:
745; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
746; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
747; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
748; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
749; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
750; GFX10-NEXT:    s_waitcnt vmcnt(1)
751; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
752; GFX10-NEXT:    s_waitcnt vmcnt(0)
753; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
754; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
755; GFX10-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
756; GFX10-NEXT:    s_setpc_b64 s[30:31]
757  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
758  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
759  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
760  ret <4 x half> %shuffle
761}
762
763define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
764; GFX9-LABEL: shuffle_v4f16_5634:
765; GFX9:       ; %bb.0:
766; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
767; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
768; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
769; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
770; GFX9-NEXT:    s_waitcnt vmcnt(0)
771; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
772; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
773; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
774; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
775; GFX9-NEXT:    s_setpc_b64 s[30:31]
776;
777; GFX10-LABEL: shuffle_v4f16_5634:
778; GFX10:       ; %bb.0:
779; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
780; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
781; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
782; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
783; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
784; GFX10-NEXT:    s_waitcnt vmcnt(1)
785; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
786; GFX10-NEXT:    s_waitcnt vmcnt(0)
787; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
788; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
789; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v2
790; GFX10-NEXT:    s_setpc_b64 s[30:31]
791  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
792  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
793  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
794  ret <4 x half> %shuffle
795}
796
797define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
798; GFX9-LABEL: shuffle_v4f16_5734:
799; GFX9:       ; %bb.0:
800; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
801; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
802; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
803; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
804; GFX9-NEXT:    s_waitcnt vmcnt(1)
805; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
806; GFX9-NEXT:    s_waitcnt vmcnt(0)
807; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
808; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
809; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
810; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
811; GFX9-NEXT:    s_setpc_b64 s[30:31]
812;
813; GFX10-LABEL: shuffle_v4f16_5734:
814; GFX10:       ; %bb.0:
815; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
816; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
817; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
818; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
819; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
820; GFX10-NEXT:    s_waitcnt vmcnt(1)
821; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
822; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
823; GFX10-NEXT:    s_waitcnt vmcnt(0)
824; GFX10-NEXT:    v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
825; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v2
826; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
827; GFX10-NEXT:    s_setpc_b64 s[30:31]
828  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
829  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
830  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
831  ret <4 x half> %shuffle
832}
833
834define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
835; GFX9-LABEL: shuffle_v4i16_2356:
836; GFX9:       ; %bb.0:
837; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
838; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
839; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
840; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
841; GFX9-NEXT:    s_waitcnt vmcnt(1)
842; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
843; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
844; GFX9-NEXT:    s_waitcnt vmcnt(0)
845; GFX9-NEXT:    v_mov_b32_e32 v0, v4
846; GFX9-NEXT:    s_setpc_b64 s[30:31]
847;
848; GFX10-LABEL: shuffle_v4i16_2356:
849; GFX10:       ; %bb.0:
850; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
852; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
853; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
854; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
855; GFX10-NEXT:    s_waitcnt vmcnt(1)
856; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
857; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
858; GFX10-NEXT:    s_waitcnt vmcnt(0)
859; GFX10-NEXT:    v_mov_b32_e32 v0, v4
860; GFX10-NEXT:    s_setpc_b64 s[30:31]
861  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
862  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
863  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
864  ret <4 x i16> %shuffle
865}
866
867define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
868; GFX9-LABEL: shuffle_v4i16_0167:
869; GFX9:       ; %bb.0:
870; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
871; GFX9-NEXT:    global_load_dword v4, v[0:1], off
872; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
873; GFX9-NEXT:    s_waitcnt vmcnt(1)
874; GFX9-NEXT:    v_mov_b32_e32 v0, v4
875; GFX9-NEXT:    s_waitcnt vmcnt(0)
876; GFX9-NEXT:    v_mov_b32_e32 v1, v5
877; GFX9-NEXT:    s_setpc_b64 s[30:31]
878;
879; GFX10-LABEL: shuffle_v4i16_0167:
880; GFX10:       ; %bb.0:
881; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
882; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
883; GFX10-NEXT:    global_load_dword v4, v[0:1], off
884; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
885; GFX10-NEXT:    s_waitcnt vmcnt(1)
886; GFX10-NEXT:    v_mov_b32_e32 v0, v4
887; GFX10-NEXT:    s_waitcnt vmcnt(0)
888; GFX10-NEXT:    v_mov_b32_e32 v1, v5
889; GFX10-NEXT:    s_setpc_b64 s[30:31]
890  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
891  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
892  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
893  ret <4 x i16> %shuffle
894}
895
896define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
897; GFX9-LABEL: shuffle_v4f16_0000:
898; GFX9:       ; %bb.0:
899; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
901; GFX9-NEXT:    s_waitcnt vmcnt(0)
902; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
903; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
904; GFX9-NEXT:    v_mov_b32_e32 v1, v0
905; GFX9-NEXT:    s_setpc_b64 s[30:31]
906;
907; GFX10-LABEL: shuffle_v4f16_0000:
908; GFX10:       ; %bb.0:
909; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
911; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
912; GFX10-NEXT:    s_waitcnt vmcnt(0)
913; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v0
914; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
915; GFX10-NEXT:    v_mov_b32_e32 v1, v0
916; GFX10-NEXT:    s_setpc_b64 s[30:31]
917  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
918  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
919  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
920  ret <4 x half> %shuffle
921}
922
923define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
924; GFX9-LABEL: shuffle_v4f16_1010:
925; GFX9:       ; %bb.0:
926; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
928; GFX9-NEXT:    s_waitcnt vmcnt(0)
929; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
930; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
931; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
932; GFX9-NEXT:    v_mov_b32_e32 v1, v0
933; GFX9-NEXT:    s_setpc_b64 s[30:31]
934;
935; GFX10-LABEL: shuffle_v4f16_1010:
936; GFX10:       ; %bb.0:
937; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
938; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
939; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
940; GFX10-NEXT:    s_waitcnt vmcnt(0)
941; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
942; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
943; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
944; GFX10-NEXT:    v_mov_b32_e32 v1, v0
945; GFX10-NEXT:    s_setpc_b64 s[30:31]
946  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
947  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
948  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
949  ret <4 x half> %shuffle
950}
951
952define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
953; GFX9-LABEL: shuffle_v4f16_1100:
954; GFX9:       ; %bb.0:
955; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
956; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
957; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
958; GFX9-NEXT:    s_waitcnt vmcnt(0)
959; GFX9-NEXT:    v_and_b32_e32 v1, v2, v0
960; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
961; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
962; GFX9-NEXT:    v_and_b32_e32 v0, v2, v3
963; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
964; GFX9-NEXT:    s_setpc_b64 s[30:31]
965;
966; GFX10-LABEL: shuffle_v4f16_1100:
967; GFX10:       ; %bb.0:
968; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
969; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
970; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
971; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
972; GFX10-NEXT:    s_waitcnt vmcnt(0)
973; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
974; GFX10-NEXT:    v_and_b32_e32 v4, v0, v1
975; GFX10-NEXT:    v_and_b32_e32 v3, v0, v2
976; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
977; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v3
978; GFX10-NEXT:    s_setpc_b64 s[30:31]
979  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
980  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
981  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
982  ret <4 x half> %shuffle
983}
984
985define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
986; GFX9-LABEL: shuffle_v4f16_6161:
987; GFX9:       ; %bb.0:
988; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
990; GFX9-NEXT:    global_load_dword v5, v[0:1], off
991; GFX9-NEXT:    s_waitcnt vmcnt(1)
992; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v4
993; GFX9-NEXT:    s_waitcnt vmcnt(0)
994; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
995; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
996; GFX9-NEXT:    v_mov_b32_e32 v1, v0
997; GFX9-NEXT:    s_setpc_b64 s[30:31]
998;
999; GFX10-LABEL: shuffle_v4f16_6161:
1000; GFX10:       ; %bb.0:
1001; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1002; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1003; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1004; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
1005; GFX10-NEXT:    s_waitcnt vmcnt(1)
1006; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
1007; GFX10-NEXT:    s_waitcnt vmcnt(0)
1008; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v5
1009; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1010; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1011; GFX10-NEXT:    s_setpc_b64 s[30:31]
1012  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1013  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1014  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
1015  ret <4 x half> %shuffle
1016}
1017
1018define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1019; GFX9-LABEL: shuffle_v4f16_2333:
1020; GFX9:       ; %bb.0:
1021; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1022; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
1023; GFX9-NEXT:    s_waitcnt vmcnt(0)
1024; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1025; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1026; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1027; GFX9-NEXT:    s_setpc_b64 s[30:31]
1028;
1029; GFX10-LABEL: shuffle_v4f16_2333:
1030; GFX10:       ; %bb.0:
1031; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1032; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1033; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
1034; GFX10-NEXT:    s_waitcnt vmcnt(0)
1035; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1036; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1037; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1038; GFX10-NEXT:    s_setpc_b64 s[30:31]
1039  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1040  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1041  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1042  ret <4 x half> %shuffle
1043}
1044
1045define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1046; GFX9-LABEL: shuffle_v4f16_6667:
1047; GFX9:       ; %bb.0:
1048; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1049; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
1050; GFX9-NEXT:    s_waitcnt vmcnt(0)
1051; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1052; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1053; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1054; GFX9-NEXT:    s_setpc_b64 s[30:31]
1055;
1056; GFX10-LABEL: shuffle_v4f16_6667:
1057; GFX10:       ; %bb.0:
1058; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1059; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1060; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
1061; GFX10-NEXT:    s_waitcnt vmcnt(0)
1062; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1063; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1064; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1065; GFX10-NEXT:    s_setpc_b64 s[30:31]
1066  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1067  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1068  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1069  ret <4 x half> %shuffle
1070}
1071
1072define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1073; GFX9-LABEL: shuffle_v8f16_0101:
1074; GFX9:       ; %bb.0:
1075; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1076; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1077; GFX9-NEXT:    s_waitcnt vmcnt(0)
1078; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1079; GFX9-NEXT:    s_setpc_b64 s[30:31]
1080;
1081; GFX10-LABEL: shuffle_v8f16_0101:
1082; GFX10:       ; %bb.0:
1083; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1084; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1085; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1086; GFX10-NEXT:    s_waitcnt vmcnt(0)
1087; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1088; GFX10-NEXT:    s_setpc_b64 s[30:31]
1089  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1090  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1091  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1092  ret <4 x half> %shuffle
1093}
1094
1095define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1096; GFX9-LABEL: shuffle_v8f16_0123:
1097; GFX9:       ; %bb.0:
1098; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1099; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1100; GFX9-NEXT:    s_waitcnt vmcnt(0)
1101; GFX9-NEXT:    s_setpc_b64 s[30:31]
1102;
1103; GFX10-LABEL: shuffle_v8f16_0123:
1104; GFX10:       ; %bb.0:
1105; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1106; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1107; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1108; GFX10-NEXT:    s_waitcnt vmcnt(0)
1109; GFX10-NEXT:    s_setpc_b64 s[30:31]
1110  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1111  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1112  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1113  ret <4 x half> %shuffle
1114}
1115
1116define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1117; GFX9-LABEL: shuffle_v8f16_4589:
1118; GFX9:       ; %bb.0:
1119; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1120; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
1121; GFX9-NEXT:    global_load_dword v5, v[2:3], off
1122; GFX9-NEXT:    s_waitcnt vmcnt(1)
1123; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1124; GFX9-NEXT:    s_waitcnt vmcnt(0)
1125; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1126; GFX9-NEXT:    s_setpc_b64 s[30:31]
1127;
1128; GFX10-LABEL: shuffle_v8f16_4589:
1129; GFX10:       ; %bb.0:
1130; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1131; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1132; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
1133; GFX10-NEXT:    global_load_dword v5, v[2:3], off
1134; GFX10-NEXT:    s_waitcnt vmcnt(1)
1135; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1136; GFX10-NEXT:    s_waitcnt vmcnt(0)
1137; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1138; GFX10-NEXT:    s_setpc_b64 s[30:31]
1139  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1140  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1141  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
1142  ret <4 x half> %shuffle
1143}
1144
1145define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1146; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
1147; GFX9:       ; %bb.0:
1148; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1149; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
1150; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
1151; GFX9-NEXT:    s_waitcnt vmcnt(1)
1152; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1153; GFX9-NEXT:    s_waitcnt vmcnt(0)
1154; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1155; GFX9-NEXT:    s_setpc_b64 s[30:31]
1156;
1157; GFX10-LABEL: shuffle_v8f16_10_11_2_3:
1158; GFX10:       ; %bb.0:
1159; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1160; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1161; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
1162; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
1163; GFX10-NEXT:    s_waitcnt vmcnt(1)
1164; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1165; GFX10-NEXT:    s_waitcnt vmcnt(0)
1166; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1167; GFX10-NEXT:    s_setpc_b64 s[30:31]
1168  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1169  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1170  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
1171  ret <4 x half> %shuffle
1172}
1173
1174define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1175; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
1176; GFX9:       ; %bb.0:
1177; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
1179; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
1180; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1181; GFX9-NEXT:    s_waitcnt vmcnt(1)
1182; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1183; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
1184; GFX9-NEXT:    s_waitcnt vmcnt(0)
1185; GFX9-NEXT:    v_mov_b32_e32 v1, v4
1186; GFX9-NEXT:    s_setpc_b64 s[30:31]
1187;
1188; GFX10-LABEL: shuffle_v8f16_13_14_2_3:
1189; GFX10:       ; %bb.0:
1190; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1191; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1192; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
1193; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
1194; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1195; GFX10-NEXT:    s_waitcnt vmcnt(1)
1196; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1197; GFX10-NEXT:    s_waitcnt vmcnt(0)
1198; GFX10-NEXT:    v_mov_b32_e32 v1, v4
1199; GFX10-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
1200; GFX10-NEXT:    s_setpc_b64 s[30:31]
1201  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1202  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1203  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
1204  ret <4 x half> %shuffle
1205}
1206
1207define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
1208; GFX9-LABEL: shuffle_v3f16_0122:
1209; GFX9:       ; %bb.0:
1210; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1212; GFX9-NEXT:    s_waitcnt vmcnt(0)
1213; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1214; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1215; GFX9-NEXT:    s_setpc_b64 s[30:31]
1216;
1217; GFX10-LABEL: shuffle_v3f16_0122:
1218; GFX10:       ; %bb.0:
1219; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1220; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1221; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1222; GFX10-NEXT:    s_waitcnt vmcnt(0)
1223; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1224; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1225; GFX10-NEXT:    s_setpc_b64 s[30:31]
1226  %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
1227  %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
1228  %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1229  ret <4 x half> %shuffle
1230}
1231
1232define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
1233; GFX9-LABEL: shuffle_v2f16_0122:
1234; GFX9:       ; %bb.0:
1235; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1236; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1237; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
1238; GFX9-NEXT:    s_waitcnt vmcnt(0)
1239; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1240; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
1241; GFX9-NEXT:    s_setpc_b64 s[30:31]
1242;
1243; GFX10-LABEL: shuffle_v2f16_0122:
1244; GFX10:       ; %bb.0:
1245; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1246; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1247; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1248; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
1249; GFX10-NEXT:    s_waitcnt vmcnt(0)
1250; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1251; GFX10-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
1252; GFX10-NEXT:    s_setpc_b64 s[30:31]
1253  %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
1254  %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
1255  %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
1256  ret <4 x half> %shuffle
1257}
1258
1259define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
1260; GFX9-LABEL: shuffle_v6f16_452367:
1261; GFX9:       ; %bb.0:
1262; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1263; GFX9-NEXT:    v_mov_b32_e32 v6, v1
1264; GFX9-NEXT:    v_mov_b32_e32 v5, v0
1265; GFX9-NEXT:    v_mov_b32_e32 v4, v3
1266; GFX9-NEXT:    v_mov_b32_e32 v3, v2
1267; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
1268; GFX9-NEXT:    global_load_dword v7, v[3:4], off
1269; GFX9-NEXT:    s_waitcnt vmcnt(1)
1270; GFX9-NEXT:    v_mov_b32_e32 v0, v2
1271; GFX9-NEXT:    s_waitcnt vmcnt(0)
1272; GFX9-NEXT:    v_mov_b32_e32 v2, v7
1273; GFX9-NEXT:    s_setpc_b64 s[30:31]
1274;
1275; GFX10-LABEL: shuffle_v6f16_452367:
1276; GFX10:       ; %bb.0:
1277; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1279; GFX10-NEXT:    v_mov_b32_e32 v6, v1
1280; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1281; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1282; GFX10-NEXT:    v_mov_b32_e32 v3, v2
1283; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
1284; GFX10-NEXT:    global_load_dword v7, v[3:4], off
1285; GFX10-NEXT:    s_waitcnt vmcnt(1)
1286; GFX10-NEXT:    v_mov_b32_e32 v0, v2
1287; GFX10-NEXT:    s_waitcnt vmcnt(0)
1288; GFX10-NEXT:    v_mov_b32_e32 v2, v7
1289; GFX10-NEXT:    s_setpc_b64 s[30:31]
1290  %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
1291  %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
1292  %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
1293  ret <6 x half> %shuffle
1294}
1295
1296define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C)  {
1297; GFX9-LABEL: fma_shuffle:
1298; GFX9:       ; %bb.0: ; %entry
1299; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1300; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1301; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1302; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1304; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1305; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[6:7]
1306; GFX9-NEXT:    s_waitcnt vmcnt(0)
1307; GFX9-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1308; GFX9-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1309; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1310; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1311; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
1312; GFX9-NEXT:    s_endpgm
1313;
1314; GFX10-LABEL: fma_shuffle:
1315; GFX10:       ; %bb.0: ; %entry
1316; GFX10-NEXT:    s_clause 0x1
1317; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1318; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1319; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1320; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1321; GFX10-NEXT:    s_clause 0x2
1322; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1323; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1324; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[6:7]
1325; GFX10-NEXT:    s_waitcnt vmcnt(0)
1326; GFX10-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1327; GFX10-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1328; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1329; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1330; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
1331; GFX10-NEXT:    s_endpgm
1332entry:
1333  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
1334  %tmp12 = zext i32 %tmp1 to i64
1335  %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
1336  %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
1337  %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
1338  %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
1339  %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
1340  %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
1341  %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
1342  %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1343  %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1344  %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
1345  %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
1346  %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1347  %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
1348  %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1349  %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1350  %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
1351  %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1352  %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
1353  %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
1354  %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
1355  %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1356  %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1357  store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
1358  ret void
1359}
1360
1361define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1362; GFX9-LABEL: shuffle_v4f16_0456:
1363; GFX9:       ; %bb.0:
1364; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1365; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1366; GFX9-NEXT:    s_waitcnt vmcnt(0)
1367; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1368; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1369; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1370; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1371; GFX9-NEXT:    v_and_b32_e32 v1, v0, v4
1372; GFX9-NEXT:    s_waitcnt vmcnt(0)
1373; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1374; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
1375; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
1376; GFX9-NEXT:    s_setpc_b64 s[30:31]
1377;
1378; GFX10-LABEL: shuffle_v4f16_0456:
1379; GFX10:       ; %bb.0:
1380; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1381; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1382; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1383; GFX10-NEXT:    s_waitcnt vmcnt(0)
1384; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1385; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1386; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1387; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1388; GFX10-NEXT:    v_and_b32_e32 v1, v0, v4
1389; GFX10-NEXT:    s_waitcnt vmcnt(0)
1390; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1391; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
1392; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
1393; GFX10-NEXT:    s_setpc_b64 s[30:31]
1394  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1395  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1396  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1397  ret <4 x half> %shuffle
1398}
1399
1400define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out)  {
1401; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
1402; GFX9:       ; %bb.0:
1403; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1404; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1405; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1406; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1407; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1408; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1409; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1410; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1411; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1412; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1413; GFX9-NEXT:    s_endpgm
1414;
1415; GFX10-LABEL: shuffle_scalar_load_v8i32_0123:
1416; GFX10:       ; %bb.0:
1417; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1418; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1419; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1420; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1421; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1422; GFX10-NEXT:    v_mov_b32_e32 v0, s4
1423; GFX10-NEXT:    v_mov_b32_e32 v1, s5
1424; GFX10-NEXT:    v_mov_b32_e32 v2, s6
1425; GFX10-NEXT:    v_mov_b32_e32 v3, s7
1426; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1427; GFX10-NEXT:    s_endpgm
1428  %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
1429  %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1430  store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8
1431  ret void
1432}
1433
1434declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
1435declare i32 @llvm.amdgcn.workitem.id.x() #0
1436
1437attributes #0 = { nounwind readnone speculatable }
1438