1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4
5define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
6; GFX9-LABEL: shuffle_v4f16_23uu:
7; GFX9:       ; %bb.0:
8; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
10; GFX9-NEXT:    s_waitcnt vmcnt(0)
11; GFX9-NEXT:    s_setpc_b64 s[30:31]
12;
13; GFX10-LABEL: shuffle_v4f16_23uu:
14; GFX10:       ; %bb.0:
15; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
17; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
18; GFX10-NEXT:    s_waitcnt vmcnt(0)
19; GFX10-NEXT:    s_setpc_b64 s[30:31]
20  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
21  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
22  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
23  ret <4 x half> %shuffle
24}
25
26define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
27; GFX9-LABEL: shuffle_v4f16_234u:
28; GFX9:       ; %bb.0:
29; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
31; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
32; GFX9-NEXT:    s_waitcnt vmcnt(1)
33; GFX9-NEXT:    v_mov_b32_e32 v0, v6
34; GFX9-NEXT:    s_waitcnt vmcnt(0)
35; GFX9-NEXT:    v_mov_b32_e32 v1, v4
36; GFX9-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX10-LABEL: shuffle_v4f16_234u:
39; GFX10:       ; %bb.0:
40; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
42; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
43; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
44; GFX10-NEXT:    s_waitcnt vmcnt(1)
45; GFX10-NEXT:    v_mov_b32_e32 v0, v6
46; GFX10-NEXT:    s_waitcnt vmcnt(0)
47; GFX10-NEXT:    v_mov_b32_e32 v1, v4
48; GFX10-NEXT:    s_setpc_b64 s[30:31]
49  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
50  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
51  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
52  ret <4 x half> %shuffle
53}
54
55define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
56; GFX9-LABEL: shuffle_v4f16_u1u3:
57; GFX9:       ; %bb.0:
58; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
60; GFX9-NEXT:    s_waitcnt vmcnt(0)
61; GFX9-NEXT:    s_setpc_b64 s[30:31]
62;
63; GFX10-LABEL: shuffle_v4f16_u1u3:
64; GFX10:       ; %bb.0:
65; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
67; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
68; GFX10-NEXT:    s_waitcnt vmcnt(0)
69; GFX10-NEXT:    s_setpc_b64 s[30:31]
70  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
71  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
72  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
73  ret <4 x half> %shuffle
74}
75
76define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
77; GFX9-LABEL: shuffle_v4f16_u3u1:
78; GFX9:       ; %bb.0:
79; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
81; GFX9-NEXT:    s_waitcnt vmcnt(0)
82; GFX9-NEXT:    v_mov_b32_e32 v0, v2
83; GFX9-NEXT:    s_setpc_b64 s[30:31]
84;
85; GFX10-LABEL: shuffle_v4f16_u3u1:
86; GFX10:       ; %bb.0:
87; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
89; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
90; GFX10-NEXT:    s_waitcnt vmcnt(0)
91; GFX10-NEXT:    v_mov_b32_e32 v0, v2
92; GFX10-NEXT:    s_setpc_b64 s[30:31]
93  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
94  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
95  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
96  ret <4 x half> %shuffle
97}
98
99define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
100; GFX9-LABEL: shuffle_v4f16_u3uu:
101; GFX9:       ; %bb.0:
102; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
104; GFX9-NEXT:    s_waitcnt vmcnt(0)
105; GFX9-NEXT:    s_setpc_b64 s[30:31]
106;
107; GFX10-LABEL: shuffle_v4f16_u3uu:
108; GFX10:       ; %bb.0:
109; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
111; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
112; GFX10-NEXT:    s_waitcnt vmcnt(0)
113; GFX10-NEXT:    s_setpc_b64 s[30:31]
114  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
115  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
116  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
117  ret <4 x half> %shuffle
118}
119
120define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
121; GFX9-LABEL: shuffle_v4f16_3u6u:
122; GFX9:       ; %bb.0:
123; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
125; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
126; GFX9-NEXT:    s_waitcnt vmcnt(1)
127; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
128; GFX9-NEXT:    s_waitcnt vmcnt(0)
129; GFX9-NEXT:    v_mov_b32_e32 v1, v4
130; GFX9-NEXT:    s_setpc_b64 s[30:31]
131;
132; GFX10-LABEL: shuffle_v4f16_3u6u:
133; GFX10:       ; %bb.0:
134; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
136; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
137; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
138; GFX10-NEXT:    s_waitcnt vmcnt(1)
139; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
140; GFX10-NEXT:    s_waitcnt vmcnt(0)
141; GFX10-NEXT:    v_mov_b32_e32 v1, v4
142; GFX10-NEXT:    s_setpc_b64 s[30:31]
143  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
144  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
145  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
146  ret <4 x half> %shuffle
147}
148
149define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
150; GFX9-LABEL: shuffle_v4f16_3uu7:
151; GFX9:       ; %bb.0:
152; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
154; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
155; GFX9-NEXT:    s_waitcnt vmcnt(1)
156; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
157; GFX9-NEXT:    s_waitcnt vmcnt(0)
158; GFX9-NEXT:    v_mov_b32_e32 v1, v4
159; GFX9-NEXT:    s_setpc_b64 s[30:31]
160;
161; GFX10-LABEL: shuffle_v4f16_3uu7:
162; GFX10:       ; %bb.0:
163; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
165; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
166; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
167; GFX10-NEXT:    s_waitcnt vmcnt(1)
168; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
169; GFX10-NEXT:    s_waitcnt vmcnt(0)
170; GFX10-NEXT:    v_mov_b32_e32 v1, v4
171; GFX10-NEXT:    s_setpc_b64 s[30:31]
172  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
173  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
174  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
175  ret <4 x half> %shuffle
176}
177
178define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
179; GFX9-LABEL: shuffle_v4f16_35u5:
180; GFX9:       ; %bb.0:
181; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
183; GFX9-NEXT:    global_load_dword v4, v[2:3], off
184; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
185; GFX9-NEXT:    s_waitcnt vmcnt(1)
186; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
187; GFX9-NEXT:    s_waitcnt vmcnt(0)
188; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
189; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
190; GFX9-NEXT:    v_mov_b32_e32 v1, v4
191; GFX9-NEXT:    s_setpc_b64 s[30:31]
192;
193; GFX10-LABEL: shuffle_v4f16_35u5:
194; GFX10:       ; %bb.0:
195; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
197; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
198; GFX10-NEXT:    global_load_dword v4, v[2:3], off
199; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
200; GFX10-NEXT:    s_waitcnt vmcnt(1)
201; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
202; GFX10-NEXT:    s_waitcnt vmcnt(0)
203; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
204; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
205; GFX10-NEXT:    v_mov_b32_e32 v1, v4
206; GFX10-NEXT:    s_setpc_b64 s[30:31]
207  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
208  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
209  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
210  ret <4 x half> %shuffle
211}
212
213define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
214; GFX9-LABEL: shuffle_v4f16_357u:
215; GFX9:       ; %bb.0:
216; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
218; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
219; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
220; GFX9-NEXT:    s_waitcnt vmcnt(1)
221; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
222; GFX9-NEXT:    s_waitcnt vmcnt(0)
223; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
224; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
225; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
226; GFX9-NEXT:    s_setpc_b64 s[30:31]
227;
228; GFX10-LABEL: shuffle_v4f16_357u:
229; GFX10:       ; %bb.0:
230; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
231; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
232; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
233; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
234; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
235; GFX10-NEXT:    s_waitcnt vmcnt(1)
236; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
237; GFX10-NEXT:    s_waitcnt vmcnt(0)
238; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
239; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
240; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
241; GFX10-NEXT:    s_setpc_b64 s[30:31]
242  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
243  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
244  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
245  ret <4 x half> %shuffle
246}
247
248define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
249; GFX9-LABEL: shuffle_v4f16_0101:
250; GFX9:       ; %bb.0:
251; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; GFX9-NEXT:    global_load_dword v0, v[0:1], off
253; GFX9-NEXT:    s_waitcnt vmcnt(0)
254; GFX9-NEXT:    v_mov_b32_e32 v1, v0
255; GFX9-NEXT:    s_setpc_b64 s[30:31]
256;
257; GFX10-LABEL: shuffle_v4f16_0101:
258; GFX10:       ; %bb.0:
259; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
261; GFX10-NEXT:    global_load_dword v0, v[0:1], off
262; GFX10-NEXT:    s_waitcnt vmcnt(0)
263; GFX10-NEXT:    v_mov_b32_e32 v1, v0
264; GFX10-NEXT:    s_setpc_b64 s[30:31]
265  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
266  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
267  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
268  ret <4 x half> %shuffle
269}
270
271define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
272; GFX9-LABEL: shuffle_v4f16_0123:
273; GFX9:       ; %bb.0:
274; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
276; GFX9-NEXT:    s_waitcnt vmcnt(0)
277; GFX9-NEXT:    s_setpc_b64 s[30:31]
278;
279; GFX10-LABEL: shuffle_v4f16_0123:
280; GFX10:       ; %bb.0:
281; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
282; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
283; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
284; GFX10-NEXT:    s_waitcnt vmcnt(0)
285; GFX10-NEXT:    s_setpc_b64 s[30:31]
286  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
287  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
288  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
289  ret <4 x half> %shuffle
290}
291
292define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
293; GFX9-LABEL: shuffle_v4f16_0145:
294; GFX9:       ; %bb.0:
295; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296; GFX9-NEXT:    global_load_dword v4, v[0:1], off
297; GFX9-NEXT:    global_load_dword v5, v[2:3], off
298; GFX9-NEXT:    s_waitcnt vmcnt(1)
299; GFX9-NEXT:    v_mov_b32_e32 v0, v4
300; GFX9-NEXT:    s_waitcnt vmcnt(0)
301; GFX9-NEXT:    v_mov_b32_e32 v1, v5
302; GFX9-NEXT:    s_setpc_b64 s[30:31]
303;
304; GFX10-LABEL: shuffle_v4f16_0145:
305; GFX10:       ; %bb.0:
306; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
308; GFX10-NEXT:    global_load_dword v4, v[0:1], off
309; GFX10-NEXT:    global_load_dword v5, v[2:3], off
310; GFX10-NEXT:    s_waitcnt vmcnt(1)
311; GFX10-NEXT:    v_mov_b32_e32 v0, v4
312; GFX10-NEXT:    s_waitcnt vmcnt(0)
313; GFX10-NEXT:    v_mov_b32_e32 v1, v5
314; GFX10-NEXT:    s_setpc_b64 s[30:31]
315  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
316  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
317  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
318  ret <4 x half> %shuffle
319}
320
321define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
322; GFX9-LABEL: shuffle_v4f16_0167:
323; GFX9:       ; %bb.0:
324; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GFX9-NEXT:    global_load_dword v4, v[0:1], off
326; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
327; GFX9-NEXT:    s_waitcnt vmcnt(1)
328; GFX9-NEXT:    v_mov_b32_e32 v0, v4
329; GFX9-NEXT:    s_waitcnt vmcnt(0)
330; GFX9-NEXT:    v_mov_b32_e32 v1, v5
331; GFX9-NEXT:    s_setpc_b64 s[30:31]
332;
333; GFX10-LABEL: shuffle_v4f16_0167:
334; GFX10:       ; %bb.0:
335; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
337; GFX10-NEXT:    global_load_dword v4, v[0:1], off
338; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
339; GFX10-NEXT:    s_waitcnt vmcnt(1)
340; GFX10-NEXT:    v_mov_b32_e32 v0, v4
341; GFX10-NEXT:    s_waitcnt vmcnt(0)
342; GFX10-NEXT:    v_mov_b32_e32 v1, v5
343; GFX10-NEXT:    s_setpc_b64 s[30:31]
344  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
345  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
346  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
347  ret <4 x half> %shuffle
348}
349
350define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
351; GFX9-LABEL: shuffle_v4f16_2301:
352; GFX9:       ; %bb.0:
353; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
355; GFX9-NEXT:    s_waitcnt vmcnt(0)
356; GFX9-NEXT:    v_mov_b32_e32 v0, v2
357; GFX9-NEXT:    s_setpc_b64 s[30:31]
358;
359; GFX10-LABEL: shuffle_v4f16_2301:
360; GFX10:       ; %bb.0:
361; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
363; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
364; GFX10-NEXT:    s_waitcnt vmcnt(0)
365; GFX10-NEXT:    v_mov_b32_e32 v0, v2
366; GFX10-NEXT:    s_setpc_b64 s[30:31]
367  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
368  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
369  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
370  ret <4 x half> %shuffle
371}
372
373define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
374; GFX9-LABEL: shuffle_v4f16_2323:
375; GFX9:       ; %bb.0:
376; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
378; GFX9-NEXT:    s_waitcnt vmcnt(0)
379; GFX9-NEXT:    v_mov_b32_e32 v1, v0
380; GFX9-NEXT:    s_setpc_b64 s[30:31]
381;
382; GFX10-LABEL: shuffle_v4f16_2323:
383; GFX10:       ; %bb.0:
384; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
386; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
387; GFX10-NEXT:    s_waitcnt vmcnt(0)
388; GFX10-NEXT:    v_mov_b32_e32 v1, v0
389; GFX10-NEXT:    s_setpc_b64 s[30:31]
390  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
391  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
392  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
393  ret <4 x half> %shuffle
394}
395
396define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
397; GFX9-LABEL: shuffle_v4f16_2345:
398; GFX9:       ; %bb.0:
399; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
401; GFX9-NEXT:    global_load_dword v5, v[2:3], off
402; GFX9-NEXT:    s_waitcnt vmcnt(1)
403; GFX9-NEXT:    v_mov_b32_e32 v0, v4
404; GFX9-NEXT:    s_waitcnt vmcnt(0)
405; GFX9-NEXT:    v_mov_b32_e32 v1, v5
406; GFX9-NEXT:    s_setpc_b64 s[30:31]
407;
408; GFX10-LABEL: shuffle_v4f16_2345:
409; GFX10:       ; %bb.0:
410; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
411; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
412; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
413; GFX10-NEXT:    global_load_dword v5, v[2:3], off
414; GFX10-NEXT:    s_waitcnt vmcnt(1)
415; GFX10-NEXT:    v_mov_b32_e32 v0, v4
416; GFX10-NEXT:    s_waitcnt vmcnt(0)
417; GFX10-NEXT:    v_mov_b32_e32 v1, v5
418; GFX10-NEXT:    s_setpc_b64 s[30:31]
419  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
420  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
421  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
422  ret <4 x half> %shuffle
423}
424
425define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
426; GFX9-LABEL: shuffle_v4f16_2367:
427; GFX9:       ; %bb.0:
428; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
429; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
430; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
431; GFX9-NEXT:    s_waitcnt vmcnt(1)
432; GFX9-NEXT:    v_mov_b32_e32 v0, v4
433; GFX9-NEXT:    s_waitcnt vmcnt(0)
434; GFX9-NEXT:    v_mov_b32_e32 v1, v5
435; GFX9-NEXT:    s_setpc_b64 s[30:31]
436;
437; GFX10-LABEL: shuffle_v4f16_2367:
438; GFX10:       ; %bb.0:
439; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
440; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
441; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
442; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
443; GFX10-NEXT:    s_waitcnt vmcnt(1)
444; GFX10-NEXT:    v_mov_b32_e32 v0, v4
445; GFX10-NEXT:    s_waitcnt vmcnt(0)
446; GFX10-NEXT:    v_mov_b32_e32 v1, v5
447; GFX10-NEXT:    s_setpc_b64 s[30:31]
448  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
449  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
450  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
451  ret <4 x half> %shuffle
452}
453
454define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
455; GFX9-LABEL: shuffle_v4f16_4501:
456; GFX9:       ; %bb.0:
457; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX9-NEXT:    global_load_dword v4, v[2:3], off
459; GFX9-NEXT:    global_load_dword v5, v[0:1], off
460; GFX9-NEXT:    s_waitcnt vmcnt(1)
461; GFX9-NEXT:    v_mov_b32_e32 v0, v4
462; GFX9-NEXT:    s_waitcnt vmcnt(0)
463; GFX9-NEXT:    v_mov_b32_e32 v1, v5
464; GFX9-NEXT:    s_setpc_b64 s[30:31]
465;
466; GFX10-LABEL: shuffle_v4f16_4501:
467; GFX10:       ; %bb.0:
468; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
469; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
470; GFX10-NEXT:    global_load_dword v4, v[2:3], off
471; GFX10-NEXT:    global_load_dword v5, v[0:1], off
472; GFX10-NEXT:    s_waitcnt vmcnt(1)
473; GFX10-NEXT:    v_mov_b32_e32 v0, v4
474; GFX10-NEXT:    s_waitcnt vmcnt(0)
475; GFX10-NEXT:    v_mov_b32_e32 v1, v5
476; GFX10-NEXT:    s_setpc_b64 s[30:31]
477  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
478  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
479  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
480  ret <4 x half> %shuffle
481}
482
483define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
484; GFX9-LABEL: shuffle_v4f16_4523:
485; GFX9:       ; %bb.0:
486; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487; GFX9-NEXT:    global_load_dword v4, v[2:3], off
488; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
489; GFX9-NEXT:    s_waitcnt vmcnt(1)
490; GFX9-NEXT:    v_mov_b32_e32 v0, v4
491; GFX9-NEXT:    s_waitcnt vmcnt(0)
492; GFX9-NEXT:    v_mov_b32_e32 v1, v5
493; GFX9-NEXT:    s_setpc_b64 s[30:31]
494;
495; GFX10-LABEL: shuffle_v4f16_4523:
496; GFX10:       ; %bb.0:
497; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
499; GFX10-NEXT:    global_load_dword v4, v[2:3], off
500; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
501; GFX10-NEXT:    s_waitcnt vmcnt(1)
502; GFX10-NEXT:    v_mov_b32_e32 v0, v4
503; GFX10-NEXT:    s_waitcnt vmcnt(0)
504; GFX10-NEXT:    v_mov_b32_e32 v1, v5
505; GFX10-NEXT:    s_setpc_b64 s[30:31]
506  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
507  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
508  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
509  ret <4 x half> %shuffle
510}
511
512define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
513; GFX9-LABEL: shuffle_v4f16_4545:
514; GFX9:       ; %bb.0:
515; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
516; GFX9-NEXT:    global_load_dword v0, v[2:3], off
517; GFX9-NEXT:    s_waitcnt vmcnt(0)
518; GFX9-NEXT:    v_mov_b32_e32 v1, v0
519; GFX9-NEXT:    s_setpc_b64 s[30:31]
520;
521; GFX10-LABEL: shuffle_v4f16_4545:
522; GFX10:       ; %bb.0:
523; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
524; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
525; GFX10-NEXT:    global_load_dword v0, v[2:3], off
526; GFX10-NEXT:    s_waitcnt vmcnt(0)
527; GFX10-NEXT:    v_mov_b32_e32 v1, v0
528; GFX10-NEXT:    s_setpc_b64 s[30:31]
529  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
530  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
531  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
532  ret <4 x half> %shuffle
533}
534
535define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
536; GFX9-LABEL: shuffle_v4f16_4567:
537; GFX9:       ; %bb.0:
538; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
539; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
540; GFX9-NEXT:    s_waitcnt vmcnt(0)
541; GFX9-NEXT:    s_setpc_b64 s[30:31]
542;
543; GFX10-LABEL: shuffle_v4f16_4567:
544; GFX10:       ; %bb.0:
545; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
546; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
547; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
548; GFX10-NEXT:    s_waitcnt vmcnt(0)
549; GFX10-NEXT:    s_setpc_b64 s[30:31]
550  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
551  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
552  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
553  ret <4 x half> %shuffle
554}
555
556define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
557; GFX9-LABEL: shuffle_v4f16_6701:
558; GFX9:       ; %bb.0:
559; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
560; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
561; GFX9-NEXT:    global_load_dword v5, v[0:1], off
562; GFX9-NEXT:    s_waitcnt vmcnt(1)
563; GFX9-NEXT:    v_mov_b32_e32 v0, v4
564; GFX9-NEXT:    s_waitcnt vmcnt(0)
565; GFX9-NEXT:    v_mov_b32_e32 v1, v5
566; GFX9-NEXT:    s_setpc_b64 s[30:31]
567;
568; GFX10-LABEL: shuffle_v4f16_6701:
569; GFX10:       ; %bb.0:
570; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
571; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
572; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
573; GFX10-NEXT:    global_load_dword v5, v[0:1], off
574; GFX10-NEXT:    s_waitcnt vmcnt(1)
575; GFX10-NEXT:    v_mov_b32_e32 v0, v4
576; GFX10-NEXT:    s_waitcnt vmcnt(0)
577; GFX10-NEXT:    v_mov_b32_e32 v1, v5
578; GFX10-NEXT:    s_setpc_b64 s[30:31]
579  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
580  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
581  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
582  ret <4 x half> %shuffle
583}
584
585define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
586; GFX9-LABEL: shuffle_v4f16_6723:
587; GFX9:       ; %bb.0:
588; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
589; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
590; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
591; GFX9-NEXT:    s_waitcnt vmcnt(1)
592; GFX9-NEXT:    v_mov_b32_e32 v0, v4
593; GFX9-NEXT:    s_waitcnt vmcnt(0)
594; GFX9-NEXT:    v_mov_b32_e32 v1, v5
595; GFX9-NEXT:    s_setpc_b64 s[30:31]
596;
597; GFX10-LABEL: shuffle_v4f16_6723:
598; GFX10:       ; %bb.0:
599; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
600; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
601; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
602; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
603; GFX10-NEXT:    s_waitcnt vmcnt(1)
604; GFX10-NEXT:    v_mov_b32_e32 v0, v4
605; GFX10-NEXT:    s_waitcnt vmcnt(0)
606; GFX10-NEXT:    v_mov_b32_e32 v1, v5
607; GFX10-NEXT:    s_setpc_b64 s[30:31]
608  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
609  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
610  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
611  ret <4 x half> %shuffle
612}
613
614define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
615; GFX9-LABEL: shuffle_v4f16_6745:
616; GFX9:       ; %bb.0:
617; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
619; GFX9-NEXT:    s_waitcnt vmcnt(0)
620; GFX9-NEXT:    v_mov_b32_e32 v0, v2
621; GFX9-NEXT:    s_setpc_b64 s[30:31]
622;
623; GFX10-LABEL: shuffle_v4f16_6745:
624; GFX10:       ; %bb.0:
625; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
627; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
628; GFX10-NEXT:    s_waitcnt vmcnt(0)
629; GFX10-NEXT:    v_mov_b32_e32 v0, v2
630; GFX10-NEXT:    s_setpc_b64 s[30:31]
631  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
632  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
633  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
634  ret <4 x half> %shuffle
635}
636
637define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
638; GFX9-LABEL: shuffle_v4f16_6767:
639; GFX9:       ; %bb.0:
640; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
642; GFX9-NEXT:    s_waitcnt vmcnt(0)
643; GFX9-NEXT:    v_mov_b32_e32 v1, v0
644; GFX9-NEXT:    s_setpc_b64 s[30:31]
645;
646; GFX10-LABEL: shuffle_v4f16_6767:
647; GFX10:       ; %bb.0:
648; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
649; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
650; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
651; GFX10-NEXT:    s_waitcnt vmcnt(0)
652; GFX10-NEXT:    v_mov_b32_e32 v1, v0
653; GFX10-NEXT:    s_setpc_b64 s[30:31]
654  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
655  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
656  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
657  ret <4 x half> %shuffle
658}
659
660define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
661; GFX9-LABEL: shuffle_v4f16_2356:
662; GFX9:       ; %bb.0:
663; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
664; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
665; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
666; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
667; GFX9-NEXT:    s_waitcnt vmcnt(1)
668; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
669; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
670; GFX9-NEXT:    s_waitcnt vmcnt(0)
671; GFX9-NEXT:    v_mov_b32_e32 v0, v4
672; GFX9-NEXT:    s_setpc_b64 s[30:31]
673;
674; GFX10-LABEL: shuffle_v4f16_2356:
675; GFX10:       ; %bb.0:
676; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
678; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
679; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
680; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
681; GFX10-NEXT:    s_waitcnt vmcnt(1)
682; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
683; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
684; GFX10-NEXT:    s_waitcnt vmcnt(0)
685; GFX10-NEXT:    v_mov_b32_e32 v0, v4
686; GFX10-NEXT:    s_setpc_b64 s[30:31]
687  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
688  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
689  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
690  ret <4 x half> %shuffle
691}
692
693define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
694; GFX9-LABEL: shuffle_v4f16_5623:
695; GFX9:       ; %bb.0:
696; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
698; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
699; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
700; GFX9-NEXT:    s_waitcnt vmcnt(1)
701; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
702; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
703; GFX9-NEXT:    s_waitcnt vmcnt(0)
704; GFX9-NEXT:    v_mov_b32_e32 v1, v4
705; GFX9-NEXT:    s_setpc_b64 s[30:31]
706;
707; GFX10-LABEL: shuffle_v4f16_5623:
708; GFX10:       ; %bb.0:
709; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
710; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
711; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
712; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
713; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
714; GFX10-NEXT:    s_waitcnt vmcnt(1)
715; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
716; GFX10-NEXT:    s_waitcnt vmcnt(0)
717; GFX10-NEXT:    v_mov_b32_e32 v1, v4
718; GFX10-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
719; GFX10-NEXT:    s_setpc_b64 s[30:31]
720  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
721  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
722  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
723  ret <4 x half> %shuffle
724}
725
726define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
727; GFX9-LABEL: shuffle_v4f16_3456:
728; GFX9:       ; %bb.0:
729; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
731; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
732; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
733; GFX9-NEXT:    s_waitcnt vmcnt(1)
734; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
735; GFX9-NEXT:    s_waitcnt vmcnt(0)
736; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
737; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
738; GFX9-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
739; GFX9-NEXT:    s_setpc_b64 s[30:31]
740;
741; GFX10-LABEL: shuffle_v4f16_3456:
742; GFX10:       ; %bb.0:
743; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
744; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
745; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
746; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
747; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
748; GFX10-NEXT:    s_waitcnt vmcnt(1)
749; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
750; GFX10-NEXT:    s_waitcnt vmcnt(0)
751; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
752; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
753; GFX10-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
754; GFX10-NEXT:    s_setpc_b64 s[30:31]
755  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
756  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
757  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
758  ret <4 x half> %shuffle
759}
760
761define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
762; GFX9-LABEL: shuffle_v4f16_5634:
763; GFX9:       ; %bb.0:
764; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
765; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
766; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
767; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
768; GFX9-NEXT:    s_waitcnt vmcnt(1)
769; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
770; GFX9-NEXT:    s_waitcnt vmcnt(0)
771; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
772; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
773; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
774; GFX9-NEXT:    s_setpc_b64 s[30:31]
775;
776; GFX10-LABEL: shuffle_v4f16_5634:
777; GFX10:       ; %bb.0:
778; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
779; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
780; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
781; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
782; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
783; GFX10-NEXT:    s_waitcnt vmcnt(1)
784; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
785; GFX10-NEXT:    s_waitcnt vmcnt(0)
786; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
787; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
788; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v2
789; GFX10-NEXT:    s_setpc_b64 s[30:31]
790  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
791  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
792  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
793  ret <4 x half> %shuffle
794}
795
796define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
797; GFX9-LABEL: shuffle_v4f16_5734:
798; GFX9:       ; %bb.0:
799; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
800; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
801; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
802; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
803; GFX9-NEXT:    s_waitcnt vmcnt(1)
804; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
805; GFX9-NEXT:    s_waitcnt vmcnt(0)
806; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
807; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
808; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
809; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
810; GFX9-NEXT:    s_setpc_b64 s[30:31]
811;
812; GFX10-LABEL: shuffle_v4f16_5734:
813; GFX10:       ; %bb.0:
814; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
815; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
816; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
817; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
818; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
819; GFX10-NEXT:    s_waitcnt vmcnt(1)
820; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
821; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
822; GFX10-NEXT:    s_waitcnt vmcnt(0)
823; GFX10-NEXT:    v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
824; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
825; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
826; GFX10-NEXT:    s_setpc_b64 s[30:31]
827  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
828  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
829  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
830  ret <4 x half> %shuffle
831}
832
833define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
834; GFX9-LABEL: shuffle_v4i16_2356:
835; GFX9:       ; %bb.0:
836; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
837; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
838; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
839; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
840; GFX9-NEXT:    s_waitcnt vmcnt(1)
841; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
842; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
843; GFX9-NEXT:    s_waitcnt vmcnt(0)
844; GFX9-NEXT:    v_mov_b32_e32 v0, v4
845; GFX9-NEXT:    s_setpc_b64 s[30:31]
846;
847; GFX10-LABEL: shuffle_v4i16_2356:
848; GFX10:       ; %bb.0:
849; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
850; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
851; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
852; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
853; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
854; GFX10-NEXT:    s_waitcnt vmcnt(1)
855; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
856; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
857; GFX10-NEXT:    s_waitcnt vmcnt(0)
858; GFX10-NEXT:    v_mov_b32_e32 v0, v4
859; GFX10-NEXT:    s_setpc_b64 s[30:31]
860  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
861  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
862  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
863  ret <4 x i16> %shuffle
864}
865
866define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
867; GFX9-LABEL: shuffle_v4i16_0167:
868; GFX9:       ; %bb.0:
869; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870; GFX9-NEXT:    global_load_dword v4, v[0:1], off
871; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
872; GFX9-NEXT:    s_waitcnt vmcnt(1)
873; GFX9-NEXT:    v_mov_b32_e32 v0, v4
874; GFX9-NEXT:    s_waitcnt vmcnt(0)
875; GFX9-NEXT:    v_mov_b32_e32 v1, v5
876; GFX9-NEXT:    s_setpc_b64 s[30:31]
877;
878; GFX10-LABEL: shuffle_v4i16_0167:
879; GFX10:       ; %bb.0:
880; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
882; GFX10-NEXT:    global_load_dword v4, v[0:1], off
883; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
884; GFX10-NEXT:    s_waitcnt vmcnt(1)
885; GFX10-NEXT:    v_mov_b32_e32 v0, v4
886; GFX10-NEXT:    s_waitcnt vmcnt(0)
887; GFX10-NEXT:    v_mov_b32_e32 v1, v5
888; GFX10-NEXT:    s_setpc_b64 s[30:31]
889  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
890  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
891  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
892  ret <4 x i16> %shuffle
893}
894
895define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
896; GFX9-LABEL: shuffle_v4f16_0000:
897; GFX9:       ; %bb.0:
898; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
900; GFX9-NEXT:    s_waitcnt vmcnt(0)
901; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
902; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
903; GFX9-NEXT:    v_mov_b32_e32 v1, v0
904; GFX9-NEXT:    s_setpc_b64 s[30:31]
905;
906; GFX10-LABEL: shuffle_v4f16_0000:
907; GFX10:       ; %bb.0:
908; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
910; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
911; GFX10-NEXT:    s_waitcnt vmcnt(0)
912; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v0
913; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
914; GFX10-NEXT:    v_mov_b32_e32 v1, v0
915; GFX10-NEXT:    s_setpc_b64 s[30:31]
916  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
917  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
918  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
919  ret <4 x half> %shuffle
920}
921
922define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
923; GFX9-LABEL: shuffle_v4f16_1010:
924; GFX9:       ; %bb.0:
925; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
926; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
927; GFX9-NEXT:    s_waitcnt vmcnt(0)
928; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
929; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
930; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
931; GFX9-NEXT:    v_mov_b32_e32 v1, v0
932; GFX9-NEXT:    s_setpc_b64 s[30:31]
933;
934; GFX10-LABEL: shuffle_v4f16_1010:
935; GFX10:       ; %bb.0:
936; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
938; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
939; GFX10-NEXT:    s_waitcnt vmcnt(0)
940; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
941; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
942; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
943; GFX10-NEXT:    v_mov_b32_e32 v1, v0
944; GFX10-NEXT:    s_setpc_b64 s[30:31]
945  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
946  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
947  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
948  ret <4 x half> %shuffle
949}
950
951define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
952; GFX9-LABEL: shuffle_v4f16_1100:
953; GFX9:       ; %bb.0:
954; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
955; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
956; GFX9-NEXT:    s_waitcnt vmcnt(0)
957; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
958; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
959; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
960; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
961; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
962; GFX9-NEXT:    s_setpc_b64 s[30:31]
963;
964; GFX10-LABEL: shuffle_v4f16_1100:
965; GFX10:       ; %bb.0:
966; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
967; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
968; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
969; GFX10-NEXT:    s_waitcnt vmcnt(0)
970; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
971; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v1
972; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v0
973; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
974; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
975; GFX10-NEXT:    s_setpc_b64 s[30:31]
976  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
977  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
978  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
979  ret <4 x half> %shuffle
980}
981
982define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
983; GFX9-LABEL: shuffle_v4f16_6161:
984; GFX9:       ; %bb.0:
985; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
986; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
987; GFX9-NEXT:    global_load_dword v5, v[0:1], off
988; GFX9-NEXT:    s_waitcnt vmcnt(1)
989; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
990; GFX9-NEXT:    s_waitcnt vmcnt(0)
991; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
992; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
993; GFX9-NEXT:    v_mov_b32_e32 v1, v0
994; GFX9-NEXT:    s_setpc_b64 s[30:31]
995;
996; GFX10-LABEL: shuffle_v4f16_6161:
997; GFX10:       ; %bb.0:
998; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
999; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1000; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
1001; GFX10-NEXT:    global_load_dword v5, v[0:1], off
1002; GFX10-NEXT:    s_waitcnt vmcnt(1)
1003; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
1004; GFX10-NEXT:    s_waitcnt vmcnt(0)
1005; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
1006; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1007; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1008; GFX10-NEXT:    s_setpc_b64 s[30:31]
1009  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1010  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1011  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
1012  ret <4 x half> %shuffle
1013}
1014
1015define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1016; GFX9-LABEL: shuffle_v4f16_2333:
1017; GFX9:       ; %bb.0:
1018; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1019; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
1020; GFX9-NEXT:    s_waitcnt vmcnt(0)
1021; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1022; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1023; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1024; GFX9-NEXT:    s_setpc_b64 s[30:31]
1025;
1026; GFX10-LABEL: shuffle_v4f16_2333:
1027; GFX10:       ; %bb.0:
1028; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1029; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1030; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
1031; GFX10-NEXT:    s_waitcnt vmcnt(0)
1032; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1033; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1034; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1035; GFX10-NEXT:    s_setpc_b64 s[30:31]
1036  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1037  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1038  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1039  ret <4 x half> %shuffle
1040}
1041
1042define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1043; GFX9-LABEL: shuffle_v4f16_6667:
1044; GFX9:       ; %bb.0:
1045; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1046; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
1047; GFX9-NEXT:    s_waitcnt vmcnt(0)
1048; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1049; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1050; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1051; GFX9-NEXT:    s_setpc_b64 s[30:31]
1052;
1053; GFX10-LABEL: shuffle_v4f16_6667:
1054; GFX10:       ; %bb.0:
1055; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1056; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1057; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
1058; GFX10-NEXT:    s_waitcnt vmcnt(0)
1059; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1060; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1061; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1062; GFX10-NEXT:    s_setpc_b64 s[30:31]
1063  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1064  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1065  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1066  ret <4 x half> %shuffle
1067}
1068
1069define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1070; GFX9-LABEL: shuffle_v8f16_0101:
1071; GFX9:       ; %bb.0:
1072; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1073; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1074; GFX9-NEXT:    s_waitcnt vmcnt(0)
1075; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1076; GFX9-NEXT:    s_setpc_b64 s[30:31]
1077;
1078; GFX10-LABEL: shuffle_v8f16_0101:
1079; GFX10:       ; %bb.0:
1080; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1081; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1082; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1083; GFX10-NEXT:    s_waitcnt vmcnt(0)
1084; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1085; GFX10-NEXT:    s_setpc_b64 s[30:31]
1086  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1087  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1088  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1089  ret <4 x half> %shuffle
1090}
1091
1092define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1093; GFX9-LABEL: shuffle_v8f16_0123:
1094; GFX9:       ; %bb.0:
1095; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1096; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1097; GFX9-NEXT:    s_waitcnt vmcnt(0)
1098; GFX9-NEXT:    s_setpc_b64 s[30:31]
1099;
1100; GFX10-LABEL: shuffle_v8f16_0123:
1101; GFX10:       ; %bb.0:
1102; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1103; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1104; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1105; GFX10-NEXT:    s_waitcnt vmcnt(0)
1106; GFX10-NEXT:    s_setpc_b64 s[30:31]
1107  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1108  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1109  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1110  ret <4 x half> %shuffle
1111}
1112
1113define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1114; GFX9-LABEL: shuffle_v8f16_4589:
1115; GFX9:       ; %bb.0:
1116; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1117; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
1118; GFX9-NEXT:    global_load_dword v5, v[2:3], off
1119; GFX9-NEXT:    s_waitcnt vmcnt(1)
1120; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1121; GFX9-NEXT:    s_waitcnt vmcnt(0)
1122; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1123; GFX9-NEXT:    s_setpc_b64 s[30:31]
1124;
1125; GFX10-LABEL: shuffle_v8f16_4589:
1126; GFX10:       ; %bb.0:
1127; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1128; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1129; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
1130; GFX10-NEXT:    global_load_dword v5, v[2:3], off
1131; GFX10-NEXT:    s_waitcnt vmcnt(1)
1132; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1133; GFX10-NEXT:    s_waitcnt vmcnt(0)
1134; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1135; GFX10-NEXT:    s_setpc_b64 s[30:31]
1136  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1137  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1138  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
1139  ret <4 x half> %shuffle
1140}
1141
1142define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1143; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
1144; GFX9:       ; %bb.0:
1145; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1146; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
1147; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
1148; GFX9-NEXT:    s_waitcnt vmcnt(1)
1149; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1150; GFX9-NEXT:    s_waitcnt vmcnt(0)
1151; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1152; GFX9-NEXT:    s_setpc_b64 s[30:31]
1153;
1154; GFX10-LABEL: shuffle_v8f16_10_11_2_3:
1155; GFX10:       ; %bb.0:
1156; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1157; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1158; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
1159; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
1160; GFX10-NEXT:    s_waitcnt vmcnt(1)
1161; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1162; GFX10-NEXT:    s_waitcnt vmcnt(0)
1163; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1164; GFX10-NEXT:    s_setpc_b64 s[30:31]
1165  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1166  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1167  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
1168  ret <4 x half> %shuffle
1169}
1170
1171define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1172; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
1173; GFX9:       ; %bb.0:
1174; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1175; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
1176; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
1177; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1178; GFX9-NEXT:    s_waitcnt vmcnt(1)
1179; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1180; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
1181; GFX9-NEXT:    s_waitcnt vmcnt(0)
1182; GFX9-NEXT:    v_mov_b32_e32 v1, v4
1183; GFX9-NEXT:    s_setpc_b64 s[30:31]
1184;
1185; GFX10-LABEL: shuffle_v8f16_13_14_2_3:
1186; GFX10:       ; %bb.0:
1187; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1188; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1189; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
1190; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
1191; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1192; GFX10-NEXT:    s_waitcnt vmcnt(1)
1193; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1194; GFX10-NEXT:    s_waitcnt vmcnt(0)
1195; GFX10-NEXT:    v_mov_b32_e32 v1, v4
1196; GFX10-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
1197; GFX10-NEXT:    s_setpc_b64 s[30:31]
1198  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1199  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1200  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
1201  ret <4 x half> %shuffle
1202}
1203
1204define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
1205; GFX9-LABEL: shuffle_v3f16_0122:
1206; GFX9:       ; %bb.0:
1207; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1208; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1209; GFX9-NEXT:    s_waitcnt vmcnt(0)
1210; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1211; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1212; GFX9-NEXT:    s_setpc_b64 s[30:31]
1213;
1214; GFX10-LABEL: shuffle_v3f16_0122:
1215; GFX10:       ; %bb.0:
1216; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1217; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1218; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1219; GFX10-NEXT:    s_waitcnt vmcnt(0)
1220; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1221; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
1222; GFX10-NEXT:    s_setpc_b64 s[30:31]
1223  %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
1224  %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
1225  %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1226  ret <4 x half> %shuffle
1227}
1228
1229define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
1230; GFX9-LABEL: shuffle_v2f16_0122:
1231; GFX9:       ; %bb.0:
1232; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1233; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1234; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
1235; GFX9-NEXT:    s_waitcnt vmcnt(0)
1236; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1237; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
1238; GFX9-NEXT:    s_setpc_b64 s[30:31]
1239;
1240; GFX10-LABEL: shuffle_v2f16_0122:
1241; GFX10:       ; %bb.0:
1242; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1244; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1245; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
1246; GFX10-NEXT:    s_waitcnt vmcnt(0)
1247; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1248; GFX10-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
1249; GFX10-NEXT:    s_setpc_b64 s[30:31]
1250  %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
1251  %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
1252  %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
1253  ret <4 x half> %shuffle
1254}
1255
1256define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
1257; GFX9-LABEL: shuffle_v6f16_452367:
1258; GFX9:       ; %bb.0:
1259; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1260; GFX9-NEXT:    v_mov_b32_e32 v6, v1
1261; GFX9-NEXT:    v_mov_b32_e32 v5, v0
1262; GFX9-NEXT:    v_mov_b32_e32 v4, v3
1263; GFX9-NEXT:    v_mov_b32_e32 v3, v2
1264; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
1265; GFX9-NEXT:    global_load_dword v7, v[3:4], off
1266; GFX9-NEXT:    s_waitcnt vmcnt(1)
1267; GFX9-NEXT:    v_mov_b32_e32 v0, v2
1268; GFX9-NEXT:    s_waitcnt vmcnt(0)
1269; GFX9-NEXT:    v_mov_b32_e32 v2, v7
1270; GFX9-NEXT:    s_setpc_b64 s[30:31]
1271;
1272; GFX10-LABEL: shuffle_v6f16_452367:
1273; GFX10:       ; %bb.0:
1274; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1275; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1276; GFX10-NEXT:    v_mov_b32_e32 v6, v1
1277; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1278; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1279; GFX10-NEXT:    v_mov_b32_e32 v3, v2
1280; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
1281; GFX10-NEXT:    global_load_dword v7, v[3:4], off
1282; GFX10-NEXT:    s_waitcnt vmcnt(1)
1283; GFX10-NEXT:    v_mov_b32_e32 v0, v2
1284; GFX10-NEXT:    s_waitcnt vmcnt(0)
1285; GFX10-NEXT:    v_mov_b32_e32 v2, v7
1286; GFX10-NEXT:    s_setpc_b64 s[30:31]
1287  %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
1288  %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
1289  %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
1290  ret <6 x half> %shuffle
1291}
1292
1293define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C)  {
1294; GFX9-LABEL: fma_shuffle:
1295; GFX9:       ; %bb.0: ; %entry
1296; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1297; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1298; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1299; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1300; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1301; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1302; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[6:7]
1303; GFX9-NEXT:    s_waitcnt vmcnt(0)
1304; GFX9-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1305; GFX9-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1306; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1307; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1308; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
1309; GFX9-NEXT:    s_endpgm
1310;
1311; GFX10-LABEL: fma_shuffle:
1312; GFX10:       ; %bb.0: ; %entry
1313; GFX10-NEXT:    s_clause 0x1
1314; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1315; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1316; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1317; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1318; GFX10-NEXT:    s_clause 0x2
1319; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1320; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1321; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[6:7]
1322; GFX10-NEXT:    s_waitcnt vmcnt(0)
1323; GFX10-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1324; GFX10-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1325; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1326; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1327; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
1328; GFX10-NEXT:    s_endpgm
1329entry:
1330  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
1331  %tmp12 = zext i32 %tmp1 to i64
1332  %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
1333  %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
1334  %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
1335  %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
1336  %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
1337  %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
1338  %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
1339  %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1340  %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1341  %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
1342  %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
1343  %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1344  %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
1345  %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1346  %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1347  %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
1348  %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1349  %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
1350  %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
1351  %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
1352  %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1353  %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1354  store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
1355  ret void
1356}
1357
1358define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1359; GFX9-LABEL: shuffle_v4f16_0456:
1360; GFX9:       ; %bb.0:
1361; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1362; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1363; GFX9-NEXT:    s_waitcnt vmcnt(0)
1364; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1365; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1366; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
1367; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1368; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v4
1369; GFX9-NEXT:    s_waitcnt vmcnt(0)
1370; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1371; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
1372; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
1373; GFX9-NEXT:    s_setpc_b64 s[30:31]
1374;
1375; GFX10-LABEL: shuffle_v4f16_0456:
1376; GFX10:       ; %bb.0:
1377; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1378; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1379; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1380; GFX10-NEXT:    s_waitcnt vmcnt(0)
1381; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1382; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1383; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffff
1384; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1385; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v4
1386; GFX10-NEXT:    s_waitcnt vmcnt(0)
1387; GFX10-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1388; GFX10-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
1389; GFX10-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
1390; GFX10-NEXT:    s_setpc_b64 s[30:31]
1391  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1392  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1393  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1394  ret <4 x half> %shuffle
1395}
1396
1397define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out)  {
1398; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
1399; GFX9:       ; %bb.0:
1400; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1401; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1402; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1403; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1404; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1405; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1406; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1407; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1408; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1409; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1410; GFX9-NEXT:    s_endpgm
1411;
1412; GFX10-LABEL: shuffle_scalar_load_v8i32_0123:
1413; GFX10:       ; %bb.0:
1414; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1415; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1416; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1418; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1419; GFX10-NEXT:    v_mov_b32_e32 v0, s4
1420; GFX10-NEXT:    v_mov_b32_e32 v1, s5
1421; GFX10-NEXT:    v_mov_b32_e32 v2, s6
1422; GFX10-NEXT:    v_mov_b32_e32 v3, s7
1423; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1424; GFX10-NEXT:    s_endpgm
1425  %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
1426  %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1427  store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8
1428  ret void
1429}
1430
1431declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
1432declare i32 @llvm.amdgcn.workitem.id.x() #0
1433
1434attributes #0 = { nounwind readnone speculatable }
1435