1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3
4define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
5; GFX9-LABEL: shuffle_v4f16_23uu:
6; GFX9:       ; %bb.0:
7; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
9; GFX9-NEXT:    s_waitcnt vmcnt(0)
10; GFX9-NEXT:    s_setpc_b64 s[30:31]
11  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
12  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
13  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
14  ret <4 x half> %shuffle
15}
16
17define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
18; GFX9-LABEL: shuffle_v4f16_234u:
19; GFX9:       ; %bb.0:
20; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
22; GFX9-NEXT:    s_waitcnt vmcnt(0)
23; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
24; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
25; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
26; GFX9-NEXT:    v_mov_b32_e32 v1, v4
27; GFX9-NEXT:    s_waitcnt vmcnt(0)
28; GFX9-NEXT:    v_mov_b32_e32 v0, v5
29; GFX9-NEXT:    s_setpc_b64 s[30:31]
30  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
31  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
32  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
33  ret <4 x half> %shuffle
34}
35
36define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
37; GFX9-LABEL: shuffle_v4f16_u1u3:
38; GFX9:       ; %bb.0:
39; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
41; GFX9-NEXT:    s_waitcnt vmcnt(0)
42; GFX9-NEXT:    s_setpc_b64 s[30:31]
43  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
44  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
45  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
46  ret <4 x half> %shuffle
47}
48
49define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
50; GFX9-LABEL: shuffle_v4f16_u3u1:
51; GFX9:       ; %bb.0:
52; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
54; GFX9-NEXT:    s_waitcnt vmcnt(0)
55; GFX9-NEXT:    v_mov_b32_e32 v0, v2
56; GFX9-NEXT:    s_setpc_b64 s[30:31]
57  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
58  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
59  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
60  ret <4 x half> %shuffle
61}
62
63define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
64; GFX9-LABEL: shuffle_v4f16_u3uu:
65; GFX9:       ; %bb.0:
66; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
68; GFX9-NEXT:    s_waitcnt vmcnt(0)
69; GFX9-NEXT:    s_setpc_b64 s[30:31]
70  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
71  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
72  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
73  ret <4 x half> %shuffle
74}
75
76define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
77; GFX9-LABEL: shuffle_v4f16_3u6u:
78; GFX9:       ; %bb.0:
79; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
81; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
82; GFX9-NEXT:    s_waitcnt vmcnt(1)
83; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
84; GFX9-NEXT:    s_waitcnt vmcnt(0)
85; GFX9-NEXT:    v_mov_b32_e32 v1, v4
86; GFX9-NEXT:    s_setpc_b64 s[30:31]
87  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
88  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
89  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
90  ret <4 x half> %shuffle
91}
92
93define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
94; GFX9-LABEL: shuffle_v4f16_3uu7:
95; GFX9:       ; %bb.0:
96; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
98; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
99; GFX9-NEXT:    s_waitcnt vmcnt(1)
100; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
101; GFX9-NEXT:    s_waitcnt vmcnt(0)
102; GFX9-NEXT:    v_mov_b32_e32 v1, v4
103; GFX9-NEXT:    s_setpc_b64 s[30:31]
104  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
105  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
106  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
107  ret <4 x half> %shuffle
108}
109
110define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
111; GFX9-LABEL: shuffle_v4f16_35u5:
112; GFX9:       ; %bb.0:
113; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX9-NEXT:    global_load_dword v4, v[2:3], off
115; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
116; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
117; GFX9-NEXT:    s_waitcnt vmcnt(1)
118; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
119; GFX9-NEXT:    s_waitcnt vmcnt(0)
120; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
121; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
122; GFX9-NEXT:    v_mov_b32_e32 v1, v4
123; GFX9-NEXT:    s_setpc_b64 s[30:31]
124  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
125  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
126  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
127  ret <4 x half> %shuffle
128}
129
130define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
131; GFX9-LABEL: shuffle_v4f16_357u:
132; GFX9:       ; %bb.0:
133; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
135; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
136; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
137; GFX9-NEXT:    s_waitcnt vmcnt(1)
138; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
139; GFX9-NEXT:    s_waitcnt vmcnt(0)
140; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
141; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
142; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
143; GFX9-NEXT:    s_setpc_b64 s[30:31]
144  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
145  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
146  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
147  ret <4 x half> %shuffle
148}
149
150define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
151; GFX9-LABEL: shuffle_v4f16_0101:
152; GFX9:       ; %bb.0:
153; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154; GFX9-NEXT:    global_load_dword v0, v[0:1], off
155; GFX9-NEXT:    s_waitcnt vmcnt(0)
156; GFX9-NEXT:    v_mov_b32_e32 v1, v0
157; GFX9-NEXT:    s_setpc_b64 s[30:31]
158  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
159  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
160  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
161  ret <4 x half> %shuffle
162}
163
164define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
165; GFX9-LABEL: shuffle_v4f16_0123:
166; GFX9:       ; %bb.0:
167; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
169; GFX9-NEXT:    s_waitcnt vmcnt(0)
170; GFX9-NEXT:    s_setpc_b64 s[30:31]
171  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
172  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
173  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
174  ret <4 x half> %shuffle
175}
176
177define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
178; GFX9-LABEL: shuffle_v4f16_0145:
179; GFX9:       ; %bb.0:
180; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181; GFX9-NEXT:    global_load_dword v4, v[0:1], off
182; GFX9-NEXT:    global_load_dword v5, v[2:3], off
183; GFX9-NEXT:    s_waitcnt vmcnt(1)
184; GFX9-NEXT:    v_mov_b32_e32 v0, v4
185; GFX9-NEXT:    s_waitcnt vmcnt(0)
186; GFX9-NEXT:    v_mov_b32_e32 v1, v5
187; GFX9-NEXT:    s_setpc_b64 s[30:31]
188  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
189  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
190  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
191  ret <4 x half> %shuffle
192}
193
194define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
195; GFX9-LABEL: shuffle_v4f16_0167:
196; GFX9:       ; %bb.0:
197; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198; GFX9-NEXT:    global_load_dword v4, v[0:1], off
199; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
200; GFX9-NEXT:    s_waitcnt vmcnt(1)
201; GFX9-NEXT:    v_mov_b32_e32 v0, v4
202; GFX9-NEXT:    s_waitcnt vmcnt(0)
203; GFX9-NEXT:    v_mov_b32_e32 v1, v5
204; GFX9-NEXT:    s_setpc_b64 s[30:31]
205  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
206  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
207  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
208  ret <4 x half> %shuffle
209}
210
211define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
212; GFX9-LABEL: shuffle_v4f16_2301:
213; GFX9:       ; %bb.0:
214; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
216; GFX9-NEXT:    s_waitcnt vmcnt(0)
217; GFX9-NEXT:    v_mov_b32_e32 v0, v2
218; GFX9-NEXT:    s_setpc_b64 s[30:31]
219  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
220  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
221  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
222  ret <4 x half> %shuffle
223}
224
225define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
226; GFX9-LABEL: shuffle_v4f16_2323:
227; GFX9:       ; %bb.0:
228; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
230; GFX9-NEXT:    s_waitcnt vmcnt(0)
231; GFX9-NEXT:    v_mov_b32_e32 v1, v0
232; GFX9-NEXT:    s_setpc_b64 s[30:31]
233  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
234  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
235  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
236  ret <4 x half> %shuffle
237}
238
239define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
240; GFX9-LABEL: shuffle_v4f16_2345:
241; GFX9:       ; %bb.0:
242; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
244; GFX9-NEXT:    global_load_dword v5, v[2:3], off
245; GFX9-NEXT:    s_waitcnt vmcnt(1)
246; GFX9-NEXT:    v_mov_b32_e32 v0, v4
247; GFX9-NEXT:    s_waitcnt vmcnt(0)
248; GFX9-NEXT:    v_mov_b32_e32 v1, v5
249; GFX9-NEXT:    s_setpc_b64 s[30:31]
250  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
251  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
252  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
253  ret <4 x half> %shuffle
254}
255
256define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
257; GFX9-LABEL: shuffle_v4f16_2367:
258; GFX9:       ; %bb.0:
259; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
261; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
262; GFX9-NEXT:    s_waitcnt vmcnt(1)
263; GFX9-NEXT:    v_mov_b32_e32 v0, v4
264; GFX9-NEXT:    s_waitcnt vmcnt(0)
265; GFX9-NEXT:    v_mov_b32_e32 v1, v5
266; GFX9-NEXT:    s_setpc_b64 s[30:31]
267  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
268  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
269  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
270  ret <4 x half> %shuffle
271}
272
273define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
274; GFX9-LABEL: shuffle_v4f16_4501:
275; GFX9:       ; %bb.0:
276; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277; GFX9-NEXT:    global_load_dword v4, v[2:3], off
278; GFX9-NEXT:    global_load_dword v5, v[0:1], off
279; GFX9-NEXT:    s_waitcnt vmcnt(1)
280; GFX9-NEXT:    v_mov_b32_e32 v0, v4
281; GFX9-NEXT:    s_waitcnt vmcnt(0)
282; GFX9-NEXT:    v_mov_b32_e32 v1, v5
283; GFX9-NEXT:    s_setpc_b64 s[30:31]
284  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
285  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
286  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
287  ret <4 x half> %shuffle
288}
289
290define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
291; GFX9-LABEL: shuffle_v4f16_4523:
292; GFX9:       ; %bb.0:
293; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294; GFX9-NEXT:    global_load_dword v4, v[2:3], off
295; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
296; GFX9-NEXT:    s_waitcnt vmcnt(1)
297; GFX9-NEXT:    v_mov_b32_e32 v0, v4
298; GFX9-NEXT:    s_waitcnt vmcnt(0)
299; GFX9-NEXT:    v_mov_b32_e32 v1, v5
300; GFX9-NEXT:    s_setpc_b64 s[30:31]
301  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
302  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
303  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
304  ret <4 x half> %shuffle
305}
306
307define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
308; GFX9-LABEL: shuffle_v4f16_4545:
309; GFX9:       ; %bb.0:
310; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311; GFX9-NEXT:    global_load_dword v0, v[2:3], off
312; GFX9-NEXT:    s_waitcnt vmcnt(0)
313; GFX9-NEXT:    v_mov_b32_e32 v1, v0
314; GFX9-NEXT:    s_setpc_b64 s[30:31]
315  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
316  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
317  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
318  ret <4 x half> %shuffle
319}
320
321define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
322; GFX9-LABEL: shuffle_v4f16_4567:
323; GFX9:       ; %bb.0:
324; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
326; GFX9-NEXT:    s_waitcnt vmcnt(0)
327; GFX9-NEXT:    s_setpc_b64 s[30:31]
328  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
329  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
330  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
331  ret <4 x half> %shuffle
332}
333
334define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
335; GFX9-LABEL: shuffle_v4f16_6701:
336; GFX9:       ; %bb.0:
337; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
338; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
339; GFX9-NEXT:    global_load_dword v5, v[0:1], off
340; GFX9-NEXT:    s_waitcnt vmcnt(1)
341; GFX9-NEXT:    v_mov_b32_e32 v0, v4
342; GFX9-NEXT:    s_waitcnt vmcnt(0)
343; GFX9-NEXT:    v_mov_b32_e32 v1, v5
344; GFX9-NEXT:    s_setpc_b64 s[30:31]
345  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
346  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
347  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
348  ret <4 x half> %shuffle
349}
350
351define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
352; GFX9-LABEL: shuffle_v4f16_6723:
353; GFX9:       ; %bb.0:
354; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
356; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
357; GFX9-NEXT:    s_waitcnt vmcnt(1)
358; GFX9-NEXT:    v_mov_b32_e32 v0, v4
359; GFX9-NEXT:    s_waitcnt vmcnt(0)
360; GFX9-NEXT:    v_mov_b32_e32 v1, v5
361; GFX9-NEXT:    s_setpc_b64 s[30:31]
362  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
363  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
364  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
365  ret <4 x half> %shuffle
366}
367
368define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
369; GFX9-LABEL: shuffle_v4f16_6745:
370; GFX9:       ; %bb.0:
371; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
373; GFX9-NEXT:    s_waitcnt vmcnt(0)
374; GFX9-NEXT:    v_mov_b32_e32 v0, v2
375; GFX9-NEXT:    s_setpc_b64 s[30:31]
376  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
377  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
378  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
379  ret <4 x half> %shuffle
380}
381
382define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
383; GFX9-LABEL: shuffle_v4f16_6767:
384; GFX9:       ; %bb.0:
385; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
386; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
387; GFX9-NEXT:    s_waitcnt vmcnt(0)
388; GFX9-NEXT:    v_mov_b32_e32 v1, v0
389; GFX9-NEXT:    s_setpc_b64 s[30:31]
390  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
391  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
392  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
393  ret <4 x half> %shuffle
394}
395
396define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
397; GFX9-LABEL: shuffle_v4f16_2356:
398; GFX9:       ; %bb.0:
399; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
401; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
402; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
403; GFX9-NEXT:    s_waitcnt vmcnt(1)
404; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
405; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
406; GFX9-NEXT:    s_waitcnt vmcnt(0)
407; GFX9-NEXT:    v_mov_b32_e32 v0, v4
408; GFX9-NEXT:    s_setpc_b64 s[30:31]
409  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
410  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
411  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
412  ret <4 x half> %shuffle
413}
414
415define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
416; GFX9-LABEL: shuffle_v4f16_5623:
417; GFX9:       ; %bb.0:
418; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
420; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
421; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
422; GFX9-NEXT:    s_waitcnt vmcnt(1)
423; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
424; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
425; GFX9-NEXT:    s_waitcnt vmcnt(0)
426; GFX9-NEXT:    v_mov_b32_e32 v1, v4
427; GFX9-NEXT:    s_setpc_b64 s[30:31]
428  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
429  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
430  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
431  ret <4 x half> %shuffle
432}
433
434define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
435; GFX9-LABEL: shuffle_v4f16_3456:
436; GFX9:       ; %bb.0:
437; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
438; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
439; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
440; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
441; GFX9-NEXT:    s_waitcnt vmcnt(1)
442; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
443; GFX9-NEXT:    s_waitcnt vmcnt(0)
444; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
445; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v1
446; GFX9-NEXT:    v_lshl_or_b32 v1, v5, 16, v2
447; GFX9-NEXT:    s_setpc_b64 s[30:31]
448  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
449  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
450  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
451  ret <4 x half> %shuffle
452}
453
454define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
455; GFX9-LABEL: shuffle_v4f16_5634:
456; GFX9:       ; %bb.0:
457; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
459; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
460; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
461; GFX9-NEXT:    s_waitcnt vmcnt(0)
462; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
463; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
464; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
465; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
466; GFX9-NEXT:    s_setpc_b64 s[30:31]
467  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
468  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
469  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
470  ret <4 x half> %shuffle
471}
472
473define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
474; GFX9-LABEL: shuffle_v4f16_5734:
475; GFX9:       ; %bb.0:
476; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
477; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
478; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
479; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
480; GFX9-NEXT:    s_waitcnt vmcnt(1)
481; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
482; GFX9-NEXT:    s_waitcnt vmcnt(0)
483; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
484; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
485; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
486; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
487; GFX9-NEXT:    s_setpc_b64 s[30:31]
488  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
489  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
490  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
491  ret <4 x half> %shuffle
492}
493
494define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
495; GFX9-LABEL: shuffle_v4i16_2356:
496; GFX9:       ; %bb.0:
497; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
499; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
500; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
501; GFX9-NEXT:    s_waitcnt vmcnt(1)
502; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
503; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v0
504; GFX9-NEXT:    s_waitcnt vmcnt(0)
505; GFX9-NEXT:    v_mov_b32_e32 v0, v4
506; GFX9-NEXT:    s_setpc_b64 s[30:31]
507  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
508  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
509  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
510  ret <4 x i16> %shuffle
511}
512
513define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
514; GFX9-LABEL: shuffle_v4i16_0167:
515; GFX9:       ; %bb.0:
516; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
517; GFX9-NEXT:    global_load_dword v4, v[0:1], off
518; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
519; GFX9-NEXT:    s_waitcnt vmcnt(1)
520; GFX9-NEXT:    v_mov_b32_e32 v0, v4
521; GFX9-NEXT:    s_waitcnt vmcnt(0)
522; GFX9-NEXT:    v_mov_b32_e32 v1, v5
523; GFX9-NEXT:    s_setpc_b64 s[30:31]
524  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
525  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
526  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
527  ret <4 x i16> %shuffle
528}
529
530define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
531; GFX9-LABEL: shuffle_v4f16_0000:
532; GFX9:       ; %bb.0:
533; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
535; GFX9-NEXT:    s_waitcnt vmcnt(0)
536; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v0
537; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
538; GFX9-NEXT:    v_mov_b32_e32 v1, v0
539; GFX9-NEXT:    s_setpc_b64 s[30:31]
540  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
541  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
542  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
543  ret <4 x half> %shuffle
544}
545
546define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
547; GFX9-LABEL: shuffle_v4f16_1010:
548; GFX9:       ; %bb.0:
549; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
550; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
551; GFX9-NEXT:    s_waitcnt vmcnt(0)
552; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
553; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
554; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
555; GFX9-NEXT:    v_mov_b32_e32 v1, v0
556; GFX9-NEXT:    s_setpc_b64 s[30:31]
557  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
558  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
559  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
560  ret <4 x half> %shuffle
561}
562
563define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
564; GFX9-LABEL: shuffle_v4f16_1100:
565; GFX9:       ; %bb.0:
566; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
568; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
569; GFX9-NEXT:    s_waitcnt vmcnt(0)
570; GFX9-NEXT:    v_and_b32_e32 v1, v2, v0
571; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
572; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
573; GFX9-NEXT:    v_and_b32_e32 v0, v2, v3
574; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
575; GFX9-NEXT:    s_setpc_b64 s[30:31]
576  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
577  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
578  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
579  ret <4 x half> %shuffle
580}
581
582define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
583; GFX9-LABEL: shuffle_v4f16_6161:
584; GFX9:       ; %bb.0:
585; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
587; GFX9-NEXT:    global_load_dword v5, v[0:1], off
588; GFX9-NEXT:    s_waitcnt vmcnt(1)
589; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v4
590; GFX9-NEXT:    s_waitcnt vmcnt(0)
591; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
592; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
593; GFX9-NEXT:    v_mov_b32_e32 v1, v0
594; GFX9-NEXT:    s_setpc_b64 s[30:31]
595  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
596  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
597  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
598  ret <4 x half> %shuffle
599}
600
601define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
602; GFX9-LABEL: shuffle_v4f16_2333:
603; GFX9:       ; %bb.0:
604; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
606; GFX9-NEXT:    s_waitcnt vmcnt(0)
607; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
608; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
609; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
610; GFX9-NEXT:    s_setpc_b64 s[30:31]
611  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
612  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
613  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
614  ret <4 x half> %shuffle
615}
616
617define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
618; GFX9-LABEL: shuffle_v4f16_6667:
619; GFX9:       ; %bb.0:
620; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
621; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
622; GFX9-NEXT:    s_waitcnt vmcnt(0)
623; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
624; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
625; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
626; GFX9-NEXT:    s_setpc_b64 s[30:31]
627  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
628  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
629  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
630  ret <4 x half> %shuffle
631}
632
633define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
634; GFX9-LABEL: shuffle_v8f16_0101:
635; GFX9:       ; %bb.0:
636; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
637; GFX9-NEXT:    global_load_dword v0, v[0:1], off
638; GFX9-NEXT:    s_waitcnt vmcnt(0)
639; GFX9-NEXT:    v_mov_b32_e32 v1, v0
640; GFX9-NEXT:    s_setpc_b64 s[30:31]
641  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
642  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
643  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
644  ret <4 x half> %shuffle
645}
646
647define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
648; GFX9-LABEL: shuffle_v8f16_0123:
649; GFX9:       ; %bb.0:
650; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
652; GFX9-NEXT:    s_waitcnt vmcnt(0)
653; GFX9-NEXT:    s_setpc_b64 s[30:31]
654  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
655  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
656  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
657  ret <4 x half> %shuffle
658}
659
660define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
661; GFX9-LABEL: shuffle_v8f16_4589:
662; GFX9:       ; %bb.0:
663; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
664; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
665; GFX9-NEXT:    global_load_dword v5, v[2:3], off
666; GFX9-NEXT:    s_waitcnt vmcnt(1)
667; GFX9-NEXT:    v_mov_b32_e32 v0, v4
668; GFX9-NEXT:    s_waitcnt vmcnt(0)
669; GFX9-NEXT:    v_mov_b32_e32 v1, v5
670; GFX9-NEXT:    s_setpc_b64 s[30:31]
671  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
672  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
673  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
674  ret <4 x half> %shuffle
675}
676
677define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
678; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
679; GFX9:       ; %bb.0:
680; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
681; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
682; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
683; GFX9-NEXT:    s_waitcnt vmcnt(1)
684; GFX9-NEXT:    v_mov_b32_e32 v0, v4
685; GFX9-NEXT:    s_waitcnt vmcnt(0)
686; GFX9-NEXT:    v_mov_b32_e32 v1, v5
687; GFX9-NEXT:    s_setpc_b64 s[30:31]
688  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
689  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
690  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
691  ret <4 x half> %shuffle
692}
693
694define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
695; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
696; GFX9:       ; %bb.0:
697; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
698; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
699; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
700; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
701; GFX9-NEXT:    s_waitcnt vmcnt(1)
702; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
703; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
704; GFX9-NEXT:    s_waitcnt vmcnt(0)
705; GFX9-NEXT:    v_mov_b32_e32 v1, v4
706; GFX9-NEXT:    s_setpc_b64 s[30:31]
707  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
708  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
709  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
710  ret <4 x half> %shuffle
711}
712
713define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
714; GFX9-LABEL: shuffle_v3f16_0122:
715; GFX9:       ; %bb.0:
716; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
717; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
718; GFX9-NEXT:    s_waitcnt vmcnt(0)
719; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
720; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
721; GFX9-NEXT:    s_setpc_b64 s[30:31]
722  %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
723  %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
724  %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
725  ret <4 x half> %shuffle
726}
727
728define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
729; GFX9-LABEL: shuffle_v2f16_0122:
730; GFX9:       ; %bb.0:
731; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
732; GFX9-NEXT:    global_load_dword v0, v[0:1], off
733; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
734; GFX9-NEXT:    s_waitcnt vmcnt(0)
735; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
736; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
737; GFX9-NEXT:    s_setpc_b64 s[30:31]
738  %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
739  %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
740  %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
741  ret <4 x half> %shuffle
742}
743
744define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
745; GFX9-LABEL: shuffle_v6f16_452367:
746; GFX9:       ; %bb.0:
747; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
748; GFX9-NEXT:    v_mov_b32_e32 v6, v1
749; GFX9-NEXT:    v_mov_b32_e32 v5, v0
750; GFX9-NEXT:    v_mov_b32_e32 v4, v3
751; GFX9-NEXT:    v_mov_b32_e32 v3, v2
752; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
753; GFX9-NEXT:    global_load_dword v7, v[3:4], off
754; GFX9-NEXT:    s_waitcnt vmcnt(1)
755; GFX9-NEXT:    v_mov_b32_e32 v0, v2
756; GFX9-NEXT:    s_waitcnt vmcnt(0)
757; GFX9-NEXT:    v_mov_b32_e32 v2, v7
758; GFX9-NEXT:    s_setpc_b64 s[30:31]
759  %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
760  %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
761  %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
762  ret <6 x half> %shuffle
763}
764
765define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C)  {
766; GFX9-LABEL: fma_shuffle:
767; GFX9:       ; %bb.0: ; %entry
768; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
769; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
770; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
771; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
772; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
773; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
774; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[8:9]
775; GFX9-NEXT:    s_waitcnt vmcnt(0)
776; GFX9-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
777; GFX9-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
778; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
779; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
780; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[8:9]
781; GFX9-NEXT:    s_endpgm
782entry:
783  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
784  %tmp12 = zext i32 %tmp1 to i64
785  %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
786  %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
787  %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
788  %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
789  %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
790  %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
791  %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
792  %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
793  %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
794  %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
795  %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
796  %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
797  %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
798  %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
799  %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
800  %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
801  %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
802  %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
803  %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
804  %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
805  %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
806  %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
807  store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
808  ret void
809}
810
811define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
812; GFX9-LABEL: shuffle_v4f16_0456:
813; GFX9:       ; %bb.0:
814; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
815; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
816; GFX9-NEXT:    s_waitcnt vmcnt(0)
817; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
818; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
819; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
820; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
821; GFX9-NEXT:    v_and_b32_e32 v1, v0, v4
822; GFX9-NEXT:    s_waitcnt vmcnt(0)
823; GFX9-NEXT:    v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
824; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v1
825; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v2
826; GFX9-NEXT:    s_setpc_b64 s[30:31]
827  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
828  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
829  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
830  ret <4 x half> %shuffle
831}
832
833define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out)  {
834; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
835; GFX9:       ; %bb.0:
836; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
837; GFX9-NEXT:    v_mov_b32_e32 v4, 0
838; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
839; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
840; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
841; GFX9-NEXT:    v_mov_b32_e32 v0, s4
842; GFX9-NEXT:    v_mov_b32_e32 v1, s5
843; GFX9-NEXT:    v_mov_b32_e32 v2, s6
844; GFX9-NEXT:    v_mov_b32_e32 v3, s7
845; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
846; GFX9-NEXT:    s_endpgm
847  %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
848  %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
849  store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8
850  ret void
851}
852
853declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
854declare i32 @llvm.amdgcn.workitem.id.x() #0
855
856attributes #0 = { nounwind readnone speculatable }
857