1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600
6
7declare i32 @llvm.fshr.i32(i32, i32, i32)
8declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
9declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
10declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
11declare i16 @llvm.fshr.i16(i16, i16, i16)
12declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
13declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
14declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
15declare i64 @llvm.fshr.i64(i64, i64, i64)
16declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
17declare i24 @llvm.fshr.i24(i24, i24, i24)
18declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
19
20define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
21; SI-LABEL: fshr_i32:
22; SI:       ; %bb.0: ; %entry
23; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
24; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
25; SI-NEXT:    s_mov_b32 s7, 0xf000
26; SI-NEXT:    s_mov_b32 s6, -1
27; SI-NEXT:    s_waitcnt lgkmcnt(0)
28; SI-NEXT:    v_mov_b32_e32 v0, s1
29; SI-NEXT:    v_mov_b32_e32 v1, s2
30; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v1
31; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
32; SI-NEXT:    s_endpgm
33;
34; VI-LABEL: fshr_i32:
35; VI:       ; %bb.0: ; %entry
36; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
37; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
38; VI-NEXT:    s_waitcnt lgkmcnt(0)
39; VI-NEXT:    v_mov_b32_e32 v0, s1
40; VI-NEXT:    v_mov_b32_e32 v1, s2
41; VI-NEXT:    v_alignbit_b32 v2, s0, v0, v1
42; VI-NEXT:    v_mov_b32_e32 v0, s4
43; VI-NEXT:    v_mov_b32_e32 v1, s5
44; VI-NEXT:    flat_store_dword v[0:1], v2
45; VI-NEXT:    s_endpgm
46;
47; GFX9-LABEL: fshr_i32:
48; GFX9:       ; %bb.0: ; %entry
49; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
50; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
51; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX9-NEXT:    v_mov_b32_e32 v0, s1
53; GFX9-NEXT:    v_mov_b32_e32 v1, s2
54; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, v1
55; GFX9-NEXT:    v_mov_b32_e32 v0, s4
56; GFX9-NEXT:    v_mov_b32_e32 v1, s5
57; GFX9-NEXT:    global_store_dword v[0:1], v2, off
58; GFX9-NEXT:    s_endpgm
59;
60; R600-LABEL: fshr_i32:
61; R600:       ; %bb.0: ; %entry
62; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
63; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
64; R600-NEXT:    CF_END
65; R600-NEXT:    PAD
66; R600-NEXT:    ALU clause starting at 4:
67; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
68; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
69; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
70entry:
71  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
72  store i32 %0, i32 addrspace(1)* %in
73  ret void
74}
75
76define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
77; SI-LABEL: fshr_i32_imm:
78; SI:       ; %bb.0: ; %entry
79; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
80; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
81; SI-NEXT:    s_mov_b32 s7, 0xf000
82; SI-NEXT:    s_mov_b32 s6, -1
83; SI-NEXT:    s_waitcnt lgkmcnt(0)
84; SI-NEXT:    v_mov_b32_e32 v0, s1
85; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 7
86; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
87; SI-NEXT:    s_endpgm
88;
89; VI-LABEL: fshr_i32_imm:
90; VI:       ; %bb.0: ; %entry
91; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
92; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
93; VI-NEXT:    s_waitcnt lgkmcnt(0)
94; VI-NEXT:    v_mov_b32_e32 v0, s1
95; VI-NEXT:    v_alignbit_b32 v2, s0, v0, 7
96; VI-NEXT:    v_mov_b32_e32 v0, s2
97; VI-NEXT:    v_mov_b32_e32 v1, s3
98; VI-NEXT:    flat_store_dword v[0:1], v2
99; VI-NEXT:    s_endpgm
100;
101; GFX9-LABEL: fshr_i32_imm:
102; GFX9:       ; %bb.0: ; %entry
103; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
104; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX9-NEXT:    v_mov_b32_e32 v0, s1
107; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, 7
108; GFX9-NEXT:    v_mov_b32_e32 v0, s2
109; GFX9-NEXT:    v_mov_b32_e32 v1, s3
110; GFX9-NEXT:    global_store_dword v[0:1], v2, off
111; GFX9-NEXT:    s_endpgm
112;
113; R600-LABEL: fshr_i32_imm:
114; R600:       ; %bb.0: ; %entry
115; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
116; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
117; R600-NEXT:    CF_END
118; R600-NEXT:    PAD
119; R600-NEXT:    ALU clause starting at 4:
120; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
121; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
122; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
123; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
124entry:
125  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
126  store i32 %0, i32 addrspace(1)* %in
127  ret void
128}
129
130define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
131; SI-LABEL: fshr_v2i32:
132; SI:       ; %bb.0: ; %entry
133; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
134; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
135; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
136; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
137; SI-NEXT:    s_mov_b32 s7, 0xf000
138; SI-NEXT:    s_mov_b32 s6, -1
139; SI-NEXT:    s_waitcnt lgkmcnt(0)
140; SI-NEXT:    v_mov_b32_e32 v0, s9
141; SI-NEXT:    s_and_b32 s1, s1, 31
142; SI-NEXT:    v_mov_b32_e32 v1, s1
143; SI-NEXT:    s_cmp_eq_u32 s1, 0
144; SI-NEXT:    s_cselect_b64 vcc, -1, 0
145; SI-NEXT:    s_and_b32 s0, s0, 31
146; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
147; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
148; SI-NEXT:    s_cmp_eq_u32 s0, 0
149; SI-NEXT:    v_mov_b32_e32 v0, s8
150; SI-NEXT:    v_mov_b32_e32 v2, s0
151; SI-NEXT:    v_alignbit_b32 v2, s2, v0, v2
152; SI-NEXT:    s_cselect_b64 vcc, -1, 0
153; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
154; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
155; SI-NEXT:    s_endpgm
156;
157; VI-LABEL: fshr_v2i32:
158; VI:       ; %bb.0: ; %entry
159; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
160; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
161; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
162; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
163; VI-NEXT:    s_waitcnt lgkmcnt(0)
164; VI-NEXT:    v_mov_b32_e32 v0, s7
165; VI-NEXT:    s_and_b32 s1, s1, 31
166; VI-NEXT:    v_mov_b32_e32 v1, s1
167; VI-NEXT:    s_cmp_eq_u32 s1, 0
168; VI-NEXT:    s_cselect_b64 vcc, -1, 0
169; VI-NEXT:    s_and_b32 s0, s0, 31
170; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
171; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
172; VI-NEXT:    s_cmp_eq_u32 s0, 0
173; VI-NEXT:    v_mov_b32_e32 v0, s6
174; VI-NEXT:    v_mov_b32_e32 v2, s0
175; VI-NEXT:    v_alignbit_b32 v2, s4, v0, v2
176; VI-NEXT:    s_cselect_b64 vcc, -1, 0
177; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
178; VI-NEXT:    v_mov_b32_e32 v2, s2
179; VI-NEXT:    v_mov_b32_e32 v3, s3
180; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
181; VI-NEXT:    s_endpgm
182;
183; GFX9-LABEL: fshr_v2i32:
184; GFX9:       ; %bb.0: ; %entry
185; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
186; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
187; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
188; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
189; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX9-NEXT:    v_mov_b32_e32 v0, s7
191; GFX9-NEXT:    s_and_b32 s1, s1, 31
192; GFX9-NEXT:    v_mov_b32_e32 v1, s1
193; GFX9-NEXT:    s_cmp_eq_u32 s1, 0
194; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
195; GFX9-NEXT:    s_and_b32 s0, s0, 31
196; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
197; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
198; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
199; GFX9-NEXT:    v_mov_b32_e32 v0, s6
200; GFX9-NEXT:    v_mov_b32_e32 v2, s0
201; GFX9-NEXT:    v_alignbit_b32 v2, s4, v0, v2
202; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
203; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
204; GFX9-NEXT:    v_mov_b32_e32 v2, s2
205; GFX9-NEXT:    v_mov_b32_e32 v3, s3
206; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
207; GFX9-NEXT:    s_endpgm
208;
209; R600-LABEL: fshr_v2i32:
210; R600:       ; %bb.0: ; %entry
211; R600-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
212; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
213; R600-NEXT:    CF_END
214; R600-NEXT:    PAD
215; R600-NEXT:    ALU clause starting at 4:
216; R600-NEXT:     AND_INT * T0.W, KC0[4].X, literal.x,
217; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
218; R600-NEXT:     BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
219; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
220; R600-NEXT:     CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
221; R600-NEXT:     AND_INT * T0.W, KC0[3].W, literal.x,
222; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
223; R600-NEXT:     BIT_ALIGN_INT T1.W, KC0[2].W, KC0[3].Y, PV.W,
224; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
225; R600-NEXT:     CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
226; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
227; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
228entry:
229  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
230  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
231  ret void
232}
233
234define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
235; SI-LABEL: fshr_v2i32_imm:
236; SI:       ; %bb.0: ; %entry
237; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
238; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
239; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
240; SI-NEXT:    s_mov_b32 s7, 0xf000
241; SI-NEXT:    s_mov_b32 s6, -1
242; SI-NEXT:    s_waitcnt lgkmcnt(0)
243; SI-NEXT:    v_mov_b32_e32 v0, s1
244; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 9
245; SI-NEXT:    v_mov_b32_e32 v0, s0
246; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
247; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
248; SI-NEXT:    s_endpgm
249;
250; VI-LABEL: fshr_v2i32_imm:
251; VI:       ; %bb.0: ; %entry
252; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
253; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
254; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
255; VI-NEXT:    s_waitcnt lgkmcnt(0)
256; VI-NEXT:    v_mov_b32_e32 v0, s1
257; VI-NEXT:    v_mov_b32_e32 v2, s0
258; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
259; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
260; VI-NEXT:    v_mov_b32_e32 v2, s2
261; VI-NEXT:    v_mov_b32_e32 v3, s3
262; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
263; VI-NEXT:    s_endpgm
264;
265; GFX9-LABEL: fshr_v2i32_imm:
266; GFX9:       ; %bb.0: ; %entry
267; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
268; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
269; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
270; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
271; GFX9-NEXT:    v_mov_b32_e32 v0, s1
272; GFX9-NEXT:    v_mov_b32_e32 v2, s0
273; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
274; GFX9-NEXT:    v_alignbit_b32 v0, s4, v2, 7
275; GFX9-NEXT:    v_mov_b32_e32 v2, s2
276; GFX9-NEXT:    v_mov_b32_e32 v3, s3
277; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
278; GFX9-NEXT:    s_endpgm
279;
280; R600-LABEL: fshr_v2i32_imm:
281; R600:       ; %bb.0: ; %entry
282; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
283; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
284; R600-NEXT:    CF_END
285; R600-NEXT:    PAD
286; R600-NEXT:    ALU clause starting at 4:
287; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
288; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
289; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
290; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
291; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
292; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
293entry:
294  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
295  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
296  ret void
297}
298
299define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
300; SI-LABEL: fshr_v4i32:
301; SI:       ; %bb.0: ; %entry
302; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
303; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
304; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x11
305; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
306; SI-NEXT:    s_mov_b32 s7, 0xf000
307; SI-NEXT:    s_mov_b32 s6, -1
308; SI-NEXT:    s_waitcnt lgkmcnt(0)
309; SI-NEXT:    v_mov_b32_e32 v0, s15
310; SI-NEXT:    s_and_b32 s3, s3, 31
311; SI-NEXT:    v_mov_b32_e32 v1, s3
312; SI-NEXT:    s_cmp_eq_u32 s3, 0
313; SI-NEXT:    s_cselect_b64 vcc, -1, 0
314; SI-NEXT:    s_and_b32 s2, s2, 31
315; SI-NEXT:    v_alignbit_b32 v1, s11, v0, v1
316; SI-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
317; SI-NEXT:    s_cmp_eq_u32 s2, 0
318; SI-NEXT:    v_mov_b32_e32 v0, s14
319; SI-NEXT:    v_mov_b32_e32 v1, s2
320; SI-NEXT:    s_cselect_b64 vcc, -1, 0
321; SI-NEXT:    s_and_b32 s1, s1, 31
322; SI-NEXT:    v_alignbit_b32 v1, s10, v0, v1
323; SI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
324; SI-NEXT:    s_cmp_eq_u32 s1, 0
325; SI-NEXT:    v_mov_b32_e32 v0, s13
326; SI-NEXT:    v_mov_b32_e32 v1, s1
327; SI-NEXT:    s_cselect_b64 vcc, -1, 0
328; SI-NEXT:    s_and_b32 s0, s0, 31
329; SI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
330; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
331; SI-NEXT:    s_cmp_eq_u32 s0, 0
332; SI-NEXT:    v_mov_b32_e32 v0, s12
333; SI-NEXT:    v_mov_b32_e32 v4, s0
334; SI-NEXT:    v_alignbit_b32 v4, s8, v0, v4
335; SI-NEXT:    s_cselect_b64 vcc, -1, 0
336; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
337; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
338; SI-NEXT:    s_endpgm
339;
340; VI-LABEL: fshr_v4i32:
341; VI:       ; %bb.0: ; %entry
342; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
343; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
344; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
345; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
346; VI-NEXT:    s_waitcnt lgkmcnt(0)
347; VI-NEXT:    v_mov_b32_e32 v0, s11
348; VI-NEXT:    s_and_b32 s3, s3, 31
349; VI-NEXT:    v_mov_b32_e32 v1, s3
350; VI-NEXT:    s_cmp_eq_u32 s3, 0
351; VI-NEXT:    s_cselect_b64 vcc, -1, 0
352; VI-NEXT:    s_and_b32 s2, s2, 31
353; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
354; VI-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
355; VI-NEXT:    s_cmp_eq_u32 s2, 0
356; VI-NEXT:    v_mov_b32_e32 v0, s10
357; VI-NEXT:    v_mov_b32_e32 v1, s2
358; VI-NEXT:    s_cselect_b64 vcc, -1, 0
359; VI-NEXT:    s_and_b32 s1, s1, 31
360; VI-NEXT:    v_alignbit_b32 v1, s6, v0, v1
361; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
362; VI-NEXT:    s_cmp_eq_u32 s1, 0
363; VI-NEXT:    v_mov_b32_e32 v0, s9
364; VI-NEXT:    v_mov_b32_e32 v1, s1
365; VI-NEXT:    s_cselect_b64 vcc, -1, 0
366; VI-NEXT:    s_and_b32 s0, s0, 31
367; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
368; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
369; VI-NEXT:    s_cmp_eq_u32 s0, 0
370; VI-NEXT:    v_mov_b32_e32 v0, s8
371; VI-NEXT:    v_mov_b32_e32 v4, s0
372; VI-NEXT:    v_alignbit_b32 v4, s4, v0, v4
373; VI-NEXT:    s_cselect_b64 vcc, -1, 0
374; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
375; VI-NEXT:    v_mov_b32_e32 v4, s12
376; VI-NEXT:    v_mov_b32_e32 v5, s13
377; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
378; VI-NEXT:    s_endpgm
379;
380; GFX9-LABEL: fshr_v4i32:
381; GFX9:       ; %bb.0: ; %entry
382; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
383; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
384; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
385; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
386; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX9-NEXT:    v_mov_b32_e32 v0, s11
388; GFX9-NEXT:    s_and_b32 s3, s3, 31
389; GFX9-NEXT:    v_mov_b32_e32 v1, s3
390; GFX9-NEXT:    s_cmp_eq_u32 s3, 0
391; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
392; GFX9-NEXT:    s_and_b32 s2, s2, 31
393; GFX9-NEXT:    v_alignbit_b32 v1, s7, v0, v1
394; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
395; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
396; GFX9-NEXT:    v_mov_b32_e32 v0, s10
397; GFX9-NEXT:    v_mov_b32_e32 v1, s2
398; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
399; GFX9-NEXT:    s_and_b32 s1, s1, 31
400; GFX9-NEXT:    v_alignbit_b32 v1, s6, v0, v1
401; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
402; GFX9-NEXT:    s_cmp_eq_u32 s1, 0
403; GFX9-NEXT:    v_mov_b32_e32 v0, s9
404; GFX9-NEXT:    v_mov_b32_e32 v1, s1
405; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
406; GFX9-NEXT:    s_and_b32 s0, s0, 31
407; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
408; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
409; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
410; GFX9-NEXT:    v_mov_b32_e32 v0, s8
411; GFX9-NEXT:    v_mov_b32_e32 v4, s0
412; GFX9-NEXT:    v_alignbit_b32 v4, s4, v0, v4
413; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
414; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
415; GFX9-NEXT:    v_mov_b32_e32 v4, s12
416; GFX9-NEXT:    v_mov_b32_e32 v5, s13
417; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
418; GFX9-NEXT:    s_endpgm
419;
420; R600-LABEL: fshr_v4i32:
421; R600:       ; %bb.0: ; %entry
422; R600-NEXT:    ALU 20, @4, KC0[CB0:0-32], KC1[]
423; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
424; R600-NEXT:    CF_END
425; R600-NEXT:    PAD
426; R600-NEXT:    ALU clause starting at 4:
427; R600-NEXT:     AND_INT T0.W, KC0[5].Z, literal.x,
428; R600-NEXT:     AND_INT * T1.W, KC0[6].X, literal.x,
429; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
430; R600-NEXT:     SETE_INT T0.Z, PS, 0.0,
431; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[4].X, KC0[5].X, PS,
432; R600-NEXT:     AND_INT * T2.W, KC0[5].W, literal.x,
433; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
434; R600-NEXT:     SETE_INT T1.Z, PV.W, 0.0,
435; R600-NEXT:     BIT_ALIGN_INT * T2.W, KC0[3].W, KC0[4].W, PV.W,
436; R600-NEXT:     CNDE_INT * T1.W, T0.Z, T1.W, KC0[5].X,
437; R600-NEXT:     CNDE_INT T1.Z, T1.Z, T2.W, KC0[4].W,
438; R600-NEXT:     BIT_ALIGN_INT T2.W, KC0[3].Z, KC0[4].Z, T0.W,
439; R600-NEXT:     SETE_INT * T0.W, T0.W, 0.0,
440; R600-NEXT:     CNDE_INT T1.Y, PS, PV.W, KC0[4].Z,
441; R600-NEXT:     AND_INT * T0.W, KC0[5].Y, literal.x,
442; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
443; R600-NEXT:     BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PV.W,
444; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
445; R600-NEXT:     CNDE_INT T1.X, PS, PV.W, KC0[4].Y,
446; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
447; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
448entry:
449  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
450  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
451  ret void
452}
453
454define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
455; SI-LABEL: fshr_v4i32_imm:
456; SI:       ; %bb.0: ; %entry
457; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
458; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
459; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
460; SI-NEXT:    s_mov_b32 s7, 0xf000
461; SI-NEXT:    s_mov_b32 s6, -1
462; SI-NEXT:    s_waitcnt lgkmcnt(0)
463; SI-NEXT:    v_mov_b32_e32 v0, s3
464; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
465; SI-NEXT:    v_mov_b32_e32 v0, s2
466; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 9
467; SI-NEXT:    v_mov_b32_e32 v0, s1
468; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
469; SI-NEXT:    v_mov_b32_e32 v0, s0
470; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
471; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
472; SI-NEXT:    s_endpgm
473;
474; VI-LABEL: fshr_v4i32_imm:
475; VI:       ; %bb.0: ; %entry
476; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
477; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
478; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
479; VI-NEXT:    s_waitcnt lgkmcnt(0)
480; VI-NEXT:    v_mov_b32_e32 v4, s8
481; VI-NEXT:    v_mov_b32_e32 v5, s9
482; VI-NEXT:    v_mov_b32_e32 v0, s3
483; VI-NEXT:    v_mov_b32_e32 v1, s2
484; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
485; VI-NEXT:    v_mov_b32_e32 v0, s1
486; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
487; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
488; VI-NEXT:    v_mov_b32_e32 v0, s0
489; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
490; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
491; VI-NEXT:    s_endpgm
492;
493; GFX9-LABEL: fshr_v4i32_imm:
494; GFX9:       ; %bb.0: ; %entry
495; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
496; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
497; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
498; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
499; GFX9-NEXT:    v_mov_b32_e32 v4, s8
500; GFX9-NEXT:    v_mov_b32_e32 v5, s9
501; GFX9-NEXT:    v_mov_b32_e32 v0, s3
502; GFX9-NEXT:    v_mov_b32_e32 v1, s2
503; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
504; GFX9-NEXT:    v_mov_b32_e32 v0, s1
505; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
506; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
507; GFX9-NEXT:    v_mov_b32_e32 v0, s0
508; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
509; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
510; GFX9-NEXT:    s_endpgm
511;
512; R600-LABEL: fshr_v4i32_imm:
513; R600:       ; %bb.0: ; %entry
514; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
515; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
516; R600-NEXT:    CF_END
517; R600-NEXT:    PAD
518; R600-NEXT:    ALU clause starting at 4:
519; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
520; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
521; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
522; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
523; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
524; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
525; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
526; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
527entry:
528  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
529  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
530  ret void
531}
532
533define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
534; GFX89-LABEL: v_fshr_i32:
535; GFX89:       ; %bb.0:
536; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
537; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
538; GFX89-NEXT:    s_setpc_b64 s[30:31]
539;
540; R600-LABEL: v_fshr_i32:
541; R600:       ; %bb.0:
542; R600-NEXT:    CF_END
543; R600-NEXT:    PAD
544  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
545  ret i32 %ret
546}
547
548define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
549; GFX89-LABEL: v_fshr_v2i32:
550; GFX89:       ; %bb.0:
551; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
552; GFX89-NEXT:    v_and_b32_e32 v4, 31, v4
553; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
554; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
555; GFX89-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
556; GFX89-NEXT:    v_and_b32_e32 v2, 31, v5
557; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v2
558; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
559; GFX89-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
560; GFX89-NEXT:    s_setpc_b64 s[30:31]
561;
562; R600-LABEL: v_fshr_v2i32:
563; R600:       ; %bb.0:
564; R600-NEXT:    CF_END
565; R600-NEXT:    PAD
566  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
567  ret <2 x i32> %ret
568}
569
570define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
571; GFX89-LABEL: v_fshr_v3i32:
572; GFX89:       ; %bb.0:
573; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
574; GFX89-NEXT:    v_and_b32_e32 v6, 31, v6
575; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
576; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
577; GFX89-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
578; GFX89-NEXT:    v_and_b32_e32 v3, 31, v7
579; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v3
580; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
581; GFX89-NEXT:    v_and_b32_e32 v3, 31, v8
582; GFX89-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
583; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v3
584; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
585; GFX89-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
586; GFX89-NEXT:    s_setpc_b64 s[30:31]
587;
588; R600-LABEL: v_fshr_v3i32:
589; R600:       ; %bb.0:
590; R600-NEXT:    CF_END
591; R600-NEXT:    PAD
592  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
593  ret <3 x i32> %ret
594}
595
596define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
597; GFX89-LABEL: v_fshr_v4i32:
598; GFX89:       ; %bb.0:
599; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
600; GFX89-NEXT:    v_and_b32_e32 v8, 31, v8
601; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
602; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
603; GFX89-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
604; GFX89-NEXT:    v_and_b32_e32 v4, 31, v9
605; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v4
606; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
607; GFX89-NEXT:    v_and_b32_e32 v4, 31, v10
608; GFX89-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
609; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v4
610; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
611; GFX89-NEXT:    v_and_b32_e32 v4, 31, v11
612; GFX89-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
613; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v4
614; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
615; GFX89-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
616; GFX89-NEXT:    s_setpc_b64 s[30:31]
617;
618; R600-LABEL: v_fshr_v4i32:
619; R600:       ; %bb.0:
620; R600-NEXT:    CF_END
621; R600-NEXT:    PAD
622  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
623  ret <4 x i32> %ret
624}
625
626define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
627; SI-LABEL: v_fshr_i16:
628; SI:       ; %bb.0:
629; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
630; SI-NEXT:    v_and_b32_e32 v2, 15, v2
631; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
632; SI-NEXT:    v_sub_i32_e32 v4, vcc, 16, v2
633; SI-NEXT:    v_lshr_b32_e32 v3, v3, v2
634; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
635; SI-NEXT:    v_or_b32_e32 v0, v0, v3
636; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
637; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
638; SI-NEXT:    s_setpc_b64 s[30:31]
639;
640; VI-LABEL: v_fshr_i16:
641; VI:       ; %bb.0:
642; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643; VI-NEXT:    v_and_b32_e32 v2, 15, v2
644; VI-NEXT:    v_sub_u16_e32 v4, 16, v2
645; VI-NEXT:    v_lshrrev_b16_e32 v3, v2, v1
646; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
647; VI-NEXT:    v_or_b32_e32 v0, v0, v3
648; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
649; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
650; VI-NEXT:    s_setpc_b64 s[30:31]
651;
652; GFX9-LABEL: v_fshr_i16:
653; GFX9:       ; %bb.0:
654; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
656; GFX9-NEXT:    v_sub_u16_e32 v4, 16, v2
657; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v2, v1
658; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
659; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
660; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
661; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
662; GFX9-NEXT:    s_setpc_b64 s[30:31]
663;
664; R600-LABEL: v_fshr_i16:
665; R600:       ; %bb.0:
666; R600-NEXT:    CF_END
667; R600-NEXT:    PAD
668  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
669  ret i16 %ret
670}
671
672define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
673; SI-LABEL: v_fshr_v2i16:
674; SI:       ; %bb.0:
675; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
676; SI-NEXT:    s_mov_b32 s4, 0xffff
677; SI-NEXT:    v_and_b32_e32 v5, 15, v5
678; SI-NEXT:    v_and_b32_e32 v7, s4, v3
679; SI-NEXT:    v_sub_i32_e32 v8, vcc, 16, v5
680; SI-NEXT:    v_lshr_b32_e32 v7, v7, v5
681; SI-NEXT:    v_lshl_b32_e32 v1, v1, v8
682; SI-NEXT:    v_or_b32_e32 v1, v1, v7
683; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
684; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
685; SI-NEXT:    v_and_b32_e32 v3, 15, v4
686; SI-NEXT:    v_sub_i32_e32 v5, vcc, 16, v3
687; SI-NEXT:    v_and_b32_e32 v6, s4, v2
688; SI-NEXT:    v_lshr_b32_e32 v4, v6, v3
689; SI-NEXT:    v_lshl_b32_e32 v0, v0, v5
690; SI-NEXT:    v_or_b32_e32 v0, v0, v4
691; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
692; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
693; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
694; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
695; SI-NEXT:    v_or_b32_e32 v0, v0, v1
696; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
697; SI-NEXT:    s_setpc_b64 s[30:31]
698;
699; VI-LABEL: v_fshr_v2i16:
700; VI:       ; %bb.0:
701; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
702; VI-NEXT:    v_and_b32_e32 v3, 0xf000f, v2
703; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
704; VI-NEXT:    v_bfe_u32 v2, v2, 16, 4
705; VI-NEXT:    v_lshrrev_b16_e32 v4, v3, v1
706; VI-NEXT:    v_lshrrev_b16_sdwa v6, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
707; VI-NEXT:    v_or_b32_e32 v4, v4, v6
708; VI-NEXT:    v_sub_u16_e32 v6, 16, v2
709; VI-NEXT:    v_sub_u16_e32 v7, 16, v3
710; VI-NEXT:    v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
711; VI-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
712; VI-NEXT:    v_or_b32_e32 v0, v0, v6
713; VI-NEXT:    v_or_b32_e32 v0, v0, v4
714; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
715; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
716; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
717; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
718; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
719; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
720; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
721; VI-NEXT:    s_setpc_b64 s[30:31]
722;
723; GFX9-LABEL: v_fshr_v2i16:
724; GFX9:       ; %bb.0:
725; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
726; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
727; GFX9-NEXT:    v_pk_sub_i16 v4, 16, v2 op_sel_hi:[0,1]
728; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
729; GFX9-NEXT:    v_pk_lshrrev_b16 v3, v2, v1
730; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
731; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
732; GFX9-NEXT:    v_mov_b32_e32 v4, 0
733; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
734; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
735; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
736; GFX9-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v2, v4 src0_sel:WORD_1 src1_sel:DWORD
737; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
738; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v3
739; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
740; GFX9-NEXT:    s_setpc_b64 s[30:31]
741;
742; R600-LABEL: v_fshr_v2i16:
743; R600:       ; %bb.0:
744; R600-NEXT:    CF_END
745; R600-NEXT:    PAD
746  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
747  ret <2 x i16> %ret
748}
749
750define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
751; SI-LABEL: v_fshr_v3i16:
752; SI:       ; %bb.0:
753; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
754; SI-NEXT:    s_mov_b32 s4, 0xffff
755; SI-NEXT:    v_and_b32_e32 v7, 15, v7
756; SI-NEXT:    v_and_b32_e32 v12, s4, v4
757; SI-NEXT:    v_sub_i32_e32 v13, vcc, 16, v7
758; SI-NEXT:    v_lshr_b32_e32 v12, v12, v7
759; SI-NEXT:    v_lshl_b32_e32 v1, v1, v13
760; SI-NEXT:    v_or_b32_e32 v1, v1, v12
761; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
762; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
763; SI-NEXT:    v_and_b32_e32 v4, 15, v6
764; SI-NEXT:    v_sub_i32_e32 v7, vcc, 16, v4
765; SI-NEXT:    v_and_b32_e32 v11, s4, v3
766; SI-NEXT:    v_lshr_b32_e32 v6, v11, v4
767; SI-NEXT:    v_lshl_b32_e32 v0, v0, v7
768; SI-NEXT:    v_or_b32_e32 v0, v0, v6
769; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
770; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
771; SI-NEXT:    v_and_b32_e32 v3, 15, v8
772; SI-NEXT:    v_sub_i32_e32 v6, vcc, 16, v3
773; SI-NEXT:    v_and_b32_e32 v10, s4, v5
774; SI-NEXT:    v_lshr_b32_e32 v4, v10, v3
775; SI-NEXT:    v_lshl_b32_e32 v2, v2, v6
776; SI-NEXT:    v_mov_b32_e32 v9, 0xffff
777; SI-NEXT:    v_or_b32_e32 v2, v2, v4
778; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
779; SI-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc
780; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
781; SI-NEXT:    v_and_b32_e32 v0, v9, v0
782; SI-NEXT:    v_or_b32_e32 v0, v0, v1
783; SI-NEXT:    v_and_b32_e32 v2, v9, v3
784; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
785; SI-NEXT:    s_setpc_b64 s[30:31]
786;
787; VI-LABEL: v_fshr_v3i16:
788; VI:       ; %bb.0:
789; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790; VI-NEXT:    v_mov_b32_e32 v6, 15
791; VI-NEXT:    v_and_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
792; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
793; VI-NEXT:    v_lshrrev_b16_e32 v8, v6, v7
794; VI-NEXT:    v_sub_u16_e32 v6, 16, v6
795; VI-NEXT:    v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
796; VI-NEXT:    v_or_b32_e32 v6, v6, v8
797; VI-NEXT:    v_bfe_u32 v8, v4, 16, 4
798; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
799; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
800; VI-NEXT:    v_and_b32_e32 v7, 15, v5
801; VI-NEXT:    v_lshrrev_b16_e32 v8, v7, v3
802; VI-NEXT:    v_sub_u16_e32 v7, 16, v7
803; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
804; VI-NEXT:    v_and_b32_e32 v5, 15, v5
805; VI-NEXT:    v_or_b32_e32 v1, v1, v8
806; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
807; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
808; VI-NEXT:    v_and_b32_e32 v3, 15, v4
809; VI-NEXT:    v_lshrrev_b16_e32 v5, v3, v2
810; VI-NEXT:    v_sub_u16_e32 v3, 16, v3
811; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
812; VI-NEXT:    v_and_b32_e32 v3, 0xf000f, v4
813; VI-NEXT:    v_or_b32_e32 v0, v0, v5
814; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
815; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
816; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
817; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
818; VI-NEXT:    s_setpc_b64 s[30:31]
819;
820; GFX9-LABEL: v_fshr_v3i16:
821; GFX9:       ; %bb.0:
822; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823; GFX9-NEXT:    v_mov_b32_e32 v7, 15
824; GFX9-NEXT:    v_and_b32_e32 v6, 15, v4
825; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
826; GFX9-NEXT:    v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
827; GFX9-NEXT:    v_and_b32_e32 v6, v8, v6
828; GFX9-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
829; GFX9-NEXT:    v_pk_lshrrev_b16 v7, v6, v2
830; GFX9-NEXT:    v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1]
831; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
832; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
833; GFX9-NEXT:    v_and_b32_e32 v4, s6, v4
834; GFX9-NEXT:    v_or_b32_e32 v0, v0, v7
835; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
836; GFX9-NEXT:    v_mov_b32_e32 v7, 0
837; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
838; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
839; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
840; GFX9-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v4, v7 src0_sel:WORD_1 src1_sel:DWORD
841; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
842; GFX9-NEXT:    v_and_b32_e32 v2, 15, v5
843; GFX9-NEXT:    v_and_b32_e32 v2, v8, v2
844; GFX9-NEXT:    v_pk_lshrrev_b16 v4, v2, v3
845; GFX9-NEXT:    v_pk_sub_i16 v2, 16, v2
846; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
847; GFX9-NEXT:    v_and_b32_e32 v2, s6, v5
848; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
849; GFX9-NEXT:    v_or_b32_e32 v1, v1, v4
850; GFX9-NEXT:    v_and_b32_e32 v2, v8, v6
851; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
852; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
853; GFX9-NEXT:    s_setpc_b64 s[30:31]
854;
855; R600-LABEL: v_fshr_v3i16:
856; R600:       ; %bb.0:
857; R600-NEXT:    CF_END
858; R600-NEXT:    PAD
859  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
860  ret <3 x i16> %ret
861}
862
863define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
864; SI-LABEL: v_fshr_v4i16:
865; SI:       ; %bb.0:
866; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
867; SI-NEXT:    s_mov_b32 s4, 0xffff
868; SI-NEXT:    v_and_b32_e32 v9, 15, v9
869; SI-NEXT:    v_and_b32_e32 v16, s4, v5
870; SI-NEXT:    v_sub_i32_e32 v17, vcc, 16, v9
871; SI-NEXT:    v_lshr_b32_e32 v16, v16, v9
872; SI-NEXT:    v_lshl_b32_e32 v1, v1, v17
873; SI-NEXT:    v_or_b32_e32 v1, v1, v16
874; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
875; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
876; SI-NEXT:    v_and_b32_e32 v5, 15, v8
877; SI-NEXT:    v_sub_i32_e32 v9, vcc, 16, v5
878; SI-NEXT:    v_and_b32_e32 v15, s4, v4
879; SI-NEXT:    v_lshr_b32_e32 v8, v15, v5
880; SI-NEXT:    v_lshl_b32_e32 v0, v0, v9
881; SI-NEXT:    v_or_b32_e32 v0, v0, v8
882; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
883; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
884; SI-NEXT:    v_and_b32_e32 v4, 15, v11
885; SI-NEXT:    v_sub_i32_e32 v8, vcc, 16, v4
886; SI-NEXT:    v_and_b32_e32 v14, s4, v7
887; SI-NEXT:    v_lshr_b32_e32 v5, v14, v4
888; SI-NEXT:    v_lshl_b32_e32 v3, v3, v8
889; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
890; SI-NEXT:    v_or_b32_e32 v3, v3, v5
891; SI-NEXT:    v_and_b32_e32 v4, 15, v10
892; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
893; SI-NEXT:    v_sub_i32_e32 v7, vcc, 16, v4
894; SI-NEXT:    v_and_b32_e32 v13, s4, v6
895; SI-NEXT:    v_lshr_b32_e32 v5, v13, v4
896; SI-NEXT:    v_lshl_b32_e32 v2, v2, v7
897; SI-NEXT:    v_or_b32_e32 v2, v2, v5
898; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
899; SI-NEXT:    v_mov_b32_e32 v12, 0xffff
900; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
901; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
902; SI-NEXT:    v_and_b32_e32 v2, v12, v2
903; SI-NEXT:    v_or_b32_e32 v2, v2, v3
904; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
905; SI-NEXT:    v_and_b32_e32 v0, v12, v0
906; SI-NEXT:    v_or_b32_e32 v0, v0, v1
907; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
908; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
909; SI-NEXT:    s_setpc_b64 s[30:31]
910;
911; VI-LABEL: v_fshr_v4i16:
912; VI:       ; %bb.0:
913; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
914; VI-NEXT:    v_mov_b32_e32 v6, 15
915; VI-NEXT:    v_and_b32_sdwa v7, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
916; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
917; VI-NEXT:    v_lshrrev_b16_e32 v9, v7, v8
918; VI-NEXT:    v_sub_u16_e32 v7, 16, v7
919; VI-NEXT:    v_lshlrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
920; VI-NEXT:    v_or_b32_e32 v7, v7, v9
921; VI-NEXT:    v_bfe_u32 v9, v5, 16, 4
922; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
923; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
924; VI-NEXT:    v_and_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
925; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
926; VI-NEXT:    v_lshrrev_b16_e32 v9, v6, v8
927; VI-NEXT:    v_sub_u16_e32 v6, 16, v6
928; VI-NEXT:    v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
929; VI-NEXT:    v_or_b32_e32 v6, v6, v9
930; VI-NEXT:    v_bfe_u32 v9, v4, 16, 4
931; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
932; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
933; VI-NEXT:    v_and_b32_e32 v8, 15, v5
934; VI-NEXT:    v_lshrrev_b16_e32 v9, v8, v3
935; VI-NEXT:    v_sub_u16_e32 v8, 16, v8
936; VI-NEXT:    s_mov_b32 s4, 0xf000f
937; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
938; VI-NEXT:    v_and_b32_e32 v5, s4, v5
939; VI-NEXT:    v_or_b32_e32 v1, v1, v9
940; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
941; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
942; VI-NEXT:    v_and_b32_e32 v3, 15, v4
943; VI-NEXT:    v_lshrrev_b16_e32 v5, v3, v2
944; VI-NEXT:    v_sub_u16_e32 v3, 16, v3
945; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
946; VI-NEXT:    v_and_b32_e32 v3, s4, v4
947; VI-NEXT:    v_or_b32_e32 v0, v0, v5
948; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
949; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
950; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
951; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
952; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
953; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
954; VI-NEXT:    s_setpc_b64 s[30:31]
955;
956; GFX9-LABEL: v_fshr_v4i16:
957; GFX9:       ; %bb.0:
958; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
959; GFX9-NEXT:    v_mov_b32_e32 v7, 15
960; GFX9-NEXT:    v_and_b32_e32 v6, 15, v5
961; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
962; GFX9-NEXT:    v_and_b32_sdwa v8, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
963; GFX9-NEXT:    v_and_b32_e32 v6, v9, v6
964; GFX9-NEXT:    v_lshl_or_b32 v6, v8, 16, v6
965; GFX9-NEXT:    v_pk_lshrrev_b16 v8, v6, v3
966; GFX9-NEXT:    v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1]
967; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
968; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v6, v1
969; GFX9-NEXT:    v_and_b32_e32 v5, s6, v5
970; GFX9-NEXT:    v_or_b32_e32 v1, v1, v8
971; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
972; GFX9-NEXT:    v_mov_b32_e32 v8, 0
973; GFX9-NEXT:    v_cndmask_b32_e32 v6, v1, v3, vcc
974; GFX9-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v5, v8 src0_sel:WORD_1 src1_sel:DWORD
975; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
976; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
977; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
978; GFX9-NEXT:    v_and_b32_e32 v3, 15, v4
979; GFX9-NEXT:    v_and_b32_sdwa v5, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
980; GFX9-NEXT:    v_and_b32_e32 v3, v9, v3
981; GFX9-NEXT:    v_lshl_or_b32 v3, v5, 16, v3
982; GFX9-NEXT:    v_pk_lshrrev_b16 v5, v3, v2
983; GFX9-NEXT:    v_pk_sub_i16 v3, 16, v3 op_sel_hi:[0,1]
984; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
985; GFX9-NEXT:    v_and_b32_e32 v3, s6, v4
986; GFX9-NEXT:    v_or_b32_e32 v0, v0, v5
987; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
988; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
989; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
990; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
991; GFX9-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v3, v8 src0_sel:WORD_1 src1_sel:DWORD
992; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
993; GFX9-NEXT:    v_and_b32_e32 v2, v9, v4
994; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
995; GFX9-NEXT:    v_and_b32_e32 v2, v9, v6
996; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
997; GFX9-NEXT:    s_setpc_b64 s[30:31]
998;
999; R600-LABEL: v_fshr_v4i16:
1000; R600:       ; %bb.0:
1001; R600-NEXT:    CF_END
1002; R600-NEXT:    PAD
1003  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
1004  ret <4 x i16> %ret
1005}
1006
1007define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
1008; SI-LABEL: v_fshr_i64:
1009; SI:       ; %bb.0:
1010; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1011; SI-NEXT:    v_and_b32_e32 v4, 63, v4
1012; SI-NEXT:    v_sub_i32_e32 v7, vcc, 64, v4
1013; SI-NEXT:    v_lshr_b64 v[5:6], v[2:3], v4
1014; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v7
1015; SI-NEXT:    v_or_b32_e32 v0, v0, v5
1016; SI-NEXT:    v_mov_b32_e32 v5, 0
1017; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
1018; SI-NEXT:    v_or_b32_e32 v1, v1, v6
1019; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1020; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1021; SI-NEXT:    s_setpc_b64 s[30:31]
1022;
1023; VI-LABEL: v_fshr_i64:
1024; VI:       ; %bb.0:
1025; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1026; VI-NEXT:    v_and_b32_e32 v4, 63, v4
1027; VI-NEXT:    v_sub_u32_e32 v7, vcc, 64, v4
1028; VI-NEXT:    v_lshrrev_b64 v[5:6], v4, v[2:3]
1029; VI-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
1030; VI-NEXT:    v_or_b32_e32 v0, v0, v5
1031; VI-NEXT:    v_mov_b32_e32 v5, 0
1032; VI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
1033; VI-NEXT:    v_or_b32_e32 v1, v1, v6
1034; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1035; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1036; VI-NEXT:    s_setpc_b64 s[30:31]
1037;
1038; GFX9-LABEL: v_fshr_i64:
1039; GFX9:       ; %bb.0:
1040; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1041; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
1042; GFX9-NEXT:    v_sub_u32_e32 v7, 64, v4
1043; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v4, v[2:3]
1044; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
1045; GFX9-NEXT:    v_or_b32_e32 v0, v0, v5
1046; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1047; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
1048; GFX9-NEXT:    v_or_b32_e32 v1, v1, v6
1049; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1050; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1051; GFX9-NEXT:    s_setpc_b64 s[30:31]
1052;
1053; R600-LABEL: v_fshr_i64:
1054; R600:       ; %bb.0:
1055; R600-NEXT:    CF_END
1056; R600-NEXT:    PAD
1057  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1058  ret i64 %ret
1059}
1060
1061define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1062; SI-LABEL: v_fshr_v2i64:
1063; SI:       ; %bb.0:
1064; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1065; SI-NEXT:    v_and_b32_e32 v8, 63, v8
1066; SI-NEXT:    v_sub_i32_e32 v9, vcc, 64, v8
1067; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
1068; SI-NEXT:    v_lshr_b64 v[11:12], v[4:5], v8
1069; SI-NEXT:    v_mov_b32_e32 v9, 0
1070; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1071; SI-NEXT:    v_or_b32_e32 v0, v0, v11
1072; SI-NEXT:    v_and_b32_e32 v8, 63, v10
1073; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1074; SI-NEXT:    v_sub_i32_e64 v4, s[4:5], 64, v8
1075; SI-NEXT:    v_or_b32_e32 v1, v1, v12
1076; SI-NEXT:    v_lshr_b64 v[10:11], v[6:7], v8
1077; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
1078; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1079; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1080; SI-NEXT:    v_or_b32_e32 v3, v3, v11
1081; SI-NEXT:    v_or_b32_e32 v2, v2, v10
1082; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1083; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1084; SI-NEXT:    s_setpc_b64 s[30:31]
1085;
1086; VI-LABEL: v_fshr_v2i64:
1087; VI:       ; %bb.0:
1088; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1089; VI-NEXT:    v_and_b32_e32 v8, 63, v8
1090; VI-NEXT:    v_sub_u32_e32 v9, vcc, 64, v8
1091; VI-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1092; VI-NEXT:    v_lshrrev_b64 v[11:12], v8, v[4:5]
1093; VI-NEXT:    v_mov_b32_e32 v9, 0
1094; VI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1095; VI-NEXT:    v_or_b32_e32 v0, v0, v11
1096; VI-NEXT:    v_and_b32_e32 v8, 63, v10
1097; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1098; VI-NEXT:    v_sub_u32_e64 v4, s[4:5], 64, v8
1099; VI-NEXT:    v_or_b32_e32 v1, v1, v12
1100; VI-NEXT:    v_lshrrev_b64 v[10:11], v8, v[6:7]
1101; VI-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
1102; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1103; VI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1104; VI-NEXT:    v_or_b32_e32 v3, v3, v11
1105; VI-NEXT:    v_or_b32_e32 v2, v2, v10
1106; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1107; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1108; VI-NEXT:    s_setpc_b64 s[30:31]
1109;
1110; GFX9-LABEL: v_fshr_v2i64:
1111; GFX9:       ; %bb.0:
1112; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1113; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
1114; GFX9-NEXT:    v_sub_u32_e32 v9, 64, v8
1115; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1116; GFX9-NEXT:    v_lshrrev_b64 v[11:12], v8, v[4:5]
1117; GFX9-NEXT:    v_mov_b32_e32 v9, 0
1118; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1119; GFX9-NEXT:    v_or_b32_e32 v0, v0, v11
1120; GFX9-NEXT:    v_and_b32_e32 v8, 63, v10
1121; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1122; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v8
1123; GFX9-NEXT:    v_or_b32_e32 v1, v1, v12
1124; GFX9-NEXT:    v_lshrrev_b64 v[10:11], v8, v[6:7]
1125; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
1126; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1127; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1128; GFX9-NEXT:    v_or_b32_e32 v3, v3, v11
1129; GFX9-NEXT:    v_or_b32_e32 v2, v2, v10
1130; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1131; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1132; GFX9-NEXT:    s_setpc_b64 s[30:31]
1133;
1134; R600-LABEL: v_fshr_v2i64:
1135; R600:       ; %bb.0:
1136; R600-NEXT:    CF_END
1137; R600-NEXT:    PAD
1138  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1139  ret <2 x i64> %ret
1140}
1141
1142define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1143; SI-LABEL: v_fshr_i24:
1144; SI:       ; %bb.0:
1145; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1146; SI-NEXT:    s_mov_b32 s4, 0xffffff
1147; SI-NEXT:    v_and_b32_e32 v2, s4, v2
1148; SI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1149; SI-NEXT:    v_mul_hi_u32 v3, v2, s5
1150; SI-NEXT:    v_and_b32_e32 v4, s4, v1
1151; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1152; SI-NEXT:    v_mul_lo_u32 v3, v3, 24
1153; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1154; SI-NEXT:    v_lshr_b32_e32 v3, v4, v2
1155; SI-NEXT:    v_sub_i32_e32 v4, vcc, 24, v2
1156; SI-NEXT:    v_and_b32_e32 v4, s4, v4
1157; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
1158; SI-NEXT:    v_or_b32_e32 v0, v0, v3
1159; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1160; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1161; SI-NEXT:    s_setpc_b64 s[30:31]
1162;
1163; VI-LABEL: v_fshr_i24:
1164; VI:       ; %bb.0:
1165; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166; VI-NEXT:    s_mov_b32 s4, 0xffffff
1167; VI-NEXT:    v_and_b32_e32 v2, s4, v2
1168; VI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1169; VI-NEXT:    v_mul_hi_u32 v3, v2, s5
1170; VI-NEXT:    v_and_b32_e32 v4, s4, v1
1171; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1172; VI-NEXT:    v_mul_lo_u32 v3, v3, 24
1173; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1174; VI-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
1175; VI-NEXT:    v_sub_u32_e32 v4, vcc, 24, v2
1176; VI-NEXT:    v_and_b32_e32 v4, s4, v4
1177; VI-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
1178; VI-NEXT:    v_or_b32_e32 v0, v0, v3
1179; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1180; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1181; VI-NEXT:    s_setpc_b64 s[30:31]
1182;
1183; GFX9-LABEL: v_fshr_i24:
1184; GFX9:       ; %bb.0:
1185; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
1187; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
1188; GFX9-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1189; GFX9-NEXT:    v_mul_hi_u32 v3, v2, s5
1190; GFX9-NEXT:    v_and_b32_e32 v4, s4, v1
1191; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1192; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
1193; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
1194; GFX9-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
1195; GFX9-NEXT:    v_sub_u32_e32 v4, 24, v2
1196; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
1197; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v3
1198; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1199; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1200; GFX9-NEXT:    s_setpc_b64 s[30:31]
1201;
1202; R600-LABEL: v_fshr_i24:
1203; R600:       ; %bb.0:
1204; R600-NEXT:    CF_END
1205; R600-NEXT:    PAD
1206  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1207  ret i24 %ret
1208}
1209
1210define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1211; SI-LABEL: v_fshr_v2i24:
1212; SI:       ; %bb.0:
1213; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1214; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
1215; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
1216; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
1217; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
1218; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
1219; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
1220; SI-NEXT:    s_mov_b32 s4, 0xffffff
1221; SI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1222; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v0
1223; SI-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
1224; SI-NEXT:    v_add_i32_e32 v9, vcc, 5, v0
1225; SI-NEXT:    v_add_i32_e32 v10, vcc, 2, v0
1226; SI-NEXT:    s_waitcnt vmcnt(5)
1227; SI-NEXT:    v_and_b32_e32 v14, s4, v1
1228; SI-NEXT:    s_waitcnt vmcnt(4)
1229; SI-NEXT:    v_and_b32_e32 v2, s4, v2
1230; SI-NEXT:    v_mul_hi_u32 v12, v2, s5
1231; SI-NEXT:    s_waitcnt vmcnt(3)
1232; SI-NEXT:    v_and_b32_e32 v3, s4, v3
1233; SI-NEXT:    v_mul_hi_u32 v13, v3, s5
1234; SI-NEXT:    s_waitcnt vmcnt(1)
1235; SI-NEXT:    v_and_b32_e32 v11, s4, v5
1236; SI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
1237; SI-NEXT:    v_mul_lo_u32 v12, v12, 24
1238; SI-NEXT:    v_lshrrev_b32_e32 v13, 4, v13
1239; SI-NEXT:    v_mul_lo_u32 v13, v13, 24
1240; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
1241; SI-NEXT:    v_lshr_b32_e32 v12, v14, v2
1242; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v13
1243; SI-NEXT:    v_sub_i32_e32 v13, vcc, 24, v2
1244; SI-NEXT:    v_sub_i32_e32 v14, vcc, 24, v3
1245; SI-NEXT:    v_and_b32_e32 v13, s4, v13
1246; SI-NEXT:    s_waitcnt vmcnt(0)
1247; SI-NEXT:    v_lshl_b32_e32 v6, v6, v13
1248; SI-NEXT:    v_and_b32_e32 v14, 0xffffff, v14
1249; SI-NEXT:    v_lshr_b32_e32 v11, v11, v3
1250; SI-NEXT:    v_lshl_b32_e32 v4, v4, v14
1251; SI-NEXT:    v_or_b32_e32 v6, v6, v12
1252; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1253; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
1254; SI-NEXT:    v_or_b32_e32 v4, v4, v11
1255; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1256; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
1257; SI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
1258; SI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1259; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
1260; SI-NEXT:    s_waitcnt expcnt(1)
1261; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1262; SI-NEXT:    s_waitcnt expcnt(0)
1263; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1264; SI-NEXT:    buffer_store_byte v0, v8, s[0:3], 0 offen
1265; SI-NEXT:    buffer_store_byte v2, v9, s[0:3], 0 offen
1266; SI-NEXT:    buffer_store_byte v1, v10, s[0:3], 0 offen
1267; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1268; SI-NEXT:    s_setpc_b64 s[30:31]
1269;
1270; VI-LABEL: v_fshr_v2i24:
1271; VI:       ; %bb.0:
1272; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1273; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
1274; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
1275; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
1276; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
1277; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
1278; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
1279; VI-NEXT:    s_mov_b32 s4, 0xffffff
1280; VI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1281; VI-NEXT:    v_add_u32_e32 v7, vcc, 3, v0
1282; VI-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
1283; VI-NEXT:    v_add_u32_e32 v9, vcc, 5, v0
1284; VI-NEXT:    v_add_u32_e32 v10, vcc, 2, v0
1285; VI-NEXT:    s_waitcnt vmcnt(5)
1286; VI-NEXT:    v_and_b32_e32 v14, s4, v1
1287; VI-NEXT:    s_waitcnt vmcnt(4)
1288; VI-NEXT:    v_and_b32_e32 v2, s4, v2
1289; VI-NEXT:    v_mul_hi_u32 v12, v2, s5
1290; VI-NEXT:    s_waitcnt vmcnt(3)
1291; VI-NEXT:    v_and_b32_e32 v3, s4, v3
1292; VI-NEXT:    v_mul_hi_u32 v13, v3, s5
1293; VI-NEXT:    s_waitcnt vmcnt(1)
1294; VI-NEXT:    v_and_b32_e32 v11, s4, v5
1295; VI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
1296; VI-NEXT:    v_mul_lo_u32 v12, v12, 24
1297; VI-NEXT:    v_lshrrev_b32_e32 v13, 4, v13
1298; VI-NEXT:    v_mul_lo_u32 v13, v13, 24
1299; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v12
1300; VI-NEXT:    v_lshrrev_b32_e32 v12, v2, v14
1301; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v13
1302; VI-NEXT:    v_sub_u32_e32 v13, vcc, 24, v2
1303; VI-NEXT:    v_sub_u32_e32 v14, vcc, 24, v3
1304; VI-NEXT:    v_and_b32_e32 v13, s4, v13
1305; VI-NEXT:    s_waitcnt vmcnt(0)
1306; VI-NEXT:    v_lshlrev_b32_e32 v6, v13, v6
1307; VI-NEXT:    v_and_b32_e32 v14, 0xffffff, v14
1308; VI-NEXT:    v_lshrrev_b32_e32 v11, v3, v11
1309; VI-NEXT:    v_lshlrev_b32_e32 v4, v14, v4
1310; VI-NEXT:    v_or_b32_e32 v6, v6, v12
1311; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1312; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
1313; VI-NEXT:    v_or_b32_e32 v4, v4, v11
1314; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1315; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
1316; VI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
1317; VI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1318; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
1319; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1320; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1321; VI-NEXT:    buffer_store_byte v0, v8, s[0:3], 0 offen
1322; VI-NEXT:    buffer_store_byte v2, v9, s[0:3], 0 offen
1323; VI-NEXT:    buffer_store_byte v1, v10, s[0:3], 0 offen
1324; VI-NEXT:    s_waitcnt vmcnt(0)
1325; VI-NEXT:    s_setpc_b64 s[30:31]
1326;
1327; GFX9-LABEL: v_fshr_v2i24:
1328; GFX9:       ; %bb.0:
1329; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1330; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
1331; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20
1332; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
1333; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
1334; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32
1335; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
1336; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
1337; GFX9-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1338; GFX9-NEXT:    s_waitcnt vmcnt(5)
1339; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
1340; GFX9-NEXT:    v_mul_hi_u32 v6, v1, s5
1341; GFX9-NEXT:    s_waitcnt vmcnt(4)
1342; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
1343; GFX9-NEXT:    v_mul_hi_u32 v7, v2, s5
1344; GFX9-NEXT:    s_waitcnt vmcnt(2)
1345; GFX9-NEXT:    v_and_b32_e32 v9, s4, v4
1346; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1347; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1348; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1349; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
1350; GFX9-NEXT:    s_waitcnt vmcnt(0)
1351; GFX9-NEXT:    v_and_b32_e32 v10, s4, v8
1352; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v6
1353; GFX9-NEXT:    v_lshrrev_b32_e32 v6, v1, v10
1354; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v7
1355; GFX9-NEXT:    v_sub_u32_e32 v7, 24, v1
1356; GFX9-NEXT:    v_sub_u32_e32 v10, 24, v2
1357; GFX9-NEXT:    v_and_b32_e32 v7, s4, v7
1358; GFX9-NEXT:    v_lshrrev_b32_e32 v9, v2, v9
1359; GFX9-NEXT:    v_and_b32_e32 v10, 0xffffff, v10
1360; GFX9-NEXT:    v_lshl_or_b32 v5, v5, v7, v6
1361; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1362; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc
1363; GFX9-NEXT:    v_lshl_or_b32 v3, v3, v10, v9
1364; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1365; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
1366; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
1367; GFX9-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
1368; GFX9-NEXT:    buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
1369; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1370; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
1371; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
1372; GFX9-NEXT:    s_waitcnt vmcnt(0)
1373; GFX9-NEXT:    s_setpc_b64 s[30:31]
1374;
1375; R600-LABEL: v_fshr_v2i24:
1376; R600:       ; %bb.0:
1377; R600-NEXT:    CF_END
1378; R600-NEXT:    PAD
1379  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1380  ret <2 x i24> %ret
1381}
1382