1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600
6
7declare i32 @llvm.fshr.i32(i32, i32, i32)
8declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
9declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
10declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
11declare i16 @llvm.fshr.i16(i16, i16, i16)
12declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
13declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
14declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
15declare i64 @llvm.fshr.i64(i64, i64, i64)
16declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
17declare i24 @llvm.fshr.i24(i24, i24, i24)
18declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
19
20define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
21; SI-LABEL: fshr_i32:
22; SI:       ; %bb.0: ; %entry
23; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
24; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
25; SI-NEXT:    s_mov_b32 s7, 0xf000
26; SI-NEXT:    s_mov_b32 s6, -1
27; SI-NEXT:    s_waitcnt lgkmcnt(0)
28; SI-NEXT:    v_mov_b32_e32 v0, s1
29; SI-NEXT:    v_mov_b32_e32 v1, s2
30; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v1
31; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
32; SI-NEXT:    s_endpgm
33;
34; VI-LABEL: fshr_i32:
35; VI:       ; %bb.0: ; %entry
36; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
37; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
38; VI-NEXT:    s_waitcnt lgkmcnt(0)
39; VI-NEXT:    v_mov_b32_e32 v0, s1
40; VI-NEXT:    v_mov_b32_e32 v1, s2
41; VI-NEXT:    v_alignbit_b32 v2, s0, v0, v1
42; VI-NEXT:    v_mov_b32_e32 v0, s4
43; VI-NEXT:    v_mov_b32_e32 v1, s5
44; VI-NEXT:    flat_store_dword v[0:1], v2
45; VI-NEXT:    s_endpgm
46;
47; GFX9-LABEL: fshr_i32:
48; GFX9:       ; %bb.0: ; %entry
49; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
50; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
51; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX9-NEXT:    v_mov_b32_e32 v0, s1
53; GFX9-NEXT:    v_mov_b32_e32 v1, s2
54; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, v1
55; GFX9-NEXT:    v_mov_b32_e32 v0, s4
56; GFX9-NEXT:    v_mov_b32_e32 v1, s5
57; GFX9-NEXT:    global_store_dword v[0:1], v2, off
58; GFX9-NEXT:    s_endpgm
59;
60; R600-LABEL: fshr_i32:
61; R600:       ; %bb.0: ; %entry
62; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
63; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
64; R600-NEXT:    CF_END
65; R600-NEXT:    PAD
66; R600-NEXT:    ALU clause starting at 4:
67; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
68; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
69; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
70entry:
71  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
72  store i32 %0, i32 addrspace(1)* %in
73  ret void
74}
75
76define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
77; SI-LABEL: fshr_i32_imm:
78; SI:       ; %bb.0: ; %entry
79; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
80; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
81; SI-NEXT:    s_mov_b32 s7, 0xf000
82; SI-NEXT:    s_mov_b32 s6, -1
83; SI-NEXT:    s_waitcnt lgkmcnt(0)
84; SI-NEXT:    v_mov_b32_e32 v0, s1
85; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 7
86; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
87; SI-NEXT:    s_endpgm
88;
89; VI-LABEL: fshr_i32_imm:
90; VI:       ; %bb.0: ; %entry
91; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
92; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
93; VI-NEXT:    s_waitcnt lgkmcnt(0)
94; VI-NEXT:    v_mov_b32_e32 v0, s1
95; VI-NEXT:    v_alignbit_b32 v2, s0, v0, 7
96; VI-NEXT:    v_mov_b32_e32 v0, s2
97; VI-NEXT:    v_mov_b32_e32 v1, s3
98; VI-NEXT:    flat_store_dword v[0:1], v2
99; VI-NEXT:    s_endpgm
100;
101; GFX9-LABEL: fshr_i32_imm:
102; GFX9:       ; %bb.0: ; %entry
103; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
104; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX9-NEXT:    v_mov_b32_e32 v0, s1
107; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, 7
108; GFX9-NEXT:    v_mov_b32_e32 v0, s2
109; GFX9-NEXT:    v_mov_b32_e32 v1, s3
110; GFX9-NEXT:    global_store_dword v[0:1], v2, off
111; GFX9-NEXT:    s_endpgm
112;
113; R600-LABEL: fshr_i32_imm:
114; R600:       ; %bb.0: ; %entry
115; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
116; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
117; R600-NEXT:    CF_END
118; R600-NEXT:    PAD
119; R600-NEXT:    ALU clause starting at 4:
120; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
121; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
122; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
123; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
124entry:
125  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
126  store i32 %0, i32 addrspace(1)* %in
127  ret void
128}
129
130define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
131; SI-LABEL: fshr_v2i32:
132; SI:       ; %bb.0: ; %entry
133; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
134; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
135; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
136; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
137; SI-NEXT:    s_mov_b32 s7, 0xf000
138; SI-NEXT:    s_mov_b32 s6, -1
139; SI-NEXT:    s_waitcnt lgkmcnt(0)
140; SI-NEXT:    v_mov_b32_e32 v0, s9
141; SI-NEXT:    s_and_b32 s1, s1, 31
142; SI-NEXT:    v_mov_b32_e32 v1, s1
143; SI-NEXT:    s_and_b32 s0, s0, 31
144; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
145; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
146; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
147; SI-NEXT:    v_mov_b32_e32 v0, s8
148; SI-NEXT:    v_mov_b32_e32 v2, s0
149; SI-NEXT:    v_alignbit_b32 v2, s2, v0, v2
150; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
151; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
152; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
153; SI-NEXT:    s_endpgm
154;
155; VI-LABEL: fshr_v2i32:
156; VI:       ; %bb.0: ; %entry
157; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
158; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
159; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
160; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
161; VI-NEXT:    s_waitcnt lgkmcnt(0)
162; VI-NEXT:    v_mov_b32_e32 v0, s7
163; VI-NEXT:    s_and_b32 s1, s1, 31
164; VI-NEXT:    v_mov_b32_e32 v1, s1
165; VI-NEXT:    s_and_b32 s0, s0, 31
166; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
167; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
168; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
169; VI-NEXT:    v_mov_b32_e32 v0, s6
170; VI-NEXT:    v_mov_b32_e32 v2, s0
171; VI-NEXT:    v_alignbit_b32 v2, s4, v0, v2
172; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
173; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
174; VI-NEXT:    v_mov_b32_e32 v2, s2
175; VI-NEXT:    v_mov_b32_e32 v3, s3
176; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
177; VI-NEXT:    s_endpgm
178;
179; GFX9-LABEL: fshr_v2i32:
180; GFX9:       ; %bb.0: ; %entry
181; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
182; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
183; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
184; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
185; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX9-NEXT:    v_mov_b32_e32 v0, s7
187; GFX9-NEXT:    s_and_b32 s1, s1, 31
188; GFX9-NEXT:    v_mov_b32_e32 v1, s1
189; GFX9-NEXT:    s_and_b32 s0, s0, 31
190; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
191; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
192; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
193; GFX9-NEXT:    v_mov_b32_e32 v0, s6
194; GFX9-NEXT:    v_mov_b32_e32 v2, s0
195; GFX9-NEXT:    v_alignbit_b32 v2, s4, v0, v2
196; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
197; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
198; GFX9-NEXT:    v_mov_b32_e32 v2, s2
199; GFX9-NEXT:    v_mov_b32_e32 v3, s3
200; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
201; GFX9-NEXT:    s_endpgm
202;
203; R600-LABEL: fshr_v2i32:
204; R600:       ; %bb.0: ; %entry
205; R600-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
206; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
207; R600-NEXT:    CF_END
208; R600-NEXT:    PAD
209; R600-NEXT:    ALU clause starting at 4:
210; R600-NEXT:     AND_INT * T0.W, KC0[4].X, literal.x,
211; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
212; R600-NEXT:     BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
213; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
214; R600-NEXT:     CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
215; R600-NEXT:     AND_INT * T0.W, KC0[3].W, literal.x,
216; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
217; R600-NEXT:     BIT_ALIGN_INT T1.W, KC0[2].W, KC0[3].Y, PV.W,
218; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
219; R600-NEXT:     CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
220; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
221; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
222entry:
223  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
224  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
225  ret void
226}
227
228define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
229; SI-LABEL: fshr_v2i32_imm:
230; SI:       ; %bb.0: ; %entry
231; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
232; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
233; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
234; SI-NEXT:    s_mov_b32 s7, 0xf000
235; SI-NEXT:    s_mov_b32 s6, -1
236; SI-NEXT:    s_waitcnt lgkmcnt(0)
237; SI-NEXT:    v_mov_b32_e32 v0, s1
238; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 9
239; SI-NEXT:    v_mov_b32_e32 v0, s0
240; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
241; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
242; SI-NEXT:    s_endpgm
243;
244; VI-LABEL: fshr_v2i32_imm:
245; VI:       ; %bb.0: ; %entry
246; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
247; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
248; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
249; VI-NEXT:    s_waitcnt lgkmcnt(0)
250; VI-NEXT:    v_mov_b32_e32 v0, s1
251; VI-NEXT:    v_mov_b32_e32 v2, s0
252; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
253; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
254; VI-NEXT:    v_mov_b32_e32 v2, s2
255; VI-NEXT:    v_mov_b32_e32 v3, s3
256; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
257; VI-NEXT:    s_endpgm
258;
259; GFX9-LABEL: fshr_v2i32_imm:
260; GFX9:       ; %bb.0: ; %entry
261; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
262; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
263; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
264; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX9-NEXT:    v_mov_b32_e32 v0, s1
266; GFX9-NEXT:    v_mov_b32_e32 v2, s0
267; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
268; GFX9-NEXT:    v_alignbit_b32 v0, s4, v2, 7
269; GFX9-NEXT:    v_mov_b32_e32 v2, s2
270; GFX9-NEXT:    v_mov_b32_e32 v3, s3
271; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
272; GFX9-NEXT:    s_endpgm
273;
274; R600-LABEL: fshr_v2i32_imm:
275; R600:       ; %bb.0: ; %entry
276; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
277; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
278; R600-NEXT:    CF_END
279; R600-NEXT:    PAD
280; R600-NEXT:    ALU clause starting at 4:
281; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
282; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
283; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
284; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
285; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
286; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
287entry:
288  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
289  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
290  ret void
291}
292
293define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
294; SI-LABEL: fshr_v4i32:
295; SI:       ; %bb.0: ; %entry
296; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
297; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
298; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x11
299; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
300; SI-NEXT:    s_mov_b32 s7, 0xf000
301; SI-NEXT:    s_mov_b32 s6, -1
302; SI-NEXT:    s_waitcnt lgkmcnt(0)
303; SI-NEXT:    v_mov_b32_e32 v0, s15
304; SI-NEXT:    s_and_b32 s3, s3, 31
305; SI-NEXT:    v_mov_b32_e32 v1, s3
306; SI-NEXT:    v_alignbit_b32 v1, s11, v0, v1
307; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
308; SI-NEXT:    s_and_b32 s2, s2, 31
309; SI-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
310; SI-NEXT:    v_mov_b32_e32 v0, s14
311; SI-NEXT:    v_mov_b32_e32 v1, s2
312; SI-NEXT:    v_alignbit_b32 v1, s10, v0, v1
313; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
314; SI-NEXT:    s_and_b32 s1, s1, 31
315; SI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
316; SI-NEXT:    v_mov_b32_e32 v0, s13
317; SI-NEXT:    v_mov_b32_e32 v1, s1
318; SI-NEXT:    s_and_b32 s0, s0, 31
319; SI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
320; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
321; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
322; SI-NEXT:    v_mov_b32_e32 v0, s12
323; SI-NEXT:    v_mov_b32_e32 v4, s0
324; SI-NEXT:    v_alignbit_b32 v4, s8, v0, v4
325; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
326; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
327; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
328; SI-NEXT:    s_endpgm
329;
330; VI-LABEL: fshr_v4i32:
331; VI:       ; %bb.0: ; %entry
332; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
333; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
334; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
335; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
336; VI-NEXT:    s_waitcnt lgkmcnt(0)
337; VI-NEXT:    v_mov_b32_e32 v0, s11
338; VI-NEXT:    s_and_b32 s3, s3, 31
339; VI-NEXT:    v_mov_b32_e32 v1, s3
340; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
341; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
342; VI-NEXT:    s_and_b32 s2, s2, 31
343; VI-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
344; VI-NEXT:    v_mov_b32_e32 v0, s10
345; VI-NEXT:    v_mov_b32_e32 v1, s2
346; VI-NEXT:    v_alignbit_b32 v1, s6, v0, v1
347; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
348; VI-NEXT:    s_and_b32 s1, s1, 31
349; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
350; VI-NEXT:    v_mov_b32_e32 v0, s9
351; VI-NEXT:    v_mov_b32_e32 v1, s1
352; VI-NEXT:    s_and_b32 s0, s0, 31
353; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
354; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
355; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
356; VI-NEXT:    v_mov_b32_e32 v0, s8
357; VI-NEXT:    v_mov_b32_e32 v4, s0
358; VI-NEXT:    v_alignbit_b32 v4, s4, v0, v4
359; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
360; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
361; VI-NEXT:    v_mov_b32_e32 v4, s12
362; VI-NEXT:    v_mov_b32_e32 v5, s13
363; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
364; VI-NEXT:    s_endpgm
365;
366; GFX9-LABEL: fshr_v4i32:
367; GFX9:       ; %bb.0: ; %entry
368; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
369; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
370; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
371; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
372; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX9-NEXT:    v_mov_b32_e32 v0, s11
374; GFX9-NEXT:    s_and_b32 s3, s3, 31
375; GFX9-NEXT:    v_mov_b32_e32 v1, s3
376; GFX9-NEXT:    v_alignbit_b32 v1, s7, v0, v1
377; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
378; GFX9-NEXT:    s_and_b32 s2, s2, 31
379; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
380; GFX9-NEXT:    v_mov_b32_e32 v0, s10
381; GFX9-NEXT:    v_mov_b32_e32 v1, s2
382; GFX9-NEXT:    v_alignbit_b32 v1, s6, v0, v1
383; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
384; GFX9-NEXT:    s_and_b32 s1, s1, 31
385; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
386; GFX9-NEXT:    v_mov_b32_e32 v0, s9
387; GFX9-NEXT:    v_mov_b32_e32 v1, s1
388; GFX9-NEXT:    s_and_b32 s0, s0, 31
389; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
390; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
391; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
392; GFX9-NEXT:    v_mov_b32_e32 v0, s8
393; GFX9-NEXT:    v_mov_b32_e32 v4, s0
394; GFX9-NEXT:    v_alignbit_b32 v4, s4, v0, v4
395; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
396; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
397; GFX9-NEXT:    v_mov_b32_e32 v4, s12
398; GFX9-NEXT:    v_mov_b32_e32 v5, s13
399; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
400; GFX9-NEXT:    s_endpgm
401;
402; R600-LABEL: fshr_v4i32:
403; R600:       ; %bb.0: ; %entry
404; R600-NEXT:    ALU 20, @4, KC0[CB0:0-32], KC1[]
405; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
406; R600-NEXT:    CF_END
407; R600-NEXT:    PAD
408; R600-NEXT:    ALU clause starting at 4:
409; R600-NEXT:     AND_INT T0.W, KC0[5].Z, literal.x,
410; R600-NEXT:     AND_INT * T1.W, KC0[6].X, literal.x,
411; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
412; R600-NEXT:     SETE_INT T0.Z, PS, 0.0,
413; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[4].X, KC0[5].X, PS,
414; R600-NEXT:     AND_INT * T2.W, KC0[5].W, literal.x,
415; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
416; R600-NEXT:     SETE_INT T1.Z, PV.W, 0.0,
417; R600-NEXT:     BIT_ALIGN_INT * T2.W, KC0[3].W, KC0[4].W, PV.W,
418; R600-NEXT:     CNDE_INT * T1.W, T0.Z, T1.W, KC0[5].X,
419; R600-NEXT:     CNDE_INT T1.Z, T1.Z, T2.W, KC0[4].W,
420; R600-NEXT:     BIT_ALIGN_INT T2.W, KC0[3].Z, KC0[4].Z, T0.W,
421; R600-NEXT:     SETE_INT * T0.W, T0.W, 0.0,
422; R600-NEXT:     CNDE_INT T1.Y, PS, PV.W, KC0[4].Z,
423; R600-NEXT:     AND_INT * T0.W, KC0[5].Y, literal.x,
424; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
425; R600-NEXT:     BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PV.W,
426; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
427; R600-NEXT:     CNDE_INT T1.X, PS, PV.W, KC0[4].Y,
428; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
429; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
430entry:
431  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
432  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
433  ret void
434}
435
436define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
437; SI-LABEL: fshr_v4i32_imm:
438; SI:       ; %bb.0: ; %entry
439; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
440; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
441; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
442; SI-NEXT:    s_mov_b32 s7, 0xf000
443; SI-NEXT:    s_mov_b32 s6, -1
444; SI-NEXT:    s_waitcnt lgkmcnt(0)
445; SI-NEXT:    v_mov_b32_e32 v0, s3
446; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
447; SI-NEXT:    v_mov_b32_e32 v0, s2
448; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 9
449; SI-NEXT:    v_mov_b32_e32 v0, s1
450; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
451; SI-NEXT:    v_mov_b32_e32 v0, s0
452; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
453; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
454; SI-NEXT:    s_endpgm
455;
456; VI-LABEL: fshr_v4i32_imm:
457; VI:       ; %bb.0: ; %entry
458; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
459; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
460; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
461; VI-NEXT:    s_waitcnt lgkmcnt(0)
462; VI-NEXT:    v_mov_b32_e32 v4, s8
463; VI-NEXT:    v_mov_b32_e32 v5, s9
464; VI-NEXT:    v_mov_b32_e32 v0, s3
465; VI-NEXT:    v_mov_b32_e32 v1, s2
466; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
467; VI-NEXT:    v_mov_b32_e32 v0, s1
468; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
469; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
470; VI-NEXT:    v_mov_b32_e32 v0, s0
471; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
472; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
473; VI-NEXT:    s_endpgm
474;
475; GFX9-LABEL: fshr_v4i32_imm:
476; GFX9:       ; %bb.0: ; %entry
477; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
478; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
479; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
480; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX9-NEXT:    v_mov_b32_e32 v4, s8
482; GFX9-NEXT:    v_mov_b32_e32 v5, s9
483; GFX9-NEXT:    v_mov_b32_e32 v0, s3
484; GFX9-NEXT:    v_mov_b32_e32 v1, s2
485; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
486; GFX9-NEXT:    v_mov_b32_e32 v0, s1
487; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
488; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
489; GFX9-NEXT:    v_mov_b32_e32 v0, s0
490; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
491; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
492; GFX9-NEXT:    s_endpgm
493;
494; R600-LABEL: fshr_v4i32_imm:
495; R600:       ; %bb.0: ; %entry
496; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
497; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
498; R600-NEXT:    CF_END
499; R600-NEXT:    PAD
500; R600-NEXT:    ALU clause starting at 4:
501; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
502; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
503; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
504; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
505; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
506; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
507; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
508; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
509entry:
510  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
511  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
512  ret void
513}
514
515define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
516; GFX89-LABEL: v_fshr_i32:
517; GFX89:       ; %bb.0:
518; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
520; GFX89-NEXT:    s_setpc_b64 s[30:31]
521;
522; R600-LABEL: v_fshr_i32:
523; R600:       ; %bb.0:
524; R600-NEXT:    CF_END
525; R600-NEXT:    PAD
526  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
527  ret i32 %ret
528}
529
530define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
531; GFX89-LABEL: v_fshr_v2i32:
532; GFX89:       ; %bb.0:
533; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX89-NEXT:    v_and_b32_e32 v4, 31, v4
535; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
536; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
537; GFX89-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
538; GFX89-NEXT:    v_and_b32_e32 v2, 31, v5
539; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v2
540; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
541; GFX89-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
542; GFX89-NEXT:    s_setpc_b64 s[30:31]
543;
544; R600-LABEL: v_fshr_v2i32:
545; R600:       ; %bb.0:
546; R600-NEXT:    CF_END
547; R600-NEXT:    PAD
548  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
549  ret <2 x i32> %ret
550}
551
552define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
553; GFX89-LABEL: v_fshr_v3i32:
554; GFX89:       ; %bb.0:
555; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556; GFX89-NEXT:    v_and_b32_e32 v6, 31, v6
557; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
558; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
559; GFX89-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
560; GFX89-NEXT:    v_and_b32_e32 v3, 31, v7
561; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v3
562; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
563; GFX89-NEXT:    v_and_b32_e32 v3, 31, v8
564; GFX89-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
565; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v3
566; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
567; GFX89-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
568; GFX89-NEXT:    s_setpc_b64 s[30:31]
569;
570; R600-LABEL: v_fshr_v3i32:
571; R600:       ; %bb.0:
572; R600-NEXT:    CF_END
573; R600-NEXT:    PAD
574  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
575  ret <3 x i32> %ret
576}
577
578define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
579; GFX89-LABEL: v_fshr_v4i32:
580; GFX89:       ; %bb.0:
581; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX89-NEXT:    v_and_b32_e32 v8, 31, v8
583; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
584; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
585; GFX89-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
586; GFX89-NEXT:    v_and_b32_e32 v4, 31, v9
587; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v4
588; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
589; GFX89-NEXT:    v_and_b32_e32 v4, 31, v10
590; GFX89-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
591; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v4
592; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
593; GFX89-NEXT:    v_and_b32_e32 v4, 31, v11
594; GFX89-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
595; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v4
596; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
597; GFX89-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
598; GFX89-NEXT:    s_setpc_b64 s[30:31]
599;
600; R600-LABEL: v_fshr_v4i32:
601; R600:       ; %bb.0:
602; R600-NEXT:    CF_END
603; R600-NEXT:    PAD
604  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
605  ret <4 x i32> %ret
606}
607
608define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
609; SI-LABEL: v_fshr_i16:
610; SI:       ; %bb.0:
611; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612; SI-NEXT:    v_and_b32_e32 v2, 15, v2
613; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
614; SI-NEXT:    v_sub_i32_e32 v4, vcc, 16, v2
615; SI-NEXT:    v_lshr_b32_e32 v3, v3, v2
616; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
617; SI-NEXT:    v_or_b32_e32 v0, v0, v3
618; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
619; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
620; SI-NEXT:    s_setpc_b64 s[30:31]
621;
622; VI-LABEL: v_fshr_i16:
623; VI:       ; %bb.0:
624; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625; VI-NEXT:    v_and_b32_e32 v2, 15, v2
626; VI-NEXT:    v_sub_u16_e32 v4, 16, v2
627; VI-NEXT:    v_lshrrev_b16_e32 v3, v2, v1
628; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
629; VI-NEXT:    v_or_b32_e32 v0, v0, v3
630; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
631; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
632; VI-NEXT:    s_setpc_b64 s[30:31]
633;
634; GFX9-LABEL: v_fshr_i16:
635; GFX9:       ; %bb.0:
636; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
637; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
638; GFX9-NEXT:    v_sub_u16_e32 v4, 16, v2
639; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v2, v1
640; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
641; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
642; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
643; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
644; GFX9-NEXT:    s_setpc_b64 s[30:31]
645;
646; R600-LABEL: v_fshr_i16:
647; R600:       ; %bb.0:
648; R600-NEXT:    CF_END
649; R600-NEXT:    PAD
650  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
651  ret i16 %ret
652}
653
654define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
655; SI-LABEL: v_fshr_v2i16:
656; SI:       ; %bb.0:
657; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; SI-NEXT:    s_mov_b32 s4, 0xffff
659; SI-NEXT:    v_and_b32_e32 v5, 15, v5
660; SI-NEXT:    v_and_b32_e32 v7, s4, v3
661; SI-NEXT:    v_sub_i32_e32 v8, vcc, 16, v5
662; SI-NEXT:    v_lshr_b32_e32 v7, v7, v5
663; SI-NEXT:    v_lshl_b32_e32 v1, v1, v8
664; SI-NEXT:    v_or_b32_e32 v1, v1, v7
665; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
666; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
667; SI-NEXT:    v_and_b32_e32 v3, 15, v4
668; SI-NEXT:    v_sub_i32_e32 v5, vcc, 16, v3
669; SI-NEXT:    v_and_b32_e32 v6, s4, v2
670; SI-NEXT:    v_lshr_b32_e32 v4, v6, v3
671; SI-NEXT:    v_lshl_b32_e32 v0, v0, v5
672; SI-NEXT:    v_or_b32_e32 v0, v0, v4
673; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
674; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
675; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
676; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
677; SI-NEXT:    v_or_b32_e32 v0, v0, v1
678; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
679; SI-NEXT:    s_setpc_b64 s[30:31]
680;
681; VI-LABEL: v_fshr_v2i16:
682; VI:       ; %bb.0:
683; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
684; VI-NEXT:    v_and_b32_e32 v3, 0xf000f, v2
685; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
686; VI-NEXT:    v_bfe_u32 v2, v2, 16, 4
687; VI-NEXT:    v_lshrrev_b16_e32 v4, v3, v1
688; VI-NEXT:    v_lshrrev_b16_sdwa v6, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
689; VI-NEXT:    v_or_b32_e32 v4, v4, v6
690; VI-NEXT:    v_sub_u16_e32 v6, 16, v2
691; VI-NEXT:    v_sub_u16_e32 v7, 16, v3
692; VI-NEXT:    v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
693; VI-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
694; VI-NEXT:    v_or_b32_e32 v0, v0, v6
695; VI-NEXT:    v_or_b32_e32 v0, v0, v4
696; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
697; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
698; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
699; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
700; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
701; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
702; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
703; VI-NEXT:    s_setpc_b64 s[30:31]
704;
705; GFX9-LABEL: v_fshr_v2i16:
706; GFX9:       ; %bb.0:
707; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
708; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
709; GFX9-NEXT:    v_pk_sub_i16 v4, 16, v2 op_sel_hi:[0,1]
710; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
711; GFX9-NEXT:    v_pk_lshrrev_b16 v3, v2, v1
712; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
713; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
714; GFX9-NEXT:    v_mov_b32_e32 v4, 0
715; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
716; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
717; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
718; GFX9-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v2, v4 src0_sel:WORD_1 src1_sel:DWORD
719; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
720; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v3
721; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
722; GFX9-NEXT:    s_setpc_b64 s[30:31]
723;
724; R600-LABEL: v_fshr_v2i16:
725; R600:       ; %bb.0:
726; R600-NEXT:    CF_END
727; R600-NEXT:    PAD
728  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
729  ret <2 x i16> %ret
730}
731
732define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
733; SI-LABEL: v_fshr_v3i16:
734; SI:       ; %bb.0:
735; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
736; SI-NEXT:    s_mov_b32 s4, 0xffff
737; SI-NEXT:    v_and_b32_e32 v7, 15, v7
738; SI-NEXT:    v_and_b32_e32 v12, s4, v4
739; SI-NEXT:    v_sub_i32_e32 v13, vcc, 16, v7
740; SI-NEXT:    v_lshr_b32_e32 v12, v12, v7
741; SI-NEXT:    v_lshl_b32_e32 v1, v1, v13
742; SI-NEXT:    v_or_b32_e32 v1, v1, v12
743; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
744; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
745; SI-NEXT:    v_and_b32_e32 v4, 15, v6
746; SI-NEXT:    v_sub_i32_e32 v7, vcc, 16, v4
747; SI-NEXT:    v_and_b32_e32 v11, s4, v3
748; SI-NEXT:    v_lshr_b32_e32 v6, v11, v4
749; SI-NEXT:    v_lshl_b32_e32 v0, v0, v7
750; SI-NEXT:    v_or_b32_e32 v0, v0, v6
751; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
752; SI-NEXT:    v_mov_b32_e32 v9, 0xffff
753; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
754; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
755; SI-NEXT:    v_and_b32_e32 v0, v9, v0
756; SI-NEXT:    v_or_b32_e32 v0, v0, v1
757; SI-NEXT:    v_and_b32_e32 v1, 15, v8
758; SI-NEXT:    v_sub_i32_e32 v4, vcc, 16, v1
759; SI-NEXT:    v_and_b32_e32 v10, s4, v5
760; SI-NEXT:    v_lshr_b32_e32 v3, v10, v1
761; SI-NEXT:    v_lshl_b32_e32 v2, v2, v4
762; SI-NEXT:    v_or_b32_e32 v2, v2, v3
763; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
764; SI-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
765; SI-NEXT:    v_and_b32_e32 v2, v9, v1
766; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
767; SI-NEXT:    s_setpc_b64 s[30:31]
768;
769; VI-LABEL: v_fshr_v3i16:
770; VI:       ; %bb.0:
771; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
772; VI-NEXT:    v_mov_b32_e32 v6, 15
773; VI-NEXT:    v_and_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
774; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
775; VI-NEXT:    v_lshrrev_b16_e32 v8, v6, v7
776; VI-NEXT:    v_sub_u16_e32 v6, 16, v6
777; VI-NEXT:    v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
778; VI-NEXT:    v_or_b32_e32 v6, v6, v8
779; VI-NEXT:    v_bfe_u32 v8, v4, 16, 4
780; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v8
781; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
782; VI-NEXT:    v_and_b32_e32 v7, 15, v5
783; VI-NEXT:    v_lshrrev_b16_e32 v8, v7, v3
784; VI-NEXT:    v_sub_u16_e32 v7, 16, v7
785; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
786; VI-NEXT:    v_and_b32_e32 v5, 15, v5
787; VI-NEXT:    v_or_b32_e32 v1, v1, v8
788; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
789; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
790; VI-NEXT:    v_and_b32_e32 v3, 15, v4
791; VI-NEXT:    v_lshrrev_b16_e32 v5, v3, v2
792; VI-NEXT:    v_sub_u16_e32 v3, 16, v3
793; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
794; VI-NEXT:    v_and_b32_e32 v3, 0xf000f, v4
795; VI-NEXT:    v_or_b32_e32 v0, v0, v5
796; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
797; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
798; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
799; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
800; VI-NEXT:    s_setpc_b64 s[30:31]
801;
802; GFX9-LABEL: v_fshr_v3i16:
803; GFX9:       ; %bb.0:
804; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
805; GFX9-NEXT:    v_mov_b32_e32 v7, 15
806; GFX9-NEXT:    v_and_b32_e32 v6, 15, v4
807; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
808; GFX9-NEXT:    v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
809; GFX9-NEXT:    v_and_b32_e32 v6, v8, v6
810; GFX9-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
811; GFX9-NEXT:    v_pk_lshrrev_b16 v7, v6, v2
812; GFX9-NEXT:    v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1]
813; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
814; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v6, v0
815; GFX9-NEXT:    v_and_b32_e32 v4, s6, v4
816; GFX9-NEXT:    v_or_b32_e32 v0, v0, v7
817; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v4
818; GFX9-NEXT:    v_mov_b32_e32 v7, 0
819; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
820; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
821; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
822; GFX9-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v4, v7 src0_sel:WORD_1 src1_sel:DWORD
823; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
824; GFX9-NEXT:    v_and_b32_e32 v2, 15, v5
825; GFX9-NEXT:    v_and_b32_e32 v2, v8, v2
826; GFX9-NEXT:    v_pk_lshrrev_b16 v4, v2, v3
827; GFX9-NEXT:    v_pk_sub_i16 v2, 16, v2
828; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
829; GFX9-NEXT:    v_and_b32_e32 v2, s6, v5
830; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v2
831; GFX9-NEXT:    v_or_b32_e32 v1, v1, v4
832; GFX9-NEXT:    v_and_b32_e32 v2, v8, v6
833; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
834; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
835; GFX9-NEXT:    s_setpc_b64 s[30:31]
836;
837; R600-LABEL: v_fshr_v3i16:
838; R600:       ; %bb.0:
839; R600-NEXT:    CF_END
840; R600-NEXT:    PAD
841  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
842  ret <3 x i16> %ret
843}
844
845define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
846; SI-LABEL: v_fshr_v4i16:
847; SI:       ; %bb.0:
848; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
849; SI-NEXT:    s_mov_b32 s4, 0xffff
850; SI-NEXT:    v_and_b32_e32 v11, 15, v11
851; SI-NEXT:    v_and_b32_e32 v16, s4, v7
852; SI-NEXT:    v_sub_i32_e32 v17, vcc, 16, v11
853; SI-NEXT:    v_lshr_b32_e32 v16, v16, v11
854; SI-NEXT:    v_lshl_b32_e32 v3, v3, v17
855; SI-NEXT:    v_or_b32_e32 v3, v3, v16
856; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
857; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
858; SI-NEXT:    v_and_b32_e32 v7, 15, v10
859; SI-NEXT:    v_sub_i32_e32 v11, vcc, 16, v7
860; SI-NEXT:    v_and_b32_e32 v15, s4, v6
861; SI-NEXT:    v_lshr_b32_e32 v10, v15, v7
862; SI-NEXT:    v_lshl_b32_e32 v2, v2, v11
863; SI-NEXT:    v_or_b32_e32 v2, v2, v10
864; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
865; SI-NEXT:    v_mov_b32_e32 v12, 0xffff
866; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
867; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
868; SI-NEXT:    v_and_b32_e32 v2, v12, v2
869; SI-NEXT:    v_or_b32_e32 v2, v2, v3
870; SI-NEXT:    v_and_b32_e32 v3, 15, v9
871; SI-NEXT:    v_sub_i32_e32 v7, vcc, 16, v3
872; SI-NEXT:    v_and_b32_e32 v14, s4, v5
873; SI-NEXT:    v_lshr_b32_e32 v6, v14, v3
874; SI-NEXT:    v_lshl_b32_e32 v1, v1, v7
875; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
876; SI-NEXT:    v_or_b32_e32 v1, v1, v6
877; SI-NEXT:    v_and_b32_e32 v3, 15, v8
878; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
879; SI-NEXT:    v_sub_i32_e32 v6, vcc, 16, v3
880; SI-NEXT:    v_and_b32_e32 v13, s4, v4
881; SI-NEXT:    v_lshr_b32_e32 v5, v13, v3
882; SI-NEXT:    v_lshl_b32_e32 v0, v0, v6
883; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
884; SI-NEXT:    v_or_b32_e32 v0, v0, v5
885; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
886; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
887; SI-NEXT:    v_and_b32_e32 v0, v12, v0
888; SI-NEXT:    v_or_b32_e32 v0, v0, v1
889; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
890; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
891; SI-NEXT:    s_setpc_b64 s[30:31]
892;
893; VI-LABEL: v_fshr_v4i16:
894; VI:       ; %bb.0:
895; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
896; VI-NEXT:    v_mov_b32_e32 v6, 15
897; VI-NEXT:    v_and_b32_sdwa v7, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
898; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
899; VI-NEXT:    v_lshrrev_b16_e32 v9, v7, v8
900; VI-NEXT:    v_sub_u16_e32 v7, 16, v7
901; VI-NEXT:    v_lshlrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
902; VI-NEXT:    v_or_b32_e32 v7, v7, v9
903; VI-NEXT:    v_bfe_u32 v9, v5, 16, 4
904; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
905; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
906; VI-NEXT:    v_and_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
907; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
908; VI-NEXT:    v_lshrrev_b16_e32 v9, v6, v8
909; VI-NEXT:    v_sub_u16_e32 v6, 16, v6
910; VI-NEXT:    v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
911; VI-NEXT:    v_or_b32_e32 v6, v6, v9
912; VI-NEXT:    v_bfe_u32 v9, v4, 16, 4
913; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v9
914; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
915; VI-NEXT:    v_and_b32_e32 v8, 15, v5
916; VI-NEXT:    v_lshrrev_b16_e32 v9, v8, v3
917; VI-NEXT:    v_sub_u16_e32 v8, 16, v8
918; VI-NEXT:    s_mov_b32 s4, 0xf000f
919; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
920; VI-NEXT:    v_and_b32_e32 v5, s4, v5
921; VI-NEXT:    v_or_b32_e32 v1, v1, v9
922; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
923; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
924; VI-NEXT:    v_and_b32_e32 v3, 15, v4
925; VI-NEXT:    v_lshrrev_b16_e32 v5, v3, v2
926; VI-NEXT:    v_sub_u16_e32 v3, 16, v3
927; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
928; VI-NEXT:    v_and_b32_e32 v3, s4, v4
929; VI-NEXT:    v_or_b32_e32 v0, v0, v5
930; VI-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
931; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
932; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
933; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
934; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
935; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
936; VI-NEXT:    s_setpc_b64 s[30:31]
937;
938; GFX9-LABEL: v_fshr_v4i16:
939; GFX9:       ; %bb.0:
940; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
941; GFX9-NEXT:    v_mov_b32_e32 v7, 15
942; GFX9-NEXT:    v_and_b32_e32 v6, 15, v5
943; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
944; GFX9-NEXT:    v_and_b32_sdwa v8, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
945; GFX9-NEXT:    v_and_b32_e32 v6, v9, v6
946; GFX9-NEXT:    v_lshl_or_b32 v6, v8, 16, v6
947; GFX9-NEXT:    v_pk_lshrrev_b16 v8, v6, v3
948; GFX9-NEXT:    v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1]
949; GFX9-NEXT:    s_mov_b32 s6, 0xf000f
950; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v6, v1
951; GFX9-NEXT:    v_and_b32_e32 v5, s6, v5
952; GFX9-NEXT:    v_or_b32_e32 v1, v1, v8
953; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v5
954; GFX9-NEXT:    v_mov_b32_e32 v8, 0
955; GFX9-NEXT:    v_cndmask_b32_e32 v6, v1, v3, vcc
956; GFX9-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v5, v8 src0_sel:WORD_1 src1_sel:DWORD
957; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
958; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
959; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
960; GFX9-NEXT:    v_and_b32_e32 v3, 15, v4
961; GFX9-NEXT:    v_and_b32_sdwa v5, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
962; GFX9-NEXT:    v_and_b32_e32 v3, v9, v3
963; GFX9-NEXT:    v_lshl_or_b32 v3, v5, 16, v3
964; GFX9-NEXT:    v_pk_lshrrev_b16 v5, v3, v2
965; GFX9-NEXT:    v_pk_sub_i16 v3, 16, v3 op_sel_hi:[0,1]
966; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
967; GFX9-NEXT:    v_and_b32_e32 v3, s6, v4
968; GFX9-NEXT:    v_or_b32_e32 v0, v0, v5
969; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
970; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
971; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
972; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
973; GFX9-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v3, v8 src0_sel:WORD_1 src1_sel:DWORD
974; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
975; GFX9-NEXT:    v_and_b32_e32 v2, v9, v4
976; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
977; GFX9-NEXT:    v_and_b32_e32 v2, v9, v6
978; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
979; GFX9-NEXT:    s_setpc_b64 s[30:31]
980;
981; R600-LABEL: v_fshr_v4i16:
982; R600:       ; %bb.0:
983; R600-NEXT:    CF_END
984; R600-NEXT:    PAD
985  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
986  ret <4 x i16> %ret
987}
988
989define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
990; SI-LABEL: v_fshr_i64:
991; SI:       ; %bb.0:
992; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
993; SI-NEXT:    v_and_b32_e32 v4, 63, v4
994; SI-NEXT:    v_sub_i32_e32 v7, vcc, 64, v4
995; SI-NEXT:    v_lshr_b64 v[5:6], v[2:3], v4
996; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v7
997; SI-NEXT:    v_or_b32_e32 v0, v0, v5
998; SI-NEXT:    v_mov_b32_e32 v5, 0
999; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
1000; SI-NEXT:    v_or_b32_e32 v1, v1, v6
1001; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1002; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1003; SI-NEXT:    s_setpc_b64 s[30:31]
1004;
1005; VI-LABEL: v_fshr_i64:
1006; VI:       ; %bb.0:
1007; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1008; VI-NEXT:    v_and_b32_e32 v4, 63, v4
1009; VI-NEXT:    v_sub_u32_e32 v7, vcc, 64, v4
1010; VI-NEXT:    v_lshrrev_b64 v[5:6], v4, v[2:3]
1011; VI-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
1012; VI-NEXT:    v_or_b32_e32 v0, v0, v5
1013; VI-NEXT:    v_mov_b32_e32 v5, 0
1014; VI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
1015; VI-NEXT:    v_or_b32_e32 v1, v1, v6
1016; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1017; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1018; VI-NEXT:    s_setpc_b64 s[30:31]
1019;
1020; GFX9-LABEL: v_fshr_i64:
1021; GFX9:       ; %bb.0:
1022; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
1024; GFX9-NEXT:    v_sub_u32_e32 v7, 64, v4
1025; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v4, v[2:3]
1026; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
1027; GFX9-NEXT:    v_or_b32_e32 v0, v0, v5
1028; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1029; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
1030; GFX9-NEXT:    v_or_b32_e32 v1, v1, v6
1031; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1032; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1033; GFX9-NEXT:    s_setpc_b64 s[30:31]
1034;
1035; R600-LABEL: v_fshr_i64:
1036; R600:       ; %bb.0:
1037; R600-NEXT:    CF_END
1038; R600-NEXT:    PAD
1039  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1040  ret i64 %ret
1041}
1042
1043define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1044; SI-LABEL: v_fshr_v2i64:
1045; SI:       ; %bb.0:
1046; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1047; SI-NEXT:    v_and_b32_e32 v8, 63, v8
1048; SI-NEXT:    v_sub_i32_e32 v9, vcc, 64, v8
1049; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
1050; SI-NEXT:    v_lshr_b64 v[11:12], v[4:5], v8
1051; SI-NEXT:    v_mov_b32_e32 v9, 0
1052; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1053; SI-NEXT:    v_or_b32_e32 v0, v0, v11
1054; SI-NEXT:    v_and_b32_e32 v8, 63, v10
1055; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1056; SI-NEXT:    v_sub_i32_e64 v4, s[4:5], 64, v8
1057; SI-NEXT:    v_or_b32_e32 v1, v1, v12
1058; SI-NEXT:    v_lshr_b64 v[10:11], v[6:7], v8
1059; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
1060; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1061; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1062; SI-NEXT:    v_or_b32_e32 v3, v3, v11
1063; SI-NEXT:    v_or_b32_e32 v2, v2, v10
1064; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1065; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1066; SI-NEXT:    s_setpc_b64 s[30:31]
1067;
1068; VI-LABEL: v_fshr_v2i64:
1069; VI:       ; %bb.0:
1070; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1071; VI-NEXT:    v_and_b32_e32 v8, 63, v8
1072; VI-NEXT:    v_sub_u32_e32 v9, vcc, 64, v8
1073; VI-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1074; VI-NEXT:    v_lshrrev_b64 v[11:12], v8, v[4:5]
1075; VI-NEXT:    v_mov_b32_e32 v9, 0
1076; VI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1077; VI-NEXT:    v_or_b32_e32 v0, v0, v11
1078; VI-NEXT:    v_and_b32_e32 v8, 63, v10
1079; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1080; VI-NEXT:    v_sub_u32_e64 v4, s[4:5], 64, v8
1081; VI-NEXT:    v_or_b32_e32 v1, v1, v12
1082; VI-NEXT:    v_lshrrev_b64 v[10:11], v8, v[6:7]
1083; VI-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
1084; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1085; VI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1086; VI-NEXT:    v_or_b32_e32 v3, v3, v11
1087; VI-NEXT:    v_or_b32_e32 v2, v2, v10
1088; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1089; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1090; VI-NEXT:    s_setpc_b64 s[30:31]
1091;
1092; GFX9-LABEL: v_fshr_v2i64:
1093; GFX9:       ; %bb.0:
1094; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1095; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
1096; GFX9-NEXT:    v_sub_u32_e32 v9, 64, v8
1097; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1098; GFX9-NEXT:    v_lshrrev_b64 v[11:12], v8, v[4:5]
1099; GFX9-NEXT:    v_mov_b32_e32 v9, 0
1100; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1101; GFX9-NEXT:    v_or_b32_e32 v0, v0, v11
1102; GFX9-NEXT:    v_and_b32_e32 v8, 63, v10
1103; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1104; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v8
1105; GFX9-NEXT:    v_or_b32_e32 v1, v1, v12
1106; GFX9-NEXT:    v_lshrrev_b64 v[10:11], v8, v[6:7]
1107; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
1108; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1109; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
1110; GFX9-NEXT:    v_or_b32_e32 v3, v3, v11
1111; GFX9-NEXT:    v_or_b32_e32 v2, v2, v10
1112; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1113; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1114; GFX9-NEXT:    s_setpc_b64 s[30:31]
1115;
1116; R600-LABEL: v_fshr_v2i64:
1117; R600:       ; %bb.0:
1118; R600-NEXT:    CF_END
1119; R600-NEXT:    PAD
1120  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1121  ret <2 x i64> %ret
1122}
1123
1124define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1125; SI-LABEL: v_fshr_i24:
1126; SI:       ; %bb.0:
1127; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1128; SI-NEXT:    s_mov_b32 s4, 0xffffff
1129; SI-NEXT:    v_and_b32_e32 v2, s4, v2
1130; SI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1131; SI-NEXT:    v_mul_hi_u32 v3, v2, s5
1132; SI-NEXT:    v_and_b32_e32 v4, s4, v1
1133; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1134; SI-NEXT:    v_mul_lo_u32 v3, v3, 24
1135; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1136; SI-NEXT:    v_lshr_b32_e32 v3, v4, v2
1137; SI-NEXT:    v_sub_i32_e32 v4, vcc, 24, v2
1138; SI-NEXT:    v_and_b32_e32 v4, s4, v4
1139; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
1140; SI-NEXT:    v_or_b32_e32 v0, v0, v3
1141; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1142; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1143; SI-NEXT:    s_setpc_b64 s[30:31]
1144;
1145; VI-LABEL: v_fshr_i24:
1146; VI:       ; %bb.0:
1147; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1148; VI-NEXT:    s_mov_b32 s4, 0xffffff
1149; VI-NEXT:    v_and_b32_e32 v2, s4, v2
1150; VI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1151; VI-NEXT:    v_mul_hi_u32 v3, v2, s5
1152; VI-NEXT:    v_and_b32_e32 v4, s4, v1
1153; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1154; VI-NEXT:    v_mul_lo_u32 v3, v3, 24
1155; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1156; VI-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
1157; VI-NEXT:    v_sub_u32_e32 v4, vcc, 24, v2
1158; VI-NEXT:    v_and_b32_e32 v4, s4, v4
1159; VI-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
1160; VI-NEXT:    v_or_b32_e32 v0, v0, v3
1161; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1162; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1163; VI-NEXT:    s_setpc_b64 s[30:31]
1164;
1165; GFX9-LABEL: v_fshr_i24:
1166; GFX9:       ; %bb.0:
1167; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1168; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
1169; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
1170; GFX9-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1171; GFX9-NEXT:    v_mul_hi_u32 v3, v2, s5
1172; GFX9-NEXT:    v_and_b32_e32 v4, s4, v1
1173; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1174; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
1175; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
1176; GFX9-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
1177; GFX9-NEXT:    v_sub_u32_e32 v4, 24, v2
1178; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
1179; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v3
1180; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1181; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1182; GFX9-NEXT:    s_setpc_b64 s[30:31]
1183;
1184; R600-LABEL: v_fshr_i24:
1185; R600:       ; %bb.0:
1186; R600-NEXT:    CF_END
1187; R600-NEXT:    PAD
1188  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1189  ret i24 %ret
1190}
1191
1192define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1193; SI-LABEL: v_fshr_v2i24:
1194; SI:       ; %bb.0:
1195; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1196; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
1197; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
1198; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
1199; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
1200; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32
1201; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
1202; SI-NEXT:    s_mov_b32 s4, 0xffffff
1203; SI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1204; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v0
1205; SI-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
1206; SI-NEXT:    v_add_i32_e32 v9, vcc, 5, v0
1207; SI-NEXT:    v_add_i32_e32 v10, vcc, 2, v0
1208; SI-NEXT:    s_waitcnt vmcnt(5)
1209; SI-NEXT:    v_and_b32_e32 v14, s4, v1
1210; SI-NEXT:    s_waitcnt vmcnt(4)
1211; SI-NEXT:    v_and_b32_e32 v2, s4, v2
1212; SI-NEXT:    v_mul_hi_u32 v12, v2, s5
1213; SI-NEXT:    s_waitcnt vmcnt(3)
1214; SI-NEXT:    v_and_b32_e32 v3, s4, v3
1215; SI-NEXT:    v_mul_hi_u32 v13, v3, s5
1216; SI-NEXT:    s_waitcnt vmcnt(2)
1217; SI-NEXT:    v_and_b32_e32 v11, s4, v4
1218; SI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
1219; SI-NEXT:    v_mul_lo_u32 v12, v12, 24
1220; SI-NEXT:    v_lshrrev_b32_e32 v13, 4, v13
1221; SI-NEXT:    v_mul_lo_u32 v13, v13, 24
1222; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
1223; SI-NEXT:    v_lshr_b32_e32 v12, v14, v2
1224; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v13
1225; SI-NEXT:    v_sub_i32_e32 v13, vcc, 24, v2
1226; SI-NEXT:    v_sub_i32_e32 v14, vcc, 24, v3
1227; SI-NEXT:    v_and_b32_e32 v13, s4, v13
1228; SI-NEXT:    s_waitcnt vmcnt(1)
1229; SI-NEXT:    v_lshl_b32_e32 v5, v5, v13
1230; SI-NEXT:    v_and_b32_e32 v14, 0xffffff, v14
1231; SI-NEXT:    v_lshr_b32_e32 v11, v11, v3
1232; SI-NEXT:    s_waitcnt vmcnt(0)
1233; SI-NEXT:    v_lshl_b32_e32 v6, v6, v14
1234; SI-NEXT:    v_or_b32_e32 v5, v5, v12
1235; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1236; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
1237; SI-NEXT:    v_or_b32_e32 v6, v6, v11
1238; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1239; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
1240; SI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
1241; SI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1242; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
1243; SI-NEXT:    s_waitcnt expcnt(1)
1244; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1245; SI-NEXT:    s_waitcnt expcnt(0)
1246; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1247; SI-NEXT:    buffer_store_byte v0, v8, s[0:3], 0 offen
1248; SI-NEXT:    buffer_store_byte v2, v9, s[0:3], 0 offen
1249; SI-NEXT:    buffer_store_byte v1, v10, s[0:3], 0 offen
1250; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1251; SI-NEXT:    s_setpc_b64 s[30:31]
1252;
1253; VI-LABEL: v_fshr_v2i24:
1254; VI:       ; %bb.0:
1255; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1256; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
1257; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
1258; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
1259; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
1260; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32
1261; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
1262; VI-NEXT:    s_mov_b32 s4, 0xffffff
1263; VI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1264; VI-NEXT:    v_add_u32_e32 v7, vcc, 3, v0
1265; VI-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
1266; VI-NEXT:    v_add_u32_e32 v9, vcc, 5, v0
1267; VI-NEXT:    v_add_u32_e32 v10, vcc, 2, v0
1268; VI-NEXT:    s_waitcnt vmcnt(5)
1269; VI-NEXT:    v_and_b32_e32 v14, s4, v1
1270; VI-NEXT:    s_waitcnt vmcnt(4)
1271; VI-NEXT:    v_and_b32_e32 v2, s4, v2
1272; VI-NEXT:    v_mul_hi_u32 v12, v2, s5
1273; VI-NEXT:    s_waitcnt vmcnt(3)
1274; VI-NEXT:    v_and_b32_e32 v3, s4, v3
1275; VI-NEXT:    v_mul_hi_u32 v13, v3, s5
1276; VI-NEXT:    s_waitcnt vmcnt(2)
1277; VI-NEXT:    v_and_b32_e32 v11, s4, v4
1278; VI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
1279; VI-NEXT:    v_mul_lo_u32 v12, v12, 24
1280; VI-NEXT:    v_lshrrev_b32_e32 v13, 4, v13
1281; VI-NEXT:    v_mul_lo_u32 v13, v13, 24
1282; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v12
1283; VI-NEXT:    v_lshrrev_b32_e32 v12, v2, v14
1284; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v13
1285; VI-NEXT:    v_sub_u32_e32 v13, vcc, 24, v2
1286; VI-NEXT:    v_sub_u32_e32 v14, vcc, 24, v3
1287; VI-NEXT:    v_and_b32_e32 v13, s4, v13
1288; VI-NEXT:    s_waitcnt vmcnt(1)
1289; VI-NEXT:    v_lshlrev_b32_e32 v5, v13, v5
1290; VI-NEXT:    v_and_b32_e32 v14, 0xffffff, v14
1291; VI-NEXT:    v_lshrrev_b32_e32 v11, v3, v11
1292; VI-NEXT:    s_waitcnt vmcnt(0)
1293; VI-NEXT:    v_lshlrev_b32_e32 v6, v14, v6
1294; VI-NEXT:    v_or_b32_e32 v5, v5, v12
1295; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1296; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
1297; VI-NEXT:    v_or_b32_e32 v6, v6, v11
1298; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1299; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
1300; VI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
1301; VI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1302; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
1303; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1304; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1305; VI-NEXT:    buffer_store_byte v0, v8, s[0:3], 0 offen
1306; VI-NEXT:    buffer_store_byte v2, v9, s[0:3], 0 offen
1307; VI-NEXT:    buffer_store_byte v1, v10, s[0:3], 0 offen
1308; VI-NEXT:    s_waitcnt vmcnt(0)
1309; VI-NEXT:    s_setpc_b64 s[30:31]
1310;
1311; GFX9-LABEL: v_fshr_v2i24:
1312; GFX9:       ; %bb.0:
1313; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1314; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
1315; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
1316; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
1317; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
1318; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32
1319; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4
1320; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
1321; GFX9-NEXT:    s_mov_b32 s5, 0xaaaaaaab
1322; GFX9-NEXT:    s_waitcnt vmcnt(5)
1323; GFX9-NEXT:    v_and_b32_e32 v10, s4, v1
1324; GFX9-NEXT:    s_waitcnt vmcnt(4)
1325; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
1326; GFX9-NEXT:    v_mul_hi_u32 v6, v2, s5
1327; GFX9-NEXT:    s_waitcnt vmcnt(3)
1328; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
1329; GFX9-NEXT:    v_mul_hi_u32 v7, v3, s5
1330; GFX9-NEXT:    s_waitcnt vmcnt(2)
1331; GFX9-NEXT:    v_and_b32_e32 v9, s4, v4
1332; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1333; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1334; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1335; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
1336; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v6
1337; GFX9-NEXT:    v_lshrrev_b32_e32 v6, v2, v10
1338; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v7
1339; GFX9-NEXT:    v_sub_u32_e32 v7, 24, v2
1340; GFX9-NEXT:    v_sub_u32_e32 v10, 24, v3
1341; GFX9-NEXT:    v_and_b32_e32 v7, s4, v7
1342; GFX9-NEXT:    v_lshrrev_b32_e32 v9, v3, v9
1343; GFX9-NEXT:    v_and_b32_e32 v10, 0xffffff, v10
1344; GFX9-NEXT:    s_waitcnt vmcnt(1)
1345; GFX9-NEXT:    v_lshl_or_b32 v5, v5, v7, v6
1346; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1347; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
1348; GFX9-NEXT:    s_waitcnt vmcnt(0)
1349; GFX9-NEXT:    v_lshl_or_b32 v6, v8, v10, v9
1350; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1351; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
1352; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
1353; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
1354; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
1355; GFX9-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
1356; GFX9-NEXT:    buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
1357; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1358; GFX9-NEXT:    s_waitcnt vmcnt(0)
1359; GFX9-NEXT:    s_setpc_b64 s[30:31]
1360;
1361; R600-LABEL: v_fshr_v2i24:
1362; R600:       ; %bb.0:
1363; R600-NEXT:    CF_END
1364; R600-NEXT:    PAD
1365  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1366  ret <2 x i24> %ret
1367}
1368