1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
7; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11
8
9declare i32 @llvm.fshr.i32(i32, i32, i32)
10declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
11declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
12declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
13declare i16 @llvm.fshr.i16(i16, i16, i16)
14declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
15declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
16declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
17declare i64 @llvm.fshr.i64(i64, i64, i64)
18declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19declare i24 @llvm.fshr.i24(i24, i24, i24)
20declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
21
22define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
23; SI-LABEL: fshr_i32:
24; SI:       ; %bb.0: ; %entry
25; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
26; SI-NEXT:    s_load_dword s6, s[0:1], 0xd
27; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
28; SI-NEXT:    s_mov_b32 s3, 0xf000
29; SI-NEXT:    s_mov_b32 s2, -1
30; SI-NEXT:    s_waitcnt lgkmcnt(0)
31; SI-NEXT:    v_mov_b32_e32 v0, s5
32; SI-NEXT:    v_mov_b32_e32 v1, s6
33; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v1
34; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
35; SI-NEXT:    s_endpgm
36;
37; VI-LABEL: fshr_i32:
38; VI:       ; %bb.0: ; %entry
39; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
40; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
41; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    v_mov_b32_e32 v0, s3
44; VI-NEXT:    v_mov_b32_e32 v1, s4
45; VI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
46; VI-NEXT:    v_mov_b32_e32 v0, s0
47; VI-NEXT:    v_mov_b32_e32 v1, s1
48; VI-NEXT:    flat_store_dword v[0:1], v2
49; VI-NEXT:    s_endpgm
50;
51; GFX9-LABEL: fshr_i32:
52; GFX9:       ; %bb.0: ; %entry
53; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
54; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x34
55; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
56; GFX9-NEXT:    v_mov_b32_e32 v0, 0
57; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-NEXT:    v_mov_b32_e32 v1, s3
59; GFX9-NEXT:    v_mov_b32_e32 v2, s6
60; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, v2
61; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
62; GFX9-NEXT:    s_endpgm
63;
64; R600-LABEL: fshr_i32:
65; R600:       ; %bb.0: ; %entry
66; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
67; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
68; R600-NEXT:    CF_END
69; R600-NEXT:    PAD
70; R600-NEXT:    ALU clause starting at 4:
71; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
72; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
73; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
74;
75; GFX10-LABEL: fshr_i32:
76; GFX10:       ; %bb.0: ; %entry
77; GFX10-NEXT:    s_clause 0x2
78; GFX10-NEXT:    s_load_dword s6, s[0:1], 0x34
79; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
80; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
81; GFX10-NEXT:    v_mov_b32_e32 v1, 0
82; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX10-NEXT:    v_mov_b32_e32 v0, s6
84; GFX10-NEXT:    v_alignbit_b32 v0, s2, s3, v0
85; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
86; GFX10-NEXT:    s_endpgm
87;
88; GFX11-LABEL: fshr_i32:
89; GFX11:       ; %bb.0: ; %entry
90; GFX11-NEXT:    s_clause 0x2
91; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x34
92; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
93; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
94; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
96; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
97; GFX11-NEXT:    v_alignbit_b32 v0, s2, s3, v0
98; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
99; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
100; GFX11-NEXT:    s_endpgm
101entry:
102  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
103  store i32 %0, i32 addrspace(1)* %in
104  ret void
105}
106
107define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
108; SI-LABEL: fshr_i32_imm:
109; SI:       ; %bb.0: ; %entry
110; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
111; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
112; SI-NEXT:    s_mov_b32 s3, 0xf000
113; SI-NEXT:    s_mov_b32 s2, -1
114; SI-NEXT:    s_waitcnt lgkmcnt(0)
115; SI-NEXT:    v_mov_b32_e32 v0, s5
116; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 7
117; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
118; SI-NEXT:    s_endpgm
119;
120; VI-LABEL: fshr_i32_imm:
121; VI:       ; %bb.0: ; %entry
122; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
123; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
124; VI-NEXT:    s_waitcnt lgkmcnt(0)
125; VI-NEXT:    v_mov_b32_e32 v0, s3
126; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 7
127; VI-NEXT:    v_mov_b32_e32 v0, s0
128; VI-NEXT:    v_mov_b32_e32 v1, s1
129; VI-NEXT:    flat_store_dword v[0:1], v2
130; VI-NEXT:    s_endpgm
131;
132; GFX9-LABEL: fshr_i32_imm:
133; GFX9:       ; %bb.0: ; %entry
134; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
135; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
136; GFX9-NEXT:    v_mov_b32_e32 v0, 0
137; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
138; GFX9-NEXT:    v_mov_b32_e32 v1, s3
139; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
140; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
141; GFX9-NEXT:    s_endpgm
142;
143; R600-LABEL: fshr_i32_imm:
144; R600:       ; %bb.0: ; %entry
145; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
146; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
147; R600-NEXT:    CF_END
148; R600-NEXT:    PAD
149; R600-NEXT:    ALU clause starting at 4:
150; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
151; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
152; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
153; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
154;
155; GFX10-LABEL: fshr_i32_imm:
156; GFX10:       ; %bb.0: ; %entry
157; GFX10-NEXT:    s_clause 0x1
158; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
159; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
160; GFX10-NEXT:    v_mov_b32_e32 v0, 0
161; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
162; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
163; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
164; GFX10-NEXT:    s_endpgm
165;
166; GFX11-LABEL: fshr_i32_imm:
167; GFX11:       ; %bb.0: ; %entry
168; GFX11-NEXT:    s_clause 0x1
169; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
170; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
171; GFX11-NEXT:    v_mov_b32_e32 v0, 0
172; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 7
174; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
175; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
176; GFX11-NEXT:    s_endpgm
177entry:
178  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
179  store i32 %0, i32 addrspace(1)* %in
180  ret void
181}
182
183define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
184; SI-LABEL: fshr_v2i32:
185; SI:       ; %bb.0: ; %entry
186; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
187; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xf
188; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
189; SI-NEXT:    s_mov_b32 s3, 0xf000
190; SI-NEXT:    s_mov_b32 s2, -1
191; SI-NEXT:    s_waitcnt lgkmcnt(0)
192; SI-NEXT:    v_mov_b32_e32 v0, s7
193; SI-NEXT:    v_mov_b32_e32 v1, s9
194; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
195; SI-NEXT:    v_mov_b32_e32 v0, s6
196; SI-NEXT:    v_mov_b32_e32 v2, s8
197; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v2
198; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
199; SI-NEXT:    s_endpgm
200;
201; VI-LABEL: fshr_v2i32:
202; VI:       ; %bb.0: ; %entry
203; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
204; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
205; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
206; VI-NEXT:    s_waitcnt lgkmcnt(0)
207; VI-NEXT:    v_mov_b32_e32 v0, s7
208; VI-NEXT:    v_mov_b32_e32 v1, s3
209; VI-NEXT:    v_mov_b32_e32 v2, s6
210; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
211; VI-NEXT:    v_mov_b32_e32 v0, s2
212; VI-NEXT:    v_alignbit_b32 v0, s4, v2, v0
213; VI-NEXT:    v_mov_b32_e32 v3, s1
214; VI-NEXT:    v_mov_b32_e32 v2, s0
215; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
216; VI-NEXT:    s_endpgm
217;
218; GFX9-LABEL: fshr_v2i32:
219; GFX9:       ; %bb.0: ; %entry
220; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
221; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
222; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
223; GFX9-NEXT:    v_mov_b32_e32 v2, 0
224; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX9-NEXT:    v_mov_b32_e32 v0, s7
226; GFX9-NEXT:    v_mov_b32_e32 v1, s3
227; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
228; GFX9-NEXT:    v_mov_b32_e32 v0, s6
229; GFX9-NEXT:    v_mov_b32_e32 v3, s2
230; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v3
231; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
232; GFX9-NEXT:    s_endpgm
233;
234; R600-LABEL: fshr_v2i32:
235; R600:       ; %bb.0: ; %entry
236; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
237; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
238; R600-NEXT:    CF_END
239; R600-NEXT:    PAD
240; R600-NEXT:    ALU clause starting at 4:
241; R600-NEXT:     MOV * T0.W, KC0[4].X,
242; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
243; R600-NEXT:     MOV * T0.W, KC0[3].W,
244; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
245; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
246; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
247;
248; GFX10-LABEL: fshr_v2i32:
249; GFX10:       ; %bb.0: ; %entry
250; GFX10-NEXT:    s_clause 0x2
251; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
252; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
253; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
254; GFX10-NEXT:    v_mov_b32_e32 v3, 0
255; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-NEXT:    v_mov_b32_e32 v0, s3
257; GFX10-NEXT:    v_mov_b32_e32 v2, s2
258; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, v0
259; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, v2
260; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
261; GFX10-NEXT:    s_endpgm
262;
263; GFX11-LABEL: fshr_v2i32:
264; GFX11:       ; %bb.0: ; %entry
265; GFX11-NEXT:    s_clause 0x2
266; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x3c
267; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x2c
268; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
269; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
270; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
271; GFX11-NEXT:    v_mov_b32_e32 v2, s2
272; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
273; GFX11-NEXT:    v_alignbit_b32 v1, s5, s7, v0
274; GFX11-NEXT:    v_alignbit_b32 v0, s4, s6, v2
275; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
276; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
277; GFX11-NEXT:    s_endpgm
278entry:
279  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
280  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
281  ret void
282}
283
284define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
285; SI-LABEL: fshr_v2i32_imm:
286; SI:       ; %bb.0: ; %entry
287; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
288; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
289; SI-NEXT:    s_mov_b32 s3, 0xf000
290; SI-NEXT:    s_mov_b32 s2, -1
291; SI-NEXT:    s_waitcnt lgkmcnt(0)
292; SI-NEXT:    v_mov_b32_e32 v0, s7
293; SI-NEXT:    v_mov_b32_e32 v2, s6
294; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
295; SI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
296; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
297; SI-NEXT:    s_endpgm
298;
299; VI-LABEL: fshr_v2i32_imm:
300; VI:       ; %bb.0: ; %entry
301; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
302; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
303; VI-NEXT:    s_waitcnt lgkmcnt(0)
304; VI-NEXT:    v_mov_b32_e32 v0, s7
305; VI-NEXT:    v_mov_b32_e32 v2, s6
306; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
307; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
308; VI-NEXT:    v_mov_b32_e32 v3, s1
309; VI-NEXT:    v_mov_b32_e32 v2, s0
310; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
311; VI-NEXT:    s_endpgm
312;
313; GFX9-LABEL: fshr_v2i32_imm:
314; GFX9:       ; %bb.0: ; %entry
315; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
316; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
317; GFX9-NEXT:    v_mov_b32_e32 v2, 0
318; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX9-NEXT:    v_mov_b32_e32 v0, s7
320; GFX9-NEXT:    v_mov_b32_e32 v3, s6
321; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
322; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 7
323; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
324; GFX9-NEXT:    s_endpgm
325;
326; R600-LABEL: fshr_v2i32_imm:
327; R600:       ; %bb.0: ; %entry
328; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
329; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
330; R600-NEXT:    CF_END
331; R600-NEXT:    PAD
332; R600-NEXT:    ALU clause starting at 4:
333; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
334; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
335; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
336; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
337; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
338; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
339;
340; GFX10-LABEL: fshr_v2i32_imm:
341; GFX10:       ; %bb.0: ; %entry
342; GFX10-NEXT:    s_clause 0x1
343; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
344; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
345; GFX10-NEXT:    v_mov_b32_e32 v2, 0
346; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, 9
348; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, 7
349; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
350; GFX10-NEXT:    s_endpgm
351;
352; GFX11-LABEL: fshr_v2i32_imm:
353; GFX11:       ; %bb.0: ; %entry
354; GFX11-NEXT:    s_clause 0x1
355; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x2c
356; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
357; GFX11-NEXT:    v_mov_b32_e32 v2, 0
358; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX11-NEXT:    v_alignbit_b32 v1, s5, s7, 9
360; GFX11-NEXT:    v_alignbit_b32 v0, s4, s6, 7
361; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
362; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
363; GFX11-NEXT:    s_endpgm
364entry:
365  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
366  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
367  ret void
368}
369
370define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
371; SI-LABEL: fshr_v4i32:
372; SI:       ; %bb.0: ; %entry
373; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
374; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x15
375; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
376; SI-NEXT:    s_mov_b32 s3, 0xf000
377; SI-NEXT:    s_mov_b32 s2, -1
378; SI-NEXT:    s_waitcnt lgkmcnt(0)
379; SI-NEXT:    v_mov_b32_e32 v0, s11
380; SI-NEXT:    v_mov_b32_e32 v1, s15
381; SI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
382; SI-NEXT:    v_mov_b32_e32 v0, s10
383; SI-NEXT:    v_mov_b32_e32 v1, s14
384; SI-NEXT:    v_alignbit_b32 v2, s6, v0, v1
385; SI-NEXT:    v_mov_b32_e32 v0, s9
386; SI-NEXT:    v_mov_b32_e32 v1, s13
387; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
388; SI-NEXT:    v_mov_b32_e32 v0, s8
389; SI-NEXT:    v_mov_b32_e32 v4, s12
390; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
391; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
392; SI-NEXT:    s_endpgm
393;
394; VI-LABEL: fshr_v4i32:
395; VI:       ; %bb.0: ; %entry
396; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
397; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
398; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
399; VI-NEXT:    s_waitcnt lgkmcnt(0)
400; VI-NEXT:    v_mov_b32_e32 v0, s11
401; VI-NEXT:    v_mov_b32_e32 v1, s15
402; VI-NEXT:    v_mov_b32_e32 v2, s10
403; VI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
404; VI-NEXT:    v_mov_b32_e32 v0, s14
405; VI-NEXT:    v_alignbit_b32 v2, s6, v2, v0
406; VI-NEXT:    v_mov_b32_e32 v0, s9
407; VI-NEXT:    v_mov_b32_e32 v1, s13
408; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
409; VI-NEXT:    v_mov_b32_e32 v0, s8
410; VI-NEXT:    v_mov_b32_e32 v4, s12
411; VI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
412; VI-NEXT:    v_mov_b32_e32 v5, s1
413; VI-NEXT:    v_mov_b32_e32 v4, s0
414; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
415; VI-NEXT:    s_endpgm
416;
417; GFX9-LABEL: fshr_v4i32:
418; GFX9:       ; %bb.0: ; %entry
419; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
420; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
421; GFX9-NEXT:    v_mov_b32_e32 v4, 0
422; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
423; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX9-NEXT:    v_mov_b32_e32 v0, s11
425; GFX9-NEXT:    v_mov_b32_e32 v1, s15
426; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, v1
427; GFX9-NEXT:    v_mov_b32_e32 v0, s10
428; GFX9-NEXT:    v_mov_b32_e32 v1, s14
429; GFX9-NEXT:    v_alignbit_b32 v2, s6, v0, v1
430; GFX9-NEXT:    v_mov_b32_e32 v0, s9
431; GFX9-NEXT:    v_mov_b32_e32 v1, s13
432; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
433; GFX9-NEXT:    v_mov_b32_e32 v0, s8
434; GFX9-NEXT:    v_mov_b32_e32 v5, s12
435; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v5
436; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
437; GFX9-NEXT:    s_endpgm
438;
439; R600-LABEL: fshr_v4i32:
440; R600:       ; %bb.0: ; %entry
441; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
442; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
443; R600-NEXT:    CF_END
444; R600-NEXT:    PAD
445; R600-NEXT:    ALU clause starting at 4:
446; R600-NEXT:     MOV * T0.W, KC0[6].X,
447; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
448; R600-NEXT:     MOV * T1.W, KC0[5].W,
449; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
450; R600-NEXT:     MOV * T1.W, KC0[5].Z,
451; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
452; R600-NEXT:     MOV * T1.W, KC0[5].Y,
453; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
454; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
455; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
456;
457; GFX10-LABEL: fshr_v4i32:
458; GFX10:       ; %bb.0: ; %entry
459; GFX10-NEXT:    s_clause 0x2
460; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
461; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
462; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
463; GFX10-NEXT:    v_mov_b32_e32 v6, 0
464; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX10-NEXT:    v_mov_b32_e32 v0, s15
466; GFX10-NEXT:    v_mov_b32_e32 v1, s14
467; GFX10-NEXT:    v_mov_b32_e32 v4, s13
468; GFX10-NEXT:    v_mov_b32_e32 v5, s12
469; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, v0
470; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, v1
471; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, v4
472; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, v5
473; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
474; GFX10-NEXT:    s_endpgm
475;
476; GFX11-LABEL: fshr_v4i32:
477; GFX11:       ; %bb.0: ; %entry
478; GFX11-NEXT:    s_clause 0x2
479; GFX11-NEXT:    s_load_b128 s[12:15], s[0:1], 0x54
480; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x34
481; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
482; GFX11-NEXT:    v_mov_b32_e32 v6, 0
483; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX11-NEXT:    v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14
485; GFX11-NEXT:    v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v5, s12
486; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
487; GFX11-NEXT:    v_alignbit_b32 v3, s7, s11, v0
488; GFX11-NEXT:    v_alignbit_b32 v2, s6, s10, v1
489; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
490; GFX11-NEXT:    v_alignbit_b32 v1, s5, s9, v4
491; GFX11-NEXT:    v_alignbit_b32 v0, s4, s8, v5
492; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
493; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
494; GFX11-NEXT:    s_endpgm
495entry:
496  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
497  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
498  ret void
499}
500
501define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
502; SI-LABEL: fshr_v4i32_imm:
503; SI:       ; %bb.0: ; %entry
504; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
505; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
506; SI-NEXT:    s_mov_b32 s3, 0xf000
507; SI-NEXT:    s_mov_b32 s2, -1
508; SI-NEXT:    s_waitcnt lgkmcnt(0)
509; SI-NEXT:    v_mov_b32_e32 v0, s11
510; SI-NEXT:    v_mov_b32_e32 v1, s10
511; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
512; SI-NEXT:    v_mov_b32_e32 v0, s9
513; SI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
514; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
515; SI-NEXT:    v_mov_b32_e32 v0, s8
516; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
517; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
518; SI-NEXT:    s_endpgm
519;
520; VI-LABEL: fshr_v4i32_imm:
521; VI:       ; %bb.0: ; %entry
522; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
523; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
524; VI-NEXT:    s_waitcnt lgkmcnt(0)
525; VI-NEXT:    v_mov_b32_e32 v0, s11
526; VI-NEXT:    v_mov_b32_e32 v1, s10
527; VI-NEXT:    v_mov_b32_e32 v4, s9
528; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
529; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
530; VI-NEXT:    v_alignbit_b32 v1, s5, v4, 7
531; VI-NEXT:    v_mov_b32_e32 v0, s8
532; VI-NEXT:    v_mov_b32_e32 v5, s1
533; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
534; VI-NEXT:    v_mov_b32_e32 v4, s0
535; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
536; VI-NEXT:    s_endpgm
537;
538; GFX9-LABEL: fshr_v4i32_imm:
539; GFX9:       ; %bb.0: ; %entry
540; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
541; GFX9-NEXT:    v_mov_b32_e32 v4, 0
542; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
543; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
544; GFX9-NEXT:    v_mov_b32_e32 v0, s11
545; GFX9-NEXT:    v_mov_b32_e32 v1, s10
546; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
547; GFX9-NEXT:    v_mov_b32_e32 v0, s9
548; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
549; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
550; GFX9-NEXT:    v_mov_b32_e32 v0, s8
551; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
552; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
553; GFX9-NEXT:    s_endpgm
554;
555; R600-LABEL: fshr_v4i32_imm:
556; R600:       ; %bb.0: ; %entry
557; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
558; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
559; R600-NEXT:    CF_END
560; R600-NEXT:    PAD
561; R600-NEXT:    ALU clause starting at 4:
562; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
563; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
564; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
565; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
566; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
567; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
568; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
569; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
570;
571; GFX10-LABEL: fshr_v4i32_imm:
572; GFX10:       ; %bb.0: ; %entry
573; GFX10-NEXT:    s_clause 0x1
574; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
575; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
576; GFX10-NEXT:    v_mov_b32_e32 v4, 0
577; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, 1
579; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, 9
580; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, 7
581; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, 1
582; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
583; GFX10-NEXT:    s_endpgm
584;
585; GFX11-LABEL: fshr_v4i32_imm:
586; GFX11:       ; %bb.0: ; %entry
587; GFX11-NEXT:    s_clause 0x1
588; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x34
589; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
590; GFX11-NEXT:    v_mov_b32_e32 v4, 0
591; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
592; GFX11-NEXT:    v_alignbit_b32 v3, s7, s11, 1
593; GFX11-NEXT:    v_alignbit_b32 v2, s6, s10, 9
594; GFX11-NEXT:    v_alignbit_b32 v1, s5, s9, 7
595; GFX11-NEXT:    v_alignbit_b32 v0, s4, s8, 1
596; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
597; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
598; GFX11-NEXT:    s_endpgm
599entry:
600  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
601  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
602  ret void
603}
604
605define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
606; GFX89-LABEL: v_fshr_i32:
607; GFX89:       ; %bb.0:
608; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
609; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
610; GFX89-NEXT:    s_setpc_b64 s[30:31]
611;
612; R600-LABEL: v_fshr_i32:
613; R600:       ; %bb.0:
614; R600-NEXT:    CF_END
615; R600-NEXT:    PAD
616;
617; GFX10-LABEL: v_fshr_i32:
618; GFX10:       ; %bb.0:
619; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
621; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
622; GFX10-NEXT:    s_setpc_b64 s[30:31]
623;
624; GFX11-LABEL: v_fshr_i32:
625; GFX11:       ; %bb.0:
626; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
627; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
628; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
629; GFX11-NEXT:    s_setpc_b64 s[30:31]
630  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
631  ret i32 %ret
632}
633
634define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
635; GFX89-LABEL: v_fshr_v2i32:
636; GFX89:       ; %bb.0:
637; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
639; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v5
640; GFX89-NEXT:    s_setpc_b64 s[30:31]
641;
642; R600-LABEL: v_fshr_v2i32:
643; R600:       ; %bb.0:
644; R600-NEXT:    CF_END
645; R600-NEXT:    PAD
646;
647; GFX10-LABEL: v_fshr_v2i32:
648; GFX10:       ; %bb.0:
649; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
650; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
651; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
652; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
653; GFX10-NEXT:    s_setpc_b64 s[30:31]
654;
655; GFX11-LABEL: v_fshr_v2i32:
656; GFX11:       ; %bb.0:
657; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
659; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
660; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
661; GFX11-NEXT:    s_setpc_b64 s[30:31]
662  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
663  ret <2 x i32> %ret
664}
665
666define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
667; GFX89-LABEL: v_fshr_v3i32:
668; GFX89:       ; %bb.0:
669; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
670; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
671; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v7
672; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v8
673; GFX89-NEXT:    s_setpc_b64 s[30:31]
674;
675; R600-LABEL: v_fshr_v3i32:
676; R600:       ; %bb.0:
677; R600-NEXT:    CF_END
678; R600-NEXT:    PAD
679;
680; GFX10-LABEL: v_fshr_v3i32:
681; GFX10:       ; %bb.0:
682; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
684; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
685; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
686; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
687; GFX10-NEXT:    s_setpc_b64 s[30:31]
688;
689; GFX11-LABEL: v_fshr_v3i32:
690; GFX11:       ; %bb.0:
691; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
692; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
693; GFX11-NEXT:    v_alignbit_b32 v0, v0, v3, v6
694; GFX11-NEXT:    v_alignbit_b32 v1, v1, v4, v7
695; GFX11-NEXT:    v_alignbit_b32 v2, v2, v5, v8
696; GFX11-NEXT:    s_setpc_b64 s[30:31]
697  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
698  ret <3 x i32> %ret
699}
700
701define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
702; GFX89-LABEL: v_fshr_v4i32:
703; GFX89:       ; %bb.0:
704; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
706; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v9
707; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v10
708; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v11
709; GFX89-NEXT:    s_setpc_b64 s[30:31]
710;
711; R600-LABEL: v_fshr_v4i32:
712; R600:       ; %bb.0:
713; R600-NEXT:    CF_END
714; R600-NEXT:    PAD
715;
716; GFX10-LABEL: v_fshr_v4i32:
717; GFX10:       ; %bb.0:
718; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
720; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
721; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
722; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
723; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
724; GFX10-NEXT:    s_setpc_b64 s[30:31]
725;
726; GFX11-LABEL: v_fshr_v4i32:
727; GFX11:       ; %bb.0:
728; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
729; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
730; GFX11-NEXT:    v_alignbit_b32 v0, v0, v4, v8
731; GFX11-NEXT:    v_alignbit_b32 v1, v1, v5, v9
732; GFX11-NEXT:    v_alignbit_b32 v2, v2, v6, v10
733; GFX11-NEXT:    v_alignbit_b32 v3, v3, v7, v11
734; GFX11-NEXT:    s_setpc_b64 s[30:31]
735  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
736  ret <4 x i32> %ret
737}
738
739define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
740; SI-LABEL: v_fshr_i16:
741; SI:       ; %bb.0:
742; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
743; SI-NEXT:    v_or_b32_e32 v2, 16, v2
744; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
745; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
746; SI-NEXT:    s_setpc_b64 s[30:31]
747;
748; VI-LABEL: v_fshr_i16:
749; VI:       ; %bb.0:
750; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
752; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
753; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
754; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
755; VI-NEXT:    v_or_b32_e32 v0, v0, v1
756; VI-NEXT:    s_setpc_b64 s[30:31]
757;
758; GFX9-LABEL: v_fshr_i16:
759; GFX9:       ; %bb.0:
760; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
761; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
762; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
763; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
764; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
765; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
766; GFX9-NEXT:    s_setpc_b64 s[30:31]
767;
768; R600-LABEL: v_fshr_i16:
769; R600:       ; %bb.0:
770; R600-NEXT:    CF_END
771; R600-NEXT:    PAD
772;
773; GFX10-LABEL: v_fshr_i16:
774; GFX10:       ; %bb.0:
775; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
777; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
778; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
779; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
780; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
781; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
782; GFX10-NEXT:    s_setpc_b64 s[30:31]
783;
784; GFX11-LABEL: v_fshr_i16:
785; GFX11:       ; %bb.0:
786; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
787; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
788; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
789; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
790; GFX11-NEXT:    v_lshrrev_b16 v1, v2, v1
791; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
792; GFX11-NEXT:    v_lshlrev_b16 v0, v3, v0
793; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
794; GFX11-NEXT:    s_setpc_b64 s[30:31]
795  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
796  ret i16 %ret
797}
798
799define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
800; SI-LABEL: v_fshr_v2i16:
801; SI:       ; %bb.0:
802; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
803; SI-NEXT:    v_or_b32_e32 v5, 16, v5
804; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
805; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
806; SI-NEXT:    v_or_b32_e32 v3, 16, v4
807; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
808; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v3
809; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
810; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
811; SI-NEXT:    v_or_b32_e32 v0, v0, v1
812; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
813; SI-NEXT:    s_setpc_b64 s[30:31]
814;
815; VI-LABEL: v_fshr_v2i16:
816; VI:       ; %bb.0:
817; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
818; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
819; VI-NEXT:    v_mov_b32_e32 v5, 1
820; VI-NEXT:    v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
821; VI-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
822; VI-NEXT:    v_xor_b32_e32 v3, -1, v3
823; VI-NEXT:    v_lshlrev_b16_e32 v3, v3, v5
824; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
825; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
826; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
827; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
828; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
829; VI-NEXT:    v_or_b32_e32 v0, v0, v1
830; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
831; VI-NEXT:    s_setpc_b64 s[30:31]
832;
833; GFX9-LABEL: v_fshr_v2i16:
834; GFX9:       ; %bb.0:
835; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
836; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
837; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
838; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
839; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
840; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
841; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
842; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
843; GFX9-NEXT:    s_setpc_b64 s[30:31]
844;
845; R600-LABEL: v_fshr_v2i16:
846; R600:       ; %bb.0:
847; R600-NEXT:    CF_END
848; R600-NEXT:    PAD
849;
850; GFX10-LABEL: v_fshr_v2i16:
851; GFX10:       ; %bb.0:
852; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
853; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
854; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
855; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
856; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
857; GFX10-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
858; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
859; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
860; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
861; GFX10-NEXT:    s_setpc_b64 s[30:31]
862;
863; GFX11-LABEL: v_fshr_v2i16:
864; GFX11:       ; %bb.0:
865; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
866; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
867; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
868; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
869; GFX11-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
870; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
871; GFX11-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
872; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
873; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
874; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
875; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
876; GFX11-NEXT:    s_setpc_b64 s[30:31]
877  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
878  ret <2 x i16> %ret
879}
880
881define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
882; SI-LABEL: v_fshr_v3i16:
883; SI:       ; %bb.0:
884; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
885; SI-NEXT:    v_or_b32_e32 v7, 16, v7
886; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
887; SI-NEXT:    v_alignbit_b32 v1, v1, v4, v7
888; SI-NEXT:    v_or_b32_e32 v4, 16, v6
889; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
890; SI-NEXT:    v_alignbit_b32 v0, v0, v3, v4
891; SI-NEXT:    v_or_b32_e32 v3, 16, v8
892; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
893; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
894; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
895; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
896; SI-NEXT:    v_or_b32_e32 v0, v0, v1
897; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
898; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
899; SI-NEXT:    s_setpc_b64 s[30:31]
900;
901; VI-LABEL: v_fshr_v3i16:
902; VI:       ; %bb.0:
903; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
904; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
905; VI-NEXT:    v_mov_b32_e32 v8, 1
906; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
907; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
908; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
909; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
910; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
911; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
912; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
913; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
914; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
915; VI-NEXT:    v_or_b32_e32 v1, v1, v3
916; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
917; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
918; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
919; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
920; VI-NEXT:    v_or_b32_e32 v0, v0, v2
921; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
922; VI-NEXT:    s_setpc_b64 s[30:31]
923;
924; GFX9-LABEL: v_fshr_v3i16:
925; GFX9:       ; %bb.0:
926; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
928; GFX9-NEXT:    v_mov_b32_e32 v8, 1
929; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
930; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
931; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
932; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
933; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
934; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
935; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
936; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
937; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
938; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
939; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
940; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
941; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
942; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
943; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
944; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
945; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
946; GFX9-NEXT:    s_setpc_b64 s[30:31]
947;
948; R600-LABEL: v_fshr_v3i16:
949; R600:       ; %bb.0:
950; R600-NEXT:    CF_END
951; R600-NEXT:    PAD
952;
953; GFX10-LABEL: v_fshr_v3i16:
954; GFX10:       ; %bb.0:
955; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
956; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
957; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
958; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
959; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
960; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
961; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
962; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v6
963; GFX10-NEXT:    v_lshlrev_b16 v7, 1, v7
964; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
965; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
966; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v9
967; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
968; GFX10-NEXT:    v_lshlrev_b16 v6, v10, v7
969; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
970; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
971; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v5
972; GFX10-NEXT:    v_or_b32_e32 v4, v6, v4
973; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
974; GFX10-NEXT:    v_lshlrev_b16 v1, v2, v1
975; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
976; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
977; GFX10-NEXT:    s_setpc_b64 s[30:31]
978;
979; GFX11-LABEL: v_fshr_v3i16:
980; GFX11:       ; %bb.0:
981; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
982; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
983; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
984; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
985; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
986; GFX11-NEXT:    v_xor_b32_e32 v8, -1, v4
987; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
988; GFX11-NEXT:    v_xor_b32_e32 v10, -1, v6
989; GFX11-NEXT:    v_lshlrev_b16 v7, 1, v7
990; GFX11-NEXT:    v_lshrrev_b16 v2, v4, v2
991; GFX11-NEXT:    v_lshlrev_b16 v0, v8, v0
992; GFX11-NEXT:    v_lshrrev_b16 v4, v6, v9
993; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
994; GFX11-NEXT:    v_lshlrev_b16 v6, v10, v7
995; GFX11-NEXT:    v_lshrrev_b16 v3, v5, v3
996; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
997; GFX11-NEXT:    v_xor_b32_e32 v2, -1, v5
998; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
999; GFX11-NEXT:    v_or_b32_e32 v4, v6, v4
1000; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1001; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1002; GFX11-NEXT:    v_lshlrev_b16 v1, v2, v1
1003; GFX11-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
1004; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1005; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
1006; GFX11-NEXT:    s_setpc_b64 s[30:31]
1007  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
1008  ret <3 x i16> %ret
1009}
1010
1011define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
1012; SI-LABEL: v_fshr_v4i16:
1013; SI:       ; %bb.0:
1014; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1015; SI-NEXT:    v_or_b32_e32 v9, 16, v9
1016; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1017; SI-NEXT:    v_alignbit_b32 v1, v1, v5, v9
1018; SI-NEXT:    v_or_b32_e32 v5, 16, v8
1019; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
1020; SI-NEXT:    v_alignbit_b32 v0, v0, v4, v5
1021; SI-NEXT:    v_or_b32_e32 v4, 16, v11
1022; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
1023; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
1024; SI-NEXT:    v_or_b32_e32 v4, 16, v10
1025; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
1026; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
1027; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1028; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1029; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1030; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1031; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1032; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1033; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
1034; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1035; SI-NEXT:    s_setpc_b64 s[30:31]
1036;
1037; VI-LABEL: v_fshr_v4i16:
1038; VI:       ; %bb.0:
1039; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
1041; VI-NEXT:    v_mov_b32_e32 v8, 1
1042; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1043; VI-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1044; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
1045; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
1046; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1047; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1048; VI-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1049; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1050; VI-NEXT:    v_xor_b32_e32 v7, -1, v7
1051; VI-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
1052; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
1053; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
1054; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
1055; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
1056; VI-NEXT:    v_or_b32_e32 v1, v1, v3
1057; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
1058; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
1059; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
1060; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
1061; VI-NEXT:    v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1062; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1063; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1064; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1065; VI-NEXT:    s_setpc_b64 s[30:31]
1066;
1067; GFX9-LABEL: v_fshr_v4i16:
1068; GFX9:       ; %bb.0:
1069; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1070; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
1071; GFX9-NEXT:    v_mov_b32_e32 v8, 1
1072; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1073; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1074; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
1075; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
1076; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
1077; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1078; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1079; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1080; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v7
1081; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
1082; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
1083; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
1084; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
1085; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
1086; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
1087; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
1088; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
1089; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
1090; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
1091; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
1092; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
1093; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1094; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1095; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
1096; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
1097; GFX9-NEXT:    s_setpc_b64 s[30:31]
1098;
1099; R600-LABEL: v_fshr_v4i16:
1100; R600:       ; %bb.0:
1101; R600-NEXT:    CF_END
1102; R600-NEXT:    PAD
1103;
1104; GFX10-LABEL: v_fshr_v4i16:
1105; GFX10:       ; %bb.0:
1106; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1107; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1108; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1109; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
1110; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
1111; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
1112; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
1113; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
1114; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v5
1115; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
1116; GFX10-NEXT:    v_xor_b32_e32 v12, -1, v4
1117; GFX10-NEXT:    v_lshrrev_b16 v6, v7, v6
1118; GFX10-NEXT:    v_lshlrev_b16 v8, 1, v8
1119; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
1120; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
1121; GFX10-NEXT:    v_lshlrev_b16 v10, 1, v10
1122; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v9
1123; GFX10-NEXT:    v_lshlrev_b16 v1, v11, v1
1124; GFX10-NEXT:    v_lshlrev_b16 v0, v12, v0
1125; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
1126; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
1127; GFX10-NEXT:    v_lshlrev_b16 v4, v7, v8
1128; GFX10-NEXT:    v_lshrrev_b16 v5, v9, v13
1129; GFX10-NEXT:    v_lshlrev_b16 v7, v14, v10
1130; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1131; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1132; GFX10-NEXT:    v_or_b32_e32 v2, v4, v6
1133; GFX10-NEXT:    v_or_b32_e32 v3, v7, v5
1134; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1135; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1136; GFX10-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
1137; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
1138; GFX10-NEXT:    s_setpc_b64 s[30:31]
1139;
1140; GFX11-LABEL: v_fshr_v4i16:
1141; GFX11:       ; %bb.0:
1142; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1143; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1144; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1145; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
1146; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
1147; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
1148; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
1149; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
1150; GFX11-NEXT:    v_xor_b32_e32 v11, -1, v5
1151; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
1152; GFX11-NEXT:    v_xor_b32_e32 v12, -1, v4
1153; GFX11-NEXT:    v_lshrrev_b16 v6, v7, v6
1154; GFX11-NEXT:    v_lshlrev_b16 v8, 1, v8
1155; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v7
1156; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
1157; GFX11-NEXT:    v_lshlrev_b16 v10, 1, v10
1158; GFX11-NEXT:    v_xor_b32_e32 v14, -1, v9
1159; GFX11-NEXT:    v_lshlrev_b16 v1, v11, v1
1160; GFX11-NEXT:    v_lshlrev_b16 v0, v12, v0
1161; GFX11-NEXT:    v_lshrrev_b16 v2, v4, v2
1162; GFX11-NEXT:    v_lshrrev_b16 v3, v5, v3
1163; GFX11-NEXT:    v_lshlrev_b16 v4, v7, v8
1164; GFX11-NEXT:    v_lshrrev_b16 v5, v9, v13
1165; GFX11-NEXT:    v_lshlrev_b16 v7, v14, v10
1166; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
1167; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
1168; GFX11-NEXT:    v_or_b32_e32 v2, v4, v6
1169; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1170; GFX11-NEXT:    v_or_b32_e32 v3, v7, v5
1171; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1172; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
1173; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1174; GFX11-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
1175; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1176; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
1177; GFX11-NEXT:    s_setpc_b64 s[30:31]
1178  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
1179  ret <4 x i16> %ret
1180}
1181
1182define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
1183; SI-LABEL: v_fshr_i64:
1184; SI:       ; %bb.0:
1185; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1187; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
1188; SI-NEXT:    v_not_b32_e32 v4, v4
1189; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
1190; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1191; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1192; SI-NEXT:    s_setpc_b64 s[30:31]
1193;
1194; VI-LABEL: v_fshr_i64:
1195; VI:       ; %bb.0:
1196; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1197; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1198; VI-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1199; VI-NEXT:    v_not_b32_e32 v4, v4
1200; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
1201; VI-NEXT:    v_or_b32_e32 v1, v1, v3
1202; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1203; VI-NEXT:    s_setpc_b64 s[30:31]
1204;
1205; GFX9-LABEL: v_fshr_i64:
1206; GFX9:       ; %bb.0:
1207; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1208; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1209; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1210; GFX9-NEXT:    v_not_b32_e32 v4, v4
1211; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
1212; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
1213; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
1214; GFX9-NEXT:    s_setpc_b64 s[30:31]
1215;
1216; R600-LABEL: v_fshr_i64:
1217; R600:       ; %bb.0:
1218; R600-NEXT:    CF_END
1219; R600-NEXT:    PAD
1220;
1221; GFX10-LABEL: v_fshr_i64:
1222; GFX10:       ; %bb.0:
1223; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1224; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1225; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1226; GFX10-NEXT:    v_not_b32_e32 v5, v4
1227; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1228; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
1229; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1230; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1231; GFX10-NEXT:    s_setpc_b64 s[30:31]
1232;
1233; GFX11-LABEL: v_fshr_i64:
1234; GFX11:       ; %bb.0:
1235; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1236; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1237; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1238; GFX11-NEXT:    v_not_b32_e32 v5, v4
1239; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1240; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1241; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
1242; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
1243; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1244; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
1245; GFX11-NEXT:    s_setpc_b64 s[30:31]
1246  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1247  ret i64 %ret
1248}
1249
1250define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1251; SI-LABEL: v_fshr_v2i64:
1252; SI:       ; %bb.0:
1253; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1254; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1255; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
1256; SI-NEXT:    v_not_b32_e32 v8, v8
1257; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
1258; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1259; SI-NEXT:    v_or_b32_e32 v1, v1, v5
1260; SI-NEXT:    v_lshr_b64 v[5:6], v[6:7], v10
1261; SI-NEXT:    v_not_b32_e32 v7, v10
1262; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
1263; SI-NEXT:    v_or_b32_e32 v0, v0, v4
1264; SI-NEXT:    v_or_b32_e32 v3, v3, v6
1265; SI-NEXT:    v_or_b32_e32 v2, v2, v5
1266; SI-NEXT:    s_setpc_b64 s[30:31]
1267;
1268; VI-LABEL: v_fshr_v2i64:
1269; VI:       ; %bb.0:
1270; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1271; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1272; VI-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1273; VI-NEXT:    v_not_b32_e32 v8, v8
1274; VI-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1275; VI-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1276; VI-NEXT:    v_or_b32_e32 v1, v1, v5
1277; VI-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1278; VI-NEXT:    v_not_b32_e32 v7, v10
1279; VI-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1280; VI-NEXT:    v_or_b32_e32 v0, v0, v4
1281; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1282; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1283; VI-NEXT:    s_setpc_b64 s[30:31]
1284;
1285; GFX9-LABEL: v_fshr_v2i64:
1286; GFX9:       ; %bb.0:
1287; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1288; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1289; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1290; GFX9-NEXT:    v_not_b32_e32 v8, v8
1291; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1292; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1293; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
1294; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1295; GFX9-NEXT:    v_not_b32_e32 v7, v10
1296; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1297; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
1298; GFX9-NEXT:    v_or_b32_e32 v3, v3, v6
1299; GFX9-NEXT:    v_or_b32_e32 v2, v2, v5
1300; GFX9-NEXT:    s_setpc_b64 s[30:31]
1301;
1302; R600-LABEL: v_fshr_v2i64:
1303; R600:       ; %bb.0:
1304; R600-NEXT:    CF_END
1305; R600-NEXT:    PAD
1306;
1307; GFX10-LABEL: v_fshr_v2i64:
1308; GFX10:       ; %bb.0:
1309; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1310; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1311; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1312; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1313; GFX10-NEXT:    v_not_b32_e32 v9, v8
1314; GFX10-NEXT:    v_not_b32_e32 v11, v10
1315; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1316; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
1317; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1318; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
1319; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
1320; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
1321; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
1322; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
1323; GFX10-NEXT:    s_setpc_b64 s[30:31]
1324;
1325; GFX11-LABEL: v_fshr_v2i64:
1326; GFX11:       ; %bb.0:
1327; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1328; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1329; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1330; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1331; GFX11-NEXT:    v_not_b32_e32 v9, v8
1332; GFX11-NEXT:    v_not_b32_e32 v11, v10
1333; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1334; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
1335; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1336; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1337; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
1338; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1339; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
1340; GFX11-NEXT:    v_or_b32_e32 v1, v1, v5
1341; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
1342; GFX11-NEXT:    v_or_b32_e32 v2, v2, v6
1343; GFX11-NEXT:    v_or_b32_e32 v3, v3, v7
1344; GFX11-NEXT:    s_setpc_b64 s[30:31]
1345  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1346  ret <2 x i64> %ret
1347}
1348
1349define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1350; SI-LABEL: v_fshr_i24:
1351; SI:       ; %bb.0:
1352; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1353; SI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1354; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1355; SI-NEXT:    v_mul_hi_u32 v3, v3, s4
1356; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1357; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1358; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1359; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1360; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
1361; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1362; SI-NEXT:    s_setpc_b64 s[30:31]
1363;
1364; VI-LABEL: v_fshr_i24:
1365; VI:       ; %bb.0:
1366; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1367; VI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1368; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1369; VI-NEXT:    v_mul_hi_u32 v3, v3, s4
1370; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1371; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1372; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1373; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1374; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
1375; VI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1376; VI-NEXT:    s_setpc_b64 s[30:31]
1377;
1378; GFX9-LABEL: v_fshr_i24:
1379; GFX9:       ; %bb.0:
1380; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1381; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1382; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1383; GFX9-NEXT:    v_mul_hi_u32 v3, v3, s4
1384; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1385; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1386; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1387; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
1388; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
1389; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1390; GFX9-NEXT:    s_setpc_b64 s[30:31]
1391;
1392; R600-LABEL: v_fshr_i24:
1393; R600:       ; %bb.0:
1394; R600-NEXT:    CF_END
1395; R600-NEXT:    PAD
1396;
1397; GFX10-LABEL: v_fshr_i24:
1398; GFX10:       ; %bb.0:
1399; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1400; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1401; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1402; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1403; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaaab, v3
1404; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1405; GFX10-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1406; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
1407; GFX10-NEXT:    v_add_nc_u32_e32 v2, 8, v2
1408; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1409; GFX10-NEXT:    s_setpc_b64 s[30:31]
1410;
1411; GFX11-LABEL: v_fshr_i24:
1412; GFX11:       ; %bb.0:
1413; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1414; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1415; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1416; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1417; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1418; GFX11-NEXT:    v_mul_hi_u32 v3, 0xaaaaaaab, v3
1419; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1420; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1421; GFX11-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1422; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
1423; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1424; GFX11-NEXT:    v_add_nc_u32_e32 v2, 8, v2
1425; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1426; GFX11-NEXT:    s_setpc_b64 s[30:31]
1427  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1428  ret i24 %ret
1429}
1430
1431define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1432; SI-LABEL: v_fshr_v2i24:
1433; SI:       ; %bb.0:
1434; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1435; SI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1436; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1437; SI-NEXT:    v_mul_hi_u32 v6, v6, s4
1438; SI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1439; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1440; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1441; SI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1442; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
1443; SI-NEXT:    v_mul_hi_u32 v6, v7, s4
1444; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
1445; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1446; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1447; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
1448; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1449; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
1450; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
1451; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1452; SI-NEXT:    s_setpc_b64 s[30:31]
1453;
1454; VI-LABEL: v_fshr_v2i24:
1455; VI:       ; %bb.0:
1456; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1457; VI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1458; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1459; VI-NEXT:    v_mul_hi_u32 v6, v6, s4
1460; VI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1461; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1462; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1463; VI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1464; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
1465; VI-NEXT:    v_mul_hi_u32 v6, v7, s4
1466; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
1467; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1468; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1469; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
1470; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1471; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v3
1472; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
1473; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1474; VI-NEXT:    s_setpc_b64 s[30:31]
1475;
1476; GFX9-LABEL: v_fshr_v2i24:
1477; GFX9:       ; %bb.0:
1478; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1479; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1480; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1481; GFX9-NEXT:    v_mul_hi_u32 v6, v6, s4
1482; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1483; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1484; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1485; GFX9-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1486; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
1487; GFX9-NEXT:    v_mul_hi_u32 v6, v7, s4
1488; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
1489; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1490; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1491; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
1492; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1493; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v3
1494; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
1495; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1496; GFX9-NEXT:    s_setpc_b64 s[30:31]
1497;
1498; R600-LABEL: v_fshr_v2i24:
1499; R600:       ; %bb.0:
1500; R600-NEXT:    CF_END
1501; R600-NEXT:    PAD
1502;
1503; GFX10-LABEL: v_fshr_v2i24:
1504; GFX10:       ; %bb.0:
1505; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1506; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1507; GFX10-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1508; GFX10-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1509; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1510; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1511; GFX10-NEXT:    v_mul_hi_u32 v6, 0xaaaaaaab, v6
1512; GFX10-NEXT:    v_mul_hi_u32 v7, 0xaaaaaaab, v7
1513; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1514; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1515; GFX10-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1516; GFX10-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
1517; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
1518; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
1519; GFX10-NEXT:    v_add_nc_u32_e32 v4, 8, v4
1520; GFX10-NEXT:    v_add_nc_u32_e32 v5, 8, v5
1521; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1522; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
1523; GFX10-NEXT:    s_setpc_b64 s[30:31]
1524;
1525; GFX11-LABEL: v_fshr_v2i24:
1526; GFX11:       ; %bb.0:
1527; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1528; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1529; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1530; GFX11-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1531; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1532; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1533; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1534; GFX11-NEXT:    v_mul_hi_u32 v6, 0xaaaaaaab, v6
1535; GFX11-NEXT:    v_mul_hi_u32 v7, 0xaaaaaaab, v7
1536; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1537; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1538; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1539; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1540; GFX11-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1541; GFX11-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
1542; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1543; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
1544; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
1545; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1546; GFX11-NEXT:    v_add_nc_u32_e32 v4, 8, v4
1547; GFX11-NEXT:    v_add_nc_u32_e32 v5, 8, v5
1548; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1549; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1550; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
1551; GFX11-NEXT:    s_setpc_b64 s[30:31]
1552  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1553  ret <2 x i24> %ret
1554}
1555