1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
7
8declare i32 @llvm.fshr.i32(i32, i32, i32)
9declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
10declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
11declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
12declare i16 @llvm.fshr.i16(i16, i16, i16)
13declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
14declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
15declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
16declare i64 @llvm.fshr.i64(i64, i64, i64)
17declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
18declare i24 @llvm.fshr.i24(i24, i24, i24)
19declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
20
21define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
22; SI-LABEL: fshr_i32:
23; SI:       ; %bb.0: ; %entry
24; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
25; SI-NEXT:    s_load_dword s6, s[0:1], 0xd
26; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
27; SI-NEXT:    s_mov_b32 s3, 0xf000
28; SI-NEXT:    s_mov_b32 s2, -1
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    v_mov_b32_e32 v0, s5
31; SI-NEXT:    v_mov_b32_e32 v1, s6
32; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v1
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: fshr_i32:
37; VI:       ; %bb.0: ; %entry
38; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
39; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
40; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; VI-NEXT:    s_waitcnt lgkmcnt(0)
42; VI-NEXT:    v_mov_b32_e32 v0, s3
43; VI-NEXT:    v_mov_b32_e32 v1, s4
44; VI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
45; VI-NEXT:    v_mov_b32_e32 v0, s0
46; VI-NEXT:    v_mov_b32_e32 v1, s1
47; VI-NEXT:    flat_store_dword v[0:1], v2
48; VI-NEXT:    s_endpgm
49;
50; GFX9-LABEL: fshr_i32:
51; GFX9:       ; %bb.0: ; %entry
52; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
53; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x34
54; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
55; GFX9-NEXT:    v_mov_b32_e32 v0, 0
56; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX9-NEXT:    v_mov_b32_e32 v1, s3
58; GFX9-NEXT:    v_mov_b32_e32 v2, s6
59; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, v2
60; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
61; GFX9-NEXT:    s_endpgm
62;
63; R600-LABEL: fshr_i32:
64; R600:       ; %bb.0: ; %entry
65; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
66; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
67; R600-NEXT:    CF_END
68; R600-NEXT:    PAD
69; R600-NEXT:    ALU clause starting at 4:
70; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
71; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
72; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
73;
74; GFX10-LABEL: fshr_i32:
75; GFX10:       ; %bb.0: ; %entry
76; GFX10-NEXT:    s_clause 0x2
77; GFX10-NEXT:    s_load_dword s6, s[0:1], 0x34
78; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
79; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
80; GFX10-NEXT:    v_mov_b32_e32 v1, 0
81; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX10-NEXT:    v_mov_b32_e32 v0, s6
83; GFX10-NEXT:    v_alignbit_b32 v0, s2, s3, v0
84; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
85; GFX10-NEXT:    s_endpgm
86entry:
87  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
88  store i32 %0, i32 addrspace(1)* %in
89  ret void
90}
91
92define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
93; SI-LABEL: fshr_i32_imm:
94; SI:       ; %bb.0: ; %entry
95; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
96; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
97; SI-NEXT:    s_mov_b32 s3, 0xf000
98; SI-NEXT:    s_mov_b32 s2, -1
99; SI-NEXT:    s_waitcnt lgkmcnt(0)
100; SI-NEXT:    v_mov_b32_e32 v0, s5
101; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 7
102; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
103; SI-NEXT:    s_endpgm
104;
105; VI-LABEL: fshr_i32_imm:
106; VI:       ; %bb.0: ; %entry
107; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
108; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; VI-NEXT:    s_waitcnt lgkmcnt(0)
110; VI-NEXT:    v_mov_b32_e32 v0, s3
111; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 7
112; VI-NEXT:    v_mov_b32_e32 v0, s0
113; VI-NEXT:    v_mov_b32_e32 v1, s1
114; VI-NEXT:    flat_store_dword v[0:1], v2
115; VI-NEXT:    s_endpgm
116;
117; GFX9-LABEL: fshr_i32_imm:
118; GFX9:       ; %bb.0: ; %entry
119; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
120; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
121; GFX9-NEXT:    v_mov_b32_e32 v0, 0
122; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX9-NEXT:    v_mov_b32_e32 v1, s3
124; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
125; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
126; GFX9-NEXT:    s_endpgm
127;
128; R600-LABEL: fshr_i32_imm:
129; R600:       ; %bb.0: ; %entry
130; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
131; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
132; R600-NEXT:    CF_END
133; R600-NEXT:    PAD
134; R600-NEXT:    ALU clause starting at 4:
135; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
136; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
137; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
138; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
139;
140; GFX10-LABEL: fshr_i32_imm:
141; GFX10:       ; %bb.0: ; %entry
142; GFX10-NEXT:    s_clause 0x1
143; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
144; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
145; GFX10-NEXT:    v_mov_b32_e32 v0, 0
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
148; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
149; GFX10-NEXT:    s_endpgm
150entry:
151  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
152  store i32 %0, i32 addrspace(1)* %in
153  ret void
154}
155
156define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
157; SI-LABEL: fshr_v2i32:
158; SI:       ; %bb.0: ; %entry
159; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
160; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xf
161; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
162; SI-NEXT:    s_mov_b32 s3, 0xf000
163; SI-NEXT:    s_mov_b32 s2, -1
164; SI-NEXT:    s_waitcnt lgkmcnt(0)
165; SI-NEXT:    v_mov_b32_e32 v0, s7
166; SI-NEXT:    v_mov_b32_e32 v1, s9
167; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
168; SI-NEXT:    v_mov_b32_e32 v0, s6
169; SI-NEXT:    v_mov_b32_e32 v2, s8
170; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v2
171; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
172; SI-NEXT:    s_endpgm
173;
174; VI-LABEL: fshr_v2i32:
175; VI:       ; %bb.0: ; %entry
176; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
177; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
178; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
179; VI-NEXT:    s_waitcnt lgkmcnt(0)
180; VI-NEXT:    v_mov_b32_e32 v0, s7
181; VI-NEXT:    v_mov_b32_e32 v1, s3
182; VI-NEXT:    v_mov_b32_e32 v2, s6
183; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
184; VI-NEXT:    v_mov_b32_e32 v0, s2
185; VI-NEXT:    v_alignbit_b32 v0, s4, v2, v0
186; VI-NEXT:    v_mov_b32_e32 v3, s1
187; VI-NEXT:    v_mov_b32_e32 v2, s0
188; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
189; VI-NEXT:    s_endpgm
190;
191; GFX9-LABEL: fshr_v2i32:
192; GFX9:       ; %bb.0: ; %entry
193; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
194; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
195; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
196; GFX9-NEXT:    v_mov_b32_e32 v2, 0
197; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX9-NEXT:    v_mov_b32_e32 v0, s7
199; GFX9-NEXT:    v_mov_b32_e32 v1, s3
200; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
201; GFX9-NEXT:    v_mov_b32_e32 v0, s6
202; GFX9-NEXT:    v_mov_b32_e32 v3, s2
203; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v3
204; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
205; GFX9-NEXT:    s_endpgm
206;
207; R600-LABEL: fshr_v2i32:
208; R600:       ; %bb.0: ; %entry
209; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
210; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
211; R600-NEXT:    CF_END
212; R600-NEXT:    PAD
213; R600-NEXT:    ALU clause starting at 4:
214; R600-NEXT:     MOV * T0.W, KC0[4].X,
215; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
216; R600-NEXT:     MOV * T0.W, KC0[3].W,
217; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
218; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
219; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
220;
221; GFX10-LABEL: fshr_v2i32:
222; GFX10:       ; %bb.0: ; %entry
223; GFX10-NEXT:    s_clause 0x2
224; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
225; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
226; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
227; GFX10-NEXT:    v_mov_b32_e32 v3, 0
228; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
229; GFX10-NEXT:    v_mov_b32_e32 v0, s3
230; GFX10-NEXT:    v_mov_b32_e32 v2, s2
231; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, v0
232; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, v2
233; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
234; GFX10-NEXT:    s_endpgm
235entry:
236  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
237  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
238  ret void
239}
240
241define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
242; SI-LABEL: fshr_v2i32_imm:
243; SI:       ; %bb.0: ; %entry
244; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
245; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
246; SI-NEXT:    s_mov_b32 s3, 0xf000
247; SI-NEXT:    s_mov_b32 s2, -1
248; SI-NEXT:    s_waitcnt lgkmcnt(0)
249; SI-NEXT:    v_mov_b32_e32 v0, s7
250; SI-NEXT:    v_mov_b32_e32 v2, s6
251; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
252; SI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
253; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
254; SI-NEXT:    s_endpgm
255;
256; VI-LABEL: fshr_v2i32_imm:
257; VI:       ; %bb.0: ; %entry
258; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
259; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
260; VI-NEXT:    s_waitcnt lgkmcnt(0)
261; VI-NEXT:    v_mov_b32_e32 v0, s7
262; VI-NEXT:    v_mov_b32_e32 v2, s6
263; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
264; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
265; VI-NEXT:    v_mov_b32_e32 v3, s1
266; VI-NEXT:    v_mov_b32_e32 v2, s0
267; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
268; VI-NEXT:    s_endpgm
269;
270; GFX9-LABEL: fshr_v2i32_imm:
271; GFX9:       ; %bb.0: ; %entry
272; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
273; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
274; GFX9-NEXT:    v_mov_b32_e32 v2, 0
275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX9-NEXT:    v_mov_b32_e32 v0, s7
277; GFX9-NEXT:    v_mov_b32_e32 v3, s6
278; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
279; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 7
280; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
281; GFX9-NEXT:    s_endpgm
282;
283; R600-LABEL: fshr_v2i32_imm:
284; R600:       ; %bb.0: ; %entry
285; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
286; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
287; R600-NEXT:    CF_END
288; R600-NEXT:    PAD
289; R600-NEXT:    ALU clause starting at 4:
290; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
291; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
292; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
293; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
294; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
295; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
296;
297; GFX10-LABEL: fshr_v2i32_imm:
298; GFX10:       ; %bb.0: ; %entry
299; GFX10-NEXT:    s_clause 0x1
300; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
301; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
302; GFX10-NEXT:    v_mov_b32_e32 v2, 0
303; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, 9
305; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, 7
306; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
307; GFX10-NEXT:    s_endpgm
308entry:
309  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
310  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
311  ret void
312}
313
314define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
315; SI-LABEL: fshr_v4i32:
316; SI:       ; %bb.0: ; %entry
317; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
318; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x15
319; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
320; SI-NEXT:    s_mov_b32 s3, 0xf000
321; SI-NEXT:    s_mov_b32 s2, -1
322; SI-NEXT:    s_waitcnt lgkmcnt(0)
323; SI-NEXT:    v_mov_b32_e32 v0, s11
324; SI-NEXT:    v_mov_b32_e32 v1, s15
325; SI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
326; SI-NEXT:    v_mov_b32_e32 v0, s10
327; SI-NEXT:    v_mov_b32_e32 v1, s14
328; SI-NEXT:    v_alignbit_b32 v2, s6, v0, v1
329; SI-NEXT:    v_mov_b32_e32 v0, s9
330; SI-NEXT:    v_mov_b32_e32 v1, s13
331; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
332; SI-NEXT:    v_mov_b32_e32 v0, s8
333; SI-NEXT:    v_mov_b32_e32 v4, s12
334; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
335; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
336; SI-NEXT:    s_endpgm
337;
338; VI-LABEL: fshr_v4i32:
339; VI:       ; %bb.0: ; %entry
340; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
341; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
342; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
343; VI-NEXT:    s_waitcnt lgkmcnt(0)
344; VI-NEXT:    v_mov_b32_e32 v0, s11
345; VI-NEXT:    v_mov_b32_e32 v1, s15
346; VI-NEXT:    v_mov_b32_e32 v2, s10
347; VI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
348; VI-NEXT:    v_mov_b32_e32 v0, s14
349; VI-NEXT:    v_alignbit_b32 v2, s6, v2, v0
350; VI-NEXT:    v_mov_b32_e32 v0, s9
351; VI-NEXT:    v_mov_b32_e32 v1, s13
352; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
353; VI-NEXT:    v_mov_b32_e32 v0, s8
354; VI-NEXT:    v_mov_b32_e32 v4, s12
355; VI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
356; VI-NEXT:    v_mov_b32_e32 v5, s1
357; VI-NEXT:    v_mov_b32_e32 v4, s0
358; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
359; VI-NEXT:    s_endpgm
360;
361; GFX9-LABEL: fshr_v4i32:
362; GFX9:       ; %bb.0: ; %entry
363; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
364; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
365; GFX9-NEXT:    v_mov_b32_e32 v4, 0
366; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
367; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX9-NEXT:    v_mov_b32_e32 v0, s11
369; GFX9-NEXT:    v_mov_b32_e32 v1, s15
370; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, v1
371; GFX9-NEXT:    v_mov_b32_e32 v0, s10
372; GFX9-NEXT:    v_mov_b32_e32 v1, s14
373; GFX9-NEXT:    v_alignbit_b32 v2, s6, v0, v1
374; GFX9-NEXT:    v_mov_b32_e32 v0, s9
375; GFX9-NEXT:    v_mov_b32_e32 v1, s13
376; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
377; GFX9-NEXT:    v_mov_b32_e32 v0, s8
378; GFX9-NEXT:    v_mov_b32_e32 v5, s12
379; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v5
380; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
381; GFX9-NEXT:    s_endpgm
382;
383; R600-LABEL: fshr_v4i32:
384; R600:       ; %bb.0: ; %entry
385; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
386; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
387; R600-NEXT:    CF_END
388; R600-NEXT:    PAD
389; R600-NEXT:    ALU clause starting at 4:
390; R600-NEXT:     MOV * T0.W, KC0[6].X,
391; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
392; R600-NEXT:     MOV * T1.W, KC0[5].W,
393; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
394; R600-NEXT:     MOV * T1.W, KC0[5].Z,
395; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
396; R600-NEXT:     MOV * T1.W, KC0[5].Y,
397; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
398; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
399; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
400;
401; GFX10-LABEL: fshr_v4i32:
402; GFX10:       ; %bb.0: ; %entry
403; GFX10-NEXT:    s_clause 0x2
404; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
405; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
406; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
407; GFX10-NEXT:    v_mov_b32_e32 v6, 0
408; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX10-NEXT:    v_mov_b32_e32 v0, s15
410; GFX10-NEXT:    v_mov_b32_e32 v1, s14
411; GFX10-NEXT:    v_mov_b32_e32 v4, s13
412; GFX10-NEXT:    v_mov_b32_e32 v5, s12
413; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, v0
414; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, v1
415; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, v4
416; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, v5
417; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
418; GFX10-NEXT:    s_endpgm
419entry:
420  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
421  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
422  ret void
423}
424
425define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
426; SI-LABEL: fshr_v4i32_imm:
427; SI:       ; %bb.0: ; %entry
428; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
429; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
430; SI-NEXT:    s_mov_b32 s3, 0xf000
431; SI-NEXT:    s_mov_b32 s2, -1
432; SI-NEXT:    s_waitcnt lgkmcnt(0)
433; SI-NEXT:    v_mov_b32_e32 v0, s11
434; SI-NEXT:    v_mov_b32_e32 v1, s10
435; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
436; SI-NEXT:    v_mov_b32_e32 v0, s9
437; SI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
438; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
439; SI-NEXT:    v_mov_b32_e32 v0, s8
440; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
441; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
442; SI-NEXT:    s_endpgm
443;
444; VI-LABEL: fshr_v4i32_imm:
445; VI:       ; %bb.0: ; %entry
446; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
447; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
448; VI-NEXT:    s_waitcnt lgkmcnt(0)
449; VI-NEXT:    v_mov_b32_e32 v0, s11
450; VI-NEXT:    v_mov_b32_e32 v1, s10
451; VI-NEXT:    v_mov_b32_e32 v4, s9
452; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
453; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
454; VI-NEXT:    v_alignbit_b32 v1, s5, v4, 7
455; VI-NEXT:    v_mov_b32_e32 v0, s8
456; VI-NEXT:    v_mov_b32_e32 v5, s1
457; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
458; VI-NEXT:    v_mov_b32_e32 v4, s0
459; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
460; VI-NEXT:    s_endpgm
461;
462; GFX9-LABEL: fshr_v4i32_imm:
463; GFX9:       ; %bb.0: ; %entry
464; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
465; GFX9-NEXT:    v_mov_b32_e32 v4, 0
466; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
467; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX9-NEXT:    v_mov_b32_e32 v0, s11
469; GFX9-NEXT:    v_mov_b32_e32 v1, s10
470; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
471; GFX9-NEXT:    v_mov_b32_e32 v0, s9
472; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
473; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
474; GFX9-NEXT:    v_mov_b32_e32 v0, s8
475; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
476; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
477; GFX9-NEXT:    s_endpgm
478;
479; R600-LABEL: fshr_v4i32_imm:
480; R600:       ; %bb.0: ; %entry
481; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
482; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
483; R600-NEXT:    CF_END
484; R600-NEXT:    PAD
485; R600-NEXT:    ALU clause starting at 4:
486; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
487; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
488; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
489; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
490; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
491; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
492; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
493; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
494;
495; GFX10-LABEL: fshr_v4i32_imm:
496; GFX10:       ; %bb.0: ; %entry
497; GFX10-NEXT:    s_clause 0x1
498; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
499; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
500; GFX10-NEXT:    v_mov_b32_e32 v4, 0
501; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, 1
503; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, 9
504; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, 7
505; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, 1
506; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
507; GFX10-NEXT:    s_endpgm
508entry:
509  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
510  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
511  ret void
512}
513
514define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
515; GFX89-LABEL: v_fshr_i32:
516; GFX89:       ; %bb.0:
517; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
519; GFX89-NEXT:    s_setpc_b64 s[30:31]
520;
521; R600-LABEL: v_fshr_i32:
522; R600:       ; %bb.0:
523; R600-NEXT:    CF_END
524; R600-NEXT:    PAD
525;
526; GFX10-LABEL: v_fshr_i32:
527; GFX10:       ; %bb.0:
528; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
530; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
531; GFX10-NEXT:    s_setpc_b64 s[30:31]
532  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
533  ret i32 %ret
534}
535
536define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
537; GFX89-LABEL: v_fshr_v2i32:
538; GFX89:       ; %bb.0:
539; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
540; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
541; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v5
542; GFX89-NEXT:    s_setpc_b64 s[30:31]
543;
544; R600-LABEL: v_fshr_v2i32:
545; R600:       ; %bb.0:
546; R600-NEXT:    CF_END
547; R600-NEXT:    PAD
548;
549; GFX10-LABEL: v_fshr_v2i32:
550; GFX10:       ; %bb.0:
551; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
552; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
553; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
554; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
555; GFX10-NEXT:    s_setpc_b64 s[30:31]
556  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
557  ret <2 x i32> %ret
558}
559
560define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
561; GFX89-LABEL: v_fshr_v3i32:
562; GFX89:       ; %bb.0:
563; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
564; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
565; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v7
566; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v8
567; GFX89-NEXT:    s_setpc_b64 s[30:31]
568;
569; R600-LABEL: v_fshr_v3i32:
570; R600:       ; %bb.0:
571; R600-NEXT:    CF_END
572; R600-NEXT:    PAD
573;
574; GFX10-LABEL: v_fshr_v3i32:
575; GFX10:       ; %bb.0:
576; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
578; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
579; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
580; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
581; GFX10-NEXT:    s_setpc_b64 s[30:31]
582  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
583  ret <3 x i32> %ret
584}
585
586define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
587; GFX89-LABEL: v_fshr_v4i32:
588; GFX89:       ; %bb.0:
589; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
590; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
591; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v9
592; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v10
593; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v11
594; GFX89-NEXT:    s_setpc_b64 s[30:31]
595;
596; R600-LABEL: v_fshr_v4i32:
597; R600:       ; %bb.0:
598; R600-NEXT:    CF_END
599; R600-NEXT:    PAD
600;
601; GFX10-LABEL: v_fshr_v4i32:
602; GFX10:       ; %bb.0:
603; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
605; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
606; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
607; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
608; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
609; GFX10-NEXT:    s_setpc_b64 s[30:31]
610  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
611  ret <4 x i32> %ret
612}
613
614define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
615; SI-LABEL: v_fshr_i16:
616; SI:       ; %bb.0:
617; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618; SI-NEXT:    v_or_b32_e32 v2, 16, v2
619; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
620; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
621; SI-NEXT:    s_setpc_b64 s[30:31]
622;
623; VI-LABEL: v_fshr_i16:
624; VI:       ; %bb.0:
625; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
627; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
628; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
629; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
630; VI-NEXT:    v_or_b32_e32 v0, v0, v1
631; VI-NEXT:    s_setpc_b64 s[30:31]
632;
633; GFX9-LABEL: v_fshr_i16:
634; GFX9:       ; %bb.0:
635; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
636; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
637; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
638; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
639; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
640; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
641; GFX9-NEXT:    s_setpc_b64 s[30:31]
642;
643; R600-LABEL: v_fshr_i16:
644; R600:       ; %bb.0:
645; R600-NEXT:    CF_END
646; R600-NEXT:    PAD
647;
648; GFX10-LABEL: v_fshr_i16:
649; GFX10:       ; %bb.0:
650; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
652; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
653; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
654; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
655; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
656; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
657; GFX10-NEXT:    s_setpc_b64 s[30:31]
658  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
659  ret i16 %ret
660}
661
662define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
663; SI-LABEL: v_fshr_v2i16:
664; SI:       ; %bb.0:
665; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666; SI-NEXT:    v_or_b32_e32 v5, 16, v5
667; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
668; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
669; SI-NEXT:    v_or_b32_e32 v3, 16, v4
670; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
671; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v3
672; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
673; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
674; SI-NEXT:    v_or_b32_e32 v0, v0, v1
675; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
676; SI-NEXT:    s_setpc_b64 s[30:31]
677;
678; VI-LABEL: v_fshr_v2i16:
679; VI:       ; %bb.0:
680; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
681; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
682; VI-NEXT:    v_mov_b32_e32 v5, 1
683; VI-NEXT:    v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
684; VI-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
685; VI-NEXT:    v_xor_b32_e32 v3, -1, v3
686; VI-NEXT:    v_lshlrev_b16_e32 v3, v3, v5
687; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
688; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
689; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
690; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
691; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
692; VI-NEXT:    v_or_b32_e32 v0, v0, v1
693; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
694; VI-NEXT:    s_setpc_b64 s[30:31]
695;
696; GFX9-LABEL: v_fshr_v2i16:
697; GFX9:       ; %bb.0:
698; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
699; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
700; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
701; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
702; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
703; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
704; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
705; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
706; GFX9-NEXT:    s_setpc_b64 s[30:31]
707;
708; R600-LABEL: v_fshr_v2i16:
709; R600:       ; %bb.0:
710; R600-NEXT:    CF_END
711; R600-NEXT:    PAD
712;
713; GFX10-LABEL: v_fshr_v2i16:
714; GFX10:       ; %bb.0:
715; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
717; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
718; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
719; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
720; GFX10-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
721; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
722; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
723; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
724; GFX10-NEXT:    s_setpc_b64 s[30:31]
725  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
726  ret <2 x i16> %ret
727}
728
729define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
730; SI-LABEL: v_fshr_v3i16:
731; SI:       ; %bb.0:
732; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733; SI-NEXT:    v_or_b32_e32 v7, 16, v7
734; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
735; SI-NEXT:    v_alignbit_b32 v1, v1, v4, v7
736; SI-NEXT:    v_or_b32_e32 v4, 16, v6
737; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
738; SI-NEXT:    v_alignbit_b32 v0, v0, v3, v4
739; SI-NEXT:    v_or_b32_e32 v3, 16, v8
740; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
741; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
742; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
743; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
744; SI-NEXT:    v_or_b32_e32 v0, v0, v1
745; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
746; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
747; SI-NEXT:    s_setpc_b64 s[30:31]
748;
749; VI-LABEL: v_fshr_v3i16:
750; VI:       ; %bb.0:
751; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
753; VI-NEXT:    v_mov_b32_e32 v8, 1
754; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
755; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
756; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
757; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
758; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
759; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
760; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
761; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
762; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
763; VI-NEXT:    v_or_b32_e32 v1, v1, v3
764; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
765; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
766; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
767; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
768; VI-NEXT:    v_or_b32_e32 v0, v0, v2
769; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
770; VI-NEXT:    s_setpc_b64 s[30:31]
771;
772; GFX9-LABEL: v_fshr_v3i16:
773; GFX9:       ; %bb.0:
774; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
775; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
776; GFX9-NEXT:    v_mov_b32_e32 v8, 1
777; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
778; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
779; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
780; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
781; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
782; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
783; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
784; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
785; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
786; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
787; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
788; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
789; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
790; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
791; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
792; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
793; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
794; GFX9-NEXT:    s_setpc_b64 s[30:31]
795;
796; R600-LABEL: v_fshr_v3i16:
797; R600:       ; %bb.0:
798; R600-NEXT:    CF_END
799; R600-NEXT:    PAD
800;
801; GFX10-LABEL: v_fshr_v3i16:
802; GFX10:       ; %bb.0:
803; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
804; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
805; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
806; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
807; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
808; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
809; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
810; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v6
811; GFX10-NEXT:    v_lshlrev_b16 v7, 1, v7
812; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
813; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
814; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v9
815; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
816; GFX10-NEXT:    v_lshlrev_b16 v6, v10, v7
817; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
818; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
819; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v5
820; GFX10-NEXT:    v_or_b32_e32 v4, v6, v4
821; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
822; GFX10-NEXT:    v_lshlrev_b16 v1, v2, v1
823; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
824; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
825; GFX10-NEXT:    s_setpc_b64 s[30:31]
826  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
827  ret <3 x i16> %ret
828}
829
830define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
831; SI-LABEL: v_fshr_v4i16:
832; SI:       ; %bb.0:
833; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834; SI-NEXT:    v_or_b32_e32 v9, 16, v9
835; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
836; SI-NEXT:    v_alignbit_b32 v1, v1, v5, v9
837; SI-NEXT:    v_or_b32_e32 v5, 16, v8
838; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
839; SI-NEXT:    v_alignbit_b32 v0, v0, v4, v5
840; SI-NEXT:    v_or_b32_e32 v4, 16, v11
841; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
842; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
843; SI-NEXT:    v_or_b32_e32 v4, 16, v10
844; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
845; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
846; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
847; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
848; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
849; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
850; SI-NEXT:    v_or_b32_e32 v2, v2, v3
851; SI-NEXT:    v_or_b32_e32 v0, v0, v1
852; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
853; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
854; SI-NEXT:    s_setpc_b64 s[30:31]
855;
856; VI-LABEL: v_fshr_v4i16:
857; VI:       ; %bb.0:
858; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
859; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
860; VI-NEXT:    v_mov_b32_e32 v8, 1
861; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
862; VI-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
863; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
864; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
865; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
866; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
867; VI-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
868; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
869; VI-NEXT:    v_xor_b32_e32 v7, -1, v7
870; VI-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
871; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
872; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
873; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
874; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
875; VI-NEXT:    v_or_b32_e32 v1, v1, v3
876; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
877; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
878; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
879; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
880; VI-NEXT:    v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
881; VI-NEXT:    v_or_b32_e32 v0, v0, v2
882; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
883; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
884; VI-NEXT:    s_setpc_b64 s[30:31]
885;
886; GFX9-LABEL: v_fshr_v4i16:
887; GFX9:       ; %bb.0:
888; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
889; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
890; GFX9-NEXT:    v_mov_b32_e32 v8, 1
891; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
892; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
893; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
894; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
895; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
896; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
897; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
898; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
899; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v7
900; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
901; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
902; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
903; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
904; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
905; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
906; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
907; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
908; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
909; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
910; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
911; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
912; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
913; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
914; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
915; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
916; GFX9-NEXT:    s_setpc_b64 s[30:31]
917;
918; R600-LABEL: v_fshr_v4i16:
919; R600:       ; %bb.0:
920; R600-NEXT:    CF_END
921; R600-NEXT:    PAD
922;
923; GFX10-LABEL: v_fshr_v4i16:
924; GFX10:       ; %bb.0:
925; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
926; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
927; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
928; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
929; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
930; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
931; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
932; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
933; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v5
934; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
935; GFX10-NEXT:    v_xor_b32_e32 v12, -1, v4
936; GFX10-NEXT:    v_lshrrev_b16 v6, v7, v6
937; GFX10-NEXT:    v_lshlrev_b16 v8, 1, v8
938; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
939; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
940; GFX10-NEXT:    v_lshlrev_b16 v10, 1, v10
941; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v9
942; GFX10-NEXT:    v_lshlrev_b16 v1, v11, v1
943; GFX10-NEXT:    v_lshlrev_b16 v0, v12, v0
944; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
945; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
946; GFX10-NEXT:    v_lshlrev_b16 v4, v7, v8
947; GFX10-NEXT:    v_lshrrev_b16 v5, v9, v13
948; GFX10-NEXT:    v_lshlrev_b16 v7, v14, v10
949; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
950; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
951; GFX10-NEXT:    v_or_b32_e32 v2, v4, v6
952; GFX10-NEXT:    v_or_b32_e32 v3, v7, v5
953; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
954; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
955; GFX10-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
956; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
957; GFX10-NEXT:    s_setpc_b64 s[30:31]
958  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
959  ret <4 x i16> %ret
960}
961
962define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
963; SI-LABEL: v_fshr_i64:
964; SI:       ; %bb.0:
965; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
966; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
967; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
968; SI-NEXT:    v_not_b32_e32 v4, v4
969; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
970; SI-NEXT:    v_or_b32_e32 v1, v1, v3
971; SI-NEXT:    v_or_b32_e32 v0, v0, v2
972; SI-NEXT:    s_setpc_b64 s[30:31]
973;
974; VI-LABEL: v_fshr_i64:
975; VI:       ; %bb.0:
976; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
977; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
978; VI-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
979; VI-NEXT:    v_not_b32_e32 v4, v4
980; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
981; VI-NEXT:    v_or_b32_e32 v1, v1, v3
982; VI-NEXT:    v_or_b32_e32 v0, v0, v2
983; VI-NEXT:    s_setpc_b64 s[30:31]
984;
985; GFX9-LABEL: v_fshr_i64:
986; GFX9:       ; %bb.0:
987; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
988; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
989; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
990; GFX9-NEXT:    v_not_b32_e32 v4, v4
991; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
992; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
993; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
994; GFX9-NEXT:    s_setpc_b64 s[30:31]
995;
996; R600-LABEL: v_fshr_i64:
997; R600:       ; %bb.0:
998; R600-NEXT:    CF_END
999; R600-NEXT:    PAD
1000;
1001; GFX10-LABEL: v_fshr_i64:
1002; GFX10:       ; %bb.0:
1003; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1005; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1006; GFX10-NEXT:    v_not_b32_e32 v5, v4
1007; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1008; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
1009; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1010; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1011; GFX10-NEXT:    s_setpc_b64 s[30:31]
1012  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1013  ret i64 %ret
1014}
1015
1016define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1017; SI-LABEL: v_fshr_v2i64:
1018; SI:       ; %bb.0:
1019; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1020; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1021; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
1022; SI-NEXT:    v_not_b32_e32 v8, v8
1023; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
1024; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1025; SI-NEXT:    v_or_b32_e32 v1, v1, v5
1026; SI-NEXT:    v_lshr_b64 v[5:6], v[6:7], v10
1027; SI-NEXT:    v_not_b32_e32 v7, v10
1028; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
1029; SI-NEXT:    v_or_b32_e32 v0, v0, v4
1030; SI-NEXT:    v_or_b32_e32 v3, v3, v6
1031; SI-NEXT:    v_or_b32_e32 v2, v2, v5
1032; SI-NEXT:    s_setpc_b64 s[30:31]
1033;
1034; VI-LABEL: v_fshr_v2i64:
1035; VI:       ; %bb.0:
1036; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1038; VI-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1039; VI-NEXT:    v_not_b32_e32 v8, v8
1040; VI-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1041; VI-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1042; VI-NEXT:    v_or_b32_e32 v1, v1, v5
1043; VI-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1044; VI-NEXT:    v_not_b32_e32 v7, v10
1045; VI-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1046; VI-NEXT:    v_or_b32_e32 v0, v0, v4
1047; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1048; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1049; VI-NEXT:    s_setpc_b64 s[30:31]
1050;
1051; GFX9-LABEL: v_fshr_v2i64:
1052; GFX9:       ; %bb.0:
1053; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1054; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1055; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1056; GFX9-NEXT:    v_not_b32_e32 v8, v8
1057; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1058; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1059; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
1060; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1061; GFX9-NEXT:    v_not_b32_e32 v7, v10
1062; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1063; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
1064; GFX9-NEXT:    v_or_b32_e32 v3, v3, v6
1065; GFX9-NEXT:    v_or_b32_e32 v2, v2, v5
1066; GFX9-NEXT:    s_setpc_b64 s[30:31]
1067;
1068; R600-LABEL: v_fshr_v2i64:
1069; R600:       ; %bb.0:
1070; R600-NEXT:    CF_END
1071; R600-NEXT:    PAD
1072;
1073; GFX10-LABEL: v_fshr_v2i64:
1074; GFX10:       ; %bb.0:
1075; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1076; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1077; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1078; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1079; GFX10-NEXT:    v_not_b32_e32 v9, v8
1080; GFX10-NEXT:    v_not_b32_e32 v11, v10
1081; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1082; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
1083; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1084; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
1085; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
1086; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
1087; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
1088; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
1089; GFX10-NEXT:    s_setpc_b64 s[30:31]
1090  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1091  ret <2 x i64> %ret
1092}
1093
1094define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1095; SI-LABEL: v_fshr_i24:
1096; SI:       ; %bb.0:
1097; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1098; SI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1099; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1100; SI-NEXT:    v_mul_hi_u32 v3, v3, s4
1101; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1102; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1103; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1104; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1105; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
1106; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1107; SI-NEXT:    s_setpc_b64 s[30:31]
1108;
1109; VI-LABEL: v_fshr_i24:
1110; VI:       ; %bb.0:
1111; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1112; VI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1113; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1114; VI-NEXT:    v_mul_hi_u32 v3, v3, s4
1115; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1116; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1117; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1118; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1119; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
1120; VI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1121; VI-NEXT:    s_setpc_b64 s[30:31]
1122;
1123; GFX9-LABEL: v_fshr_i24:
1124; GFX9:       ; %bb.0:
1125; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1126; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1127; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1128; GFX9-NEXT:    v_mul_hi_u32 v3, v3, s4
1129; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1130; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1131; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1132; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
1133; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
1134; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1135; GFX9-NEXT:    s_setpc_b64 s[30:31]
1136;
1137; R600-LABEL: v_fshr_i24:
1138; R600:       ; %bb.0:
1139; R600-NEXT:    CF_END
1140; R600-NEXT:    PAD
1141;
1142; GFX10-LABEL: v_fshr_i24:
1143; GFX10:       ; %bb.0:
1144; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1145; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1146; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1147; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1148; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaaab, v3
1149; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1150; GFX10-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1151; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
1152; GFX10-NEXT:    v_add_nc_u32_e32 v2, 8, v2
1153; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1154; GFX10-NEXT:    s_setpc_b64 s[30:31]
1155  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1156  ret i24 %ret
1157}
1158
1159define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1160; SI-LABEL: v_fshr_v2i24:
1161; SI:       ; %bb.0:
1162; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1163; SI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1164; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1165; SI-NEXT:    v_mul_hi_u32 v6, v6, s4
1166; SI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1167; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1168; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1169; SI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1170; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
1171; SI-NEXT:    v_mul_hi_u32 v6, v7, s4
1172; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
1173; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1174; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1175; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
1176; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1177; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
1178; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
1179; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1180; SI-NEXT:    s_setpc_b64 s[30:31]
1181;
1182; VI-LABEL: v_fshr_v2i24:
1183; VI:       ; %bb.0:
1184; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1185; VI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1186; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1187; VI-NEXT:    v_mul_hi_u32 v6, v6, s4
1188; VI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1189; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1190; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1191; VI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1192; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
1193; VI-NEXT:    v_mul_hi_u32 v6, v7, s4
1194; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
1195; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1196; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1197; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
1198; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1199; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v3
1200; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
1201; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1202; VI-NEXT:    s_setpc_b64 s[30:31]
1203;
1204; GFX9-LABEL: v_fshr_v2i24:
1205; GFX9:       ; %bb.0:
1206; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1207; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1208; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1209; GFX9-NEXT:    v_mul_hi_u32 v6, v6, s4
1210; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1211; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1212; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1213; GFX9-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1214; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
1215; GFX9-NEXT:    v_mul_hi_u32 v6, v7, s4
1216; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
1217; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1218; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1219; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
1220; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1221; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v3
1222; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
1223; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1224; GFX9-NEXT:    s_setpc_b64 s[30:31]
1225;
1226; R600-LABEL: v_fshr_v2i24:
1227; R600:       ; %bb.0:
1228; R600-NEXT:    CF_END
1229; R600-NEXT:    PAD
1230;
1231; GFX10-LABEL: v_fshr_v2i24:
1232; GFX10:       ; %bb.0:
1233; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1234; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1235; GFX10-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1236; GFX10-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1237; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1238; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1239; GFX10-NEXT:    v_mul_hi_u32 v6, 0xaaaaaaab, v6
1240; GFX10-NEXT:    v_mul_hi_u32 v7, 0xaaaaaaab, v7
1241; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1242; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1243; GFX10-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1244; GFX10-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
1245; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
1246; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
1247; GFX10-NEXT:    v_add_nc_u32_e32 v4, 8, v4
1248; GFX10-NEXT:    v_add_nc_u32_e32 v5, 8, v5
1249; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1250; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
1251; GFX10-NEXT:    s_setpc_b64 s[30:31]
1252  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1253  ret <2 x i24> %ret
1254}
1255