1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
7
8declare i32 @llvm.fshr.i32(i32, i32, i32)
9declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
10declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
11declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
12declare i16 @llvm.fshr.i16(i16, i16, i16)
13declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
14declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
15declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
16declare i64 @llvm.fshr.i64(i64, i64, i64)
17declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
18declare i24 @llvm.fshr.i24(i24, i24, i24)
19declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
20
21define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
22; SI-LABEL: fshr_i32:
23; SI:       ; %bb.0: ; %entry
24; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
25; SI-NEXT:    s_load_dword s6, s[0:1], 0xd
26; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
27; SI-NEXT:    s_mov_b32 s3, 0xf000
28; SI-NEXT:    s_mov_b32 s2, -1
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    v_mov_b32_e32 v0, s5
31; SI-NEXT:    v_mov_b32_e32 v1, s6
32; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v1
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: fshr_i32:
37; VI:       ; %bb.0: ; %entry
38; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
39; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
40; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; VI-NEXT:    s_waitcnt lgkmcnt(0)
42; VI-NEXT:    v_mov_b32_e32 v0, s3
43; VI-NEXT:    v_mov_b32_e32 v1, s4
44; VI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
45; VI-NEXT:    v_mov_b32_e32 v0, s0
46; VI-NEXT:    v_mov_b32_e32 v1, s1
47; VI-NEXT:    flat_store_dword v[0:1], v2
48; VI-NEXT:    s_endpgm
49;
50; GFX9-LABEL: fshr_i32:
51; GFX9:       ; %bb.0: ; %entry
52; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
53; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x34
54; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
55; GFX9-NEXT:    v_mov_b32_e32 v0, 0
56; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX9-NEXT:    v_mov_b32_e32 v1, s3
58; GFX9-NEXT:    v_mov_b32_e32 v2, s6
59; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, v2
60; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
61; GFX9-NEXT:    s_endpgm
62;
63; R600-LABEL: fshr_i32:
64; R600:       ; %bb.0: ; %entry
65; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
66; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
67; R600-NEXT:    CF_END
68; R600-NEXT:    PAD
69; R600-NEXT:    ALU clause starting at 4:
70; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
71; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
72; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
73;
74; GFX10-LABEL: fshr_i32:
75; GFX10:       ; %bb.0: ; %entry
76; GFX10-NEXT:    s_clause 0x2
77; GFX10-NEXT:    s_load_dword s6, s[0:1], 0x34
78; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
79; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
80; GFX10-NEXT:    v_mov_b32_e32 v1, 0
81; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX10-NEXT:    v_mov_b32_e32 v0, s6
83; GFX10-NEXT:    v_alignbit_b32 v0, s2, s3, v0
84; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
85; GFX10-NEXT:    s_endpgm
86entry:
87  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
88  store i32 %0, i32 addrspace(1)* %in
89  ret void
90}
91
92define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
93; SI-LABEL: fshr_i32_imm:
94; SI:       ; %bb.0: ; %entry
95; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
96; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
97; SI-NEXT:    s_mov_b32 s3, 0xf000
98; SI-NEXT:    s_mov_b32 s2, -1
99; SI-NEXT:    s_waitcnt lgkmcnt(0)
100; SI-NEXT:    v_mov_b32_e32 v0, s5
101; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 7
102; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
103; SI-NEXT:    s_endpgm
104;
105; VI-LABEL: fshr_i32_imm:
106; VI:       ; %bb.0: ; %entry
107; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
108; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; VI-NEXT:    s_waitcnt lgkmcnt(0)
110; VI-NEXT:    v_mov_b32_e32 v0, s3
111; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 7
112; VI-NEXT:    v_mov_b32_e32 v0, s0
113; VI-NEXT:    v_mov_b32_e32 v1, s1
114; VI-NEXT:    flat_store_dword v[0:1], v2
115; VI-NEXT:    s_endpgm
116;
117; GFX9-LABEL: fshr_i32_imm:
118; GFX9:       ; %bb.0: ; %entry
119; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
120; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
121; GFX9-NEXT:    v_mov_b32_e32 v0, 0
122; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX9-NEXT:    v_mov_b32_e32 v1, s3
124; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
125; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
126; GFX9-NEXT:    s_endpgm
127;
128; R600-LABEL: fshr_i32_imm:
129; R600:       ; %bb.0: ; %entry
130; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
131; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
132; R600-NEXT:    CF_END
133; R600-NEXT:    PAD
134; R600-NEXT:    ALU clause starting at 4:
135; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
136; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
137; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
138; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
139;
140; GFX10-LABEL: fshr_i32_imm:
141; GFX10:       ; %bb.0: ; %entry
142; GFX10-NEXT:    s_clause 0x1
143; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
144; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
145; GFX10-NEXT:    v_mov_b32_e32 v0, 0
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
148; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
149; GFX10-NEXT:    s_endpgm
150entry:
151  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
152  store i32 %0, i32 addrspace(1)* %in
153  ret void
154}
155
156define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
157; SI-LABEL: fshr_v2i32:
158; SI:       ; %bb.0: ; %entry
159; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
160; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xf
161; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
162; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
163; SI-NEXT:    s_mov_b32 s7, 0xf000
164; SI-NEXT:    s_waitcnt lgkmcnt(0)
165; SI-NEXT:    v_mov_b32_e32 v0, s3
166; SI-NEXT:    v_mov_b32_e32 v1, s9
167; SI-NEXT:    v_mov_b32_e32 v2, s8
168; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
169; SI-NEXT:    v_mov_b32_e32 v0, s2
170; SI-NEXT:    s_mov_b32 s6, -1
171; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
172; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
173; SI-NEXT:    s_endpgm
174;
175; VI-LABEL: fshr_v2i32:
176; VI:       ; %bb.0: ; %entry
177; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
178; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x3c
179; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
180; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
181; VI-NEXT:    s_waitcnt lgkmcnt(0)
182; VI-NEXT:    v_mov_b32_e32 v0, s3
183; VI-NEXT:    v_mov_b32_e32 v1, s5
184; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
185; VI-NEXT:    v_mov_b32_e32 v0, s2
186; VI-NEXT:    v_mov_b32_e32 v2, s4
187; VI-NEXT:    v_alignbit_b32 v0, s6, v0, v2
188; VI-NEXT:    v_mov_b32_e32 v3, s1
189; VI-NEXT:    v_mov_b32_e32 v2, s0
190; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
191; VI-NEXT:    s_endpgm
192;
193; GFX9-LABEL: fshr_v2i32:
194; GFX9:       ; %bb.0: ; %entry
195; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
196; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x3c
197; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
198; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
199; GFX9-NEXT:    v_mov_b32_e32 v2, 0
200; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX9-NEXT:    v_mov_b32_e32 v0, s3
202; GFX9-NEXT:    v_mov_b32_e32 v1, s5
203; GFX9-NEXT:    v_alignbit_b32 v1, s7, v0, v1
204; GFX9-NEXT:    v_mov_b32_e32 v0, s2
205; GFX9-NEXT:    v_mov_b32_e32 v3, s4
206; GFX9-NEXT:    v_alignbit_b32 v0, s6, v0, v3
207; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
208; GFX9-NEXT:    s_endpgm
209;
210; R600-LABEL: fshr_v2i32:
211; R600:       ; %bb.0: ; %entry
212; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
213; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
214; R600-NEXT:    CF_END
215; R600-NEXT:    PAD
216; R600-NEXT:    ALU clause starting at 4:
217; R600-NEXT:     MOV * T0.W, KC0[4].X,
218; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
219; R600-NEXT:     MOV * T0.W, KC0[3].W,
220; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
221; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
222; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
223;
224; GFX10-LABEL: fshr_v2i32:
225; GFX10:       ; %bb.0: ; %entry
226; GFX10-NEXT:    s_clause 0x3
227; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
228; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
229; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
230; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
231; GFX10-NEXT:    v_mov_b32_e32 v3, 0
232; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX10-NEXT:    v_mov_b32_e32 v0, s3
234; GFX10-NEXT:    v_mov_b32_e32 v2, s2
235; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, v0
236; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, v2
237; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
238; GFX10-NEXT:    s_endpgm
239entry:
240  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
241  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
242  ret void
243}
244
245define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
246; SI-LABEL: fshr_v2i32_imm:
247; SI:       ; %bb.0: ; %entry
248; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
249; SI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xb
250; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
251; SI-NEXT:    s_mov_b32 s3, 0xf000
252; SI-NEXT:    s_mov_b32 s2, -1
253; SI-NEXT:    s_waitcnt lgkmcnt(0)
254; SI-NEXT:    v_mov_b32_e32 v0, s5
255; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 9
256; SI-NEXT:    v_mov_b32_e32 v0, s4
257; SI-NEXT:    v_alignbit_b32 v0, s6, v0, 7
258; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
259; SI-NEXT:    s_endpgm
260;
261; VI-LABEL: fshr_v2i32_imm:
262; VI:       ; %bb.0: ; %entry
263; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
264; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
265; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
266; VI-NEXT:    s_waitcnt lgkmcnt(0)
267; VI-NEXT:    v_mov_b32_e32 v0, s3
268; VI-NEXT:    v_mov_b32_e32 v2, s2
269; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
270; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
271; VI-NEXT:    v_mov_b32_e32 v3, s1
272; VI-NEXT:    v_mov_b32_e32 v2, s0
273; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
274; VI-NEXT:    s_endpgm
275;
276; GFX9-LABEL: fshr_v2i32_imm:
277; GFX9:       ; %bb.0: ; %entry
278; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
279; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
280; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
281; GFX9-NEXT:    v_mov_b32_e32 v2, 0
282; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX9-NEXT:    v_mov_b32_e32 v0, s3
284; GFX9-NEXT:    v_mov_b32_e32 v3, s2
285; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
286; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 7
287; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
288; GFX9-NEXT:    s_endpgm
289;
290; R600-LABEL: fshr_v2i32_imm:
291; R600:       ; %bb.0: ; %entry
292; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
293; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
294; R600-NEXT:    CF_END
295; R600-NEXT:    PAD
296; R600-NEXT:    ALU clause starting at 4:
297; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
298; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
299; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
300; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
301; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
302; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
303;
304; GFX10-LABEL: fshr_v2i32_imm:
305; GFX10:       ; %bb.0: ; %entry
306; GFX10-NEXT:    s_clause 0x2
307; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
308; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
309; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
310; GFX10-NEXT:    v_mov_b32_e32 v2, 0
311; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX10-NEXT:    v_alignbit_b32 v1, s3, s5, 9
313; GFX10-NEXT:    v_alignbit_b32 v0, s2, s4, 7
314; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
315; GFX10-NEXT:    s_endpgm
316entry:
317  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
318  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
319  ret void
320}
321
322define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
323; SI-LABEL: fshr_v4i32:
324; SI:       ; %bb.0: ; %entry
325; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
326; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x15
327; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
328; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
329; SI-NEXT:    s_mov_b32 s15, 0xf000
330; SI-NEXT:    s_waitcnt lgkmcnt(0)
331; SI-NEXT:    v_mov_b32_e32 v0, s7
332; SI-NEXT:    v_mov_b32_e32 v1, s11
333; SI-NEXT:    v_mov_b32_e32 v4, s8
334; SI-NEXT:    v_alignbit_b32 v3, s3, v0, v1
335; SI-NEXT:    v_mov_b32_e32 v0, s6
336; SI-NEXT:    v_mov_b32_e32 v1, s10
337; SI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
338; SI-NEXT:    v_mov_b32_e32 v0, s5
339; SI-NEXT:    v_mov_b32_e32 v1, s9
340; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
341; SI-NEXT:    v_mov_b32_e32 v0, s4
342; SI-NEXT:    s_mov_b32 s14, -1
343; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v4
344; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
345; SI-NEXT:    s_endpgm
346;
347; VI-LABEL: fshr_v4i32:
348; VI:       ; %bb.0: ; %entry
349; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
350; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x54
351; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
352; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
353; VI-NEXT:    s_waitcnt lgkmcnt(0)
354; VI-NEXT:    v_mov_b32_e32 v0, s7
355; VI-NEXT:    v_mov_b32_e32 v1, s11
356; VI-NEXT:    v_alignbit_b32 v3, s15, v0, v1
357; VI-NEXT:    v_mov_b32_e32 v0, s6
358; VI-NEXT:    v_mov_b32_e32 v1, s10
359; VI-NEXT:    v_alignbit_b32 v2, s14, v0, v1
360; VI-NEXT:    v_mov_b32_e32 v0, s5
361; VI-NEXT:    v_mov_b32_e32 v1, s9
362; VI-NEXT:    v_alignbit_b32 v1, s13, v0, v1
363; VI-NEXT:    v_mov_b32_e32 v0, s4
364; VI-NEXT:    v_mov_b32_e32 v4, s8
365; VI-NEXT:    v_alignbit_b32 v0, s12, v0, v4
366; VI-NEXT:    v_mov_b32_e32 v5, s1
367; VI-NEXT:    v_mov_b32_e32 v4, s0
368; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
369; VI-NEXT:    s_endpgm
370;
371; GFX9-LABEL: fshr_v4i32:
372; GFX9:       ; %bb.0: ; %entry
373; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
374; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x54
375; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
376; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
377; GFX9-NEXT:    v_mov_b32_e32 v4, 0
378; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX9-NEXT:    v_mov_b32_e32 v0, s7
380; GFX9-NEXT:    v_mov_b32_e32 v1, s11
381; GFX9-NEXT:    v_alignbit_b32 v3, s15, v0, v1
382; GFX9-NEXT:    v_mov_b32_e32 v0, s6
383; GFX9-NEXT:    v_mov_b32_e32 v1, s10
384; GFX9-NEXT:    v_alignbit_b32 v2, s14, v0, v1
385; GFX9-NEXT:    v_mov_b32_e32 v0, s5
386; GFX9-NEXT:    v_mov_b32_e32 v1, s9
387; GFX9-NEXT:    v_alignbit_b32 v1, s13, v0, v1
388; GFX9-NEXT:    v_mov_b32_e32 v0, s4
389; GFX9-NEXT:    v_mov_b32_e32 v5, s8
390; GFX9-NEXT:    v_alignbit_b32 v0, s12, v0, v5
391; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
392; GFX9-NEXT:    s_endpgm
393;
394; R600-LABEL: fshr_v4i32:
395; R600:       ; %bb.0: ; %entry
396; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
397; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
398; R600-NEXT:    CF_END
399; R600-NEXT:    PAD
400; R600-NEXT:    ALU clause starting at 4:
401; R600-NEXT:     MOV * T0.W, KC0[6].X,
402; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
403; R600-NEXT:     MOV * T1.W, KC0[5].W,
404; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
405; R600-NEXT:     MOV * T1.W, KC0[5].Z,
406; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
407; R600-NEXT:     MOV * T1.W, KC0[5].Y,
408; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
409; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
410; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
411;
412; GFX10-LABEL: fshr_v4i32:
413; GFX10:       ; %bb.0: ; %entry
414; GFX10-NEXT:    s_clause 0x3
415; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x54
416; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
417; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
418; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
419; GFX10-NEXT:    v_mov_b32_e32 v6, 0
420; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX10-NEXT:    v_mov_b32_e32 v0, s7
422; GFX10-NEXT:    v_mov_b32_e32 v1, s6
423; GFX10-NEXT:    v_mov_b32_e32 v4, s5
424; GFX10-NEXT:    v_mov_b32_e32 v5, s4
425; GFX10-NEXT:    v_alignbit_b32 v3, s15, s11, v0
426; GFX10-NEXT:    v_alignbit_b32 v2, s14, s10, v1
427; GFX10-NEXT:    v_alignbit_b32 v1, s13, s9, v4
428; GFX10-NEXT:    v_alignbit_b32 v0, s12, s8, v5
429; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
430; GFX10-NEXT:    s_endpgm
431entry:
432  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
433  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
434  ret void
435}
436
437define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
438; SI-LABEL: fshr_v4i32_imm:
439; SI:       ; %bb.0: ; %entry
440; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
441; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
442; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
443; SI-NEXT:    s_mov_b32 s3, 0xf000
444; SI-NEXT:    s_mov_b32 s2, -1
445; SI-NEXT:    s_waitcnt lgkmcnt(0)
446; SI-NEXT:    v_mov_b32_e32 v0, s7
447; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
448; SI-NEXT:    v_mov_b32_e32 v0, s6
449; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 9
450; SI-NEXT:    v_mov_b32_e32 v0, s5
451; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
452; SI-NEXT:    v_mov_b32_e32 v0, s4
453; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
454; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
455; SI-NEXT:    s_endpgm
456;
457; VI-LABEL: fshr_v4i32_imm:
458; VI:       ; %bb.0: ; %entry
459; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
460; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
461; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
462; VI-NEXT:    s_waitcnt lgkmcnt(0)
463; VI-NEXT:    v_mov_b32_e32 v0, s7
464; VI-NEXT:    v_mov_b32_e32 v1, s6
465; VI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
466; VI-NEXT:    v_mov_b32_e32 v0, s5
467; VI-NEXT:    v_alignbit_b32 v2, s10, v1, 9
468; VI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
469; VI-NEXT:    v_mov_b32_e32 v0, s4
470; VI-NEXT:    v_mov_b32_e32 v5, s1
471; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
472; VI-NEXT:    v_mov_b32_e32 v4, s0
473; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
474; VI-NEXT:    s_endpgm
475;
476; GFX9-LABEL: fshr_v4i32_imm:
477; GFX9:       ; %bb.0: ; %entry
478; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
479; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
480; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
481; GFX9-NEXT:    v_mov_b32_e32 v4, 0
482; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX9-NEXT:    v_mov_b32_e32 v0, s7
484; GFX9-NEXT:    v_mov_b32_e32 v1, s6
485; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, 1
486; GFX9-NEXT:    v_mov_b32_e32 v0, s5
487; GFX9-NEXT:    v_alignbit_b32 v2, s10, v1, 9
488; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, 7
489; GFX9-NEXT:    v_mov_b32_e32 v0, s4
490; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 1
491; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
492; GFX9-NEXT:    s_endpgm
493;
494; R600-LABEL: fshr_v4i32_imm:
495; R600:       ; %bb.0: ; %entry
496; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
497; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
498; R600-NEXT:    CF_END
499; R600-NEXT:    PAD
500; R600-NEXT:    ALU clause starting at 4:
501; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
502; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
503; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
504; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
505; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
506; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
507; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
508; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
509;
510; GFX10-LABEL: fshr_v4i32_imm:
511; GFX10:       ; %bb.0: ; %entry
512; GFX10-NEXT:    s_clause 0x2
513; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
514; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
515; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
516; GFX10-NEXT:    v_mov_b32_e32 v4, 0
517; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, 1
519; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, 9
520; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, 7
521; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, 1
522; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
523; GFX10-NEXT:    s_endpgm
524entry:
525  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
526  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
527  ret void
528}
529
530define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
531; GFX89-LABEL: v_fshr_i32:
532; GFX89:       ; %bb.0:
533; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
535; GFX89-NEXT:    s_setpc_b64 s[30:31]
536;
537; R600-LABEL: v_fshr_i32:
538; R600:       ; %bb.0:
539; R600-NEXT:    CF_END
540; R600-NEXT:    PAD
541;
542; GFX10-LABEL: v_fshr_i32:
543; GFX10:       ; %bb.0:
544; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
546; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
547; GFX10-NEXT:    s_setpc_b64 s[30:31]
548  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
549  ret i32 %ret
550}
551
552define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
553; GFX89-LABEL: v_fshr_v2i32:
554; GFX89:       ; %bb.0:
555; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
557; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v5
558; GFX89-NEXT:    s_setpc_b64 s[30:31]
559;
560; R600-LABEL: v_fshr_v2i32:
561; R600:       ; %bb.0:
562; R600-NEXT:    CF_END
563; R600-NEXT:    PAD
564;
565; GFX10-LABEL: v_fshr_v2i32:
566; GFX10:       ; %bb.0:
567; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
569; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
570; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
571; GFX10-NEXT:    s_setpc_b64 s[30:31]
572  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
573  ret <2 x i32> %ret
574}
575
576define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
577; GFX89-LABEL: v_fshr_v3i32:
578; GFX89:       ; %bb.0:
579; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
581; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v7
582; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v8
583; GFX89-NEXT:    s_setpc_b64 s[30:31]
584;
585; R600-LABEL: v_fshr_v3i32:
586; R600:       ; %bb.0:
587; R600-NEXT:    CF_END
588; R600-NEXT:    PAD
589;
590; GFX10-LABEL: v_fshr_v3i32:
591; GFX10:       ; %bb.0:
592; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
594; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
595; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
596; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
597; GFX10-NEXT:    s_setpc_b64 s[30:31]
598  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
599  ret <3 x i32> %ret
600}
601
602define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
603; GFX89-LABEL: v_fshr_v4i32:
604; GFX89:       ; %bb.0:
605; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
606; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
607; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v9
608; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v10
609; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v11
610; GFX89-NEXT:    s_setpc_b64 s[30:31]
611;
612; R600-LABEL: v_fshr_v4i32:
613; R600:       ; %bb.0:
614; R600-NEXT:    CF_END
615; R600-NEXT:    PAD
616;
617; GFX10-LABEL: v_fshr_v4i32:
618; GFX10:       ; %bb.0:
619; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
621; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
622; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
623; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
624; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
625; GFX10-NEXT:    s_setpc_b64 s[30:31]
626  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
627  ret <4 x i32> %ret
628}
629
630define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
631; SI-LABEL: v_fshr_i16:
632; SI:       ; %bb.0:
633; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634; SI-NEXT:    v_or_b32_e32 v2, 16, v2
635; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
636; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
637; SI-NEXT:    s_setpc_b64 s[30:31]
638;
639; VI-LABEL: v_fshr_i16:
640; VI:       ; %bb.0:
641; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
642; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
643; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
644; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
645; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
646; VI-NEXT:    v_or_b32_e32 v0, v0, v1
647; VI-NEXT:    s_setpc_b64 s[30:31]
648;
649; GFX9-LABEL: v_fshr_i16:
650; GFX9:       ; %bb.0:
651; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
653; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
654; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
655; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
656; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
657; GFX9-NEXT:    s_setpc_b64 s[30:31]
658;
659; R600-LABEL: v_fshr_i16:
660; R600:       ; %bb.0:
661; R600-NEXT:    CF_END
662; R600-NEXT:    PAD
663;
664; GFX10-LABEL: v_fshr_i16:
665; GFX10:       ; %bb.0:
666; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
668; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
669; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
670; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
671; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
672; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
673; GFX10-NEXT:    s_setpc_b64 s[30:31]
674  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
675  ret i16 %ret
676}
677
678define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
679; SI-LABEL: v_fshr_v2i16:
680; SI:       ; %bb.0:
681; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
682; SI-NEXT:    v_or_b32_e32 v5, 16, v5
683; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
684; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
685; SI-NEXT:    v_or_b32_e32 v3, 16, v4
686; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
687; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v3
688; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
689; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
690; SI-NEXT:    v_or_b32_e32 v0, v0, v1
691; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
692; SI-NEXT:    s_setpc_b64 s[30:31]
693;
694; VI-LABEL: v_fshr_v2i16:
695; VI:       ; %bb.0:
696; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
698; VI-NEXT:    v_mov_b32_e32 v5, 1
699; VI-NEXT:    v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
700; VI-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
701; VI-NEXT:    v_xor_b32_e32 v3, -1, v3
702; VI-NEXT:    v_lshlrev_b16_e32 v3, v3, v5
703; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
704; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
705; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
706; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
707; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
708; VI-NEXT:    v_or_b32_e32 v0, v0, v1
709; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
710; VI-NEXT:    s_setpc_b64 s[30:31]
711;
712; GFX9-LABEL: v_fshr_v2i16:
713; GFX9:       ; %bb.0:
714; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
715; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
716; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
717; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
718; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
719; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
720; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
721; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
722; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
723; GFX9-NEXT:    s_setpc_b64 s[30:31]
724;
725; R600-LABEL: v_fshr_v2i16:
726; R600:       ; %bb.0:
727; R600-NEXT:    CF_END
728; R600-NEXT:    PAD
729;
730; GFX10-LABEL: v_fshr_v2i16:
731; GFX10:       ; %bb.0:
732; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
734; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
735; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
736; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
737; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
738; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
739; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
740; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
741; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
742; GFX10-NEXT:    s_setpc_b64 s[30:31]
743  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
744  ret <2 x i16> %ret
745}
746
747define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
748; SI-LABEL: v_fshr_v3i16:
749; SI:       ; %bb.0:
750; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751; SI-NEXT:    v_or_b32_e32 v7, 16, v7
752; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
753; SI-NEXT:    v_alignbit_b32 v1, v1, v4, v7
754; SI-NEXT:    v_or_b32_e32 v4, 16, v6
755; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
756; SI-NEXT:    v_alignbit_b32 v0, v0, v3, v4
757; SI-NEXT:    s_mov_b32 s4, 0xffff
758; SI-NEXT:    v_or_b32_e32 v3, 16, v8
759; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
760; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
761; SI-NEXT:    v_and_b32_e32 v0, s4, v0
762; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
763; SI-NEXT:    v_or_b32_e32 v0, v0, v1
764; SI-NEXT:    v_and_b32_e32 v2, s4, v3
765; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
766; SI-NEXT:    s_setpc_b64 s[30:31]
767;
768; VI-LABEL: v_fshr_v3i16:
769; VI:       ; %bb.0:
770; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
771; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
772; VI-NEXT:    v_mov_b32_e32 v8, 1
773; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
774; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
775; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
776; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
777; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
778; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
779; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
780; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
781; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
782; VI-NEXT:    v_or_b32_e32 v1, v1, v3
783; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
784; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
785; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
786; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
787; VI-NEXT:    v_or_b32_e32 v0, v0, v2
788; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
789; VI-NEXT:    s_setpc_b64 s[30:31]
790;
791; GFX9-LABEL: v_fshr_v3i16:
792; GFX9:       ; %bb.0:
793; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
794; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
795; GFX9-NEXT:    v_mov_b32_e32 v8, 1
796; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
797; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
798; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
799; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
800; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
801; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
802; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
803; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
804; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
805; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
806; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
807; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
808; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
809; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
810; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
811; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
812; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
813; GFX9-NEXT:    s_setpc_b64 s[30:31]
814;
815; R600-LABEL: v_fshr_v3i16:
816; R600:       ; %bb.0:
817; R600-NEXT:    CF_END
818; R600-NEXT:    PAD
819;
820; GFX10-LABEL: v_fshr_v3i16:
821; GFX10:       ; %bb.0:
822; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
824; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
825; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
826; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
827; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
828; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
829; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v6
830; GFX10-NEXT:    v_lshlrev_b16 v7, 1, v7
831; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
832; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
833; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v9
834; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
835; GFX10-NEXT:    v_lshlrev_b16 v6, v10, v7
836; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
837; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
838; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v5
839; GFX10-NEXT:    v_or_b32_e32 v4, v6, v4
840; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
841; GFX10-NEXT:    v_lshlrev_b16 v1, v2, v1
842; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
843; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
844; GFX10-NEXT:    s_setpc_b64 s[30:31]
845  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
846  ret <3 x i16> %ret
847}
848
849define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
850; SI-LABEL: v_fshr_v4i16:
851; SI:       ; %bb.0:
852; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
853; SI-NEXT:    v_or_b32_e32 v9, 16, v9
854; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
855; SI-NEXT:    v_alignbit_b32 v1, v1, v5, v9
856; SI-NEXT:    v_or_b32_e32 v5, 16, v8
857; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
858; SI-NEXT:    v_alignbit_b32 v0, v0, v4, v5
859; SI-NEXT:    v_or_b32_e32 v4, 16, v11
860; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
861; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
862; SI-NEXT:    v_or_b32_e32 v4, 16, v10
863; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
864; SI-NEXT:    s_mov_b32 s4, 0xffff
865; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
866; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
867; SI-NEXT:    v_and_b32_e32 v2, s4, v2
868; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
869; SI-NEXT:    v_and_b32_e32 v0, s4, v0
870; SI-NEXT:    v_or_b32_e32 v2, v2, v3
871; SI-NEXT:    v_or_b32_e32 v0, v0, v1
872; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
873; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
874; SI-NEXT:    s_setpc_b64 s[30:31]
875;
876; VI-LABEL: v_fshr_v4i16:
877; VI:       ; %bb.0:
878; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
879; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
880; VI-NEXT:    v_mov_b32_e32 v8, 1
881; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
882; VI-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
883; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
884; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
885; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
886; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
887; VI-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
888; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
889; VI-NEXT:    v_xor_b32_e32 v7, -1, v7
890; VI-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
891; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
892; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
893; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
894; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
895; VI-NEXT:    v_or_b32_e32 v1, v1, v3
896; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
897; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
898; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
899; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
900; VI-NEXT:    v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
901; VI-NEXT:    v_or_b32_e32 v0, v0, v2
902; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
903; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
904; VI-NEXT:    s_setpc_b64 s[30:31]
905;
906; GFX9-LABEL: v_fshr_v4i16:
907; GFX9:       ; %bb.0:
908; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
910; GFX9-NEXT:    v_mov_b32_e32 v8, 1
911; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
912; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
913; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
914; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
915; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
916; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
917; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
918; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
919; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v7
920; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
921; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
922; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
923; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
924; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
925; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
926; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
927; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
928; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
929; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
930; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
931; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
932; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
933; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
934; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
935; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
936; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
937; GFX9-NEXT:    s_setpc_b64 s[30:31]
938;
939; R600-LABEL: v_fshr_v4i16:
940; R600:       ; %bb.0:
941; R600-NEXT:    CF_END
942; R600-NEXT:    PAD
943;
944; GFX10-LABEL: v_fshr_v4i16:
945; GFX10:       ; %bb.0:
946; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
947; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
948; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
949; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
950; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
951; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
952; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
953; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
954; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v7
955; GFX10-NEXT:    v_lshrrev_b16 v7, v7, v8
956; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
957; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
958; GFX10-NEXT:    v_xor_b32_e32 v12, -1, v5
959; GFX10-NEXT:    v_lshlrev_b16 v6, v9, v6
960; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v4
961; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
962; GFX10-NEXT:    v_lshlrev_b16 v8, 1, v8
963; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v11
964; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
965; GFX10-NEXT:    v_lshlrev_b16 v0, v9, v0
966; GFX10-NEXT:    v_lshlrev_b16 v1, v12, v1
967; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
968; GFX10-NEXT:    v_lshrrev_b16 v4, v11, v10
969; GFX10-NEXT:    v_lshlrev_b16 v5, v13, v8
970; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
971; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
972; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
973; GFX10-NEXT:    v_or_b32_e32 v3, v6, v7
974; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
975; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
976; GFX10-NEXT:    v_and_b32_e32 v1, v2, v1
977; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
978; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
979; GFX10-NEXT:    s_setpc_b64 s[30:31]
980  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
981  ret <4 x i16> %ret
982}
983
984define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
985; SI-LABEL: v_fshr_i64:
986; SI:       ; %bb.0:
987; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
988; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
989; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
990; SI-NEXT:    v_not_b32_e32 v4, v4
991; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
992; SI-NEXT:    v_or_b32_e32 v1, v1, v3
993; SI-NEXT:    v_or_b32_e32 v0, v0, v2
994; SI-NEXT:    s_setpc_b64 s[30:31]
995;
996; VI-LABEL: v_fshr_i64:
997; VI:       ; %bb.0:
998; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
999; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1000; VI-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1001; VI-NEXT:    v_not_b32_e32 v4, v4
1002; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
1003; VI-NEXT:    v_or_b32_e32 v1, v1, v3
1004; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1005; VI-NEXT:    s_setpc_b64 s[30:31]
1006;
1007; GFX9-LABEL: v_fshr_i64:
1008; GFX9:       ; %bb.0:
1009; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1011; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1012; GFX9-NEXT:    v_not_b32_e32 v4, v4
1013; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
1014; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
1015; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
1016; GFX9-NEXT:    s_setpc_b64 s[30:31]
1017;
1018; R600-LABEL: v_fshr_i64:
1019; R600:       ; %bb.0:
1020; R600-NEXT:    CF_END
1021; R600-NEXT:    PAD
1022;
1023; GFX10-LABEL: v_fshr_i64:
1024; GFX10:       ; %bb.0:
1025; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1026; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1027; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1028; GFX10-NEXT:    v_not_b32_e32 v5, v4
1029; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1030; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
1031; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1032; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1033; GFX10-NEXT:    s_setpc_b64 s[30:31]
1034  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1035  ret i64 %ret
1036}
1037
1038define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1039; SI-LABEL: v_fshr_v2i64:
1040; SI:       ; %bb.0:
1041; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1042; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1043; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
1044; SI-NEXT:    v_not_b32_e32 v8, v8
1045; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
1046; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1047; SI-NEXT:    v_or_b32_e32 v1, v1, v5
1048; SI-NEXT:    v_lshr_b64 v[5:6], v[6:7], v10
1049; SI-NEXT:    v_not_b32_e32 v7, v10
1050; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
1051; SI-NEXT:    v_or_b32_e32 v0, v0, v4
1052; SI-NEXT:    v_or_b32_e32 v3, v3, v6
1053; SI-NEXT:    v_or_b32_e32 v2, v2, v5
1054; SI-NEXT:    s_setpc_b64 s[30:31]
1055;
1056; VI-LABEL: v_fshr_v2i64:
1057; VI:       ; %bb.0:
1058; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1059; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1060; VI-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1061; VI-NEXT:    v_not_b32_e32 v8, v8
1062; VI-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1063; VI-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1064; VI-NEXT:    v_or_b32_e32 v1, v1, v5
1065; VI-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1066; VI-NEXT:    v_not_b32_e32 v7, v10
1067; VI-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1068; VI-NEXT:    v_or_b32_e32 v0, v0, v4
1069; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1070; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1071; VI-NEXT:    s_setpc_b64 s[30:31]
1072;
1073; GFX9-LABEL: v_fshr_v2i64:
1074; GFX9:       ; %bb.0:
1075; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1076; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1077; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1078; GFX9-NEXT:    v_not_b32_e32 v8, v8
1079; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1080; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1081; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
1082; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1083; GFX9-NEXT:    v_not_b32_e32 v7, v10
1084; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1085; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
1086; GFX9-NEXT:    v_or_b32_e32 v3, v3, v6
1087; GFX9-NEXT:    v_or_b32_e32 v2, v2, v5
1088; GFX9-NEXT:    s_setpc_b64 s[30:31]
1089;
1090; R600-LABEL: v_fshr_v2i64:
1091; R600:       ; %bb.0:
1092; R600-NEXT:    CF_END
1093; R600-NEXT:    PAD
1094;
1095; GFX10-LABEL: v_fshr_v2i64:
1096; GFX10:       ; %bb.0:
1097; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1098; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1099; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1100; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1101; GFX10-NEXT:    v_not_b32_e32 v9, v8
1102; GFX10-NEXT:    v_not_b32_e32 v11, v10
1103; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1104; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
1105; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1106; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
1107; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
1108; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
1109; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
1110; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
1111; GFX10-NEXT:    s_setpc_b64 s[30:31]
1112  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1113  ret <2 x i64> %ret
1114}
1115
1116define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1117; SI-LABEL: v_fshr_i24:
1118; SI:       ; %bb.0:
1119; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1120; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1121; SI-NEXT:    v_mul_hi_u32 v3, v2, s4
1122; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1123; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1124; SI-NEXT:    v_mul_lo_u32 v3, v3, 24
1125; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1126; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
1127; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1128; SI-NEXT:    s_setpc_b64 s[30:31]
1129;
1130; VI-LABEL: v_fshr_i24:
1131; VI:       ; %bb.0:
1132; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1133; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1134; VI-NEXT:    v_mul_hi_u32 v3, v2, s4
1135; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1136; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1137; VI-NEXT:    v_mul_lo_u32 v3, v3, 24
1138; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1139; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
1140; VI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1141; VI-NEXT:    s_setpc_b64 s[30:31]
1142;
1143; GFX9-LABEL: v_fshr_i24:
1144; GFX9:       ; %bb.0:
1145; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1146; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1147; GFX9-NEXT:    v_mul_hi_u32 v3, v2, s4
1148; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1149; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1150; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
1151; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
1152; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
1153; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1154; GFX9-NEXT:    s_setpc_b64 s[30:31]
1155;
1156; R600-LABEL: v_fshr_i24:
1157; R600:       ; %bb.0:
1158; R600-NEXT:    CF_END
1159; R600-NEXT:    PAD
1160;
1161; GFX10-LABEL: v_fshr_i24:
1162; GFX10:       ; %bb.0:
1163; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1164; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1165; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaaab, v2
1166; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1167; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1168; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
1169; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
1170; GFX10-NEXT:    v_add_nc_u32_e32 v2, 8, v2
1171; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1172; GFX10-NEXT:    s_setpc_b64 s[30:31]
1173  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1174  ret i24 %ret
1175}
1176
1177define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1178; SI-LABEL: v_fshr_v2i24:
1179; SI:       ; %bb.0:
1180; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1181; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1182; SI-NEXT:    v_mul_hi_u32 v6, v4, s4
1183; SI-NEXT:    v_mul_hi_u32 v7, v5, s4
1184; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1185; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1186; SI-NEXT:    v_mul_lo_u32 v6, v6, 24
1187; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
1188; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1189; SI-NEXT:    v_mul_lo_u32 v6, v6, 24
1190; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
1191; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1192; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1193; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v6
1194; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
1195; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1196; SI-NEXT:    s_setpc_b64 s[30:31]
1197;
1198; VI-LABEL: v_fshr_v2i24:
1199; VI:       ; %bb.0:
1200; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1201; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1202; VI-NEXT:    v_mul_hi_u32 v6, v4, s4
1203; VI-NEXT:    v_mul_hi_u32 v7, v5, s4
1204; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1205; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1206; VI-NEXT:    v_mul_lo_u32 v6, v6, 24
1207; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
1208; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1209; VI-NEXT:    v_mul_lo_u32 v6, v6, 24
1210; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
1211; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1212; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1213; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v6
1214; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
1215; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1216; VI-NEXT:    s_setpc_b64 s[30:31]
1217;
1218; GFX9-LABEL: v_fshr_v2i24:
1219; GFX9:       ; %bb.0:
1220; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1221; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1222; GFX9-NEXT:    v_mul_hi_u32 v6, v4, s4
1223; GFX9-NEXT:    v_mul_hi_u32 v7, v5, s4
1224; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1225; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1226; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1227; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
1228; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1229; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1230; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
1231; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1232; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1233; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v6
1234; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
1235; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1236; GFX9-NEXT:    s_setpc_b64 s[30:31]
1237;
1238; R600-LABEL: v_fshr_v2i24:
1239; R600:       ; %bb.0:
1240; R600-NEXT:    CF_END
1241; R600-NEXT:    PAD
1242;
1243; GFX10-LABEL: v_fshr_v2i24:
1244; GFX10:       ; %bb.0:
1245; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1246; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1247; GFX10-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1248; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1249; GFX10-NEXT:    v_mul_hi_u32 v6, v4, s4
1250; GFX10-NEXT:    v_mul_hi_u32 v7, v5, s4
1251; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1252; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1253; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1254; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
1255; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
1256; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
1257; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
1258; GFX10-NEXT:    v_add_nc_u32_e32 v4, 8, v4
1259; GFX10-NEXT:    v_add_nc_u32_e32 v5, 8, v5
1260; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1261; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
1262; GFX10-NEXT:    s_setpc_b64 s[30:31]
1263  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1264  ret <2 x i24> %ret
1265}
1266