1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600
6
7declare i32 @llvm.fshr.i32(i32, i32, i32) nounwind readnone
8declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
9declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
10
11define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
12; SI-LABEL: fshr_i32:
13; SI:       ; %bb.0: ; %entry
14; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
15; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
16; SI-NEXT:    s_mov_b32 s7, 0xf000
17; SI-NEXT:    s_mov_b32 s6, -1
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    v_mov_b32_e32 v0, s1
20; SI-NEXT:    v_mov_b32_e32 v1, s2
21; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v1
22; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
23; SI-NEXT:    s_endpgm
24;
25; VI-LABEL: fshr_i32:
26; VI:       ; %bb.0: ; %entry
27; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
28; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    v_mov_b32_e32 v0, s1
31; VI-NEXT:    v_mov_b32_e32 v1, s2
32; VI-NEXT:    v_alignbit_b32 v2, s0, v0, v1
33; VI-NEXT:    v_mov_b32_e32 v0, s4
34; VI-NEXT:    v_mov_b32_e32 v1, s5
35; VI-NEXT:    flat_store_dword v[0:1], v2
36; VI-NEXT:    s_endpgm
37;
38; GFX9-LABEL: fshr_i32:
39; GFX9:       ; %bb.0: ; %entry
40; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
41; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
42; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX9-NEXT:    v_mov_b32_e32 v0, s1
44; GFX9-NEXT:    v_mov_b32_e32 v1, s2
45; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, v1
46; GFX9-NEXT:    v_mov_b32_e32 v0, s4
47; GFX9-NEXT:    v_mov_b32_e32 v1, s5
48; GFX9-NEXT:    global_store_dword v[0:1], v2, off
49; GFX9-NEXT:    s_endpgm
50;
51; R600-LABEL: fshr_i32:
52; R600:       ; %bb.0: ; %entry
53; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
54; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
55; R600-NEXT:    CF_END
56; R600-NEXT:    PAD
57; R600-NEXT:    ALU clause starting at 4:
58; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
59; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
60; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
61entry:
62  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
63  store i32 %0, i32 addrspace(1)* %in
64  ret void
65}
66
67define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
68; SI-LABEL: fshr_i32_imm:
69; SI:       ; %bb.0: ; %entry
70; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
71; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
72; SI-NEXT:    s_mov_b32 s7, 0xf000
73; SI-NEXT:    s_mov_b32 s6, -1
74; SI-NEXT:    s_waitcnt lgkmcnt(0)
75; SI-NEXT:    v_mov_b32_e32 v0, s1
76; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 7
77; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
78; SI-NEXT:    s_endpgm
79;
80; VI-LABEL: fshr_i32_imm:
81; VI:       ; %bb.0: ; %entry
82; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
83; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
84; VI-NEXT:    s_waitcnt lgkmcnt(0)
85; VI-NEXT:    v_mov_b32_e32 v0, s1
86; VI-NEXT:    v_alignbit_b32 v2, s0, v0, 7
87; VI-NEXT:    v_mov_b32_e32 v0, s2
88; VI-NEXT:    v_mov_b32_e32 v1, s3
89; VI-NEXT:    flat_store_dword v[0:1], v2
90; VI-NEXT:    s_endpgm
91;
92; GFX9-LABEL: fshr_i32_imm:
93; GFX9:       ; %bb.0: ; %entry
94; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
95; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:    v_mov_b32_e32 v0, s1
98; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, 7
99; GFX9-NEXT:    v_mov_b32_e32 v0, s2
100; GFX9-NEXT:    v_mov_b32_e32 v1, s3
101; GFX9-NEXT:    global_store_dword v[0:1], v2, off
102; GFX9-NEXT:    s_endpgm
103;
104; R600-LABEL: fshr_i32_imm:
105; R600:       ; %bb.0: ; %entry
106; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
107; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
108; R600-NEXT:    CF_END
109; R600-NEXT:    PAD
110; R600-NEXT:    ALU clause starting at 4:
111; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
112; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
113; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
114; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
115entry:
116  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
117  store i32 %0, i32 addrspace(1)* %in
118  ret void
119}
120
121define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
122; SI-LABEL: fshr_v2i32:
123; SI:       ; %bb.0: ; %entry
124; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
125; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
126; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
127; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
128; SI-NEXT:    s_mov_b32 s7, 0xf000
129; SI-NEXT:    s_mov_b32 s6, -1
130; SI-NEXT:    s_waitcnt lgkmcnt(0)
131; SI-NEXT:    v_mov_b32_e32 v0, s9
132; SI-NEXT:    s_and_b32 s1, s1, 31
133; SI-NEXT:    v_mov_b32_e32 v1, s1
134; SI-NEXT:    s_and_b32 s0, s0, 31
135; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
136; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
137; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
138; SI-NEXT:    v_mov_b32_e32 v0, s8
139; SI-NEXT:    v_mov_b32_e32 v2, s0
140; SI-NEXT:    v_alignbit_b32 v2, s2, v0, v2
141; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
142; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
143; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
144; SI-NEXT:    s_endpgm
145;
146; VI-LABEL: fshr_v2i32:
147; VI:       ; %bb.0: ; %entry
148; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
149; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
150; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
151; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
152; VI-NEXT:    s_waitcnt lgkmcnt(0)
153; VI-NEXT:    v_mov_b32_e32 v0, s7
154; VI-NEXT:    s_and_b32 s1, s1, 31
155; VI-NEXT:    v_mov_b32_e32 v1, s1
156; VI-NEXT:    s_and_b32 s0, s0, 31
157; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
158; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
159; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
160; VI-NEXT:    v_mov_b32_e32 v0, s6
161; VI-NEXT:    v_mov_b32_e32 v2, s0
162; VI-NEXT:    v_alignbit_b32 v2, s4, v0, v2
163; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
164; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
165; VI-NEXT:    v_mov_b32_e32 v2, s2
166; VI-NEXT:    v_mov_b32_e32 v3, s3
167; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
168; VI-NEXT:    s_endpgm
169;
170; GFX9-LABEL: fshr_v2i32:
171; GFX9:       ; %bb.0: ; %entry
172; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
173; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
174; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
175; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
176; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
177; GFX9-NEXT:    v_mov_b32_e32 v0, s7
178; GFX9-NEXT:    s_and_b32 s1, s1, 31
179; GFX9-NEXT:    v_mov_b32_e32 v1, s1
180; GFX9-NEXT:    s_and_b32 s0, s0, 31
181; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
182; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
183; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
184; GFX9-NEXT:    v_mov_b32_e32 v0, s6
185; GFX9-NEXT:    v_mov_b32_e32 v2, s0
186; GFX9-NEXT:    v_alignbit_b32 v2, s4, v0, v2
187; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
188; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
189; GFX9-NEXT:    v_mov_b32_e32 v2, s2
190; GFX9-NEXT:    v_mov_b32_e32 v3, s3
191; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
192; GFX9-NEXT:    s_endpgm
193;
194; R600-LABEL: fshr_v2i32:
195; R600:       ; %bb.0: ; %entry
196; R600-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
197; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
198; R600-NEXT:    CF_END
199; R600-NEXT:    PAD
200; R600-NEXT:    ALU clause starting at 4:
201; R600-NEXT:     AND_INT * T0.W, KC0[4].X, literal.x,
202; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
203; R600-NEXT:     BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
204; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
205; R600-NEXT:     CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
206; R600-NEXT:     AND_INT * T0.W, KC0[3].W, literal.x,
207; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
208; R600-NEXT:     BIT_ALIGN_INT T1.W, KC0[2].W, KC0[3].Y, PV.W,
209; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
210; R600-NEXT:     CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
211; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
212; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
213entry:
214  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
215  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
216  ret void
217}
218
219define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
220; SI-LABEL: fshr_v2i32_imm:
221; SI:       ; %bb.0: ; %entry
222; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
223; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
224; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
225; SI-NEXT:    s_mov_b32 s7, 0xf000
226; SI-NEXT:    s_mov_b32 s6, -1
227; SI-NEXT:    s_waitcnt lgkmcnt(0)
228; SI-NEXT:    v_mov_b32_e32 v0, s1
229; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 9
230; SI-NEXT:    v_mov_b32_e32 v0, s0
231; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
232; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
233; SI-NEXT:    s_endpgm
234;
235; VI-LABEL: fshr_v2i32_imm:
236; VI:       ; %bb.0: ; %entry
237; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
238; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
239; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
240; VI-NEXT:    s_waitcnt lgkmcnt(0)
241; VI-NEXT:    v_mov_b32_e32 v0, s1
242; VI-NEXT:    v_mov_b32_e32 v2, s0
243; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
244; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
245; VI-NEXT:    v_mov_b32_e32 v2, s2
246; VI-NEXT:    v_mov_b32_e32 v3, s3
247; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
248; VI-NEXT:    s_endpgm
249;
250; GFX9-LABEL: fshr_v2i32_imm:
251; GFX9:       ; %bb.0: ; %entry
252; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
253; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
254; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
255; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX9-NEXT:    v_mov_b32_e32 v0, s1
257; GFX9-NEXT:    v_mov_b32_e32 v2, s0
258; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
259; GFX9-NEXT:    v_alignbit_b32 v0, s4, v2, 7
260; GFX9-NEXT:    v_mov_b32_e32 v2, s2
261; GFX9-NEXT:    v_mov_b32_e32 v3, s3
262; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
263; GFX9-NEXT:    s_endpgm
264;
265; R600-LABEL: fshr_v2i32_imm:
266; R600:       ; %bb.0: ; %entry
267; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
268; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
269; R600-NEXT:    CF_END
270; R600-NEXT:    PAD
271; R600-NEXT:    ALU clause starting at 4:
272; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
273; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
274; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
275; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
276; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
277; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
278entry:
279  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
280  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
281  ret void
282}
283
284define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
285; SI-LABEL: fshr_v4i32:
286; SI:       ; %bb.0: ; %entry
287; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
288; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
289; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x11
290; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
291; SI-NEXT:    s_mov_b32 s7, 0xf000
292; SI-NEXT:    s_mov_b32 s6, -1
293; SI-NEXT:    s_waitcnt lgkmcnt(0)
294; SI-NEXT:    v_mov_b32_e32 v0, s15
295; SI-NEXT:    s_and_b32 s3, s3, 31
296; SI-NEXT:    v_mov_b32_e32 v1, s3
297; SI-NEXT:    v_alignbit_b32 v1, s11, v0, v1
298; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
299; SI-NEXT:    s_and_b32 s2, s2, 31
300; SI-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
301; SI-NEXT:    v_mov_b32_e32 v0, s14
302; SI-NEXT:    v_mov_b32_e32 v1, s2
303; SI-NEXT:    v_alignbit_b32 v1, s10, v0, v1
304; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
305; SI-NEXT:    s_and_b32 s1, s1, 31
306; SI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
307; SI-NEXT:    v_mov_b32_e32 v0, s13
308; SI-NEXT:    v_mov_b32_e32 v1, s1
309; SI-NEXT:    s_and_b32 s0, s0, 31
310; SI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
311; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
312; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
313; SI-NEXT:    v_mov_b32_e32 v0, s12
314; SI-NEXT:    v_mov_b32_e32 v4, s0
315; SI-NEXT:    v_alignbit_b32 v4, s8, v0, v4
316; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
317; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
318; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
319; SI-NEXT:    s_endpgm
320;
321; VI-LABEL: fshr_v4i32:
322; VI:       ; %bb.0: ; %entry
323; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
324; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
325; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
326; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
327; VI-NEXT:    s_waitcnt lgkmcnt(0)
328; VI-NEXT:    v_mov_b32_e32 v0, s11
329; VI-NEXT:    s_and_b32 s3, s3, 31
330; VI-NEXT:    v_mov_b32_e32 v1, s3
331; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
332; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
333; VI-NEXT:    s_and_b32 s2, s2, 31
334; VI-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
335; VI-NEXT:    v_mov_b32_e32 v0, s10
336; VI-NEXT:    v_mov_b32_e32 v1, s2
337; VI-NEXT:    v_alignbit_b32 v1, s6, v0, v1
338; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
339; VI-NEXT:    s_and_b32 s1, s1, 31
340; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
341; VI-NEXT:    v_mov_b32_e32 v0, s9
342; VI-NEXT:    v_mov_b32_e32 v1, s1
343; VI-NEXT:    s_and_b32 s0, s0, 31
344; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
345; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
346; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
347; VI-NEXT:    v_mov_b32_e32 v0, s8
348; VI-NEXT:    v_mov_b32_e32 v4, s0
349; VI-NEXT:    v_alignbit_b32 v4, s4, v0, v4
350; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
351; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
352; VI-NEXT:    v_mov_b32_e32 v4, s12
353; VI-NEXT:    v_mov_b32_e32 v5, s13
354; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
355; VI-NEXT:    s_endpgm
356;
357; GFX9-LABEL: fshr_v4i32:
358; GFX9:       ; %bb.0: ; %entry
359; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
360; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
361; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
362; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
363; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
364; GFX9-NEXT:    v_mov_b32_e32 v0, s11
365; GFX9-NEXT:    s_and_b32 s3, s3, 31
366; GFX9-NEXT:    v_mov_b32_e32 v1, s3
367; GFX9-NEXT:    v_alignbit_b32 v1, s7, v0, v1
368; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
369; GFX9-NEXT:    s_and_b32 s2, s2, 31
370; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
371; GFX9-NEXT:    v_mov_b32_e32 v0, s10
372; GFX9-NEXT:    v_mov_b32_e32 v1, s2
373; GFX9-NEXT:    v_alignbit_b32 v1, s6, v0, v1
374; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
375; GFX9-NEXT:    s_and_b32 s1, s1, 31
376; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
377; GFX9-NEXT:    v_mov_b32_e32 v0, s9
378; GFX9-NEXT:    v_mov_b32_e32 v1, s1
379; GFX9-NEXT:    s_and_b32 s0, s0, 31
380; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
381; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
382; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
383; GFX9-NEXT:    v_mov_b32_e32 v0, s8
384; GFX9-NEXT:    v_mov_b32_e32 v4, s0
385; GFX9-NEXT:    v_alignbit_b32 v4, s4, v0, v4
386; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
387; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
388; GFX9-NEXT:    v_mov_b32_e32 v4, s12
389; GFX9-NEXT:    v_mov_b32_e32 v5, s13
390; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
391; GFX9-NEXT:    s_endpgm
392;
393; R600-LABEL: fshr_v4i32:
394; R600:       ; %bb.0: ; %entry
395; R600-NEXT:    ALU 20, @4, KC0[CB0:0-32], KC1[]
396; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
397; R600-NEXT:    CF_END
398; R600-NEXT:    PAD
399; R600-NEXT:    ALU clause starting at 4:
400; R600-NEXT:     AND_INT T0.W, KC0[5].Z, literal.x,
401; R600-NEXT:     AND_INT * T1.W, KC0[6].X, literal.x,
402; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
403; R600-NEXT:     SETE_INT T0.Z, PS, 0.0,
404; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[4].X, KC0[5].X, PS,
405; R600-NEXT:     AND_INT * T2.W, KC0[5].W, literal.x,
406; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
407; R600-NEXT:     SETE_INT T1.Z, PV.W, 0.0,
408; R600-NEXT:     BIT_ALIGN_INT * T2.W, KC0[3].W, KC0[4].W, PV.W,
409; R600-NEXT:     CNDE_INT * T1.W, T0.Z, T1.W, KC0[5].X,
410; R600-NEXT:     CNDE_INT T1.Z, T1.Z, T2.W, KC0[4].W,
411; R600-NEXT:     BIT_ALIGN_INT T2.W, KC0[3].Z, KC0[4].Z, T0.W,
412; R600-NEXT:     SETE_INT * T0.W, T0.W, 0.0,
413; R600-NEXT:     CNDE_INT T1.Y, PS, PV.W, KC0[4].Z,
414; R600-NEXT:     AND_INT * T0.W, KC0[5].Y, literal.x,
415; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
416; R600-NEXT:     BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PV.W,
417; R600-NEXT:     SETE_INT * T0.W, PV.W, 0.0,
418; R600-NEXT:     CNDE_INT T1.X, PS, PV.W, KC0[4].Y,
419; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
420; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
421entry:
422  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
423  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
424  ret void
425}
426
427define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
428; SI-LABEL: fshr_v4i32_imm:
429; SI:       ; %bb.0: ; %entry
430; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
431; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
432; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
433; SI-NEXT:    s_mov_b32 s7, 0xf000
434; SI-NEXT:    s_mov_b32 s6, -1
435; SI-NEXT:    s_waitcnt lgkmcnt(0)
436; SI-NEXT:    v_mov_b32_e32 v0, s3
437; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
438; SI-NEXT:    v_mov_b32_e32 v0, s2
439; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 9
440; SI-NEXT:    v_mov_b32_e32 v0, s1
441; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
442; SI-NEXT:    v_mov_b32_e32 v0, s0
443; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
444; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
445; SI-NEXT:    s_endpgm
446;
447; VI-LABEL: fshr_v4i32_imm:
448; VI:       ; %bb.0: ; %entry
449; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
450; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
451; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
452; VI-NEXT:    s_waitcnt lgkmcnt(0)
453; VI-NEXT:    v_mov_b32_e32 v4, s8
454; VI-NEXT:    v_mov_b32_e32 v5, s9
455; VI-NEXT:    v_mov_b32_e32 v0, s3
456; VI-NEXT:    v_mov_b32_e32 v1, s2
457; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
458; VI-NEXT:    v_mov_b32_e32 v0, s1
459; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
460; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
461; VI-NEXT:    v_mov_b32_e32 v0, s0
462; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
463; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
464; VI-NEXT:    s_endpgm
465;
466; GFX9-LABEL: fshr_v4i32_imm:
467; GFX9:       ; %bb.0: ; %entry
468; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
469; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
470; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
471; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
472; GFX9-NEXT:    v_mov_b32_e32 v4, s8
473; GFX9-NEXT:    v_mov_b32_e32 v5, s9
474; GFX9-NEXT:    v_mov_b32_e32 v0, s3
475; GFX9-NEXT:    v_mov_b32_e32 v1, s2
476; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
477; GFX9-NEXT:    v_mov_b32_e32 v0, s1
478; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
479; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
480; GFX9-NEXT:    v_mov_b32_e32 v0, s0
481; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
482; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
483; GFX9-NEXT:    s_endpgm
484;
485; R600-LABEL: fshr_v4i32_imm:
486; R600:       ; %bb.0: ; %entry
487; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
488; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
489; R600-NEXT:    CF_END
490; R600-NEXT:    PAD
491; R600-NEXT:    ALU clause starting at 4:
492; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
493; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
494; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
495; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
496; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
497; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
498; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
499; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
500entry:
501  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
502  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
503  ret void
504}
505