1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s --check-prefix=VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
7
8declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
9declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
10declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
11
12define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
13; SI-LABEL: fshl_i32:
14; SI:       ; %bb.0: ; %entry
15; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
16; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
17; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
18; SI-NEXT:    s_mov_b32 s7, 0xf000
19; SI-NEXT:    s_mov_b32 s6, -1
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    v_mov_b32_e32 v0, s3
22; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 1
23; SI-NEXT:    s_not_b32 s0, s0
24; SI-NEXT:    s_lshr_b32 s1, s2, 1
25; SI-NEXT:    v_mov_b32_e32 v1, s0
26; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v1
27; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
28; SI-NEXT:    s_endpgm
29;
30; VI-LABEL: fshl_i32:
31; VI:       ; %bb.0: ; %entry
32; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
33; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
34; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    v_mov_b32_e32 v0, s3
37; VI-NEXT:    s_not_b32 s4, s4
38; VI-NEXT:    s_lshr_b32 s3, s2, 1
39; VI-NEXT:    v_alignbit_b32 v0, s2, v0, 1
40; VI-NEXT:    v_mov_b32_e32 v1, s4
41; VI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
42; VI-NEXT:    v_mov_b32_e32 v0, s0
43; VI-NEXT:    v_mov_b32_e32 v1, s1
44; VI-NEXT:    flat_store_dword v[0:1], v2
45; VI-NEXT:    s_endpgm
46;
47; GFX9-LABEL: fshl_i32:
48; GFX9:       ; %bb.0: ; %entry
49; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
50; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
51; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x34
52; GFX9-NEXT:    v_mov_b32_e32 v0, 0
53; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX9-NEXT:    v_mov_b32_e32 v1, s3
55; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
56; GFX9-NEXT:    s_not_b32 s1, s6
57; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 1
58; GFX9-NEXT:    v_mov_b32_e32 v2, s1
59; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, v2
60; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
61; GFX9-NEXT:    s_endpgm
62;
63; R600-LABEL: fshl_i32:
64; R600:       ; %bb.0: ; %entry
65; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
66; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
67; R600-NEXT:    CF_END
68; R600-NEXT:    PAD
69; R600-NEXT:    ALU clause starting at 4:
70; R600-NEXT:     LSHR T0.Z, KC0[2].Z, 1,
71; R600-NEXT:     BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1,
72; R600-NEXT:     NOT_INT * T1.W, KC0[3].X,
73; R600-NEXT:     BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS,
74; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
75; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
76;
77; GFX10-LABEL: fshl_i32:
78; GFX10:       ; %bb.0: ; %entry
79; GFX10-NEXT:    s_clause 0x2
80; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
81; GFX10-NEXT:    s_load_dword s6, s[0:1], 0x34
82; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
83; GFX10-NEXT:    v_mov_b32_e32 v1, 0
84; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX10-NEXT:    v_alignbit_b32 v0, s2, s3, 1
86; GFX10-NEXT:    s_lshr_b32 s0, s2, 1
87; GFX10-NEXT:    s_not_b32 s1, s6
88; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, s1
89; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
90; GFX10-NEXT:    s_endpgm
91entry:
92  %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
93  store i32 %0, i32 addrspace(1)* %in
94  ret void
95}
96
97define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
98; SI-LABEL: fshl_i32_imm:
99; SI:       ; %bb.0: ; %entry
100; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
101; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
102; SI-NEXT:    s_mov_b32 s3, 0xf000
103; SI-NEXT:    s_mov_b32 s2, -1
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s5
106; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 25
107; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
108; SI-NEXT:    s_endpgm
109;
110; VI-LABEL: fshl_i32_imm:
111; VI:       ; %bb.0: ; %entry
112; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
113; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
114; VI-NEXT:    s_waitcnt lgkmcnt(0)
115; VI-NEXT:    v_mov_b32_e32 v0, s3
116; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 25
117; VI-NEXT:    v_mov_b32_e32 v0, s0
118; VI-NEXT:    v_mov_b32_e32 v1, s1
119; VI-NEXT:    flat_store_dword v[0:1], v2
120; VI-NEXT:    s_endpgm
121;
122; GFX9-LABEL: fshl_i32_imm:
123; GFX9:       ; %bb.0: ; %entry
124; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
125; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
126; GFX9-NEXT:    v_mov_b32_e32 v0, 0
127; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX9-NEXT:    v_mov_b32_e32 v1, s3
129; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
130; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
131; GFX9-NEXT:    s_endpgm
132;
133; R600-LABEL: fshl_i32_imm:
134; R600:       ; %bb.0: ; %entry
135; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
136; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
137; R600-NEXT:    CF_END
138; R600-NEXT:    PAD
139; R600-NEXT:    ALU clause starting at 4:
140; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
141; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
142; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
143; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
144;
145; GFX10-LABEL: fshl_i32_imm:
146; GFX10:       ; %bb.0: ; %entry
147; GFX10-NEXT:    s_clause 0x1
148; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
149; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
150; GFX10-NEXT:    v_mov_b32_e32 v0, 0
151; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
153; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
154; GFX10-NEXT:    s_endpgm
155entry:
156  %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
157  store i32 %0, i32 addrspace(1)* %in
158  ret void
159}
160
161define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
162; SI-LABEL: fshl_v2i32:
163; SI:       ; %bb.0: ; %entry
164; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
165; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
166; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
167; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
168; SI-NEXT:    s_mov_b32 s7, 0xf000
169; SI-NEXT:    s_waitcnt lgkmcnt(0)
170; SI-NEXT:    v_mov_b32_e32 v0, s3
171; SI-NEXT:    s_mov_b32 s6, -1
172; SI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
173; SI-NEXT:    s_not_b32 s1, s1
174; SI-NEXT:    s_lshr_b32 s3, s9, 1
175; SI-NEXT:    v_mov_b32_e32 v1, s1
176; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
177; SI-NEXT:    v_mov_b32_e32 v0, s2
178; SI-NEXT:    s_not_b32 s0, s0
179; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
180; SI-NEXT:    s_lshr_b32 s1, s8, 1
181; SI-NEXT:    v_mov_b32_e32 v2, s0
182; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v2
183; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
184; SI-NEXT:    s_endpgm
185;
186; VI-LABEL: fshl_v2i32:
187; VI:       ; %bb.0: ; %entry
188; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
189; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
190; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
191; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
192; VI-NEXT:    s_waitcnt lgkmcnt(0)
193; VI-NEXT:    v_mov_b32_e32 v0, s3
194; VI-NEXT:    s_lshr_b32 s3, s5, 1
195; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
196; VI-NEXT:    s_not_b32 s5, s7
197; VI-NEXT:    v_mov_b32_e32 v1, s5
198; VI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
199; VI-NEXT:    v_mov_b32_e32 v0, s2
200; VI-NEXT:    s_not_b32 s3, s6
201; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
202; VI-NEXT:    s_lshr_b32 s2, s4, 1
203; VI-NEXT:    v_mov_b32_e32 v2, s3
204; VI-NEXT:    v_alignbit_b32 v0, s2, v0, v2
205; VI-NEXT:    v_mov_b32_e32 v3, s1
206; VI-NEXT:    v_mov_b32_e32 v2, s0
207; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
208; VI-NEXT:    s_endpgm
209;
210; GFX9-LABEL: fshl_v2i32:
211; GFX9:       ; %bb.0: ; %entry
212; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
213; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
214; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
215; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
216; GFX9-NEXT:    v_mov_b32_e32 v2, 0
217; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX9-NEXT:    v_mov_b32_e32 v0, s3
219; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
220; GFX9-NEXT:    s_lshr_b32 s0, s5, 1
221; GFX9-NEXT:    s_not_b32 s1, s9
222; GFX9-NEXT:    v_mov_b32_e32 v1, s1
223; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
224; GFX9-NEXT:    v_mov_b32_e32 v0, s2
225; GFX9-NEXT:    s_not_b32 s1, s8
226; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
227; GFX9-NEXT:    s_lshr_b32 s0, s4, 1
228; GFX9-NEXT:    v_mov_b32_e32 v3, s1
229; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
230; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
231; GFX9-NEXT:    s_endpgm
232;
233; R600-LABEL: fshl_v2i32:
234; R600:       ; %bb.0: ; %entry
235; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
236; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
237; R600-NEXT:    CF_END
238; R600-NEXT:    PAD
239; R600-NEXT:    ALU clause starting at 4:
240; R600-NEXT:     LSHR T0.Z, KC0[3].X, 1,
241; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1,
242; R600-NEXT:     NOT_INT * T1.W, KC0[4].X,
243; R600-NEXT:     BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W,
244; R600-NEXT:     LSHR T0.Z, KC0[2].W, 1,
245; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1,
246; R600-NEXT:     NOT_INT * T1.W, KC0[3].W,
247; R600-NEXT:     BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W,
248; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
249; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
250;
251; GFX10-LABEL: fshl_v2i32:
252; GFX10:       ; %bb.0: ; %entry
253; GFX10-NEXT:    s_clause 0x3
254; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
255; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
256; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
257; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
258; GFX10-NEXT:    v_mov_b32_e32 v2, 0
259; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX10-NEXT:    s_lshr_b32 s0, s3, 1
261; GFX10-NEXT:    v_alignbit_b32 v0, s3, s5, 1
262; GFX10-NEXT:    v_alignbit_b32 v3, s2, s4, 1
263; GFX10-NEXT:    s_not_b32 s1, s7
264; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
265; GFX10-NEXT:    s_not_b32 s3, s6
266; GFX10-NEXT:    v_alignbit_b32 v1, s0, v0, s1
267; GFX10-NEXT:    v_alignbit_b32 v0, s2, v3, s3
268; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
269; GFX10-NEXT:    s_endpgm
270entry:
271  %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
272  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
273  ret void
274}
275
276define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
277; SI-LABEL: fshl_v2i32_imm:
278; SI:       ; %bb.0: ; %entry
279; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
280; SI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xb
281; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
282; SI-NEXT:    s_mov_b32 s3, 0xf000
283; SI-NEXT:    s_mov_b32 s2, -1
284; SI-NEXT:    s_waitcnt lgkmcnt(0)
285; SI-NEXT:    v_mov_b32_e32 v0, s5
286; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 23
287; SI-NEXT:    v_mov_b32_e32 v0, s4
288; SI-NEXT:    v_alignbit_b32 v0, s6, v0, 25
289; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
290; SI-NEXT:    s_endpgm
291;
292; VI-LABEL: fshl_v2i32_imm:
293; VI:       ; %bb.0: ; %entry
294; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
295; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
296; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
297; VI-NEXT:    s_waitcnt lgkmcnt(0)
298; VI-NEXT:    v_mov_b32_e32 v0, s3
299; VI-NEXT:    v_mov_b32_e32 v2, s2
300; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 23
301; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 25
302; VI-NEXT:    v_mov_b32_e32 v3, s1
303; VI-NEXT:    v_mov_b32_e32 v2, s0
304; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
305; VI-NEXT:    s_endpgm
306;
307; GFX9-LABEL: fshl_v2i32_imm:
308; GFX9:       ; %bb.0: ; %entry
309; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
310; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
311; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
312; GFX9-NEXT:    v_mov_b32_e32 v2, 0
313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX9-NEXT:    v_mov_b32_e32 v0, s3
315; GFX9-NEXT:    v_mov_b32_e32 v3, s2
316; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 23
317; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 25
318; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
319; GFX9-NEXT:    s_endpgm
320;
321; R600-LABEL: fshl_v2i32_imm:
322; R600:       ; %bb.0: ; %entry
323; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
324; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
325; R600-NEXT:    CF_END
326; R600-NEXT:    PAD
327; R600-NEXT:    ALU clause starting at 4:
328; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
329; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
330; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
331; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
332; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
333; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
334;
335; GFX10-LABEL: fshl_v2i32_imm:
336; GFX10:       ; %bb.0: ; %entry
337; GFX10-NEXT:    s_clause 0x2
338; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
339; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
340; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
341; GFX10-NEXT:    v_mov_b32_e32 v2, 0
342; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
343; GFX10-NEXT:    v_alignbit_b32 v1, s3, s5, 23
344; GFX10-NEXT:    v_alignbit_b32 v0, s2, s4, 25
345; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
346; GFX10-NEXT:    s_endpgm
347entry:
348  %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
349  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
350  ret void
351}
352
353define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
354; SI-LABEL: fshl_v4i32:
355; SI:       ; %bb.0: ; %entry
356; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
357; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
358; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
359; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
360; SI-NEXT:    s_mov_b32 s11, 0xf000
361; SI-NEXT:    s_waitcnt lgkmcnt(0)
362; SI-NEXT:    v_mov_b32_e32 v0, s7
363; SI-NEXT:    s_mov_b32 s10, -1
364; SI-NEXT:    v_alignbit_b32 v0, s15, v0, 1
365; SI-NEXT:    s_not_b32 s3, s3
366; SI-NEXT:    s_lshr_b32 s7, s15, 1
367; SI-NEXT:    v_mov_b32_e32 v1, s3
368; SI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
369; SI-NEXT:    v_mov_b32_e32 v0, s6
370; SI-NEXT:    s_not_b32 s2, s2
371; SI-NEXT:    v_alignbit_b32 v0, s14, v0, 1
372; SI-NEXT:    s_lshr_b32 s3, s14, 1
373; SI-NEXT:    v_mov_b32_e32 v1, s2
374; SI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
375; SI-NEXT:    v_mov_b32_e32 v0, s5
376; SI-NEXT:    s_not_b32 s1, s1
377; SI-NEXT:    v_alignbit_b32 v0, s13, v0, 1
378; SI-NEXT:    s_lshr_b32 s2, s13, 1
379; SI-NEXT:    v_mov_b32_e32 v1, s1
380; SI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
381; SI-NEXT:    v_mov_b32_e32 v0, s4
382; SI-NEXT:    s_not_b32 s0, s0
383; SI-NEXT:    v_alignbit_b32 v0, s12, v0, 1
384; SI-NEXT:    s_lshr_b32 s1, s12, 1
385; SI-NEXT:    v_mov_b32_e32 v4, s0
386; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v4
387; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
388; SI-NEXT:    s_endpgm
389;
390; VI-LABEL: fshl_v4i32:
391; VI:       ; %bb.0: ; %entry
392; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
393; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
394; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
395; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
396; VI-NEXT:    s_waitcnt lgkmcnt(0)
397; VI-NEXT:    v_mov_b32_e32 v0, s7
398; VI-NEXT:    s_lshr_b32 s2, s11, 1
399; VI-NEXT:    s_not_b32 s3, s15
400; VI-NEXT:    v_alignbit_b32 v0, s11, v0, 1
401; VI-NEXT:    v_mov_b32_e32 v1, s3
402; VI-NEXT:    v_alignbit_b32 v3, s2, v0, v1
403; VI-NEXT:    v_mov_b32_e32 v0, s6
404; VI-NEXT:    s_not_b32 s3, s14
405; VI-NEXT:    v_alignbit_b32 v0, s10, v0, 1
406; VI-NEXT:    s_lshr_b32 s2, s10, 1
407; VI-NEXT:    v_mov_b32_e32 v1, s3
408; VI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
409; VI-NEXT:    v_mov_b32_e32 v0, s5
410; VI-NEXT:    s_not_b32 s3, s13
411; VI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
412; VI-NEXT:    s_lshr_b32 s2, s9, 1
413; VI-NEXT:    v_mov_b32_e32 v1, s3
414; VI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
415; VI-NEXT:    v_mov_b32_e32 v0, s4
416; VI-NEXT:    s_not_b32 s3, s12
417; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
418; VI-NEXT:    s_lshr_b32 s2, s8, 1
419; VI-NEXT:    v_mov_b32_e32 v4, s3
420; VI-NEXT:    v_alignbit_b32 v0, s2, v0, v4
421; VI-NEXT:    v_mov_b32_e32 v5, s1
422; VI-NEXT:    v_mov_b32_e32 v4, s0
423; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
424; VI-NEXT:    s_endpgm
425;
426; GFX9-LABEL: fshl_v4i32:
427; GFX9:       ; %bb.0: ; %entry
428; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
429; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
430; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
431; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
432; GFX9-NEXT:    v_mov_b32_e32 v4, 0
433; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
434; GFX9-NEXT:    v_mov_b32_e32 v0, s7
435; GFX9-NEXT:    v_alignbit_b32 v0, s11, v0, 1
436; GFX9-NEXT:    s_lshr_b32 s0, s11, 1
437; GFX9-NEXT:    s_not_b32 s1, s15
438; GFX9-NEXT:    v_mov_b32_e32 v1, s1
439; GFX9-NEXT:    v_alignbit_b32 v3, s0, v0, v1
440; GFX9-NEXT:    v_mov_b32_e32 v0, s6
441; GFX9-NEXT:    s_not_b32 s1, s14
442; GFX9-NEXT:    v_alignbit_b32 v0, s10, v0, 1
443; GFX9-NEXT:    s_lshr_b32 s0, s10, 1
444; GFX9-NEXT:    v_mov_b32_e32 v1, s1
445; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, v1
446; GFX9-NEXT:    v_mov_b32_e32 v0, s5
447; GFX9-NEXT:    s_not_b32 s1, s13
448; GFX9-NEXT:    v_alignbit_b32 v0, s9, v0, 1
449; GFX9-NEXT:    s_lshr_b32 s0, s9, 1
450; GFX9-NEXT:    v_mov_b32_e32 v1, s1
451; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
452; GFX9-NEXT:    v_mov_b32_e32 v0, s4
453; GFX9-NEXT:    s_not_b32 s1, s12
454; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 1
455; GFX9-NEXT:    s_lshr_b32 s0, s8, 1
456; GFX9-NEXT:    v_mov_b32_e32 v5, s1
457; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v5
458; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
459; GFX9-NEXT:    s_endpgm
460;
461; R600-LABEL: fshl_v4i32:
462; R600:       ; %bb.0: ; %entry
463; R600-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
464; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
465; R600-NEXT:    CF_END
466; R600-NEXT:    PAD
467; R600-NEXT:    ALU clause starting at 4:
468; R600-NEXT:     LSHR T0.Z, KC0[4].X, 1,
469; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
470; R600-NEXT:     NOT_INT * T1.W, KC0[6].X,
471; R600-NEXT:     LSHR T0.Y, KC0[3].W, 1,
472; R600-NEXT:     BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1,
473; R600-NEXT:     BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W,
474; R600-NEXT:     NOT_INT * T1.W, KC0[5].W,
475; R600-NEXT:     LSHR T1.Y, KC0[3].Z, 1,
476; R600-NEXT:     BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W,
477; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1,
478; R600-NEXT:     NOT_INT * T2.W, KC0[5].Z,
479; R600-NEXT:     BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W,
480; R600-NEXT:     LSHR T1.Z, KC0[3].Y, 1,
481; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1,
482; R600-NEXT:     NOT_INT * T2.W, KC0[5].Y,
483; R600-NEXT:     BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W,
484; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
485; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
486;
487; GFX10-LABEL: fshl_v4i32:
488; GFX10:       ; %bb.0: ; %entry
489; GFX10-NEXT:    s_clause 0x3
490; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
491; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
492; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
493; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
494; GFX10-NEXT:    v_mov_b32_e32 v4, 0
495; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX10-NEXT:    s_lshr_b32 s0, s7, 1
497; GFX10-NEXT:    v_alignbit_b32 v0, s7, s11, 1
498; GFX10-NEXT:    v_alignbit_b32 v1, s6, s10, 1
499; GFX10-NEXT:    v_alignbit_b32 v5, s5, s9, 1
500; GFX10-NEXT:    v_alignbit_b32 v6, s4, s8, 1
501; GFX10-NEXT:    s_not_b32 s1, s15
502; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
503; GFX10-NEXT:    s_not_b32 s7, s14
504; GFX10-NEXT:    s_lshr_b32 s5, s5, 1
505; GFX10-NEXT:    s_not_b32 s9, s13
506; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
507; GFX10-NEXT:    s_not_b32 s8, s12
508; GFX10-NEXT:    v_alignbit_b32 v3, s0, v0, s1
509; GFX10-NEXT:    v_alignbit_b32 v2, s6, v1, s7
510; GFX10-NEXT:    v_alignbit_b32 v1, s5, v5, s9
511; GFX10-NEXT:    v_alignbit_b32 v0, s4, v6, s8
512; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
513; GFX10-NEXT:    s_endpgm
514entry:
515  %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
516  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
517  ret void
518}
519
520define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
521; SI-LABEL: fshl_v4i32_imm:
522; SI:       ; %bb.0: ; %entry
523; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
524; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
525; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
526; SI-NEXT:    s_mov_b32 s3, 0xf000
527; SI-NEXT:    s_mov_b32 s2, -1
528; SI-NEXT:    s_waitcnt lgkmcnt(0)
529; SI-NEXT:    v_mov_b32_e32 v0, s7
530; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
531; SI-NEXT:    v_mov_b32_e32 v0, s6
532; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 23
533; SI-NEXT:    v_mov_b32_e32 v0, s5
534; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 25
535; SI-NEXT:    v_mov_b32_e32 v0, s4
536; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
537; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
538; SI-NEXT:    s_endpgm
539;
540; VI-LABEL: fshl_v4i32_imm:
541; VI:       ; %bb.0: ; %entry
542; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
543; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
544; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
545; VI-NEXT:    s_waitcnt lgkmcnt(0)
546; VI-NEXT:    v_mov_b32_e32 v0, s7
547; VI-NEXT:    v_mov_b32_e32 v1, s6
548; VI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
549; VI-NEXT:    v_mov_b32_e32 v0, s5
550; VI-NEXT:    v_alignbit_b32 v2, s10, v1, 23
551; VI-NEXT:    v_alignbit_b32 v1, s9, v0, 25
552; VI-NEXT:    v_mov_b32_e32 v0, s4
553; VI-NEXT:    v_mov_b32_e32 v5, s1
554; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
555; VI-NEXT:    v_mov_b32_e32 v4, s0
556; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
557; VI-NEXT:    s_endpgm
558;
559; GFX9-LABEL: fshl_v4i32_imm:
560; GFX9:       ; %bb.0: ; %entry
561; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
562; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
563; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
564; GFX9-NEXT:    v_mov_b32_e32 v4, 0
565; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX9-NEXT:    v_mov_b32_e32 v0, s7
567; GFX9-NEXT:    v_mov_b32_e32 v1, s6
568; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, 31
569; GFX9-NEXT:    v_mov_b32_e32 v0, s5
570; GFX9-NEXT:    v_alignbit_b32 v2, s10, v1, 23
571; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, 25
572; GFX9-NEXT:    v_mov_b32_e32 v0, s4
573; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 31
574; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
575; GFX9-NEXT:    s_endpgm
576;
577; R600-LABEL: fshl_v4i32_imm:
578; R600:       ; %bb.0: ; %entry
579; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
580; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
581; R600-NEXT:    CF_END
582; R600-NEXT:    PAD
583; R600-NEXT:    ALU clause starting at 4:
584; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x,
585; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
586; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
587; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
588; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
589; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
590; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x,
591; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
592; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
593; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
594;
595; GFX10-LABEL: fshl_v4i32_imm:
596; GFX10:       ; %bb.0: ; %entry
597; GFX10-NEXT:    s_clause 0x2
598; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
599; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
600; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
601; GFX10-NEXT:    v_mov_b32_e32 v4, 0
602; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
603; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, 31
604; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, 23
605; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, 25
606; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, 31
607; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
608; GFX10-NEXT:    s_endpgm
609entry:
610  %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
611  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
612  ret void
613}
614