1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s --check-prefix=VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
7
8declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
9declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
10declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
11
12define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
13; SI-LABEL: fshl_i32:
14; SI:       ; %bb.0: ; %entry
15; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
16; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
17; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
18; SI-NEXT:    s_mov_b32 s7, 0xf000
19; SI-NEXT:    s_mov_b32 s6, -1
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    v_mov_b32_e32 v0, s3
22; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 1
23; SI-NEXT:    s_not_b32 s0, s0
24; SI-NEXT:    s_lshr_b32 s1, s2, 1
25; SI-NEXT:    v_mov_b32_e32 v1, s0
26; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v1
27; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
28; SI-NEXT:    s_endpgm
29;
30; VI-LABEL: fshl_i32:
31; VI:       ; %bb.0: ; %entry
32; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
33; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
34; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    v_mov_b32_e32 v0, s3
37; VI-NEXT:    s_not_b32 s4, s4
38; VI-NEXT:    s_lshr_b32 s3, s2, 1
39; VI-NEXT:    v_alignbit_b32 v0, s2, v0, 1
40; VI-NEXT:    v_mov_b32_e32 v1, s4
41; VI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
42; VI-NEXT:    v_mov_b32_e32 v0, s0
43; VI-NEXT:    v_mov_b32_e32 v1, s1
44; VI-NEXT:    flat_store_dword v[0:1], v2
45; VI-NEXT:    s_endpgm
46;
47; GFX9-LABEL: fshl_i32:
48; GFX9:       ; %bb.0: ; %entry
49; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
50; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
51; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x34
52; GFX9-NEXT:    v_mov_b32_e32 v0, 0
53; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX9-NEXT:    v_mov_b32_e32 v1, s3
55; GFX9-NEXT:    s_lshr_b32 s0, s2, 1
56; GFX9-NEXT:    s_not_b32 s1, s6
57; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 1
58; GFX9-NEXT:    v_mov_b32_e32 v2, s1
59; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, v2
60; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
61; GFX9-NEXT:    s_endpgm
62;
63; R600-LABEL: fshl_i32:
64; R600:       ; %bb.0: ; %entry
65; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
66; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
67; R600-NEXT:    CF_END
68; R600-NEXT:    PAD
69; R600-NEXT:    ALU clause starting at 4:
70; R600-NEXT:     LSHR T0.Z, KC0[2].Z, 1,
71; R600-NEXT:     BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1,
72; R600-NEXT:     NOT_INT * T1.W, KC0[3].X,
73; R600-NEXT:     BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS,
74; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
75; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
76;
77; GFX10-LABEL: fshl_i32:
78; GFX10:       ; %bb.0: ; %entry
79; GFX10-NEXT:    s_clause 0x2
80; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
81; GFX10-NEXT:    s_load_dword s6, s[0:1], 0x34
82; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
83; GFX10-NEXT:    v_mov_b32_e32 v1, 0
84; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX10-NEXT:    v_alignbit_b32 v0, s2, s3, 1
86; GFX10-NEXT:    s_lshr_b32 s0, s2, 1
87; GFX10-NEXT:    s_not_b32 s1, s6
88; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, s1
89; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
90; GFX10-NEXT:    s_endpgm
91entry:
92  %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
93  store i32 %0, i32 addrspace(1)* %in
94  ret void
95}
96
97define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
98; SI-LABEL: fshl_i32_imm:
99; SI:       ; %bb.0: ; %entry
100; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
101; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
102; SI-NEXT:    s_mov_b32 s3, 0xf000
103; SI-NEXT:    s_mov_b32 s2, -1
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s5
106; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 25
107; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
108; SI-NEXT:    s_endpgm
109;
110; VI-LABEL: fshl_i32_imm:
111; VI:       ; %bb.0: ; %entry
112; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
113; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
114; VI-NEXT:    s_waitcnt lgkmcnt(0)
115; VI-NEXT:    v_mov_b32_e32 v0, s3
116; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 25
117; VI-NEXT:    v_mov_b32_e32 v0, s0
118; VI-NEXT:    v_mov_b32_e32 v1, s1
119; VI-NEXT:    flat_store_dword v[0:1], v2
120; VI-NEXT:    s_endpgm
121;
122; GFX9-LABEL: fshl_i32_imm:
123; GFX9:       ; %bb.0: ; %entry
124; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
125; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
126; GFX9-NEXT:    v_mov_b32_e32 v0, 0
127; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX9-NEXT:    v_mov_b32_e32 v1, s3
129; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
130; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
131; GFX9-NEXT:    s_endpgm
132;
133; R600-LABEL: fshl_i32_imm:
134; R600:       ; %bb.0: ; %entry
135; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
136; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
137; R600-NEXT:    CF_END
138; R600-NEXT:    PAD
139; R600-NEXT:    ALU clause starting at 4:
140; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
141; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
142; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
143; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
144;
145; GFX10-LABEL: fshl_i32_imm:
146; GFX10:       ; %bb.0: ; %entry
147; GFX10-NEXT:    s_clause 0x1
148; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
149; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
150; GFX10-NEXT:    v_mov_b32_e32 v0, 0
151; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
153; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
154; GFX10-NEXT:    s_endpgm
155entry:
156  %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
157  store i32 %0, i32 addrspace(1)* %in
158  ret void
159}
160
161define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
162; SI-LABEL: fshl_v2i32:
163; SI:       ; %bb.0: ; %entry
164; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
165; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
166; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
167; SI-NEXT:    s_mov_b32 s11, 0xf000
168; SI-NEXT:    s_mov_b32 s10, -1
169; SI-NEXT:    s_waitcnt lgkmcnt(0)
170; SI-NEXT:    v_mov_b32_e32 v0, s7
171; SI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
172; SI-NEXT:    s_not_b32 s1, s1
173; SI-NEXT:    s_lshr_b32 s2, s5, 1
174; SI-NEXT:    v_mov_b32_e32 v1, s1
175; SI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
176; SI-NEXT:    v_mov_b32_e32 v0, s6
177; SI-NEXT:    s_not_b32 s0, s0
178; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
179; SI-NEXT:    s_lshr_b32 s1, s4, 1
180; SI-NEXT:    v_mov_b32_e32 v2, s0
181; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v2
182; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
183; SI-NEXT:    s_endpgm
184;
185; VI-LABEL: fshl_v2i32:
186; VI:       ; %bb.0: ; %entry
187; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
188; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
189; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
190; VI-NEXT:    s_waitcnt lgkmcnt(0)
191; VI-NEXT:    v_mov_b32_e32 v0, s7
192; VI-NEXT:    s_not_b32 s3, s3
193; VI-NEXT:    s_lshr_b32 s7, s5, 1
194; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
195; VI-NEXT:    v_mov_b32_e32 v1, s3
196; VI-NEXT:    v_alignbit_b32 v1, s7, v0, v1
197; VI-NEXT:    v_mov_b32_e32 v0, s6
198; VI-NEXT:    s_not_b32 s2, s2
199; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
200; VI-NEXT:    s_lshr_b32 s3, s4, 1
201; VI-NEXT:    v_mov_b32_e32 v2, s2
202; VI-NEXT:    v_alignbit_b32 v0, s3, v0, v2
203; VI-NEXT:    v_mov_b32_e32 v3, s1
204; VI-NEXT:    v_mov_b32_e32 v2, s0
205; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
206; VI-NEXT:    s_endpgm
207;
208; GFX9-LABEL: fshl_v2i32:
209; GFX9:       ; %bb.0: ; %entry
210; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
211; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
212; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
213; GFX9-NEXT:    v_mov_b32_e32 v2, 0
214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NEXT:    v_mov_b32_e32 v0, s7
216; GFX9-NEXT:    s_lshr_b32 s0, s5, 1
217; GFX9-NEXT:    s_not_b32 s1, s9
218; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
219; GFX9-NEXT:    v_mov_b32_e32 v1, s1
220; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
221; GFX9-NEXT:    v_mov_b32_e32 v0, s6
222; GFX9-NEXT:    s_not_b32 s1, s8
223; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
224; GFX9-NEXT:    s_lshr_b32 s0, s4, 1
225; GFX9-NEXT:    v_mov_b32_e32 v3, s1
226; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
227; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
228; GFX9-NEXT:    s_endpgm
229;
230; R600-LABEL: fshl_v2i32:
231; R600:       ; %bb.0: ; %entry
232; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
233; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
234; R600-NEXT:    CF_END
235; R600-NEXT:    PAD
236; R600-NEXT:    ALU clause starting at 4:
237; R600-NEXT:     LSHR T0.Z, KC0[3].X, 1,
238; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1,
239; R600-NEXT:     NOT_INT * T1.W, KC0[4].X,
240; R600-NEXT:     BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W,
241; R600-NEXT:     LSHR T0.Z, KC0[2].W, 1,
242; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1,
243; R600-NEXT:     NOT_INT * T1.W, KC0[3].W,
244; R600-NEXT:     BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W,
245; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
246; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
247;
248; GFX10-LABEL: fshl_v2i32:
249; GFX10:       ; %bb.0: ; %entry
250; GFX10-NEXT:    s_clause 0x2
251; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
252; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
253; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
254; GFX10-NEXT:    v_mov_b32_e32 v2, 0
255; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-NEXT:    v_alignbit_b32 v0, s5, s7, 1
257; GFX10-NEXT:    v_alignbit_b32 v3, s4, s6, 1
258; GFX10-NEXT:    s_lshr_b32 s0, s5, 1
259; GFX10-NEXT:    s_not_b32 s1, s3
260; GFX10-NEXT:    s_lshr_b32 s3, s4, 1
261; GFX10-NEXT:    s_not_b32 s2, s2
262; GFX10-NEXT:    v_alignbit_b32 v1, s0, v0, s1
263; GFX10-NEXT:    v_alignbit_b32 v0, s3, v3, s2
264; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
265; GFX10-NEXT:    s_endpgm
266entry:
267  %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
268  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
269  ret void
270}
271
272define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
273; SI-LABEL: fshl_v2i32_imm:
274; SI:       ; %bb.0: ; %entry
275; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
276; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
277; SI-NEXT:    s_mov_b32 s3, 0xf000
278; SI-NEXT:    s_mov_b32 s2, -1
279; SI-NEXT:    s_waitcnt lgkmcnt(0)
280; SI-NEXT:    v_mov_b32_e32 v0, s7
281; SI-NEXT:    v_mov_b32_e32 v2, s6
282; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 23
283; SI-NEXT:    v_alignbit_b32 v0, s4, v2, 25
284; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
285; SI-NEXT:    s_endpgm
286;
287; VI-LABEL: fshl_v2i32_imm:
288; VI:       ; %bb.0: ; %entry
289; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
290; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
291; VI-NEXT:    s_waitcnt lgkmcnt(0)
292; VI-NEXT:    v_mov_b32_e32 v0, s7
293; VI-NEXT:    v_mov_b32_e32 v2, s6
294; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 23
295; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 25
296; VI-NEXT:    v_mov_b32_e32 v3, s1
297; VI-NEXT:    v_mov_b32_e32 v2, s0
298; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
299; VI-NEXT:    s_endpgm
300;
301; GFX9-LABEL: fshl_v2i32_imm:
302; GFX9:       ; %bb.0: ; %entry
303; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
304; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
305; GFX9-NEXT:    v_mov_b32_e32 v2, 0
306; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
307; GFX9-NEXT:    v_mov_b32_e32 v0, s7
308; GFX9-NEXT:    v_mov_b32_e32 v3, s6
309; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 23
310; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 25
311; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
312; GFX9-NEXT:    s_endpgm
313;
314; R600-LABEL: fshl_v2i32_imm:
315; R600:       ; %bb.0: ; %entry
316; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
317; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
318; R600-NEXT:    CF_END
319; R600-NEXT:    PAD
320; R600-NEXT:    ALU clause starting at 4:
321; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
322; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
323; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
324; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
325; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
326; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
327;
328; GFX10-LABEL: fshl_v2i32_imm:
329; GFX10:       ; %bb.0: ; %entry
330; GFX10-NEXT:    s_clause 0x1
331; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
332; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
333; GFX10-NEXT:    v_mov_b32_e32 v2, 0
334; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
335; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, 23
336; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, 25
337; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
338; GFX10-NEXT:    s_endpgm
339entry:
340  %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
341  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
342  ret void
343}
344
345define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
346; SI-LABEL: fshl_v4i32:
347; SI:       ; %bb.0: ; %entry
348; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
349; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x15
350; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
351; SI-NEXT:    s_mov_b32 s3, 0xf000
352; SI-NEXT:    s_mov_b32 s2, -1
353; SI-NEXT:    s_waitcnt lgkmcnt(0)
354; SI-NEXT:    v_mov_b32_e32 v0, s11
355; SI-NEXT:    s_not_b32 s11, s15
356; SI-NEXT:    v_alignbit_b32 v0, s7, v0, 1
357; SI-NEXT:    s_lshr_b32 s7, s7, 1
358; SI-NEXT:    v_mov_b32_e32 v1, s11
359; SI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
360; SI-NEXT:    v_mov_b32_e32 v0, s10
361; SI-NEXT:    s_not_b32 s7, s14
362; SI-NEXT:    v_alignbit_b32 v0, s6, v0, 1
363; SI-NEXT:    s_lshr_b32 s6, s6, 1
364; SI-NEXT:    v_mov_b32_e32 v1, s7
365; SI-NEXT:    v_alignbit_b32 v2, s6, v0, v1
366; SI-NEXT:    v_mov_b32_e32 v0, s9
367; SI-NEXT:    s_not_b32 s6, s13
368; SI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
369; SI-NEXT:    s_lshr_b32 s5, s5, 1
370; SI-NEXT:    v_mov_b32_e32 v1, s6
371; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
372; SI-NEXT:    v_mov_b32_e32 v0, s8
373; SI-NEXT:    s_not_b32 s5, s12
374; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
375; SI-NEXT:    s_lshr_b32 s4, s4, 1
376; SI-NEXT:    v_mov_b32_e32 v4, s5
377; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
378; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
379; SI-NEXT:    s_endpgm
380;
381; VI-LABEL: fshl_v4i32:
382; VI:       ; %bb.0: ; %entry
383; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
384; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
385; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
386; VI-NEXT:    s_waitcnt lgkmcnt(0)
387; VI-NEXT:    v_mov_b32_e32 v0, s11
388; VI-NEXT:    s_not_b32 s3, s15
389; VI-NEXT:    s_lshr_b32 s2, s7, 1
390; VI-NEXT:    v_alignbit_b32 v0, s7, v0, 1
391; VI-NEXT:    v_mov_b32_e32 v1, s3
392; VI-NEXT:    v_alignbit_b32 v3, s2, v0, v1
393; VI-NEXT:    v_mov_b32_e32 v0, s10
394; VI-NEXT:    s_not_b32 s3, s14
395; VI-NEXT:    v_alignbit_b32 v0, s6, v0, 1
396; VI-NEXT:    s_lshr_b32 s2, s6, 1
397; VI-NEXT:    v_mov_b32_e32 v1, s3
398; VI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
399; VI-NEXT:    v_mov_b32_e32 v0, s9
400; VI-NEXT:    s_not_b32 s3, s13
401; VI-NEXT:    v_alignbit_b32 v0, s5, v0, 1
402; VI-NEXT:    s_lshr_b32 s2, s5, 1
403; VI-NEXT:    v_mov_b32_e32 v1, s3
404; VI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
405; VI-NEXT:    v_mov_b32_e32 v0, s8
406; VI-NEXT:    s_not_b32 s3, s12
407; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
408; VI-NEXT:    s_lshr_b32 s2, s4, 1
409; VI-NEXT:    v_mov_b32_e32 v4, s3
410; VI-NEXT:    v_alignbit_b32 v0, s2, v0, v4
411; VI-NEXT:    v_mov_b32_e32 v5, s1
412; VI-NEXT:    v_mov_b32_e32 v4, s0
413; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
414; VI-NEXT:    s_endpgm
415;
416; GFX9-LABEL: fshl_v4i32:
417; GFX9:       ; %bb.0: ; %entry
418; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
419; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
420; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
421; GFX9-NEXT:    v_mov_b32_e32 v4, 0
422; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX9-NEXT:    s_not_b32 s1, s15
424; GFX9-NEXT:    v_mov_b32_e32 v0, s11
425; GFX9-NEXT:    s_lshr_b32 s0, s7, 1
426; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 1
427; GFX9-NEXT:    v_mov_b32_e32 v1, s1
428; GFX9-NEXT:    v_alignbit_b32 v3, s0, v0, v1
429; GFX9-NEXT:    v_mov_b32_e32 v0, s10
430; GFX9-NEXT:    s_not_b32 s1, s14
431; GFX9-NEXT:    v_alignbit_b32 v0, s6, v0, 1
432; GFX9-NEXT:    s_lshr_b32 s0, s6, 1
433; GFX9-NEXT:    v_mov_b32_e32 v1, s1
434; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, v1
435; GFX9-NEXT:    v_mov_b32_e32 v0, s9
436; GFX9-NEXT:    s_not_b32 s1, s13
437; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 1
438; GFX9-NEXT:    s_lshr_b32 s0, s5, 1
439; GFX9-NEXT:    v_mov_b32_e32 v1, s1
440; GFX9-NEXT:    v_alignbit_b32 v1, s0, v0, v1
441; GFX9-NEXT:    v_mov_b32_e32 v0, s8
442; GFX9-NEXT:    s_not_b32 s1, s12
443; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
444; GFX9-NEXT:    s_lshr_b32 s0, s4, 1
445; GFX9-NEXT:    v_mov_b32_e32 v5, s1
446; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v5
447; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
448; GFX9-NEXT:    s_endpgm
449;
450; R600-LABEL: fshl_v4i32:
451; R600:       ; %bb.0: ; %entry
452; R600-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
453; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
454; R600-NEXT:    CF_END
455; R600-NEXT:    PAD
456; R600-NEXT:    ALU clause starting at 4:
457; R600-NEXT:     LSHR T0.Z, KC0[4].X, 1,
458; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
459; R600-NEXT:     NOT_INT * T1.W, KC0[6].X,
460; R600-NEXT:     LSHR T0.Y, KC0[3].W, 1,
461; R600-NEXT:     BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1,
462; R600-NEXT:     BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W,
463; R600-NEXT:     NOT_INT * T1.W, KC0[5].W,
464; R600-NEXT:     LSHR T1.Y, KC0[3].Z, 1,
465; R600-NEXT:     BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W,
466; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1,
467; R600-NEXT:     NOT_INT * T2.W, KC0[5].Z,
468; R600-NEXT:     BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W,
469; R600-NEXT:     LSHR T1.Z, KC0[3].Y, 1,
470; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1,
471; R600-NEXT:     NOT_INT * T2.W, KC0[5].Y,
472; R600-NEXT:     BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W,
473; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
474; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
475;
476; GFX10-LABEL: fshl_v4i32:
477; GFX10:       ; %bb.0: ; %entry
478; GFX10-NEXT:    s_clause 0x1
479; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
480; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
481; GFX10-NEXT:    v_mov_b32_e32 v4, 0
482; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
483; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX10-NEXT:    v_alignbit_b32 v0, s7, s11, 1
485; GFX10-NEXT:    v_alignbit_b32 v1, s6, s10, 1
486; GFX10-NEXT:    v_alignbit_b32 v5, s5, s9, 1
487; GFX10-NEXT:    v_alignbit_b32 v6, s4, s8, 1
488; GFX10-NEXT:    s_lshr_b32 s2, s7, 1
489; GFX10-NEXT:    s_not_b32 s3, s15
490; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
491; GFX10-NEXT:    s_not_b32 s7, s14
492; GFX10-NEXT:    s_lshr_b32 s5, s5, 1
493; GFX10-NEXT:    s_not_b32 s9, s13
494; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
495; GFX10-NEXT:    s_not_b32 s8, s12
496; GFX10-NEXT:    v_alignbit_b32 v3, s2, v0, s3
497; GFX10-NEXT:    v_alignbit_b32 v2, s6, v1, s7
498; GFX10-NEXT:    v_alignbit_b32 v1, s5, v5, s9
499; GFX10-NEXT:    v_alignbit_b32 v0, s4, v6, s8
500; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
501; GFX10-NEXT:    s_endpgm
502entry:
503  %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
504  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
505  ret void
506}
507
508define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
509; SI-LABEL: fshl_v4i32_imm:
510; SI:       ; %bb.0: ; %entry
511; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
512; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
513; SI-NEXT:    s_mov_b32 s3, 0xf000
514; SI-NEXT:    s_mov_b32 s2, -1
515; SI-NEXT:    s_waitcnt lgkmcnt(0)
516; SI-NEXT:    v_mov_b32_e32 v0, s11
517; SI-NEXT:    v_mov_b32_e32 v1, s10
518; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 31
519; SI-NEXT:    v_mov_b32_e32 v0, s9
520; SI-NEXT:    v_alignbit_b32 v2, s6, v1, 23
521; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 25
522; SI-NEXT:    v_mov_b32_e32 v0, s8
523; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 31
524; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
525; SI-NEXT:    s_endpgm
526;
527; VI-LABEL: fshl_v4i32_imm:
528; VI:       ; %bb.0: ; %entry
529; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
530; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
531; VI-NEXT:    s_waitcnt lgkmcnt(0)
532; VI-NEXT:    v_mov_b32_e32 v0, s11
533; VI-NEXT:    v_mov_b32_e32 v1, s10
534; VI-NEXT:    v_mov_b32_e32 v4, s9
535; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 31
536; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 23
537; VI-NEXT:    v_alignbit_b32 v1, s5, v4, 25
538; VI-NEXT:    v_mov_b32_e32 v0, s8
539; VI-NEXT:    v_mov_b32_e32 v5, s1
540; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 31
541; VI-NEXT:    v_mov_b32_e32 v4, s0
542; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
543; VI-NEXT:    s_endpgm
544;
545; GFX9-LABEL: fshl_v4i32_imm:
546; GFX9:       ; %bb.0: ; %entry
547; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
548; GFX9-NEXT:    v_mov_b32_e32 v4, 0
549; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
550; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX9-NEXT:    v_mov_b32_e32 v0, s11
552; GFX9-NEXT:    v_mov_b32_e32 v1, s10
553; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 31
554; GFX9-NEXT:    v_mov_b32_e32 v0, s9
555; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 23
556; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 25
557; GFX9-NEXT:    v_mov_b32_e32 v0, s8
558; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 31
559; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
560; GFX9-NEXT:    s_endpgm
561;
562; R600-LABEL: fshl_v4i32_imm:
563; R600:       ; %bb.0: ; %entry
564; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
565; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
566; R600-NEXT:    CF_END
567; R600-NEXT:    PAD
568; R600-NEXT:    ALU clause starting at 4:
569; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x,
570; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
571; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
572; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
573; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
574; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
575; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x,
576; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
577; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
578; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
579;
580; GFX10-LABEL: fshl_v4i32_imm:
581; GFX10:       ; %bb.0: ; %entry
582; GFX10-NEXT:    s_clause 0x1
583; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
584; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
585; GFX10-NEXT:    v_mov_b32_e32 v4, 0
586; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
587; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, 31
588; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, 23
589; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, 25
590; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, 31
591; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
592; GFX10-NEXT:    s_endpgm
593entry:
594  %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
595  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
596  ret void
597}
598