1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI
3; RUN: llc < %s -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8declare i32 @llvm.amdgcn.workgroup.id.x() #0
9
10define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
11; SI-LABEL: shl_v2i32:
12; SI:       ; %bb.0:
13; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
14; SI-NEXT:    s_mov_b32 s7, 0xf000
15; SI-NEXT:    s_mov_b32 s6, -1
16; SI-NEXT:    s_mov_b32 s10, s6
17; SI-NEXT:    s_mov_b32 s11, s7
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    s_mov_b32 s8, s2
20; SI-NEXT:    s_mov_b32 s9, s3
21; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
22; SI-NEXT:    s_mov_b32 s4, s0
23; SI-NEXT:    s_mov_b32 s5, s1
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    v_lshl_b32_e32 v1, v1, v3
26; SI-NEXT:    v_lshl_b32_e32 v0, v0, v2
27; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
28; SI-NEXT:    s_endpgm
29;
30; VI-LABEL: shl_v2i32:
31; VI:       ; %bb.0:
32; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
35; VI-NEXT:    s_mov_b32 s3, 0xf000
36; VI-NEXT:    s_mov_b32 s2, -1
37; VI-NEXT:    s_waitcnt lgkmcnt(0)
38; VI-NEXT:    s_lshl_b32 s5, s5, s7
39; VI-NEXT:    s_lshl_b32 s4, s4, s6
40; VI-NEXT:    v_mov_b32_e32 v0, s4
41; VI-NEXT:    v_mov_b32_e32 v1, s5
42; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
43; VI-NEXT:    s_endpgm
44;
45; EG-LABEL: shl_v2i32:
46; EG:       ; %bb.0:
47; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
48; EG-NEXT:    TEX 0 @6
49; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
50; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
51; EG-NEXT:    CF_END
52; EG-NEXT:    PAD
53; EG-NEXT:    Fetch clause starting at 6:
54; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
55; EG-NEXT:    ALU clause starting at 8:
56; EG-NEXT:     MOV * T0.X, KC0[2].Z,
57; EG-NEXT:    ALU clause starting at 9:
58; EG-NEXT:     LSHL * T0.Y, T0.Y, T0.W,
59; EG-NEXT:     LSHL T0.X, T0.X, T0.Z,
60; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
61; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
62  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
63  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
64  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
65  %result = shl <2 x i32> %a, %b
66  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
67  ret void
68}
69
70define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
71; SI-LABEL: shl_v4i32:
72; SI:       ; %bb.0:
73; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
74; SI-NEXT:    s_mov_b32 s7, 0xf000
75; SI-NEXT:    s_mov_b32 s6, -1
76; SI-NEXT:    s_mov_b32 s10, s6
77; SI-NEXT:    s_mov_b32 s11, s7
78; SI-NEXT:    s_waitcnt lgkmcnt(0)
79; SI-NEXT:    s_mov_b32 s8, s2
80; SI-NEXT:    s_mov_b32 s9, s3
81; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
82; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
83; SI-NEXT:    s_mov_b32 s4, s0
84; SI-NEXT:    s_mov_b32 s5, s1
85; SI-NEXT:    s_waitcnt vmcnt(0)
86; SI-NEXT:    v_lshl_b32_e32 v3, v3, v7
87; SI-NEXT:    v_lshl_b32_e32 v2, v2, v6
88; SI-NEXT:    v_lshl_b32_e32 v1, v1, v5
89; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
90; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
91; SI-NEXT:    s_endpgm
92;
93; VI-LABEL: shl_v4i32:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
96; VI-NEXT:    s_waitcnt lgkmcnt(0)
97; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
98; VI-NEXT:    s_mov_b32 s11, 0xf000
99; VI-NEXT:    s_mov_b32 s10, -1
100; VI-NEXT:    s_waitcnt lgkmcnt(0)
101; VI-NEXT:    s_lshl_b32 s3, s3, s7
102; VI-NEXT:    s_lshl_b32 s2, s2, s6
103; VI-NEXT:    s_lshl_b32 s1, s1, s5
104; VI-NEXT:    s_lshl_b32 s0, s0, s4
105; VI-NEXT:    v_mov_b32_e32 v0, s0
106; VI-NEXT:    v_mov_b32_e32 v1, s1
107; VI-NEXT:    v_mov_b32_e32 v2, s2
108; VI-NEXT:    v_mov_b32_e32 v3, s3
109; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
110; VI-NEXT:    s_endpgm
111;
112; EG-LABEL: shl_v4i32:
113; EG:       ; %bb.0:
114; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
115; EG-NEXT:    TEX 1 @6
116; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
117; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
118; EG-NEXT:    CF_END
119; EG-NEXT:    PAD
120; EG-NEXT:    Fetch clause starting at 6:
121; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
122; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
123; EG-NEXT:    ALU clause starting at 10:
124; EG-NEXT:     MOV * T0.X, KC0[2].Z,
125; EG-NEXT:    ALU clause starting at 11:
126; EG-NEXT:     LSHL * T0.W, T0.W, T1.W,
127; EG-NEXT:     LSHL * T0.Z, T0.Z, T1.Z,
128; EG-NEXT:     LSHL * T0.Y, T0.Y, T1.Y,
129; EG-NEXT:     LSHL T0.X, T0.X, T1.X,
130; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
131; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
132  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
133  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
134  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
135  %result = shl <4 x i32> %a, %b
136  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
137  ret void
138}
139
140define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
141; SI-LABEL: shl_i16:
142; SI:       ; %bb.0:
143; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
144; SI-NEXT:    s_mov_b32 s7, 0xf000
145; SI-NEXT:    s_mov_b32 s6, -1
146; SI-NEXT:    s_mov_b32 s10, s6
147; SI-NEXT:    s_mov_b32 s11, s7
148; SI-NEXT:    s_waitcnt lgkmcnt(0)
149; SI-NEXT:    s_mov_b32 s8, s2
150; SI-NEXT:    s_mov_b32 s9, s3
151; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
152; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
153; SI-NEXT:    s_mov_b32 s4, s0
154; SI-NEXT:    s_mov_b32 s5, s1
155; SI-NEXT:    s_waitcnt vmcnt(0)
156; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
157; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
158; SI-NEXT:    s_endpgm
159;
160; VI-LABEL: shl_i16:
161; VI:       ; %bb.0:
162; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
163; VI-NEXT:    s_mov_b32 s7, 0xf000
164; VI-NEXT:    s_mov_b32 s6, -1
165; VI-NEXT:    s_mov_b32 s10, s6
166; VI-NEXT:    s_mov_b32 s11, s7
167; VI-NEXT:    s_waitcnt lgkmcnt(0)
168; VI-NEXT:    s_mov_b32 s8, s2
169; VI-NEXT:    s_mov_b32 s9, s3
170; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
171; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
172; VI-NEXT:    s_mov_b32 s4, s0
173; VI-NEXT:    s_mov_b32 s5, s1
174; VI-NEXT:    s_waitcnt vmcnt(0)
175; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
176; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
177; VI-NEXT:    s_endpgm
178;
179; EG-LABEL: shl_i16:
180; EG:       ; %bb.0:
181; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
182; EG-NEXT:    TEX 1 @6
183; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
184; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
185; EG-NEXT:    CF_END
186; EG-NEXT:    PAD
187; EG-NEXT:    Fetch clause starting at 6:
188; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
189; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
190; EG-NEXT:    ALU clause starting at 10:
191; EG-NEXT:     MOV * T0.X, KC0[2].Z,
192; EG-NEXT:    ALU clause starting at 11:
193; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
194; EG-NEXT:     LSHL * T1.W, T0.X, T1.X,
195; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
196; EG-NEXT:     AND_INT T1.W, PS, literal.x,
197; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
198; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
199; EG-NEXT:     LSHL T0.X, PV.W, PS,
200; EG-NEXT:     LSHL * T0.W, literal.x, PS,
201; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
202; EG-NEXT:     MOV T0.Y, 0.0,
203; EG-NEXT:     MOV * T0.Z, 0.0,
204; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
205; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
206  %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
207  %a = load i16, i16 addrspace(1)* %in
208  %b = load i16, i16 addrspace(1)* %b_ptr
209  %result = shl i16 %a, %b
210  store i16 %result, i16 addrspace(1)* %out
211  ret void
212}
213
214define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
215; SI-LABEL: shl_i16_v_s:
216; SI:       ; %bb.0:
217; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
218; SI-NEXT:    s_load_dword s12, s[0:1], 0xd
219; SI-NEXT:    s_mov_b32 s3, 0xf000
220; SI-NEXT:    s_mov_b32 s2, -1
221; SI-NEXT:    s_mov_b32 s10, s2
222; SI-NEXT:    s_waitcnt lgkmcnt(0)
223; SI-NEXT:    s_mov_b32 s8, s6
224; SI-NEXT:    s_mov_b32 s9, s7
225; SI-NEXT:    s_mov_b32 s11, s3
226; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
227; SI-NEXT:    s_mov_b32 s0, s4
228; SI-NEXT:    s_mov_b32 s1, s5
229; SI-NEXT:    s_waitcnt vmcnt(0)
230; SI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
231; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
232; SI-NEXT:    s_endpgm
233;
234; VI-LABEL: shl_i16_v_s:
235; VI:       ; %bb.0:
236; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
237; VI-NEXT:    s_load_dword s12, s[0:1], 0x34
238; VI-NEXT:    s_mov_b32 s3, 0xf000
239; VI-NEXT:    s_mov_b32 s2, -1
240; VI-NEXT:    s_mov_b32 s10, s2
241; VI-NEXT:    s_waitcnt lgkmcnt(0)
242; VI-NEXT:    s_mov_b32 s8, s6
243; VI-NEXT:    s_mov_b32 s9, s7
244; VI-NEXT:    s_mov_b32 s11, s3
245; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
246; VI-NEXT:    s_mov_b32 s0, s4
247; VI-NEXT:    s_mov_b32 s1, s5
248; VI-NEXT:    s_waitcnt vmcnt(0)
249; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
250; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
251; VI-NEXT:    s_endpgm
252;
253; EG-LABEL: shl_i16_v_s:
254; EG:       ; %bb.0:
255; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
256; EG-NEXT:    TEX 1 @6
257; EG-NEXT:    ALU 12, @12, KC0[CB0:0-32], KC1[]
258; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
259; EG-NEXT:    CF_END
260; EG-NEXT:    PAD
261; EG-NEXT:    Fetch clause starting at 6:
262; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
263; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
264; EG-NEXT:    ALU clause starting at 10:
265; EG-NEXT:     MOV T0.X, 0.0,
266; EG-NEXT:     MOV * T1.X, KC0[2].Z,
267; EG-NEXT:    ALU clause starting at 12:
268; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
269; EG-NEXT:     LSHL * T1.W, T1.X, T0.X,
270; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
271; EG-NEXT:     AND_INT T1.W, PS, literal.x,
272; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
273; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
274; EG-NEXT:     LSHL T0.X, PV.W, PS,
275; EG-NEXT:     LSHL * T0.W, literal.x, PS,
276; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
277; EG-NEXT:     MOV T0.Y, 0.0,
278; EG-NEXT:     MOV * T0.Z, 0.0,
279; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
280; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
281  %a = load i16, i16 addrspace(1)* %in
282  %result = shl i16 %a, %b
283  store i16 %result, i16 addrspace(1)* %out
284  ret void
285}
286
287define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
288; SI-LABEL: shl_i16_v_compute_s:
289; SI:       ; %bb.0:
290; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
291; SI-NEXT:    s_load_dword s12, s[0:1], 0xd
292; SI-NEXT:    s_mov_b32 s3, 0xf000
293; SI-NEXT:    s_mov_b32 s2, -1
294; SI-NEXT:    s_mov_b32 s10, s2
295; SI-NEXT:    s_waitcnt lgkmcnt(0)
296; SI-NEXT:    s_mov_b32 s8, s6
297; SI-NEXT:    s_mov_b32 s9, s7
298; SI-NEXT:    s_mov_b32 s11, s3
299; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
300; SI-NEXT:    s_add_i32 s12, s12, 3
301; SI-NEXT:    s_mov_b32 s0, s4
302; SI-NEXT:    s_mov_b32 s1, s5
303; SI-NEXT:    s_waitcnt vmcnt(0)
304; SI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
305; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
306; SI-NEXT:    s_endpgm
307;
308; VI-LABEL: shl_i16_v_compute_s:
309; VI:       ; %bb.0:
310; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
311; VI-NEXT:    s_load_dword s12, s[0:1], 0x34
312; VI-NEXT:    s_mov_b32 s3, 0xf000
313; VI-NEXT:    s_mov_b32 s2, -1
314; VI-NEXT:    s_mov_b32 s10, s2
315; VI-NEXT:    s_waitcnt lgkmcnt(0)
316; VI-NEXT:    s_mov_b32 s8, s6
317; VI-NEXT:    s_mov_b32 s9, s7
318; VI-NEXT:    s_mov_b32 s11, s3
319; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
320; VI-NEXT:    s_add_i32 s12, s12, 3
321; VI-NEXT:    s_mov_b32 s0, s4
322; VI-NEXT:    s_mov_b32 s1, s5
323; VI-NEXT:    s_waitcnt vmcnt(0)
324; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
325; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
326; VI-NEXT:    s_endpgm
327;
328; EG-LABEL: shl_i16_v_compute_s:
329; EG:       ; %bb.0:
330; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
331; EG-NEXT:    TEX 0 @8
332; EG-NEXT:    ALU 0, @13, KC0[CB0:0-32], KC1[]
333; EG-NEXT:    TEX 0 @10
334; EG-NEXT:    ALU 15, @14, KC0[CB0:0-32], KC1[]
335; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
336; EG-NEXT:    CF_END
337; EG-NEXT:    PAD
338; EG-NEXT:    Fetch clause starting at 8:
339; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
340; EG-NEXT:    Fetch clause starting at 10:
341; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
342; EG-NEXT:    ALU clause starting at 12:
343; EG-NEXT:     MOV * T0.X, 0.0,
344; EG-NEXT:    ALU clause starting at 13:
345; EG-NEXT:     MOV * T1.X, KC0[2].Z,
346; EG-NEXT:    ALU clause starting at 14:
347; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
348; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
349; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
350; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
351; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
352; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
353; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
354; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
355; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
356; EG-NEXT:     LSHL T0.X, PV.W, PS,
357; EG-NEXT:     LSHL * T0.W, literal.x, PS,
358; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
359; EG-NEXT:     MOV T0.Y, 0.0,
360; EG-NEXT:     MOV * T0.Z, 0.0,
361; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
362; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
363  %a = load i16, i16 addrspace(1)* %in
364  %b.add = add i16 %b, 3
365  %result = shl i16 %a, %b.add
366  store i16 %result, i16 addrspace(1)* %out
367  ret void
368}
369
370define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
371; SI-LABEL: shl_i16_computed_amount:
372; SI:       ; %bb.0:
373; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
374; SI-NEXT:    s_mov_b32 s7, 0xf000
375; SI-NEXT:    s_mov_b32 s6, -1
376; SI-NEXT:    s_mov_b32 s10, s6
377; SI-NEXT:    s_mov_b32 s11, s7
378; SI-NEXT:    s_waitcnt lgkmcnt(0)
379; SI-NEXT:    s_mov_b32 s8, s2
380; SI-NEXT:    s_mov_b32 s9, s3
381; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
382; SI-NEXT:    v_mov_b32_e32 v1, 0
383; SI-NEXT:    s_mov_b32 s14, 0
384; SI-NEXT:    s_mov_b32 s15, s7
385; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
386; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
387; SI-NEXT:    s_waitcnt vmcnt(0)
388; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc
389; SI-NEXT:    s_waitcnt vmcnt(0)
390; SI-NEXT:    s_mov_b32 s4, s0
391; SI-NEXT:    s_mov_b32 s5, s1
392; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
393; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
394; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
395; SI-NEXT:    s_endpgm
396;
397; VI-LABEL: shl_i16_computed_amount:
398; VI:       ; %bb.0:
399; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
400; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
401; VI-NEXT:    s_mov_b32 s7, 0xf000
402; VI-NEXT:    s_mov_b32 s6, -1
403; VI-NEXT:    s_mov_b32 s10, s6
404; VI-NEXT:    s_waitcnt lgkmcnt(0)
405; VI-NEXT:    v_mov_b32_e32 v1, s3
406; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
407; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
408; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
409; VI-NEXT:    s_mov_b32 s8, s2
410; VI-NEXT:    s_mov_b32 s9, s3
411; VI-NEXT:    s_mov_b32 s11, s7
412; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
413; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
414; VI-NEXT:    s_waitcnt vmcnt(0)
415; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
416; VI-NEXT:    s_waitcnt vmcnt(0)
417; VI-NEXT:    s_mov_b32 s4, s0
418; VI-NEXT:    s_mov_b32 s5, s1
419; VI-NEXT:    v_add_u16_e32 v0, 3, v0
420; VI-NEXT:    v_lshlrev_b16_e32 v0, v0, v2
421; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
422; VI-NEXT:    s_endpgm
423;
424; EG-LABEL: shl_i16_computed_amount:
425; EG:       ; %bb.0:
426; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
427; EG-NEXT:    TEX 0 @8
428; EG-NEXT:    ALU 1, @13, KC0[CB0:0-32], KC1[]
429; EG-NEXT:    TEX 0 @10
430; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
431; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
432; EG-NEXT:    CF_END
433; EG-NEXT:    PAD
434; EG-NEXT:    Fetch clause starting at 8:
435; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
436; EG-NEXT:    Fetch clause starting at 10:
437; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
438; EG-NEXT:    ALU clause starting at 12:
439; EG-NEXT:     MOV * T1.X, KC0[2].Z,
440; EG-NEXT:    ALU clause starting at 13:
441; EG-NEXT:     LSHL * T0.W, T0.X, 1,
442; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
443; EG-NEXT:    ALU clause starting at 15:
444; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
445; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
446; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
447; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
448; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
449; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
450; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
451; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
452; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
453; EG-NEXT:     LSHL T0.X, PV.W, PS,
454; EG-NEXT:     LSHL * T0.W, literal.x, PS,
455; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
456; EG-NEXT:     MOV T0.Y, 0.0,
457; EG-NEXT:     MOV * T0.Z, 0.0,
458; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
459; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
460  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
461  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
462  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
463  %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i16 1
464  %a = load volatile i16, i16 addrspace(1)* %in
465  %b = load volatile i16, i16 addrspace(1)* %b_ptr
466  %b.add = add i16 %b, 3
467  %result = shl i16 %a, %b.add
468  store i16 %result, i16 addrspace(1)* %out
469  ret void
470}
471
472define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
473; SI-LABEL: shl_i16_i_s:
474; SI:       ; %bb.0:
475; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
476; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
477; SI-NEXT:    s_mov_b32 s3, 0xf000
478; SI-NEXT:    s_mov_b32 s2, -1
479; SI-NEXT:    s_waitcnt lgkmcnt(0)
480; SI-NEXT:    s_lshl_b32 s4, s4, 12
481; SI-NEXT:    v_mov_b32_e32 v0, s4
482; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
483; SI-NEXT:    s_endpgm
484;
485; VI-LABEL: shl_i16_i_s:
486; VI:       ; %bb.0:
487; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
488; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
489; VI-NEXT:    s_mov_b32 s3, 0xf000
490; VI-NEXT:    s_mov_b32 s2, -1
491; VI-NEXT:    s_waitcnt lgkmcnt(0)
492; VI-NEXT:    s_lshl_b32 s4, s4, 12
493; VI-NEXT:    v_mov_b32_e32 v0, s4
494; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
495; VI-NEXT:    s_endpgm
496;
497; EG-LABEL: shl_i16_i_s:
498; EG:       ; %bb.0:
499; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
500; EG-NEXT:    TEX 0 @6
501; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
502; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
503; EG-NEXT:    CF_END
504; EG-NEXT:    PAD
505; EG-NEXT:    Fetch clause starting at 6:
506; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
507; EG-NEXT:    ALU clause starting at 8:
508; EG-NEXT:     MOV * T0.X, 0.0,
509; EG-NEXT:    ALU clause starting at 9:
510; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x,
511; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
512; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
513; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
514; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
515; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
516; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
517; EG-NEXT:    61440(8.609578e-41), 3(4.203895e-45)
518; EG-NEXT:     LSHL T0.X, PV.W, PS,
519; EG-NEXT:     LSHL * T0.W, literal.x, PS,
520; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
521; EG-NEXT:     MOV T0.Y, 0.0,
522; EG-NEXT:     MOV * T0.Z, 0.0,
523; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
524; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
525  %result = shl i16 %a, 12
526  store i16 %result, i16 addrspace(1)* %out
527  ret void
528}
529
530define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
531; SI-LABEL: shl_v2i16:
532; SI:       ; %bb.0:
533; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
534; SI-NEXT:    s_mov_b32 s7, 0xf000
535; SI-NEXT:    s_mov_b32 s6, -1
536; SI-NEXT:    s_mov_b32 s10, s6
537; SI-NEXT:    s_mov_b32 s11, s7
538; SI-NEXT:    s_waitcnt lgkmcnt(0)
539; SI-NEXT:    s_mov_b32 s8, s2
540; SI-NEXT:    s_mov_b32 s9, s3
541; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
542; SI-NEXT:    v_mov_b32_e32 v1, 0
543; SI-NEXT:    s_mov_b32 s14, 0
544; SI-NEXT:    s_mov_b32 s15, s7
545; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
546; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
547; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
548; SI-NEXT:    s_mov_b32 s4, s0
549; SI-NEXT:    s_mov_b32 s5, s1
550; SI-NEXT:    s_waitcnt vmcnt(1)
551; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
552; SI-NEXT:    s_waitcnt vmcnt(0)
553; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
554; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
555; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
556; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
557; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
558; SI-NEXT:    v_or_b32_e32 v0, v0, v1
559; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
560; SI-NEXT:    s_endpgm
561;
562; VI-LABEL: shl_v2i16:
563; VI:       ; %bb.0:
564; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
565; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
566; VI-NEXT:    s_waitcnt lgkmcnt(0)
567; VI-NEXT:    v_mov_b32_e32 v1, s3
568; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
569; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
570; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
571; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
572; VI-NEXT:    flat_load_dword v0, v[0:1]
573; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
574; VI-NEXT:    s_mov_b32 s3, 0xf000
575; VI-NEXT:    s_mov_b32 s2, -1
576; VI-NEXT:    s_waitcnt lgkmcnt(0)
577; VI-NEXT:    s_lshr_b32 s5, s4, 16
578; VI-NEXT:    v_mov_b32_e32 v1, s5
579; VI-NEXT:    s_waitcnt vmcnt(0)
580; VI-NEXT:    v_lshlrev_b16_e64 v2, v0, s4
581; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
582; VI-NEXT:    v_or_b32_e32 v0, v2, v0
583; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
584; VI-NEXT:    s_endpgm
585;
586; EG-LABEL: shl_v2i16:
587; EG:       ; %bb.0:
588; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
589; EG-NEXT:    TEX 0 @8
590; EG-NEXT:    ALU 0, @15, KC0[CB0:0-32], KC1[]
591; EG-NEXT:    TEX 0 @10
592; EG-NEXT:    ALU 12, @16, KC0[CB0:0-32], KC1[]
593; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
594; EG-NEXT:    CF_END
595; EG-NEXT:    PAD
596; EG-NEXT:    Fetch clause starting at 8:
597; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
598; EG-NEXT:    Fetch clause starting at 10:
599; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
600; EG-NEXT:    ALU clause starting at 12:
601; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
603; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
604; EG-NEXT:    ALU clause starting at 15:
605; EG-NEXT:     MOV * T7.X, KC0[2].Z,
606; EG-NEXT:    ALU clause starting at 16:
607; EG-NEXT:     AND_INT T0.Y, T0.X, literal.x,
608; EG-NEXT:     AND_INT T0.Z, T7.X, literal.x, BS:VEC_120/SCL_212
609; EG-NEXT:     LSHR T0.W, T0.X, literal.y,
610; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
611; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
612; EG-NEXT:     LSHL T0.W, PS, PV.W,
613; EG-NEXT:     LSHL * T1.W, PV.Z, PV.Y,
614; EG-NEXT:     AND_INT T1.W, PS, literal.x,
615; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
616; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
617; EG-NEXT:     OR_INT T0.X, PV.W, PS,
618; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
619; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
620  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
621  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
622  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
623  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
624  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
625  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
626  %result = shl <2 x i16> %a, %b
627  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
628  ret void
629}
630
631define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
632; SI-LABEL: shl_v4i16:
633; SI:       ; %bb.0:
634; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
635; SI-NEXT:    s_mov_b32 s7, 0xf000
636; SI-NEXT:    s_mov_b32 s6, 0
637; SI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
638; SI-NEXT:    v_mov_b32_e32 v5, 0
639; SI-NEXT:    s_waitcnt lgkmcnt(0)
640; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
641; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
642; SI-NEXT:    s_mov_b32 s4, 0xffff
643; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
644; SI-NEXT:    s_waitcnt vmcnt(0)
645; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
646; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
647; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
648; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
649; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
650; SI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
651; SI-NEXT:    v_lshlrev_b32_e32 v2, v9, v7
652; SI-NEXT:    v_lshlrev_b32_e32 v3, v8, v6
653; SI-NEXT:    v_and_b32_e32 v1, s4, v1
654; SI-NEXT:    v_and_b32_e32 v0, s4, v0
655; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
656; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
657; SI-NEXT:    v_or_b32_e32 v1, v1, v2
658; SI-NEXT:    v_or_b32_e32 v0, v0, v3
659; SI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
660; SI-NEXT:    s_endpgm
661;
662; VI-LABEL: shl_v4i16:
663; VI:       ; %bb.0:
664; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
665; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
666; VI-NEXT:    s_waitcnt lgkmcnt(0)
667; VI-NEXT:    v_mov_b32_e32 v1, s3
668; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
669; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
670; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
671; VI-NEXT:    v_mov_b32_e32 v5, s1
672; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
673; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
674; VI-NEXT:    s_waitcnt vmcnt(0)
675; VI-NEXT:    v_lshlrev_b16_e32 v6, v3, v1
676; VI-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
677; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v0
678; VI-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
679; VI-NEXT:    v_or_b32_e32 v1, v6, v1
680; VI-NEXT:    v_or_b32_e32 v0, v3, v0
681; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
682; VI-NEXT:    s_endpgm
683;
684; EG-LABEL: shl_v4i16:
685; EG:       ; %bb.0:
686; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
687; EG-NEXT:    TEX 0 @6
688; EG-NEXT:    ALU 53, @11, KC0[CB0:0-32], KC1[]
689; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
690; EG-NEXT:    CF_END
691; EG-NEXT:    PAD
692; EG-NEXT:    Fetch clause starting at 6:
693; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
694; EG-NEXT:    ALU clause starting at 8:
695; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
696; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
697; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
698; EG-NEXT:    ALU clause starting at 11:
699; EG-NEXT:     MOV T4.X, T10.X,
700; EG-NEXT:     MOV * T5.X, T10.Y,
701; EG-NEXT:     MOV T0.X, PV.X,
702; EG-NEXT:     MOV T0.Y, PS,
703; EG-NEXT:     MOV * T2.X, T10.Z,
704; EG-NEXT:     MOV T3.X, T10.W,
705; EG-NEXT:     MOV * T0.Z, T6.X,
706; EG-NEXT:     MOV * T1.Y, T2.X,
707; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
708; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
709; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
710; EG-NEXT:     LSHL * T1.W, PS, PV.W,
711; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
712; EG-NEXT:     AND_INT * T2.W, T0.Z, literal.y,
713; EG-NEXT:    65535(9.183409e-41), -65536(nan)
714; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
715; EG-NEXT:     MOV * T0.Z, T3.X,
716; EG-NEXT:     MOV * T6.X, T1.W,
717; EG-NEXT:     MOV T1.Z, PV.X,
718; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
719; EG-NEXT:     LSHR * T2.W, T0.X, literal.x,
720; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
721; EG-NEXT:     LSHL T1.W, PS, PV.W,
722; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.x,
723; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
724; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
725; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
726; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
727; EG-NEXT:     MOV T6.X, PV.W,
728; EG-NEXT:     MOV * T0.X, T7.X,
729; EG-NEXT:     AND_INT T1.W, T0.Z, literal.x,
730; EG-NEXT:     AND_INT * T2.W, T0.Y, literal.x,
731; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
732; EG-NEXT:     LSHL T1.W, PS, PV.W,
733; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
734; EG-NEXT:    -65536(nan), 0(0.000000e+00)
735; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
736; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
737; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
738; EG-NEXT:     MOV * T7.X, PV.W,
739; EG-NEXT:     MOV T0.X, PV.X,
740; EG-NEXT:     LSHR T1.W, T0.Z, literal.x,
741; EG-NEXT:     LSHR * T2.W, T0.Y, literal.x,
742; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
743; EG-NEXT:     LSHL * T1.W, PS, PV.W,
744; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x,
745; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
746; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
747; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
748; EG-NEXT:     LSHR T0.X, PS, literal.x,
749; EG-NEXT:     OR_INT * T10.Y, PV.Z, PV.W,
750; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
751; EG-NEXT:     MOV T7.X, PV.Y,
752; EG-NEXT:     MOV * T10.X, T6.X,
753  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
754  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
755  %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
756  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
757  %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep
758  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
759  %result = shl <4 x i16> %a, %b
760  store <4 x i16> %result, <4 x i16> addrspace(1)* %gep.out
761  ret void
762}
763
764define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
765; SI-LABEL: shl_i64:
766; SI:       ; %bb.0:
767; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
768; SI-NEXT:    s_mov_b32 s7, 0xf000
769; SI-NEXT:    s_mov_b32 s6, -1
770; SI-NEXT:    s_mov_b32 s10, s6
771; SI-NEXT:    s_mov_b32 s11, s7
772; SI-NEXT:    s_waitcnt lgkmcnt(0)
773; SI-NEXT:    s_mov_b32 s8, s2
774; SI-NEXT:    s_mov_b32 s9, s3
775; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
776; SI-NEXT:    s_mov_b32 s4, s0
777; SI-NEXT:    s_mov_b32 s5, s1
778; SI-NEXT:    s_waitcnt vmcnt(0)
779; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
780; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
781; SI-NEXT:    s_endpgm
782;
783; VI-LABEL: shl_i64:
784; VI:       ; %bb.0:
785; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
786; VI-NEXT:    s_waitcnt lgkmcnt(0)
787; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
788; VI-NEXT:    s_mov_b32 s3, 0xf000
789; VI-NEXT:    s_mov_b32 s2, -1
790; VI-NEXT:    s_waitcnt lgkmcnt(0)
791; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
792; VI-NEXT:    v_mov_b32_e32 v0, s4
793; VI-NEXT:    v_mov_b32_e32 v1, s5
794; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
795; VI-NEXT:    s_endpgm
796;
797; EG-LABEL: shl_i64:
798; EG:       ; %bb.0:
799; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
800; EG-NEXT:    TEX 0 @6
801; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
802; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
803; EG-NEXT:    CF_END
804; EG-NEXT:    PAD
805; EG-NEXT:    Fetch clause starting at 6:
806; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
807; EG-NEXT:    ALU clause starting at 8:
808; EG-NEXT:     MOV * T0.X, KC0[2].Z,
809; EG-NEXT:    ALU clause starting at 9:
810; EG-NEXT:     AND_INT T1.Y, T0.Z, literal.x,
811; EG-NEXT:     LSHR T1.Z, T0.Y, 1,
812; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
813; EG-NEXT:     NOT_INT * T1.W, T0.Z,
814; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
815; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS,
816; EG-NEXT:     LSHL T0.W, T0.X, PV.Y,
817; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
818; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
819; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
820; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
821; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
822; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
823  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
824  %a = load i64, i64 addrspace(1)* %in
825  %b = load i64, i64 addrspace(1)* %b_ptr
826  %result = shl i64 %a, %b
827  store i64 %result, i64 addrspace(1)* %out
828  ret void
829}
830
831define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
832; SI-LABEL: shl_v2i64:
833; SI:       ; %bb.0:
834; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
835; SI-NEXT:    s_mov_b32 s7, 0xf000
836; SI-NEXT:    s_mov_b32 s6, -1
837; SI-NEXT:    s_mov_b32 s10, s6
838; SI-NEXT:    s_mov_b32 s11, s7
839; SI-NEXT:    s_waitcnt lgkmcnt(0)
840; SI-NEXT:    s_mov_b32 s8, s2
841; SI-NEXT:    s_mov_b32 s9, s3
842; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
843; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
844; SI-NEXT:    s_mov_b32 s4, s0
845; SI-NEXT:    s_mov_b32 s5, s1
846; SI-NEXT:    s_waitcnt vmcnt(0)
847; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
848; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
849; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
850; SI-NEXT:    s_endpgm
851;
852; VI-LABEL: shl_v2i64:
853; VI:       ; %bb.0:
854; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
855; VI-NEXT:    s_waitcnt lgkmcnt(0)
856; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
857; VI-NEXT:    s_mov_b32 s11, 0xf000
858; VI-NEXT:    s_mov_b32 s10, -1
859; VI-NEXT:    s_waitcnt lgkmcnt(0)
860; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
861; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
862; VI-NEXT:    v_mov_b32_e32 v0, s0
863; VI-NEXT:    v_mov_b32_e32 v1, s1
864; VI-NEXT:    v_mov_b32_e32 v2, s2
865; VI-NEXT:    v_mov_b32_e32 v3, s3
866; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
867; VI-NEXT:    s_endpgm
868;
869; EG-LABEL: shl_v2i64:
870; EG:       ; %bb.0:
871; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
872; EG-NEXT:    TEX 1 @6
873; EG-NEXT:    ALU 22, @11, KC0[CB0:0-32], KC1[]
874; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
875; EG-NEXT:    CF_END
876; EG-NEXT:    PAD
877; EG-NEXT:    Fetch clause starting at 6:
878; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
879; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
880; EG-NEXT:    ALU clause starting at 10:
881; EG-NEXT:     MOV * T0.X, KC0[2].Z,
882; EG-NEXT:    ALU clause starting at 11:
883; EG-NEXT:     AND_INT T1.Y, T1.Z, literal.x,
884; EG-NEXT:     LSHR T2.Z, T0.W, 1,
885; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
886; EG-NEXT:     NOT_INT * T1.W, T1.Z,
887; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
888; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS,
889; EG-NEXT:     LSHL * T1.W, T0.Z, PV.Y,
890; EG-NEXT:     AND_INT T2.X, T1.Z, literal.x,
891; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
892; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
893; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
894; EG-NEXT:     NOT_INT * T3.W, T1.X,
895; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
896; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
897; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
898; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
899; EG-NEXT:     CNDE_INT * T3.W, PV.X, T0.W, T1.W,
900; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
901; EG-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
902; EG-NEXT:     CNDE_INT * T3.Z, T2.X, T1.W, 0.0,
903; EG-NEXT:     CNDE_INT T3.X, T2.W, T0.Z, 0.0,
904; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
905; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
906  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
907  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
908  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
909  %result = shl <2 x i64> %a, %b
910  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
911  ret void
912}
913
914define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
915; SI-LABEL: shl_v4i64:
916; SI:       ; %bb.0:
917; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
918; SI-NEXT:    s_mov_b32 s3, 0xf000
919; SI-NEXT:    s_mov_b32 s2, -1
920; SI-NEXT:    s_mov_b32 s10, s2
921; SI-NEXT:    s_mov_b32 s11, s3
922; SI-NEXT:    s_waitcnt lgkmcnt(0)
923; SI-NEXT:    s_mov_b32 s8, s6
924; SI-NEXT:    s_mov_b32 s9, s7
925; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
926; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
927; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
928; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
929; SI-NEXT:    s_mov_b32 s0, s4
930; SI-NEXT:    s_mov_b32 s1, s5
931; SI-NEXT:    s_waitcnt vmcnt(2)
932; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
933; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
934; SI-NEXT:    s_waitcnt vmcnt(0)
935; SI-NEXT:    v_lshl_b64 v[9:10], v[9:10], v13
936; SI-NEXT:    v_lshl_b64 v[7:8], v[7:8], v11
937; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
938; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
939; SI-NEXT:    s_endpgm
940;
941; VI-LABEL: shl_v4i64:
942; VI:       ; %bb.0:
943; VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
944; VI-NEXT:    s_waitcnt lgkmcnt(0)
945; VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
946; VI-NEXT:    s_mov_b32 s19, 0xf000
947; VI-NEXT:    s_mov_b32 s18, -1
948; VI-NEXT:    s_waitcnt lgkmcnt(0)
949; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s14
950; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s12
951; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
952; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
953; VI-NEXT:    v_mov_b32_e32 v0, s4
954; VI-NEXT:    v_mov_b32_e32 v1, s5
955; VI-NEXT:    v_mov_b32_e32 v2, s6
956; VI-NEXT:    v_mov_b32_e32 v3, s7
957; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
958; VI-NEXT:    s_nop 0
959; VI-NEXT:    v_mov_b32_e32 v0, s0
960; VI-NEXT:    v_mov_b32_e32 v1, s1
961; VI-NEXT:    v_mov_b32_e32 v2, s2
962; VI-NEXT:    v_mov_b32_e32 v3, s3
963; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
964; VI-NEXT:    s_endpgm
965;
966; EG-LABEL: shl_v4i64:
967; EG:       ; %bb.0:
968; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
969; EG-NEXT:    TEX 3 @6
970; EG-NEXT:    ALU 47, @15, KC0[CB0:0-32], KC1[]
971; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
972; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
973; EG-NEXT:    CF_END
974; EG-NEXT:    Fetch clause starting at 6:
975; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
976; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
977; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 32, #1
978; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
979; EG-NEXT:    ALU clause starting at 14:
980; EG-NEXT:     MOV * T0.X, KC0[2].Z,
981; EG-NEXT:    ALU clause starting at 15:
982; EG-NEXT:     AND_INT T4.Z, T1.Z, literal.x,
983; EG-NEXT:     LSHR T1.W, T0.W, 1,
984; EG-NEXT:     NOT_INT * T3.W, T1.Z,
985; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
986; EG-NEXT:     BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1,
987; EG-NEXT:     AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201
988; EG-NEXT:     LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212
989; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221
990; EG-NEXT:     NOT_INT * T2.W, T3.Z,
991; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
992; EG-NEXT:     BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS,
993; EG-NEXT:     LSHL T2.Z, T2.Z, PV.Y,
994; EG-NEXT:     BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W,
995; EG-NEXT:     LSHL * T1.W, T0.Z, T4.Z,
996; EG-NEXT:     AND_INT T4.X, T1.Z, literal.x,
997; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
998; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
999; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
1000; EG-NEXT:     NOT_INT * T3.W, T1.X,
1001; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
1002; EG-NEXT:     AND_INT T5.X, T3.Z, literal.x,
1003; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
1004; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
1005; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
1006; EG-NEXT:     CNDE_INT * T4.W, PV.X, T0.W, T1.W,
1007; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1008; EG-NEXT:     AND_INT T0.X, T3.X, literal.x,
1009; EG-NEXT:     CNDE_INT T4.Y, PV.W, PV.Y, PV.Z,
1010; EG-NEXT:     LSHR T1.Z, T2.Y, 1,
1011; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1,
1012; EG-NEXT:     NOT_INT * T3.W, T3.X,
1013; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1014; EG-NEXT:     BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS,
1015; EG-NEXT:     LSHL T0.Y, T2.X, PV.X,
1016; EG-NEXT:     CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212
1017; EG-NEXT:     AND_INT * T0.W, T3.X, literal.x, BS:VEC_201
1018; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1019; EG-NEXT:     CNDE_INT * T1.W, T5.X, T3.Y, T2.Z,
1020; EG-NEXT:     CNDE_INT T4.X, T2.W, T0.Z, 0.0,
1021; EG-NEXT:     CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212
1022; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
1023; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1024; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
1025; EG-NEXT:     CNDE_INT T1.Z, T5.X, T2.Z, 0.0,
1026; EG-NEXT:     CNDE_INT * T1.X, T0.W, T0.Y, 0.0,
1027; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1028; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
1029; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1030  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
1031  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
1032  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
1033  %result = shl <4 x i64> %a, %b
1034  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
1035  ret void
1036}
1037
1038; Make sure load width gets reduced to i32 load.
1039define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
1040; SI-LABEL: s_shl_32_i64:
1041; SI:       ; %bb.0:
1042; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
1043; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1044; SI-NEXT:    s_mov_b32 s3, 0xf000
1045; SI-NEXT:    s_mov_b32 s2, -1
1046; SI-NEXT:    v_mov_b32_e32 v0, 0
1047; SI-NEXT:    s_waitcnt lgkmcnt(0)
1048; SI-NEXT:    v_mov_b32_e32 v1, s4
1049; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1050; SI-NEXT:    s_endpgm
1051;
1052; VI-LABEL: s_shl_32_i64:
1053; VI:       ; %bb.0:
1054; VI-NEXT:    s_load_dword s4, s[0:1], 0x4c
1055; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1056; VI-NEXT:    s_mov_b32 s3, 0xf000
1057; VI-NEXT:    s_mov_b32 s2, -1
1058; VI-NEXT:    v_mov_b32_e32 v0, 0
1059; VI-NEXT:    s_waitcnt lgkmcnt(0)
1060; VI-NEXT:    v_mov_b32_e32 v1, s4
1061; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1062; VI-NEXT:    s_endpgm
1063;
1064; EG-LABEL: s_shl_32_i64:
1065; EG:       ; %bb.0:
1066; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1067; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1068; EG-NEXT:    CF_END
1069; EG-NEXT:    PAD
1070; EG-NEXT:    ALU clause starting at 4:
1071; EG-NEXT:     MOV * T0.Y, KC0[4].W,
1072; EG-NEXT:     MOV T0.X, 0.0,
1073; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1074; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1075  %result = shl i64 %a, 32
1076  store i64 %result, i64 addrspace(1)* %out
1077  ret void
1078}
1079
1080define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
1081; SI-LABEL: v_shl_32_i64:
1082; SI:       ; %bb.0:
1083; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1084; SI-NEXT:    s_ashr_i32 s3, s2, 31
1085; SI-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
1086; SI-NEXT:    v_mov_b32_e32 v0, s0
1087; SI-NEXT:    s_mov_b32 s11, 0xf000
1088; SI-NEXT:    s_mov_b32 s10, 0
1089; SI-NEXT:    s_waitcnt lgkmcnt(0)
1090; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
1091; SI-NEXT:    v_mov_b32_e32 v1, s1
1092; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
1093; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1094; SI-NEXT:    v_mov_b32_e32 v2, 0
1095; SI-NEXT:    s_waitcnt vmcnt(0)
1096; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
1097; SI-NEXT:    s_endpgm
1098;
1099; VI-LABEL: v_shl_32_i64:
1100; VI:       ; %bb.0:
1101; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1102; VI-NEXT:    s_ashr_i32 s3, s2, 31
1103; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
1104; VI-NEXT:    v_mov_b32_e32 v0, 0
1105; VI-NEXT:    s_waitcnt lgkmcnt(0)
1106; VI-NEXT:    s_add_u32 s2, s6, s0
1107; VI-NEXT:    s_addc_u32 s3, s7, s1
1108; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1109; VI-NEXT:    s_add_u32 s0, s4, s0
1110; VI-NEXT:    s_addc_u32 s1, s5, s1
1111; VI-NEXT:    v_mov_b32_e32 v3, s1
1112; VI-NEXT:    v_mov_b32_e32 v2, s0
1113; VI-NEXT:    s_waitcnt lgkmcnt(0)
1114; VI-NEXT:    v_mov_b32_e32 v1, s2
1115; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1116; VI-NEXT:    s_endpgm
1117;
1118; EG-LABEL: v_shl_32_i64:
1119; EG:       ; %bb.0:
1120; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1121; EG-NEXT:    TEX 0 @6
1122; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
1123; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
1124; EG-NEXT:    CF_END
1125; EG-NEXT:    PAD
1126; EG-NEXT:    Fetch clause starting at 6:
1127; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1128; EG-NEXT:    ALU clause starting at 8:
1129; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1130; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1131; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1132; EG-NEXT:    ALU clause starting at 11:
1133; EG-NEXT:     MOV T1.X, 0.0,
1134; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1135; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1136; EG-NEXT:     MOV * T1.Y, T0.X,
1137; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1138  %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0
1139  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
1140  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
1141  %a = load i64, i64 addrspace(1)* %gep.in
1142  %result = shl i64 %a, 32
1143  store i64 %result, i64 addrspace(1)* %gep.out
1144  ret void
1145}
1146
1147define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
1148; SI-LABEL: s_shl_constant_i64:
1149; SI:       ; %bb.0:
1150; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1151; SI-NEXT:    s_mov_b32 s6, -1
1152; SI-NEXT:    s_mov_b32 s9, 0xffff
1153; SI-NEXT:    s_mov_b32 s8, s6
1154; SI-NEXT:    s_mov_b32 s7, 0xf000
1155; SI-NEXT:    s_waitcnt lgkmcnt(0)
1156; SI-NEXT:    s_mov_b32 s4, s0
1157; SI-NEXT:    s_mov_b32 s5, s1
1158; SI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
1159; SI-NEXT:    v_mov_b32_e32 v0, s0
1160; SI-NEXT:    v_mov_b32_e32 v1, s1
1161; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1162; SI-NEXT:    s_endpgm
1163;
1164; VI-LABEL: s_shl_constant_i64:
1165; VI:       ; %bb.0:
1166; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1167; VI-NEXT:    s_mov_b32 s6, -1
1168; VI-NEXT:    s_mov_b32 s9, 0xffff
1169; VI-NEXT:    s_mov_b32 s8, s6
1170; VI-NEXT:    s_mov_b32 s7, 0xf000
1171; VI-NEXT:    s_waitcnt lgkmcnt(0)
1172; VI-NEXT:    s_mov_b32 s4, s0
1173; VI-NEXT:    s_mov_b32 s5, s1
1174; VI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
1175; VI-NEXT:    v_mov_b32_e32 v0, s0
1176; VI-NEXT:    v_mov_b32_e32 v1, s1
1177; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1178; VI-NEXT:    s_endpgm
1179;
1180; EG-LABEL: s_shl_constant_i64:
1181; EG:       ; %bb.0:
1182; EG-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
1183; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1184; EG-NEXT:    CF_END
1185; EG-NEXT:    PAD
1186; EG-NEXT:    ALU clause starting at 4:
1187; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
1188; EG-NEXT:     MOV T0.W, literal.y,
1189; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
1190; EG-NEXT:    31(4.344025e-44), -1(nan)
1191; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
1192; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
1193; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1194; EG-NEXT:    32767(4.591635e-41), -1(nan)
1195; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1196; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1197; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1198; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1199; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1200  %shl = shl i64 281474976710655, %a
1201  store i64 %shl, i64 addrspace(1)* %out, align 8
1202  ret void
1203}
1204
1205define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1206; SI-LABEL: v_shl_constant_i64:
1207; SI:       ; %bb.0:
1208; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1209; SI-NEXT:    s_mov_b32 s7, 0xf000
1210; SI-NEXT:    s_mov_b32 s6, -1
1211; SI-NEXT:    s_mov_b32 s10, s6
1212; SI-NEXT:    s_mov_b32 s11, s7
1213; SI-NEXT:    s_waitcnt lgkmcnt(0)
1214; SI-NEXT:    s_mov_b32 s8, s2
1215; SI-NEXT:    s_mov_b32 s9, s3
1216; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1217; SI-NEXT:    s_mov_b32 s2, 0xab19b207
1218; SI-NEXT:    s_movk_i32 s3, 0x11e
1219; SI-NEXT:    s_mov_b32 s4, s0
1220; SI-NEXT:    s_mov_b32 s5, s1
1221; SI-NEXT:    s_waitcnt vmcnt(0)
1222; SI-NEXT:    v_lshl_b64 v[0:1], s[2:3], v0
1223; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1224; SI-NEXT:    s_endpgm
1225;
1226; VI-LABEL: v_shl_constant_i64:
1227; VI:       ; %bb.0:
1228; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1229; VI-NEXT:    s_mov_b32 s7, 0xf000
1230; VI-NEXT:    s_mov_b32 s6, -1
1231; VI-NEXT:    s_waitcnt lgkmcnt(0)
1232; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1233; VI-NEXT:    s_mov_b32 s4, s0
1234; VI-NEXT:    s_mov_b32 s5, s1
1235; VI-NEXT:    s_mov_b32 s0, 0xab19b207
1236; VI-NEXT:    s_movk_i32 s1, 0x11e
1237; VI-NEXT:    s_waitcnt lgkmcnt(0)
1238; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1239; VI-NEXT:    v_mov_b32_e32 v0, s0
1240; VI-NEXT:    v_mov_b32_e32 v1, s1
1241; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1242; VI-NEXT:    s_endpgm
1243;
1244; EG-LABEL: v_shl_constant_i64:
1245; EG:       ; %bb.0:
1246; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1247; EG-NEXT:    TEX 0 @6
1248; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1249; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1250; EG-NEXT:    CF_END
1251; EG-NEXT:    PAD
1252; EG-NEXT:    Fetch clause starting at 6:
1253; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1254; EG-NEXT:    ALU clause starting at 8:
1255; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1256; EG-NEXT:    ALU clause starting at 9:
1257; EG-NEXT:     NOT_INT T0.Z, T0.X,
1258; EG-NEXT:     MOV T0.W, literal.x,
1259; EG-NEXT:     AND_INT * T1.W, T0.X, literal.y,
1260; EG-NEXT:    1435293955(1.935796e+13), 31(4.344025e-44)
1261; EG-NEXT:     LSHL T1.Z, literal.x, PS,
1262; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.W, PV.Z,
1263; EG-NEXT:     AND_INT * T1.W, T0.X, literal.z,
1264; EG-NEXT:    -1424379385(-5.460358e-13), 143(2.003857e-43)
1265; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1266; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1267; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
1268; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1269; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1270  %a = load i64, i64 addrspace(1)* %aptr, align 8
1271  %shl = shl i64 1231231234567, %a
1272  store i64 %shl, i64 addrspace(1)* %out, align 8
1273  ret void
1274}
1275
1276define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1277; SI-LABEL: v_shl_i64_32_bit_constant:
1278; SI:       ; %bb.0:
1279; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1280; SI-NEXT:    s_mov_b32 s7, 0xf000
1281; SI-NEXT:    s_mov_b32 s6, -1
1282; SI-NEXT:    s_mov_b32 s10, s6
1283; SI-NEXT:    s_mov_b32 s11, s7
1284; SI-NEXT:    s_waitcnt lgkmcnt(0)
1285; SI-NEXT:    s_mov_b32 s8, s2
1286; SI-NEXT:    s_mov_b32 s9, s3
1287; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1288; SI-NEXT:    s_mov_b64 s[2:3], 0x12d687
1289; SI-NEXT:    s_mov_b32 s4, s0
1290; SI-NEXT:    s_mov_b32 s5, s1
1291; SI-NEXT:    s_waitcnt vmcnt(0)
1292; SI-NEXT:    v_lshl_b64 v[0:1], s[2:3], v0
1293; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1294; SI-NEXT:    s_endpgm
1295;
1296; VI-LABEL: v_shl_i64_32_bit_constant:
1297; VI:       ; %bb.0:
1298; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1299; VI-NEXT:    s_mov_b32 s7, 0xf000
1300; VI-NEXT:    s_mov_b32 s6, -1
1301; VI-NEXT:    s_waitcnt lgkmcnt(0)
1302; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1303; VI-NEXT:    s_mov_b32 s4, s0
1304; VI-NEXT:    s_mov_b32 s5, s1
1305; VI-NEXT:    s_mov_b64 s[0:1], 0x12d687
1306; VI-NEXT:    s_waitcnt lgkmcnt(0)
1307; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1308; VI-NEXT:    v_mov_b32_e32 v0, s0
1309; VI-NEXT:    v_mov_b32_e32 v1, s1
1310; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1311; VI-NEXT:    s_endpgm
1312;
1313; EG-LABEL: v_shl_i64_32_bit_constant:
1314; EG:       ; %bb.0:
1315; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1316; EG-NEXT:    TEX 0 @6
1317; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
1318; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1319; EG-NEXT:    CF_END
1320; EG-NEXT:    PAD
1321; EG-NEXT:    Fetch clause starting at 6:
1322; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1323; EG-NEXT:    ALU clause starting at 8:
1324; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1325; EG-NEXT:    ALU clause starting at 9:
1326; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
1327; EG-NEXT:     NOT_INT * T1.W, T0.X,
1328; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1329; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1330; EG-NEXT:     LSHL T0.W, literal.y, PV.W,
1331; EG-NEXT:     AND_INT * T1.W, T0.X, literal.z,
1332; EG-NEXT:    617283(8.649977e-40), 1234567(1.729997e-39)
1333; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1334; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1335; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1336; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1337; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1338  %a = load i64, i64 addrspace(1)* %aptr, align 8
1339  %shl = shl i64 1234567, %a
1340  store i64 %shl, i64 addrspace(1)* %out, align 8
1341  ret void
1342}
1343
1344define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1345; SI-LABEL: v_shl_inline_imm_64_i64:
1346; SI:       ; %bb.0:
1347; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1348; SI-NEXT:    s_mov_b32 s7, 0xf000
1349; SI-NEXT:    s_mov_b32 s6, -1
1350; SI-NEXT:    s_mov_b32 s10, s6
1351; SI-NEXT:    s_mov_b32 s11, s7
1352; SI-NEXT:    s_waitcnt lgkmcnt(0)
1353; SI-NEXT:    s_mov_b32 s8, s2
1354; SI-NEXT:    s_mov_b32 s9, s3
1355; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1356; SI-NEXT:    s_mov_b32 s4, s0
1357; SI-NEXT:    s_mov_b32 s5, s1
1358; SI-NEXT:    s_waitcnt vmcnt(0)
1359; SI-NEXT:    v_lshl_b64 v[0:1], 64, v0
1360; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1361; SI-NEXT:    s_endpgm
1362;
1363; VI-LABEL: v_shl_inline_imm_64_i64:
1364; VI:       ; %bb.0:
1365; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1366; VI-NEXT:    s_waitcnt lgkmcnt(0)
1367; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
1368; VI-NEXT:    s_mov_b32 s3, 0xf000
1369; VI-NEXT:    s_mov_b32 s2, -1
1370; VI-NEXT:    s_waitcnt lgkmcnt(0)
1371; VI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1372; VI-NEXT:    v_mov_b32_e32 v0, s4
1373; VI-NEXT:    v_mov_b32_e32 v1, s5
1374; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1375; VI-NEXT:    s_endpgm
1376;
1377; EG-LABEL: v_shl_inline_imm_64_i64:
1378; EG:       ; %bb.0:
1379; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1380; EG-NEXT:    TEX 0 @6
1381; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
1382; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1383; EG-NEXT:    CF_END
1384; EG-NEXT:    PAD
1385; EG-NEXT:    Fetch clause starting at 6:
1386; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1387; EG-NEXT:    ALU clause starting at 8:
1388; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1389; EG-NEXT:    ALU clause starting at 9:
1390; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
1391; EG-NEXT:     NOT_INT * T1.W, T0.X,
1392; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1393; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1394; EG-NEXT:     LSHL T0.W, literal.y, PV.W,
1395; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
1396; EG-NEXT:    32(4.484155e-44), 64(8.968310e-44)
1397; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1398; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1399; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1400; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1401  %a = load i64, i64 addrspace(1)* %aptr, align 8
1402  %shl = shl i64 64, %a
1403  store i64 %shl, i64 addrspace(1)* %out, align 8
1404  ret void
1405}
1406
1407define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1408; SI-LABEL: s_shl_inline_imm_64_i64:
1409; SI:       ; %bb.0:
1410; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1411; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1412; SI-NEXT:    s_mov_b32 s3, 0xf000
1413; SI-NEXT:    s_mov_b32 s2, -1
1414; SI-NEXT:    s_waitcnt lgkmcnt(0)
1415; SI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1416; SI-NEXT:    v_mov_b32_e32 v0, s4
1417; SI-NEXT:    v_mov_b32_e32 v1, s5
1418; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1419; SI-NEXT:    s_endpgm
1420;
1421; VI-LABEL: s_shl_inline_imm_64_i64:
1422; VI:       ; %bb.0:
1423; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1424; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1425; VI-NEXT:    s_mov_b32 s3, 0xf000
1426; VI-NEXT:    s_mov_b32 s2, -1
1427; VI-NEXT:    s_waitcnt lgkmcnt(0)
1428; VI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1429; VI-NEXT:    v_mov_b32_e32 v0, s4
1430; VI-NEXT:    v_mov_b32_e32 v1, s5
1431; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1432; VI-NEXT:    s_endpgm
1433;
1434; EG-LABEL: s_shl_inline_imm_64_i64:
1435; EG:       ; %bb.0:
1436; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
1437; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1438; EG-NEXT:    CF_END
1439; EG-NEXT:    PAD
1440; EG-NEXT:    ALU clause starting at 4:
1441; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
1442; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1443; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1444; EG-NEXT:     LSHL T0.Z, literal.x, PS,
1445; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
1446; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1447; EG-NEXT:    64(8.968310e-44), 32(4.484155e-44)
1448; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1449; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
1450; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1451; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1452  %shl = shl i64 64, %a
1453  store i64 %shl, i64 addrspace(1)* %out, align 8
1454  ret void
1455}
1456
1457define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1458; SI-LABEL: s_shl_inline_imm_1_i64:
1459; SI:       ; %bb.0:
1460; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1461; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1462; SI-NEXT:    s_mov_b32 s3, 0xf000
1463; SI-NEXT:    s_mov_b32 s2, -1
1464; SI-NEXT:    s_waitcnt lgkmcnt(0)
1465; SI-NEXT:    s_lshl_b64 s[4:5], 1, s4
1466; SI-NEXT:    v_mov_b32_e32 v0, s4
1467; SI-NEXT:    v_mov_b32_e32 v1, s5
1468; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1469; SI-NEXT:    s_endpgm
1470;
1471; VI-LABEL: s_shl_inline_imm_1_i64:
1472; VI:       ; %bb.0:
1473; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1474; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1475; VI-NEXT:    s_mov_b32 s3, 0xf000
1476; VI-NEXT:    s_mov_b32 s2, -1
1477; VI-NEXT:    s_waitcnt lgkmcnt(0)
1478; VI-NEXT:    s_lshl_b64 s[4:5], 1, s4
1479; VI-NEXT:    v_mov_b32_e32 v0, s4
1480; VI-NEXT:    v_mov_b32_e32 v1, s5
1481; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1482; VI-NEXT:    s_endpgm
1483;
1484; EG-LABEL: s_shl_inline_imm_1_i64:
1485; EG:       ; %bb.0:
1486; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
1487; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1488; EG-NEXT:    CF_END
1489; EG-NEXT:    PAD
1490; EG-NEXT:    ALU clause starting at 4:
1491; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
1492; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.y,
1493; EG-NEXT:    31(4.344025e-44), 26(3.643376e-44)
1494; EG-NEXT:     ASHR T1.W, PS, literal.x,
1495; EG-NEXT:     LSHL * T0.W, 1, PV.W,
1496; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1497; EG-NEXT:     AND_INT T0.Y, PV.W, PS,
1498; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1499; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1500; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, 0.0,
1501; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1502; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1503  %shl = shl i64 1, %a
1504  store i64 %shl, i64 addrspace(1)* %out, align 8
1505  ret void
1506}
1507
1508define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1509; SI-LABEL: s_shl_inline_imm_1_0_i64:
1510; SI:       ; %bb.0:
1511; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1512; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1513; SI-NEXT:    s_mov_b32 s3, 0xf000
1514; SI-NEXT:    s_mov_b32 s2, -1
1515; SI-NEXT:    s_waitcnt lgkmcnt(0)
1516; SI-NEXT:    s_lshl_b64 s[4:5], 1.0, s4
1517; SI-NEXT:    v_mov_b32_e32 v0, s4
1518; SI-NEXT:    v_mov_b32_e32 v1, s5
1519; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1520; SI-NEXT:    s_endpgm
1521;
1522; VI-LABEL: s_shl_inline_imm_1_0_i64:
1523; VI:       ; %bb.0:
1524; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1525; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1526; VI-NEXT:    s_mov_b32 s3, 0xf000
1527; VI-NEXT:    s_mov_b32 s2, -1
1528; VI-NEXT:    s_waitcnt lgkmcnt(0)
1529; VI-NEXT:    s_lshl_b64 s[4:5], 1.0, s4
1530; VI-NEXT:    v_mov_b32_e32 v0, s4
1531; VI-NEXT:    v_mov_b32_e32 v1, s5
1532; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1533; VI-NEXT:    s_endpgm
1534;
1535; EG-LABEL: s_shl_inline_imm_1_0_i64:
1536; EG:       ; %bb.0:
1537; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1538; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1539; EG-NEXT:    CF_END
1540; EG-NEXT:    PAD
1541; EG-NEXT:    ALU clause starting at 4:
1542; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1543; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1544; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1545; EG-NEXT:    536346624(1.050321e-19), 32(4.484155e-44)
1546; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1547; EG-NEXT:     MOV T0.X, 0.0,
1548; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1549; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1550  %shl = shl i64 4607182418800017408, %a
1551  store i64 %shl, i64 addrspace(1)* %out, align 8
1552  ret void
1553}
1554
1555define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1556; SI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1557; SI:       ; %bb.0:
1558; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1559; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1560; SI-NEXT:    s_mov_b32 s3, 0xf000
1561; SI-NEXT:    s_mov_b32 s2, -1
1562; SI-NEXT:    s_waitcnt lgkmcnt(0)
1563; SI-NEXT:    s_lshl_b64 s[4:5], -1.0, s4
1564; SI-NEXT:    v_mov_b32_e32 v0, s4
1565; SI-NEXT:    v_mov_b32_e32 v1, s5
1566; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1567; SI-NEXT:    s_endpgm
1568;
1569; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1570; VI:       ; %bb.0:
1571; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1572; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1573; VI-NEXT:    s_mov_b32 s3, 0xf000
1574; VI-NEXT:    s_mov_b32 s2, -1
1575; VI-NEXT:    s_waitcnt lgkmcnt(0)
1576; VI-NEXT:    s_lshl_b64 s[4:5], -1.0, s4
1577; VI-NEXT:    v_mov_b32_e32 v0, s4
1578; VI-NEXT:    v_mov_b32_e32 v1, s5
1579; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1580; VI-NEXT:    s_endpgm
1581;
1582; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
1583; EG:       ; %bb.0:
1584; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1585; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1586; EG-NEXT:    CF_END
1587; EG-NEXT:    PAD
1588; EG-NEXT:    ALU clause starting at 4:
1589; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1590; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1591; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1592; EG-NEXT:    1610088448(3.574057e+19), 32(4.484155e-44)
1593; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1594; EG-NEXT:     MOV T0.X, 0.0,
1595; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1596; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1597  %shl = shl i64 13830554455654793216, %a
1598  store i64 %shl, i64 addrspace(1)* %out, align 8
1599  ret void
1600}
1601
1602define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1603; SI-LABEL: s_shl_inline_imm_0_5_i64:
1604; SI:       ; %bb.0:
1605; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1606; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1607; SI-NEXT:    s_mov_b32 s3, 0xf000
1608; SI-NEXT:    s_mov_b32 s2, -1
1609; SI-NEXT:    s_waitcnt lgkmcnt(0)
1610; SI-NEXT:    s_lshl_b64 s[4:5], 0.5, s4
1611; SI-NEXT:    v_mov_b32_e32 v0, s4
1612; SI-NEXT:    v_mov_b32_e32 v1, s5
1613; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1614; SI-NEXT:    s_endpgm
1615;
1616; VI-LABEL: s_shl_inline_imm_0_5_i64:
1617; VI:       ; %bb.0:
1618; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1619; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1620; VI-NEXT:    s_mov_b32 s3, 0xf000
1621; VI-NEXT:    s_mov_b32 s2, -1
1622; VI-NEXT:    s_waitcnt lgkmcnt(0)
1623; VI-NEXT:    s_lshl_b64 s[4:5], 0.5, s4
1624; VI-NEXT:    v_mov_b32_e32 v0, s4
1625; VI-NEXT:    v_mov_b32_e32 v1, s5
1626; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1627; VI-NEXT:    s_endpgm
1628;
1629; EG-LABEL: s_shl_inline_imm_0_5_i64:
1630; EG:       ; %bb.0:
1631; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1632; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1633; EG-NEXT:    CF_END
1634; EG-NEXT:    PAD
1635; EG-NEXT:    ALU clause starting at 4:
1636; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1637; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1638; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1639; EG-NEXT:    535822336(1.016440e-19), 32(4.484155e-44)
1640; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1641; EG-NEXT:     MOV T0.X, 0.0,
1642; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1643; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1644  %shl = shl i64 4602678819172646912, %a
1645  store i64 %shl, i64 addrspace(1)* %out, align 8
1646  ret void
1647}
1648
1649define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1650; SI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1651; SI:       ; %bb.0:
1652; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1653; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1654; SI-NEXT:    s_mov_b32 s3, 0xf000
1655; SI-NEXT:    s_mov_b32 s2, -1
1656; SI-NEXT:    s_waitcnt lgkmcnt(0)
1657; SI-NEXT:    s_lshl_b64 s[4:5], -0.5, s4
1658; SI-NEXT:    v_mov_b32_e32 v0, s4
1659; SI-NEXT:    v_mov_b32_e32 v1, s5
1660; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1661; SI-NEXT:    s_endpgm
1662;
1663; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1664; VI:       ; %bb.0:
1665; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1666; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1667; VI-NEXT:    s_mov_b32 s3, 0xf000
1668; VI-NEXT:    s_mov_b32 s2, -1
1669; VI-NEXT:    s_waitcnt lgkmcnt(0)
1670; VI-NEXT:    s_lshl_b64 s[4:5], -0.5, s4
1671; VI-NEXT:    v_mov_b32_e32 v0, s4
1672; VI-NEXT:    v_mov_b32_e32 v1, s5
1673; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1674; VI-NEXT:    s_endpgm
1675;
1676; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
1677; EG:       ; %bb.0:
1678; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1679; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1680; EG-NEXT:    CF_END
1681; EG-NEXT:    PAD
1682; EG-NEXT:    ALU clause starting at 4:
1683; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1684; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1685; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1686; EG-NEXT:    1609564160(3.458765e+19), 32(4.484155e-44)
1687; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1688; EG-NEXT:     MOV T0.X, 0.0,
1689; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1690; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1691  %shl = shl i64 13826050856027422720, %a
1692  store i64 %shl, i64 addrspace(1)* %out, align 8
1693  ret void
1694}
1695
1696define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1697; SI-LABEL: s_shl_inline_imm_2_0_i64:
1698; SI:       ; %bb.0:
1699; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1700; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1701; SI-NEXT:    s_mov_b32 s3, 0xf000
1702; SI-NEXT:    s_mov_b32 s2, -1
1703; SI-NEXT:    s_waitcnt lgkmcnt(0)
1704; SI-NEXT:    s_lshl_b64 s[4:5], 2.0, s4
1705; SI-NEXT:    v_mov_b32_e32 v0, s4
1706; SI-NEXT:    v_mov_b32_e32 v1, s5
1707; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1708; SI-NEXT:    s_endpgm
1709;
1710; VI-LABEL: s_shl_inline_imm_2_0_i64:
1711; VI:       ; %bb.0:
1712; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1713; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1714; VI-NEXT:    s_mov_b32 s3, 0xf000
1715; VI-NEXT:    s_mov_b32 s2, -1
1716; VI-NEXT:    s_waitcnt lgkmcnt(0)
1717; VI-NEXT:    s_lshl_b64 s[4:5], 2.0, s4
1718; VI-NEXT:    v_mov_b32_e32 v0, s4
1719; VI-NEXT:    v_mov_b32_e32 v1, s5
1720; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1721; VI-NEXT:    s_endpgm
1722;
1723; EG-LABEL: s_shl_inline_imm_2_0_i64:
1724; EG:       ; %bb.0:
1725; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1726; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1727; EG-NEXT:    CF_END
1728; EG-NEXT:    PAD
1729; EG-NEXT:    ALU clause starting at 4:
1730; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1731; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1732; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1733; EG-NEXT:    536870912(1.084202e-19), 32(4.484155e-44)
1734; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1735; EG-NEXT:     MOV T0.X, 0.0,
1736; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1737; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1738  %shl = shl i64 4611686018427387904, %a
1739  store i64 %shl, i64 addrspace(1)* %out, align 8
1740  ret void
1741}
1742
1743define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1744; SI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1745; SI:       ; %bb.0:
1746; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1747; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1748; SI-NEXT:    s_mov_b32 s3, 0xf000
1749; SI-NEXT:    s_mov_b32 s2, -1
1750; SI-NEXT:    s_waitcnt lgkmcnt(0)
1751; SI-NEXT:    s_lshl_b64 s[4:5], -2.0, s4
1752; SI-NEXT:    v_mov_b32_e32 v0, s4
1753; SI-NEXT:    v_mov_b32_e32 v1, s5
1754; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1755; SI-NEXT:    s_endpgm
1756;
1757; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1758; VI:       ; %bb.0:
1759; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1760; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1761; VI-NEXT:    s_mov_b32 s3, 0xf000
1762; VI-NEXT:    s_mov_b32 s2, -1
1763; VI-NEXT:    s_waitcnt lgkmcnt(0)
1764; VI-NEXT:    s_lshl_b64 s[4:5], -2.0, s4
1765; VI-NEXT:    v_mov_b32_e32 v0, s4
1766; VI-NEXT:    v_mov_b32_e32 v1, s5
1767; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1768; VI-NEXT:    s_endpgm
1769;
1770; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
1771; EG:       ; %bb.0:
1772; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1773; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1774; EG-NEXT:    CF_END
1775; EG-NEXT:    PAD
1776; EG-NEXT:    ALU clause starting at 4:
1777; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1778; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1779; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1780; EG-NEXT:    1610612736(3.689349e+19), 32(4.484155e-44)
1781; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1782; EG-NEXT:     MOV T0.X, 0.0,
1783; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1784; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1785  %shl = shl i64 13835058055282163712, %a
1786  store i64 %shl, i64 addrspace(1)* %out, align 8
1787  ret void
1788}
1789
1790define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1791; SI-LABEL: s_shl_inline_imm_4_0_i64:
1792; SI:       ; %bb.0:
1793; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1794; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1795; SI-NEXT:    s_mov_b32 s3, 0xf000
1796; SI-NEXT:    s_mov_b32 s2, -1
1797; SI-NEXT:    s_waitcnt lgkmcnt(0)
1798; SI-NEXT:    s_lshl_b64 s[4:5], 4.0, s4
1799; SI-NEXT:    v_mov_b32_e32 v0, s4
1800; SI-NEXT:    v_mov_b32_e32 v1, s5
1801; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1802; SI-NEXT:    s_endpgm
1803;
1804; VI-LABEL: s_shl_inline_imm_4_0_i64:
1805; VI:       ; %bb.0:
1806; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1807; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1808; VI-NEXT:    s_mov_b32 s3, 0xf000
1809; VI-NEXT:    s_mov_b32 s2, -1
1810; VI-NEXT:    s_waitcnt lgkmcnt(0)
1811; VI-NEXT:    s_lshl_b64 s[4:5], 4.0, s4
1812; VI-NEXT:    v_mov_b32_e32 v0, s4
1813; VI-NEXT:    v_mov_b32_e32 v1, s5
1814; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1815; VI-NEXT:    s_endpgm
1816;
1817; EG-LABEL: s_shl_inline_imm_4_0_i64:
1818; EG:       ; %bb.0:
1819; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1820; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1821; EG-NEXT:    CF_END
1822; EG-NEXT:    PAD
1823; EG-NEXT:    ALU clause starting at 4:
1824; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1825; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1826; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1827; EG-NEXT:    537395200(1.151965e-19), 32(4.484155e-44)
1828; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1829; EG-NEXT:     MOV T0.X, 0.0,
1830; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1831; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1832  %shl = shl i64 4616189618054758400, %a
1833  store i64 %shl, i64 addrspace(1)* %out, align 8
1834  ret void
1835}
1836
1837define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1838; SI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1839; SI:       ; %bb.0:
1840; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1841; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1842; SI-NEXT:    s_mov_b32 s3, 0xf000
1843; SI-NEXT:    s_mov_b32 s2, -1
1844; SI-NEXT:    s_waitcnt lgkmcnt(0)
1845; SI-NEXT:    s_lshl_b64 s[4:5], -4.0, s4
1846; SI-NEXT:    v_mov_b32_e32 v0, s4
1847; SI-NEXT:    v_mov_b32_e32 v1, s5
1848; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1849; SI-NEXT:    s_endpgm
1850;
1851; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1852; VI:       ; %bb.0:
1853; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1854; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1855; VI-NEXT:    s_mov_b32 s3, 0xf000
1856; VI-NEXT:    s_mov_b32 s2, -1
1857; VI-NEXT:    s_waitcnt lgkmcnt(0)
1858; VI-NEXT:    s_lshl_b64 s[4:5], -4.0, s4
1859; VI-NEXT:    v_mov_b32_e32 v0, s4
1860; VI-NEXT:    v_mov_b32_e32 v1, s5
1861; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1862; VI-NEXT:    s_endpgm
1863;
1864; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
1865; EG:       ; %bb.0:
1866; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1867; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1868; EG-NEXT:    CF_END
1869; EG-NEXT:    PAD
1870; EG-NEXT:    ALU clause starting at 4:
1871; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1872; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1873; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1874; EG-NEXT:    1611137024(3.919933e+19), 32(4.484155e-44)
1875; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1876; EG-NEXT:     MOV T0.X, 0.0,
1877; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1878; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1879  %shl = shl i64 13839561654909534208, %a
1880  store i64 %shl, i64 addrspace(1)* %out, align 8
1881  ret void
1882}
1883
1884
1885; Test with the 64-bit integer bitpattern for a 32-bit float in the
1886; low 32-bits, which is not a valid 64-bit inline immmediate.
1887define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1888; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1889; SI:       ; %bb.0:
1890; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1891; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
1892; SI-NEXT:    s_mov_b64 s[0:1], 0x40800000
1893; SI-NEXT:    s_mov_b32 s7, 0xf000
1894; SI-NEXT:    s_mov_b32 s6, -1
1895; SI-NEXT:    s_waitcnt lgkmcnt(0)
1896; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1897; SI-NEXT:    v_mov_b32_e32 v0, s0
1898; SI-NEXT:    v_mov_b32_e32 v1, s1
1899; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1900; SI-NEXT:    s_endpgm
1901;
1902; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1903; VI:       ; %bb.0:
1904; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1905; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
1906; VI-NEXT:    s_mov_b64 s[0:1], 0x40800000
1907; VI-NEXT:    s_mov_b32 s7, 0xf000
1908; VI-NEXT:    s_mov_b32 s6, -1
1909; VI-NEXT:    s_waitcnt lgkmcnt(0)
1910; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1911; VI-NEXT:    v_mov_b32_e32 v0, s0
1912; VI-NEXT:    v_mov_b32_e32 v1, s1
1913; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1914; VI-NEXT:    s_endpgm
1915;
1916; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
1917; EG:       ; %bb.0:
1918; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
1919; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1920; EG-NEXT:    CF_END
1921; EG-NEXT:    PAD
1922; EG-NEXT:    ALU clause starting at 4:
1923; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
1924; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1925; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1926; EG-NEXT:     LSHL T0.Z, literal.x, PS,
1927; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
1928; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1929; EG-NEXT:    1082130432(4.000000e+00), 541065216(1.626303e-19)
1930; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1931; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1932; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
1933; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1934; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1935  %shl = shl i64 1082130432, %a
1936  store i64 %shl, i64 addrspace(1)* %out, align 8
1937  ret void
1938}
1939
1940; FIXME: Copy of -1 register
1941define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1942; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1943; SI:       ; %bb.0:
1944; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1945; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
1946; SI-NEXT:    s_mov_b32 s6, -1
1947; SI-NEXT:    s_mov_b32 s0, -4.0
1948; SI-NEXT:    s_mov_b32 s1, s6
1949; SI-NEXT:    s_mov_b32 s7, 0xf000
1950; SI-NEXT:    s_waitcnt lgkmcnt(0)
1951; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1952; SI-NEXT:    v_mov_b32_e32 v0, s0
1953; SI-NEXT:    v_mov_b32_e32 v1, s1
1954; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1955; SI-NEXT:    s_endpgm
1956;
1957; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1958; VI:       ; %bb.0:
1959; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1960; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
1961; VI-NEXT:    s_mov_b32 s6, -1
1962; VI-NEXT:    s_mov_b32 s0, -4.0
1963; VI-NEXT:    s_mov_b32 s1, s6
1964; VI-NEXT:    s_mov_b32 s7, 0xf000
1965; VI-NEXT:    s_waitcnt lgkmcnt(0)
1966; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1967; VI-NEXT:    v_mov_b32_e32 v0, s0
1968; VI-NEXT:    v_mov_b32_e32 v1, s1
1969; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1970; VI-NEXT:    s_endpgm
1971;
1972; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1973; EG:       ; %bb.0:
1974; EG-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
1975; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1976; EG-NEXT:    CF_END
1977; EG-NEXT:    PAD
1978; EG-NEXT:    ALU clause starting at 4:
1979; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
1980; EG-NEXT:     MOV T0.W, literal.y,
1981; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
1982; EG-NEXT:    31(4.344025e-44), -532676608(-5.534023e+19)
1983; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
1984; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
1985; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1986; EG-NEXT:    2147483647(nan), -1065353216(-4.000000e+00)
1987; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1988; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1989; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1990; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1991; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1992  %shl = shl i64 -1065353216, %a
1993  store i64 %shl, i64 addrspace(1)* %out, align 8
1994  ret void
1995}
1996
1997define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1998; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
1999; SI:       ; %bb.0:
2000; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2001; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
2002; SI-NEXT:    s_mov_b32 s0, 0
2003; SI-NEXT:    s_mov_b32 s1, 4.0
2004; SI-NEXT:    s_mov_b32 s7, 0xf000
2005; SI-NEXT:    s_mov_b32 s6, -1
2006; SI-NEXT:    s_waitcnt lgkmcnt(0)
2007; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2008; SI-NEXT:    v_mov_b32_e32 v0, s0
2009; SI-NEXT:    v_mov_b32_e32 v1, s1
2010; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2011; SI-NEXT:    s_endpgm
2012;
2013; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2014; VI:       ; %bb.0:
2015; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2016; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
2017; VI-NEXT:    s_mov_b32 s0, 0
2018; VI-NEXT:    s_mov_b32 s1, 4.0
2019; VI-NEXT:    s_mov_b32 s7, 0xf000
2020; VI-NEXT:    s_mov_b32 s6, -1
2021; VI-NEXT:    s_waitcnt lgkmcnt(0)
2022; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2023; VI-NEXT:    v_mov_b32_e32 v0, s0
2024; VI-NEXT:    v_mov_b32_e32 v1, s1
2025; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2026; VI-NEXT:    s_endpgm
2027;
2028; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2029; EG:       ; %bb.0:
2030; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
2031; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2032; EG-NEXT:    CF_END
2033; EG-NEXT:    PAD
2034; EG-NEXT:    ALU clause starting at 4:
2035; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
2036; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2037; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
2038; EG-NEXT:    541065216(1.626303e-19), 32(4.484155e-44)
2039; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
2040; EG-NEXT:     MOV T0.X, 0.0,
2041; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2042; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2043  %shl = shl i64 4647714815446351872, %a
2044  store i64 %shl, i64 addrspace(1)* %out, align 8
2045  ret void
2046}
2047
2048define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
2049; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2050; SI:       ; %bb.0:
2051; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2052; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
2053; SI-NEXT:    s_mov_b32 s0, 0
2054; SI-NEXT:    s_mov_b32 s1, -4.0
2055; SI-NEXT:    s_mov_b32 s7, 0xf000
2056; SI-NEXT:    s_mov_b32 s6, -1
2057; SI-NEXT:    s_waitcnt lgkmcnt(0)
2058; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2059; SI-NEXT:    v_mov_b32_e32 v0, s0
2060; SI-NEXT:    v_mov_b32_e32 v1, s1
2061; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2062; SI-NEXT:    s_endpgm
2063;
2064; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2065; VI:       ; %bb.0:
2066; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2067; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
2068; VI-NEXT:    s_mov_b32 s0, 0
2069; VI-NEXT:    s_mov_b32 s1, -4.0
2070; VI-NEXT:    s_mov_b32 s7, 0xf000
2071; VI-NEXT:    s_mov_b32 s6, -1
2072; VI-NEXT:    s_waitcnt lgkmcnt(0)
2073; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2074; VI-NEXT:    v_mov_b32_e32 v0, s0
2075; VI-NEXT:    v_mov_b32_e32 v1, s1
2076; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2077; VI-NEXT:    s_endpgm
2078;
2079; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2080; EG:       ; %bb.0:
2081; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
2082; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2083; EG-NEXT:    CF_END
2084; EG-NEXT:    PAD
2085; EG-NEXT:    ALU clause starting at 4:
2086; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
2087; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2088; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
2089; EG-NEXT:    1614807040(5.534023e+19), 32(4.484155e-44)
2090; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
2091; EG-NEXT:     MOV T0.X, 0.0,
2092; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2093; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2094  %shl = shl i64 13871086852301127680, %a
2095  store i64 %shl, i64 addrspace(1)* %out, align 8
2096  ret void
2097}
2098
2099define amdgpu_kernel void @test_mul2(i32 %p) {
2100; SI-LABEL: test_mul2:
2101; SI:       ; %bb.0:
2102; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2103; SI-NEXT:    s_mov_b32 s3, 0xf000
2104; SI-NEXT:    s_mov_b32 s2, -1
2105; SI-NEXT:    s_waitcnt lgkmcnt(0)
2106; SI-NEXT:    s_lshl_b32 s0, s0, 1
2107; SI-NEXT:    v_mov_b32_e32 v0, s0
2108; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2109; SI-NEXT:    s_waitcnt vmcnt(0)
2110; SI-NEXT:    s_endpgm
2111;
2112; VI-LABEL: test_mul2:
2113; VI:       ; %bb.0:
2114; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2115; VI-NEXT:    s_mov_b32 s3, 0xf000
2116; VI-NEXT:    s_mov_b32 s2, -1
2117; VI-NEXT:    s_waitcnt lgkmcnt(0)
2118; VI-NEXT:    s_lshl_b32 s0, s0, 1
2119; VI-NEXT:    v_mov_b32_e32 v0, s0
2120; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2121; VI-NEXT:    s_waitcnt vmcnt(0)
2122; VI-NEXT:    s_endpgm
2123;
2124; EG-LABEL: test_mul2:
2125; EG:       ; %bb.0:
2126; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2127; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2128; EG-NEXT:    CF_END
2129; EG-NEXT:    PAD
2130; EG-NEXT:    ALU clause starting at 4:
2131; EG-NEXT:     MOV T0.X, literal.x,
2132; EG-NEXT:     LSHL * T1.X, KC0[2].Y, 1,
2133; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2134   %i = mul i32 %p, 2
2135   store volatile i32 %i, i32 addrspace(1)* undef
2136   ret void
2137}
2138
2139define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
2140; SI-LABEL: shl_or_k:
2141; SI:       ; %bb.0:
2142; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2143; SI-NEXT:    s_mov_b32 s6, 0
2144; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2145; SI-NEXT:    s_mov_b32 s7, 0xf000
2146; SI-NEXT:    s_mov_b32 s4, s6
2147; SI-NEXT:    s_mov_b32 s5, s6
2148; SI-NEXT:    v_or_b32_e32 v2, 4, v2
2149; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
2150; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2151; SI-NEXT:    s_setpc_b64 s[30:31]
2152;
2153; VI-LABEL: shl_or_k:
2154; VI:       ; %bb.0:
2155; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2156; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2157; VI-NEXT:    v_or_b32_e32 v2, 4, v2
2158; VI-NEXT:    flat_store_dword v[0:1], v2
2159; VI-NEXT:    s_waitcnt vmcnt(0)
2160; VI-NEXT:    s_setpc_b64 s[30:31]
2161;
2162; EG-LABEL: shl_or_k:
2163; EG:       ; %bb.0:
2164; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2165; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2166; EG-NEXT:    CF_END
2167; EG-NEXT:    PAD
2168; EG-NEXT:    ALU clause starting at 4:
2169; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
2170; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2171; EG-NEXT:     OR_INT T0.X, PV.W, literal.x,
2172; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
2173; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
2174  %tmp0 = or i32 %in, 1
2175  %tmp2 = shl i32 %tmp0, 2
2176  store i32 %tmp2, i32 addrspace(1)* %out
2177  ret void
2178}
2179
2180define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
2181; SI-LABEL: shl_or_k_two_uses:
2182; SI:       ; %bb.0:
2183; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2184; SI-NEXT:    s_mov_b32 s6, 0
2185; SI-NEXT:    v_or_b32_e32 v4, 1, v4
2186; SI-NEXT:    s_mov_b32 s7, 0xf000
2187; SI-NEXT:    s_mov_b32 s4, s6
2188; SI-NEXT:    s_mov_b32 s5, s6
2189; SI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
2190; SI-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64
2191; SI-NEXT:    buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
2192; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2193; SI-NEXT:    s_setpc_b64 s[30:31]
2194;
2195; VI-LABEL: shl_or_k_two_uses:
2196; VI:       ; %bb.0:
2197; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2198; VI-NEXT:    v_or_b32_e32 v4, 1, v4
2199; VI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
2200; VI-NEXT:    flat_store_dword v[0:1], v5
2201; VI-NEXT:    flat_store_dword v[2:3], v4
2202; VI-NEXT:    s_waitcnt vmcnt(0)
2203; VI-NEXT:    s_setpc_b64 s[30:31]
2204;
2205; EG-LABEL: shl_or_k_two_uses:
2206; EG:       ; %bb.0:
2207; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
2208; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
2209; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2210; EG-NEXT:    CF_END
2211; EG-NEXT:    ALU clause starting at 4:
2212; EG-NEXT:     LSHR T0.X, KC0[2].Z, literal.x,
2213; EG-NEXT:     OR_INT * T1.X, KC0[2].W, 1,
2214; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2215; EG-NEXT:     LSHL T2.X, PS, literal.x,
2216; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2217; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2218  %tmp0 = or i32 %in, 1
2219  %tmp2 = shl i32 %tmp0, 2
2220  store i32 %tmp2, i32 addrspace(1)* %out0
2221  store i32 %tmp0, i32 addrspace(1)* %out1
2222  ret void
2223}
2224
2225attributes #0 = { nounwind readnone }
2226