1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI
3; RUN: llc < %s -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8declare i32 @llvm.amdgcn.workgroup.id.x() #0
9
10define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
11; SI-LABEL: shl_v2i32:
12; SI:       ; %bb.0:
13; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
14; SI-NEXT:    s_mov_b32 s7, 0xf000
15; SI-NEXT:    s_mov_b32 s6, -1
16; SI-NEXT:    s_mov_b32 s10, s6
17; SI-NEXT:    s_mov_b32 s11, s7
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    s_mov_b32 s8, s2
20; SI-NEXT:    s_mov_b32 s9, s3
21; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
22; SI-NEXT:    s_mov_b32 s4, s0
23; SI-NEXT:    s_mov_b32 s5, s1
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    v_lshl_b32_e32 v1, v1, v3
26; SI-NEXT:    v_lshl_b32_e32 v0, v0, v2
27; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
28; SI-NEXT:    s_endpgm
29;
30; VI-LABEL: shl_v2i32:
31; VI:       ; %bb.0:
32; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
35; VI-NEXT:    s_mov_b32 s3, 0xf000
36; VI-NEXT:    s_mov_b32 s2, -1
37; VI-NEXT:    s_waitcnt lgkmcnt(0)
38; VI-NEXT:    s_lshl_b32 s5, s5, s7
39; VI-NEXT:    s_lshl_b32 s4, s4, s6
40; VI-NEXT:    v_mov_b32_e32 v0, s4
41; VI-NEXT:    v_mov_b32_e32 v1, s5
42; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
43; VI-NEXT:    s_endpgm
44;
45; EG-LABEL: shl_v2i32:
46; EG:       ; %bb.0:
47; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
48; EG-NEXT:    TEX 0 @6
49; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
50; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
51; EG-NEXT:    CF_END
52; EG-NEXT:    PAD
53; EG-NEXT:    Fetch clause starting at 6:
54; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
55; EG-NEXT:    ALU clause starting at 8:
56; EG-NEXT:     MOV * T0.X, KC0[2].Z,
57; EG-NEXT:    ALU clause starting at 9:
58; EG-NEXT:     LSHL * T0.Y, T0.Y, T0.W,
59; EG-NEXT:     LSHL T0.X, T0.X, T0.Z,
60; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
61; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
62  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
63  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
64  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
65  %result = shl <2 x i32> %a, %b
66  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
67  ret void
68}
69
70define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
71; SI-LABEL: shl_v4i32:
72; SI:       ; %bb.0:
73; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
74; SI-NEXT:    s_mov_b32 s7, 0xf000
75; SI-NEXT:    s_mov_b32 s6, -1
76; SI-NEXT:    s_mov_b32 s10, s6
77; SI-NEXT:    s_mov_b32 s11, s7
78; SI-NEXT:    s_waitcnt lgkmcnt(0)
79; SI-NEXT:    s_mov_b32 s8, s2
80; SI-NEXT:    s_mov_b32 s9, s3
81; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
82; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
83; SI-NEXT:    s_mov_b32 s4, s0
84; SI-NEXT:    s_mov_b32 s5, s1
85; SI-NEXT:    s_waitcnt vmcnt(0)
86; SI-NEXT:    v_lshlrev_b32_e32 v3, v7, v3
87; SI-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
88; SI-NEXT:    v_lshlrev_b32_e32 v1, v5, v1
89; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
90; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
91; SI-NEXT:    s_endpgm
92;
93; VI-LABEL: shl_v4i32:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
96; VI-NEXT:    s_waitcnt lgkmcnt(0)
97; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
98; VI-NEXT:    s_mov_b32 s11, 0xf000
99; VI-NEXT:    s_mov_b32 s10, -1
100; VI-NEXT:    s_waitcnt lgkmcnt(0)
101; VI-NEXT:    s_lshl_b32 s3, s3, s7
102; VI-NEXT:    s_lshl_b32 s2, s2, s6
103; VI-NEXT:    s_lshl_b32 s1, s1, s5
104; VI-NEXT:    s_lshl_b32 s0, s0, s4
105; VI-NEXT:    v_mov_b32_e32 v0, s0
106; VI-NEXT:    v_mov_b32_e32 v1, s1
107; VI-NEXT:    v_mov_b32_e32 v2, s2
108; VI-NEXT:    v_mov_b32_e32 v3, s3
109; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
110; VI-NEXT:    s_endpgm
111;
112; EG-LABEL: shl_v4i32:
113; EG:       ; %bb.0:
114; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
115; EG-NEXT:    TEX 1 @6
116; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
117; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
118; EG-NEXT:    CF_END
119; EG-NEXT:    PAD
120; EG-NEXT:    Fetch clause starting at 6:
121; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
122; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
123; EG-NEXT:    ALU clause starting at 10:
124; EG-NEXT:     MOV * T0.X, KC0[2].Z,
125; EG-NEXT:    ALU clause starting at 11:
126; EG-NEXT:     LSHL * T0.W, T0.W, T1.W,
127; EG-NEXT:     LSHL * T0.Z, T0.Z, T1.Z,
128; EG-NEXT:     LSHL * T0.Y, T0.Y, T1.Y,
129; EG-NEXT:     LSHL T0.X, T0.X, T1.X,
130; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
131; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
132  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
133  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
134  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
135  %result = shl <4 x i32> %a, %b
136  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
137  ret void
138}
139
140define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
141; SI-LABEL: shl_i16:
142; SI:       ; %bb.0:
143; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
144; SI-NEXT:    s_mov_b32 s7, 0xf000
145; SI-NEXT:    s_mov_b32 s6, -1
146; SI-NEXT:    s_mov_b32 s10, s6
147; SI-NEXT:    s_mov_b32 s11, s7
148; SI-NEXT:    s_waitcnt lgkmcnt(0)
149; SI-NEXT:    s_mov_b32 s8, s2
150; SI-NEXT:    s_mov_b32 s9, s3
151; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
152; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
153; SI-NEXT:    s_mov_b32 s4, s0
154; SI-NEXT:    s_mov_b32 s5, s1
155; SI-NEXT:    s_waitcnt vmcnt(0)
156; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
157; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
158; SI-NEXT:    s_endpgm
159;
160; VI-LABEL: shl_i16:
161; VI:       ; %bb.0:
162; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
163; VI-NEXT:    s_mov_b32 s7, 0xf000
164; VI-NEXT:    s_mov_b32 s6, -1
165; VI-NEXT:    s_mov_b32 s10, s6
166; VI-NEXT:    s_mov_b32 s11, s7
167; VI-NEXT:    s_waitcnt lgkmcnt(0)
168; VI-NEXT:    s_mov_b32 s8, s2
169; VI-NEXT:    s_mov_b32 s9, s3
170; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
171; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
172; VI-NEXT:    s_mov_b32 s4, s0
173; VI-NEXT:    s_mov_b32 s5, s1
174; VI-NEXT:    s_waitcnt vmcnt(0)
175; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
176; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
177; VI-NEXT:    s_endpgm
178;
179; EG-LABEL: shl_i16:
180; EG:       ; %bb.0:
181; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
182; EG-NEXT:    TEX 1 @6
183; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
184; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
185; EG-NEXT:    CF_END
186; EG-NEXT:    PAD
187; EG-NEXT:    Fetch clause starting at 6:
188; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
189; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
190; EG-NEXT:    ALU clause starting at 10:
191; EG-NEXT:     MOV * T0.X, KC0[2].Z,
192; EG-NEXT:    ALU clause starting at 11:
193; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
194; EG-NEXT:     LSHL * T1.W, T0.X, T1.X,
195; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
196; EG-NEXT:     AND_INT T1.W, PS, literal.x,
197; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
198; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
199; EG-NEXT:     LSHL T0.X, PV.W, PS,
200; EG-NEXT:     LSHL * T0.W, literal.x, PS,
201; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
202; EG-NEXT:     MOV T0.Y, 0.0,
203; EG-NEXT:     MOV * T0.Z, 0.0,
204; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
205; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
206  %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
207  %a = load i16, i16 addrspace(1)* %in
208  %b = load i16, i16 addrspace(1)* %b_ptr
209  %result = shl i16 %a, %b
210  store i16 %result, i16 addrspace(1)* %out
211  ret void
212}
213
214define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
215; SI-LABEL: shl_i16_v_s:
216; SI:       ; %bb.0:
217; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
218; SI-NEXT:    s_load_dword s12, s[0:1], 0xd
219; SI-NEXT:    s_mov_b32 s3, 0xf000
220; SI-NEXT:    s_mov_b32 s2, -1
221; SI-NEXT:    s_mov_b32 s10, s2
222; SI-NEXT:    s_waitcnt lgkmcnt(0)
223; SI-NEXT:    s_mov_b32 s8, s6
224; SI-NEXT:    s_mov_b32 s9, s7
225; SI-NEXT:    s_mov_b32 s11, s3
226; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
227; SI-NEXT:    s_mov_b32 s0, s4
228; SI-NEXT:    s_mov_b32 s1, s5
229; SI-NEXT:    s_waitcnt vmcnt(0)
230; SI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
231; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
232; SI-NEXT:    s_endpgm
233;
234; VI-LABEL: shl_i16_v_s:
235; VI:       ; %bb.0:
236; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
237; VI-NEXT:    s_load_dword s12, s[0:1], 0x34
238; VI-NEXT:    s_mov_b32 s3, 0xf000
239; VI-NEXT:    s_mov_b32 s2, -1
240; VI-NEXT:    s_mov_b32 s10, s2
241; VI-NEXT:    s_waitcnt lgkmcnt(0)
242; VI-NEXT:    s_mov_b32 s8, s6
243; VI-NEXT:    s_mov_b32 s9, s7
244; VI-NEXT:    s_mov_b32 s11, s3
245; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
246; VI-NEXT:    s_mov_b32 s0, s4
247; VI-NEXT:    s_mov_b32 s1, s5
248; VI-NEXT:    s_waitcnt vmcnt(0)
249; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
250; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
251; VI-NEXT:    s_endpgm
252;
253; EG-LABEL: shl_i16_v_s:
254; EG:       ; %bb.0:
255; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
256; EG-NEXT:    TEX 1 @6
257; EG-NEXT:    ALU 12, @12, KC0[CB0:0-32], KC1[]
258; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
259; EG-NEXT:    CF_END
260; EG-NEXT:    PAD
261; EG-NEXT:    Fetch clause starting at 6:
262; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
263; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
264; EG-NEXT:    ALU clause starting at 10:
265; EG-NEXT:     MOV T0.X, 0.0,
266; EG-NEXT:     MOV * T1.X, KC0[2].Z,
267; EG-NEXT:    ALU clause starting at 12:
268; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
269; EG-NEXT:     LSHL * T1.W, T1.X, T0.X,
270; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
271; EG-NEXT:     AND_INT T1.W, PS, literal.x,
272; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
273; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
274; EG-NEXT:     LSHL T0.X, PV.W, PS,
275; EG-NEXT:     LSHL * T0.W, literal.x, PS,
276; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
277; EG-NEXT:     MOV T0.Y, 0.0,
278; EG-NEXT:     MOV * T0.Z, 0.0,
279; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
280; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
281  %a = load i16, i16 addrspace(1)* %in
282  %result = shl i16 %a, %b
283  store i16 %result, i16 addrspace(1)* %out
284  ret void
285}
286
287define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
288; SI-LABEL: shl_i16_v_compute_s:
289; SI:       ; %bb.0:
290; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
291; SI-NEXT:    s_load_dword s12, s[0:1], 0xd
292; SI-NEXT:    s_mov_b32 s3, 0xf000
293; SI-NEXT:    s_mov_b32 s2, -1
294; SI-NEXT:    s_mov_b32 s10, s2
295; SI-NEXT:    s_waitcnt lgkmcnt(0)
296; SI-NEXT:    s_mov_b32 s8, s6
297; SI-NEXT:    s_mov_b32 s9, s7
298; SI-NEXT:    s_mov_b32 s11, s3
299; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
300; SI-NEXT:    s_add_i32 s12, s12, 3
301; SI-NEXT:    s_mov_b32 s0, s4
302; SI-NEXT:    s_mov_b32 s1, s5
303; SI-NEXT:    s_waitcnt vmcnt(0)
304; SI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
305; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
306; SI-NEXT:    s_endpgm
307;
308; VI-LABEL: shl_i16_v_compute_s:
309; VI:       ; %bb.0:
310; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
311; VI-NEXT:    s_load_dword s12, s[0:1], 0x34
312; VI-NEXT:    s_mov_b32 s3, 0xf000
313; VI-NEXT:    s_mov_b32 s2, -1
314; VI-NEXT:    s_mov_b32 s10, s2
315; VI-NEXT:    s_waitcnt lgkmcnt(0)
316; VI-NEXT:    s_mov_b32 s8, s6
317; VI-NEXT:    s_mov_b32 s9, s7
318; VI-NEXT:    s_mov_b32 s11, s3
319; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
320; VI-NEXT:    s_add_i32 s12, s12, 3
321; VI-NEXT:    s_mov_b32 s0, s4
322; VI-NEXT:    s_mov_b32 s1, s5
323; VI-NEXT:    s_waitcnt vmcnt(0)
324; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
325; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
326; VI-NEXT:    s_endpgm
327;
328; EG-LABEL: shl_i16_v_compute_s:
329; EG:       ; %bb.0:
330; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
331; EG-NEXT:    TEX 0 @8
332; EG-NEXT:    ALU 0, @13, KC0[CB0:0-32], KC1[]
333; EG-NEXT:    TEX 0 @10
334; EG-NEXT:    ALU 15, @14, KC0[CB0:0-32], KC1[]
335; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
336; EG-NEXT:    CF_END
337; EG-NEXT:    PAD
338; EG-NEXT:    Fetch clause starting at 8:
339; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
340; EG-NEXT:    Fetch clause starting at 10:
341; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
342; EG-NEXT:    ALU clause starting at 12:
343; EG-NEXT:     MOV * T0.X, 0.0,
344; EG-NEXT:    ALU clause starting at 13:
345; EG-NEXT:     MOV * T1.X, KC0[2].Z,
346; EG-NEXT:    ALU clause starting at 14:
347; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
348; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
349; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
350; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
351; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
352; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
353; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
354; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
355; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
356; EG-NEXT:     LSHL T0.X, PV.W, PS,
357; EG-NEXT:     LSHL * T0.W, literal.x, PS,
358; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
359; EG-NEXT:     MOV T0.Y, 0.0,
360; EG-NEXT:     MOV * T0.Z, 0.0,
361; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
362; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
363  %a = load i16, i16 addrspace(1)* %in
364  %b.add = add i16 %b, 3
365  %result = shl i16 %a, %b.add
366  store i16 %result, i16 addrspace(1)* %out
367  ret void
368}
369
370define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
371; SI-LABEL: shl_i16_computed_amount:
372; SI:       ; %bb.0:
373; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
374; SI-NEXT:    s_mov_b32 s7, 0xf000
375; SI-NEXT:    s_mov_b32 s6, -1
376; SI-NEXT:    s_mov_b32 s10, s6
377; SI-NEXT:    s_mov_b32 s11, s7
378; SI-NEXT:    s_waitcnt lgkmcnt(0)
379; SI-NEXT:    s_mov_b32 s8, s2
380; SI-NEXT:    s_mov_b32 s9, s3
381; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
382; SI-NEXT:    v_mov_b32_e32 v1, 0
383; SI-NEXT:    s_mov_b32 s14, 0
384; SI-NEXT:    s_mov_b32 s15, s7
385; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
386; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
387; SI-NEXT:    s_waitcnt vmcnt(0)
388; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc
389; SI-NEXT:    s_waitcnt vmcnt(0)
390; SI-NEXT:    s_mov_b32 s4, s0
391; SI-NEXT:    s_mov_b32 s5, s1
392; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
393; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
394; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
395; SI-NEXT:    s_endpgm
396;
397; VI-LABEL: shl_i16_computed_amount:
398; VI:       ; %bb.0:
399; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
400; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
401; VI-NEXT:    s_mov_b32 s7, 0xf000
402; VI-NEXT:    s_mov_b32 s6, -1
403; VI-NEXT:    s_mov_b32 s10, s6
404; VI-NEXT:    s_waitcnt lgkmcnt(0)
405; VI-NEXT:    v_mov_b32_e32 v1, s3
406; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
407; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
408; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
409; VI-NEXT:    s_mov_b32 s8, s2
410; VI-NEXT:    s_mov_b32 s9, s3
411; VI-NEXT:    s_mov_b32 s11, s7
412; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
413; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
414; VI-NEXT:    s_waitcnt vmcnt(0)
415; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
416; VI-NEXT:    s_waitcnt vmcnt(0)
417; VI-NEXT:    s_mov_b32 s4, s0
418; VI-NEXT:    s_mov_b32 s5, s1
419; VI-NEXT:    v_add_u16_e32 v0, 3, v0
420; VI-NEXT:    v_lshlrev_b16_e32 v0, v0, v2
421; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
422; VI-NEXT:    s_endpgm
423;
424; EG-LABEL: shl_i16_computed_amount:
425; EG:       ; %bb.0:
426; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
427; EG-NEXT:    TEX 0 @8
428; EG-NEXT:    ALU 1, @13, KC0[CB0:0-32], KC1[]
429; EG-NEXT:    TEX 0 @10
430; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
431; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
432; EG-NEXT:    CF_END
433; EG-NEXT:    PAD
434; EG-NEXT:    Fetch clause starting at 8:
435; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
436; EG-NEXT:    Fetch clause starting at 10:
437; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
438; EG-NEXT:    ALU clause starting at 12:
439; EG-NEXT:     MOV * T1.X, KC0[2].Z,
440; EG-NEXT:    ALU clause starting at 13:
441; EG-NEXT:     LSHL * T0.W, T0.X, 1,
442; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
443; EG-NEXT:    ALU clause starting at 15:
444; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
445; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
446; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
447; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
448; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
449; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
450; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
451; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
452; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
453; EG-NEXT:     LSHL T0.X, PV.W, PS,
454; EG-NEXT:     LSHL * T0.W, literal.x, PS,
455; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
456; EG-NEXT:     MOV T0.Y, 0.0,
457; EG-NEXT:     MOV * T0.Z, 0.0,
458; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
459; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
460  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
461  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
462  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
463  %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i16 1
464  %a = load volatile i16, i16 addrspace(1)* %in
465  %b = load volatile i16, i16 addrspace(1)* %b_ptr
466  %b.add = add i16 %b, 3
467  %result = shl i16 %a, %b.add
468  store i16 %result, i16 addrspace(1)* %out
469  ret void
470}
471
472define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
473; SI-LABEL: shl_i16_i_s:
474; SI:       ; %bb.0:
475; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
476; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
477; SI-NEXT:    s_mov_b32 s3, 0xf000
478; SI-NEXT:    s_mov_b32 s2, -1
479; SI-NEXT:    s_waitcnt lgkmcnt(0)
480; SI-NEXT:    s_lshl_b32 s4, s4, 12
481; SI-NEXT:    v_mov_b32_e32 v0, s4
482; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
483; SI-NEXT:    s_endpgm
484;
485; VI-LABEL: shl_i16_i_s:
486; VI:       ; %bb.0:
487; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
488; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
489; VI-NEXT:    s_mov_b32 s3, 0xf000
490; VI-NEXT:    s_mov_b32 s2, -1
491; VI-NEXT:    s_waitcnt lgkmcnt(0)
492; VI-NEXT:    s_lshl_b32 s4, s4, 12
493; VI-NEXT:    v_mov_b32_e32 v0, s4
494; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
495; VI-NEXT:    s_endpgm
496;
497; EG-LABEL: shl_i16_i_s:
498; EG:       ; %bb.0:
499; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
500; EG-NEXT:    TEX 0 @6
501; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
502; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
503; EG-NEXT:    CF_END
504; EG-NEXT:    PAD
505; EG-NEXT:    Fetch clause starting at 6:
506; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
507; EG-NEXT:    ALU clause starting at 8:
508; EG-NEXT:     MOV * T0.X, 0.0,
509; EG-NEXT:    ALU clause starting at 9:
510; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x,
511; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
512; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
513; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
514; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
515; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
516; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
517; EG-NEXT:    61440(8.609578e-41), 3(4.203895e-45)
518; EG-NEXT:     LSHL T0.X, PV.W, PS,
519; EG-NEXT:     LSHL * T0.W, literal.x, PS,
520; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
521; EG-NEXT:     MOV T0.Y, 0.0,
522; EG-NEXT:     MOV * T0.Z, 0.0,
523; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
524; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
525  %result = shl i16 %a, 12
526  store i16 %result, i16 addrspace(1)* %out
527  ret void
528}
529
530define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
531; SI-LABEL: shl_v2i16:
532; SI:       ; %bb.0:
533; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
534; SI-NEXT:    s_mov_b32 s7, 0xf000
535; SI-NEXT:    s_mov_b32 s6, -1
536; SI-NEXT:    s_mov_b32 s10, s6
537; SI-NEXT:    s_mov_b32 s11, s7
538; SI-NEXT:    s_waitcnt lgkmcnt(0)
539; SI-NEXT:    s_mov_b32 s8, s2
540; SI-NEXT:    s_mov_b32 s9, s3
541; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
542; SI-NEXT:    v_mov_b32_e32 v1, 0
543; SI-NEXT:    s_mov_b32 s14, 0
544; SI-NEXT:    s_mov_b32 s15, s7
545; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
546; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
547; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
548; SI-NEXT:    s_mov_b32 s4, s0
549; SI-NEXT:    s_mov_b32 s5, s1
550; SI-NEXT:    s_waitcnt vmcnt(1)
551; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
552; SI-NEXT:    s_waitcnt vmcnt(0)
553; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
554; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
555; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
556; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
557; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
558; SI-NEXT:    v_or_b32_e32 v0, v0, v1
559; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
560; SI-NEXT:    s_endpgm
561;
562; VI-LABEL: shl_v2i16:
563; VI:       ; %bb.0:
564; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
565; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
566; VI-NEXT:    s_waitcnt lgkmcnt(0)
567; VI-NEXT:    v_mov_b32_e32 v1, s3
568; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
569; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
570; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
571; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
572; VI-NEXT:    flat_load_dword v0, v[0:1]
573; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
574; VI-NEXT:    s_mov_b32 s3, 0xf000
575; VI-NEXT:    s_mov_b32 s2, -1
576; VI-NEXT:    s_waitcnt lgkmcnt(0)
577; VI-NEXT:    s_lshr_b32 s5, s4, 16
578; VI-NEXT:    v_mov_b32_e32 v1, s5
579; VI-NEXT:    s_waitcnt vmcnt(0)
580; VI-NEXT:    v_lshlrev_b16_e64 v2, v0, s4
581; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
582; VI-NEXT:    v_or_b32_e32 v0, v2, v0
583; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
584; VI-NEXT:    s_endpgm
585;
586; EG-LABEL: shl_v2i16:
587; EG:       ; %bb.0:
588; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
589; EG-NEXT:    TEX 0 @8
590; EG-NEXT:    ALU 0, @15, KC0[CB0:0-32], KC1[]
591; EG-NEXT:    TEX 0 @10
592; EG-NEXT:    ALU 11, @16, KC0[CB0:0-32], KC1[]
593; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
594; EG-NEXT:    CF_END
595; EG-NEXT:    PAD
596; EG-NEXT:    Fetch clause starting at 8:
597; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
598; EG-NEXT:    Fetch clause starting at 10:
599; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
600; EG-NEXT:    ALU clause starting at 12:
601; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
603; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
604; EG-NEXT:    ALU clause starting at 15:
605; EG-NEXT:     MOV * T7.X, KC0[2].Z,
606; EG-NEXT:    ALU clause starting at 16:
607; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x,
608; EG-NEXT:     LSHR T0.W, T0.X, literal.y,
609; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
610; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
611; EG-NEXT:     LSHL T0.W, PS, PV.W,
612; EG-NEXT:     LSHL * T1.W, T7.X, PV.Z,
613; EG-NEXT:     AND_INT T1.W, PS, literal.x,
614; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
615; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
616; EG-NEXT:     OR_INT T0.X, PV.W, PS,
617; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
618; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
619  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
620  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
621  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
622  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
623  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
624  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
625  %result = shl <2 x i16> %a, %b
626  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
627  ret void
628}
629
630define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
631; SI-LABEL: shl_v4i16:
632; SI:       ; %bb.0:
633; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
634; SI-NEXT:    s_mov_b32 s7, 0xf000
635; SI-NEXT:    s_mov_b32 s6, 0
636; SI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
637; SI-NEXT:    v_mov_b32_e32 v5, 0
638; SI-NEXT:    s_waitcnt lgkmcnt(0)
639; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
640; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
641; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
642; SI-NEXT:    s_waitcnt vmcnt(0)
643; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
644; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
645; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
646; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
647; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
648; SI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
649; SI-NEXT:    v_lshlrev_b32_e32 v2, v9, v7
650; SI-NEXT:    v_lshlrev_b32_e32 v3, v8, v6
651; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
652; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
653; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
654; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
655; SI-NEXT:    v_or_b32_e32 v1, v1, v2
656; SI-NEXT:    v_or_b32_e32 v0, v0, v3
657; SI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
658; SI-NEXT:    s_endpgm
659;
660; VI-LABEL: shl_v4i16:
661; VI:       ; %bb.0:
662; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
663; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
664; VI-NEXT:    s_waitcnt lgkmcnt(0)
665; VI-NEXT:    v_mov_b32_e32 v1, s3
666; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
667; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
668; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
669; VI-NEXT:    v_mov_b32_e32 v5, s1
670; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
671; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
672; VI-NEXT:    s_waitcnt vmcnt(0)
673; VI-NEXT:    v_lshlrev_b16_e32 v6, v3, v1
674; VI-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
675; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v0
676; VI-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
677; VI-NEXT:    v_or_b32_e32 v1, v6, v1
678; VI-NEXT:    v_or_b32_e32 v0, v3, v0
679; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
680; VI-NEXT:    s_endpgm
681;
682; EG-LABEL: shl_v4i16:
683; EG:       ; %bb.0:
684; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
685; EG-NEXT:    TEX 0 @6
686; EG-NEXT:    ALU 51, @11, KC0[CB0:0-32], KC1[]
687; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
688; EG-NEXT:    CF_END
689; EG-NEXT:    PAD
690; EG-NEXT:    Fetch clause starting at 6:
691; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
692; EG-NEXT:    ALU clause starting at 8:
693; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
694; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
695; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
696; EG-NEXT:    ALU clause starting at 11:
697; EG-NEXT:     MOV T4.X, T10.X,
698; EG-NEXT:     MOV * T5.X, T10.Y,
699; EG-NEXT:     MOV T0.X, PV.X,
700; EG-NEXT:     MOV T0.Y, PS,
701; EG-NEXT:     MOV * T2.X, T10.Z,
702; EG-NEXT:     MOV T3.X, T10.W,
703; EG-NEXT:     MOV * T0.Z, T6.X,
704; EG-NEXT:     MOV * T1.Y, T2.X,
705; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
706; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
707; EG-NEXT:     LSHL * T1.W, T0.X, PV.W,
708; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
709; EG-NEXT:     AND_INT * T2.W, T0.Z, literal.y,
710; EG-NEXT:    65535(9.183409e-41), -65536(nan)
711; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
712; EG-NEXT:     MOV * T0.Z, T3.X,
713; EG-NEXT:     MOV * T6.X, T1.W,
714; EG-NEXT:     MOV T1.Z, PV.X,
715; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
716; EG-NEXT:     LSHR * T2.W, T0.X, literal.x,
717; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
718; EG-NEXT:     LSHL T1.W, PS, PV.W,
719; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.x,
720; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
721; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
722; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
723; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
724; EG-NEXT:     MOV T6.X, PV.W,
725; EG-NEXT:     MOV * T0.X, T7.X,
726; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
727; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
728; EG-NEXT:     LSHL T1.W, T0.Y, PV.W,
729; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
730; EG-NEXT:    -65536(nan), 0(0.000000e+00)
731; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
732; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
733; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
734; EG-NEXT:     MOV * T7.X, PV.W,
735; EG-NEXT:     MOV T0.X, PV.X,
736; EG-NEXT:     LSHR T1.W, T0.Z, literal.x,
737; EG-NEXT:     LSHR * T2.W, T0.Y, literal.x,
738; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
739; EG-NEXT:     LSHL * T1.W, PS, PV.W,
740; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x,
741; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
742; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
743; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
744; EG-NEXT:     LSHR T0.X, PS, literal.x,
745; EG-NEXT:     OR_INT * T10.Y, PV.Z, PV.W,
746; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
747; EG-NEXT:     MOV T7.X, PV.Y,
748; EG-NEXT:     MOV * T10.X, T6.X,
749  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
750  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
751  %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
752  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
753  %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep
754  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
755  %result = shl <4 x i16> %a, %b
756  store <4 x i16> %result, <4 x i16> addrspace(1)* %gep.out
757  ret void
758}
759
760define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
761; SI-LABEL: shl_i64:
762; SI:       ; %bb.0:
763; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
764; SI-NEXT:    s_mov_b32 s7, 0xf000
765; SI-NEXT:    s_mov_b32 s6, -1
766; SI-NEXT:    s_mov_b32 s10, s6
767; SI-NEXT:    s_mov_b32 s11, s7
768; SI-NEXT:    s_waitcnt lgkmcnt(0)
769; SI-NEXT:    s_mov_b32 s8, s2
770; SI-NEXT:    s_mov_b32 s9, s3
771; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
772; SI-NEXT:    s_mov_b32 s4, s0
773; SI-NEXT:    s_mov_b32 s5, s1
774; SI-NEXT:    s_waitcnt vmcnt(0)
775; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
776; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
777; SI-NEXT:    s_endpgm
778;
779; VI-LABEL: shl_i64:
780; VI:       ; %bb.0:
781; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
782; VI-NEXT:    s_waitcnt lgkmcnt(0)
783; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
784; VI-NEXT:    s_mov_b32 s3, 0xf000
785; VI-NEXT:    s_mov_b32 s2, -1
786; VI-NEXT:    s_waitcnt lgkmcnt(0)
787; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
788; VI-NEXT:    v_mov_b32_e32 v0, s4
789; VI-NEXT:    v_mov_b32_e32 v1, s5
790; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
791; VI-NEXT:    s_endpgm
792;
793; EG-LABEL: shl_i64:
794; EG:       ; %bb.0:
795; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
796; EG-NEXT:    TEX 0 @6
797; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
798; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
799; EG-NEXT:    CF_END
800; EG-NEXT:    PAD
801; EG-NEXT:    Fetch clause starting at 6:
802; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
803; EG-NEXT:    ALU clause starting at 8:
804; EG-NEXT:     MOV * T0.X, KC0[2].Z,
805; EG-NEXT:    ALU clause starting at 9:
806; EG-NEXT:     AND_INT T1.Y, T0.Z, literal.x,
807; EG-NEXT:     LSHR T1.Z, T0.Y, 1,
808; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
809; EG-NEXT:     NOT_INT * T1.W, T0.Z,
810; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
811; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS,
812; EG-NEXT:     LSHL T0.W, T0.X, PV.Y,
813; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
814; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
815; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
816; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
817; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
818; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
819  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
820  %a = load i64, i64 addrspace(1)* %in
821  %b = load i64, i64 addrspace(1)* %b_ptr
822  %result = shl i64 %a, %b
823  store i64 %result, i64 addrspace(1)* %out
824  ret void
825}
826
827define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
828; SI-LABEL: shl_v2i64:
829; SI:       ; %bb.0:
830; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
831; SI-NEXT:    s_mov_b32 s7, 0xf000
832; SI-NEXT:    s_mov_b32 s6, -1
833; SI-NEXT:    s_mov_b32 s10, s6
834; SI-NEXT:    s_mov_b32 s11, s7
835; SI-NEXT:    s_waitcnt lgkmcnt(0)
836; SI-NEXT:    s_mov_b32 s8, s2
837; SI-NEXT:    s_mov_b32 s9, s3
838; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
839; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
840; SI-NEXT:    s_mov_b32 s4, s0
841; SI-NEXT:    s_mov_b32 s5, s1
842; SI-NEXT:    s_waitcnt vmcnt(0)
843; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
844; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
845; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
846; SI-NEXT:    s_endpgm
847;
848; VI-LABEL: shl_v2i64:
849; VI:       ; %bb.0:
850; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
851; VI-NEXT:    s_waitcnt lgkmcnt(0)
852; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
853; VI-NEXT:    s_mov_b32 s11, 0xf000
854; VI-NEXT:    s_mov_b32 s10, -1
855; VI-NEXT:    s_waitcnt lgkmcnt(0)
856; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
857; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
858; VI-NEXT:    v_mov_b32_e32 v0, s0
859; VI-NEXT:    v_mov_b32_e32 v1, s1
860; VI-NEXT:    v_mov_b32_e32 v2, s2
861; VI-NEXT:    v_mov_b32_e32 v3, s3
862; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
863; VI-NEXT:    s_endpgm
864;
865; EG-LABEL: shl_v2i64:
866; EG:       ; %bb.0:
867; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
868; EG-NEXT:    TEX 1 @6
869; EG-NEXT:    ALU 22, @11, KC0[CB0:0-32], KC1[]
870; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
871; EG-NEXT:    CF_END
872; EG-NEXT:    PAD
873; EG-NEXT:    Fetch clause starting at 6:
874; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
875; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
876; EG-NEXT:    ALU clause starting at 10:
877; EG-NEXT:     MOV * T0.X, KC0[2].Z,
878; EG-NEXT:    ALU clause starting at 11:
879; EG-NEXT:     AND_INT T1.Y, T1.Z, literal.x,
880; EG-NEXT:     LSHR T2.Z, T0.W, 1,
881; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
882; EG-NEXT:     NOT_INT * T1.W, T1.Z,
883; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
884; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS,
885; EG-NEXT:     LSHL * T1.W, T0.Z, PV.Y,
886; EG-NEXT:     AND_INT T2.X, T1.Z, literal.x,
887; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
888; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
889; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
890; EG-NEXT:     NOT_INT * T3.W, T1.X,
891; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
892; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
893; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
894; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
895; EG-NEXT:     CNDE_INT * T3.W, PV.X, T0.W, T1.W,
896; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
897; EG-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
898; EG-NEXT:     CNDE_INT * T3.Z, T2.X, T1.W, 0.0,
899; EG-NEXT:     CNDE_INT T3.X, T2.W, T0.Z, 0.0,
900; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
901; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
902  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
903  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
904  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
905  %result = shl <2 x i64> %a, %b
906  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
907  ret void
908}
909
910define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
911; SI-LABEL: shl_v4i64:
912; SI:       ; %bb.0:
913; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
914; SI-NEXT:    s_mov_b32 s3, 0xf000
915; SI-NEXT:    s_mov_b32 s2, -1
916; SI-NEXT:    s_mov_b32 s10, s2
917; SI-NEXT:    s_mov_b32 s11, s3
918; SI-NEXT:    s_waitcnt lgkmcnt(0)
919; SI-NEXT:    s_mov_b32 s8, s6
920; SI-NEXT:    s_mov_b32 s9, s7
921; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
922; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
923; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
924; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
925; SI-NEXT:    s_mov_b32 s0, s4
926; SI-NEXT:    s_mov_b32 s1, s5
927; SI-NEXT:    s_waitcnt vmcnt(2)
928; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
929; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
930; SI-NEXT:    s_waitcnt vmcnt(0)
931; SI-NEXT:    v_lshl_b64 v[9:10], v[9:10], v13
932; SI-NEXT:    v_lshl_b64 v[7:8], v[7:8], v11
933; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
934; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
935; SI-NEXT:    s_endpgm
936;
937; VI-LABEL: shl_v4i64:
938; VI:       ; %bb.0:
939; VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
940; VI-NEXT:    s_waitcnt lgkmcnt(0)
941; VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
942; VI-NEXT:    s_mov_b32 s19, 0xf000
943; VI-NEXT:    s_mov_b32 s18, -1
944; VI-NEXT:    s_waitcnt lgkmcnt(0)
945; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s14
946; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s12
947; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
948; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
949; VI-NEXT:    v_mov_b32_e32 v0, s4
950; VI-NEXT:    v_mov_b32_e32 v1, s5
951; VI-NEXT:    v_mov_b32_e32 v2, s6
952; VI-NEXT:    v_mov_b32_e32 v3, s7
953; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
954; VI-NEXT:    s_nop 0
955; VI-NEXT:    v_mov_b32_e32 v0, s0
956; VI-NEXT:    v_mov_b32_e32 v1, s1
957; VI-NEXT:    v_mov_b32_e32 v2, s2
958; VI-NEXT:    v_mov_b32_e32 v3, s3
959; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
960; VI-NEXT:    s_endpgm
961;
962; EG-LABEL: shl_v4i64:
963; EG:       ; %bb.0:
964; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
965; EG-NEXT:    TEX 3 @6
966; EG-NEXT:    ALU 47, @15, KC0[CB0:0-32], KC1[]
967; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
968; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
969; EG-NEXT:    CF_END
970; EG-NEXT:    Fetch clause starting at 6:
971; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
972; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
973; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 32, #1
974; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
975; EG-NEXT:    ALU clause starting at 14:
976; EG-NEXT:     MOV * T0.X, KC0[2].Z,
977; EG-NEXT:    ALU clause starting at 15:
978; EG-NEXT:     AND_INT T4.Z, T1.Z, literal.x,
979; EG-NEXT:     LSHR T1.W, T0.W, 1,
980; EG-NEXT:     NOT_INT * T3.W, T1.Z,
981; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
982; EG-NEXT:     BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1,
983; EG-NEXT:     AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201
984; EG-NEXT:     LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212
985; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221
986; EG-NEXT:     NOT_INT * T2.W, T3.Z,
987; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
988; EG-NEXT:     BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS,
989; EG-NEXT:     LSHL T2.Z, T2.Z, PV.Y,
990; EG-NEXT:     BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W,
991; EG-NEXT:     LSHL * T1.W, T0.Z, T4.Z,
992; EG-NEXT:     AND_INT T4.X, T1.Z, literal.x,
993; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
994; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
995; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
996; EG-NEXT:     NOT_INT * T3.W, T1.X,
997; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
998; EG-NEXT:     AND_INT T5.X, T3.Z, literal.x,
999; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
1000; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
1001; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
1002; EG-NEXT:     CNDE_INT * T4.W, PV.X, T0.W, T1.W,
1003; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1004; EG-NEXT:     AND_INT T0.X, T3.X, literal.x,
1005; EG-NEXT:     CNDE_INT T4.Y, PV.W, PV.Y, PV.Z,
1006; EG-NEXT:     LSHR T1.Z, T2.Y, 1,
1007; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1,
1008; EG-NEXT:     NOT_INT * T3.W, T3.X,
1009; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1010; EG-NEXT:     BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS,
1011; EG-NEXT:     LSHL T0.Y, T2.X, PV.X,
1012; EG-NEXT:     CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212
1013; EG-NEXT:     AND_INT * T0.W, T3.X, literal.x, BS:VEC_201
1014; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1015; EG-NEXT:     CNDE_INT * T1.W, T5.X, T3.Y, T2.Z,
1016; EG-NEXT:     CNDE_INT T4.X, T2.W, T0.Z, 0.0,
1017; EG-NEXT:     CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212
1018; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
1019; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1020; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
1021; EG-NEXT:     CNDE_INT T1.Z, T5.X, T2.Z, 0.0,
1022; EG-NEXT:     CNDE_INT * T1.X, T0.W, T0.Y, 0.0,
1023; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1024; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
1025; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1026  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
1027  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
1028  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
1029  %result = shl <4 x i64> %a, %b
1030  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
1031  ret void
1032}
1033
1034; Make sure load width gets reduced to i32 load.
1035define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
1036; SI-LABEL: s_shl_32_i64:
1037; SI:       ; %bb.0:
1038; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
1039; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1040; SI-NEXT:    s_mov_b32 s3, 0xf000
1041; SI-NEXT:    s_mov_b32 s2, -1
1042; SI-NEXT:    v_mov_b32_e32 v0, 0
1043; SI-NEXT:    s_waitcnt lgkmcnt(0)
1044; SI-NEXT:    v_mov_b32_e32 v1, s4
1045; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1046; SI-NEXT:    s_endpgm
1047;
1048; VI-LABEL: s_shl_32_i64:
1049; VI:       ; %bb.0:
1050; VI-NEXT:    s_load_dword s4, s[0:1], 0x4c
1051; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1052; VI-NEXT:    s_mov_b32 s3, 0xf000
1053; VI-NEXT:    s_mov_b32 s2, -1
1054; VI-NEXT:    v_mov_b32_e32 v0, 0
1055; VI-NEXT:    s_waitcnt lgkmcnt(0)
1056; VI-NEXT:    v_mov_b32_e32 v1, s4
1057; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1058; VI-NEXT:    s_endpgm
1059;
1060; EG-LABEL: s_shl_32_i64:
1061; EG:       ; %bb.0:
1062; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1063; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1064; EG-NEXT:    CF_END
1065; EG-NEXT:    PAD
1066; EG-NEXT:    ALU clause starting at 4:
1067; EG-NEXT:     MOV * T0.Y, KC0[4].W,
1068; EG-NEXT:     MOV T0.X, 0.0,
1069; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1070; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1071  %result = shl i64 %a, 32
1072  store i64 %result, i64 addrspace(1)* %out
1073  ret void
1074}
1075
1076define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
1077; SI-LABEL: v_shl_32_i64:
1078; SI:       ; %bb.0:
1079; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1080; SI-NEXT:    s_ashr_i32 s3, s2, 31
1081; SI-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
1082; SI-NEXT:    v_mov_b32_e32 v0, s0
1083; SI-NEXT:    s_mov_b32 s11, 0xf000
1084; SI-NEXT:    s_mov_b32 s10, 0
1085; SI-NEXT:    s_waitcnt lgkmcnt(0)
1086; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
1087; SI-NEXT:    v_mov_b32_e32 v1, s1
1088; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
1089; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1090; SI-NEXT:    v_mov_b32_e32 v2, 0
1091; SI-NEXT:    s_waitcnt vmcnt(0)
1092; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
1093; SI-NEXT:    s_endpgm
1094;
1095; VI-LABEL: v_shl_32_i64:
1096; VI:       ; %bb.0:
1097; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1098; VI-NEXT:    s_ashr_i32 s3, s2, 31
1099; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
1100; VI-NEXT:    v_mov_b32_e32 v0, 0
1101; VI-NEXT:    s_waitcnt lgkmcnt(0)
1102; VI-NEXT:    s_add_u32 s2, s6, s0
1103; VI-NEXT:    s_addc_u32 s3, s7, s1
1104; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1105; VI-NEXT:    s_add_u32 s0, s4, s0
1106; VI-NEXT:    s_addc_u32 s1, s5, s1
1107; VI-NEXT:    v_mov_b32_e32 v3, s1
1108; VI-NEXT:    v_mov_b32_e32 v2, s0
1109; VI-NEXT:    s_waitcnt lgkmcnt(0)
1110; VI-NEXT:    v_mov_b32_e32 v1, s2
1111; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1112; VI-NEXT:    s_endpgm
1113;
1114; EG-LABEL: v_shl_32_i64:
1115; EG:       ; %bb.0:
1116; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1117; EG-NEXT:    TEX 0 @6
1118; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
1119; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
1120; EG-NEXT:    CF_END
1121; EG-NEXT:    PAD
1122; EG-NEXT:    Fetch clause starting at 6:
1123; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1124; EG-NEXT:    ALU clause starting at 8:
1125; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1126; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1127; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1128; EG-NEXT:    ALU clause starting at 11:
1129; EG-NEXT:     MOV T1.X, 0.0,
1130; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1131; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1132; EG-NEXT:     MOV * T1.Y, T0.X,
1133; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1134  %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0
1135  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
1136  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
1137  %a = load i64, i64 addrspace(1)* %gep.in
1138  %result = shl i64 %a, 32
1139  store i64 %result, i64 addrspace(1)* %gep.out
1140  ret void
1141}
1142
1143define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
1144; SI-LABEL: s_shl_constant_i64:
1145; SI:       ; %bb.0:
1146; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1147; SI-NEXT:    s_mov_b32 s6, -1
1148; SI-NEXT:    s_mov_b32 s9, 0xffff
1149; SI-NEXT:    s_mov_b32 s8, s6
1150; SI-NEXT:    s_mov_b32 s7, 0xf000
1151; SI-NEXT:    s_waitcnt lgkmcnt(0)
1152; SI-NEXT:    s_mov_b32 s4, s0
1153; SI-NEXT:    s_mov_b32 s5, s1
1154; SI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
1155; SI-NEXT:    v_mov_b32_e32 v0, s0
1156; SI-NEXT:    v_mov_b32_e32 v1, s1
1157; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1158; SI-NEXT:    s_endpgm
1159;
1160; VI-LABEL: s_shl_constant_i64:
1161; VI:       ; %bb.0:
1162; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1163; VI-NEXT:    s_mov_b32 s6, -1
1164; VI-NEXT:    s_mov_b32 s9, 0xffff
1165; VI-NEXT:    s_mov_b32 s8, s6
1166; VI-NEXT:    s_mov_b32 s7, 0xf000
1167; VI-NEXT:    s_waitcnt lgkmcnt(0)
1168; VI-NEXT:    s_mov_b32 s4, s0
1169; VI-NEXT:    s_mov_b32 s5, s1
1170; VI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
1171; VI-NEXT:    v_mov_b32_e32 v0, s0
1172; VI-NEXT:    v_mov_b32_e32 v1, s1
1173; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1174; VI-NEXT:    s_endpgm
1175;
1176; EG-LABEL: s_shl_constant_i64:
1177; EG:       ; %bb.0:
1178; EG-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
1179; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1180; EG-NEXT:    CF_END
1181; EG-NEXT:    PAD
1182; EG-NEXT:    ALU clause starting at 4:
1183; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
1184; EG-NEXT:     MOV T0.W, literal.y,
1185; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
1186; EG-NEXT:    31(4.344025e-44), -1(nan)
1187; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
1188; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
1189; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1190; EG-NEXT:    32767(4.591635e-41), -1(nan)
1191; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1192; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1193; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1194; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1195; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1196  %shl = shl i64 281474976710655, %a
1197  store i64 %shl, i64 addrspace(1)* %out, align 8
1198  ret void
1199}
1200
1201define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1202; SI-LABEL: v_shl_constant_i64:
1203; SI:       ; %bb.0:
1204; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1205; SI-NEXT:    s_mov_b32 s7, 0xf000
1206; SI-NEXT:    s_mov_b32 s6, -1
1207; SI-NEXT:    s_mov_b32 s10, s6
1208; SI-NEXT:    s_mov_b32 s11, s7
1209; SI-NEXT:    s_waitcnt lgkmcnt(0)
1210; SI-NEXT:    s_mov_b32 s8, s2
1211; SI-NEXT:    s_mov_b32 s9, s3
1212; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1213; SI-NEXT:    s_mov_b32 s2, 0xab19b207
1214; SI-NEXT:    s_movk_i32 s3, 0x11e
1215; SI-NEXT:    s_mov_b32 s4, s0
1216; SI-NEXT:    s_mov_b32 s5, s1
1217; SI-NEXT:    s_waitcnt vmcnt(0)
1218; SI-NEXT:    v_lshl_b64 v[0:1], s[2:3], v0
1219; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1220; SI-NEXT:    s_endpgm
1221;
1222; VI-LABEL: v_shl_constant_i64:
1223; VI:       ; %bb.0:
1224; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1225; VI-NEXT:    s_mov_b32 s7, 0xf000
1226; VI-NEXT:    s_mov_b32 s6, -1
1227; VI-NEXT:    s_waitcnt lgkmcnt(0)
1228; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1229; VI-NEXT:    s_mov_b32 s4, s0
1230; VI-NEXT:    s_mov_b32 s5, s1
1231; VI-NEXT:    s_mov_b32 s0, 0xab19b207
1232; VI-NEXT:    s_movk_i32 s1, 0x11e
1233; VI-NEXT:    s_waitcnt lgkmcnt(0)
1234; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1235; VI-NEXT:    v_mov_b32_e32 v0, s0
1236; VI-NEXT:    v_mov_b32_e32 v1, s1
1237; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1238; VI-NEXT:    s_endpgm
1239;
1240; EG-LABEL: v_shl_constant_i64:
1241; EG:       ; %bb.0:
1242; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1243; EG-NEXT:    TEX 0 @6
1244; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1245; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1246; EG-NEXT:    CF_END
1247; EG-NEXT:    PAD
1248; EG-NEXT:    Fetch clause starting at 6:
1249; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1250; EG-NEXT:    ALU clause starting at 8:
1251; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1252; EG-NEXT:    ALU clause starting at 9:
1253; EG-NEXT:     NOT_INT T0.Z, T0.X,
1254; EG-NEXT:     MOV T0.W, literal.x,
1255; EG-NEXT:     AND_INT * T1.W, T0.X, literal.y,
1256; EG-NEXT:    1435293955(1.935796e+13), 31(4.344025e-44)
1257; EG-NEXT:     LSHL T1.Z, literal.x, PS,
1258; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.W, PV.Z,
1259; EG-NEXT:     AND_INT * T1.W, T0.X, literal.z,
1260; EG-NEXT:    -1424379385(-5.460358e-13), 143(2.003857e-43)
1261; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1262; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1263; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
1264; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1265; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1266  %a = load i64, i64 addrspace(1)* %aptr, align 8
1267  %shl = shl i64 1231231234567, %a
1268  store i64 %shl, i64 addrspace(1)* %out, align 8
1269  ret void
1270}
1271
1272define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1273; SI-LABEL: v_shl_i64_32_bit_constant:
1274; SI:       ; %bb.0:
1275; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1276; SI-NEXT:    s_mov_b32 s7, 0xf000
1277; SI-NEXT:    s_mov_b32 s6, -1
1278; SI-NEXT:    s_mov_b32 s10, s6
1279; SI-NEXT:    s_mov_b32 s11, s7
1280; SI-NEXT:    s_waitcnt lgkmcnt(0)
1281; SI-NEXT:    s_mov_b32 s8, s2
1282; SI-NEXT:    s_mov_b32 s9, s3
1283; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1284; SI-NEXT:    s_mov_b64 s[2:3], 0x12d687
1285; SI-NEXT:    s_mov_b32 s4, s0
1286; SI-NEXT:    s_mov_b32 s5, s1
1287; SI-NEXT:    s_waitcnt vmcnt(0)
1288; SI-NEXT:    v_lshl_b64 v[0:1], s[2:3], v0
1289; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1290; SI-NEXT:    s_endpgm
1291;
1292; VI-LABEL: v_shl_i64_32_bit_constant:
1293; VI:       ; %bb.0:
1294; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1295; VI-NEXT:    s_mov_b32 s7, 0xf000
1296; VI-NEXT:    s_mov_b32 s6, -1
1297; VI-NEXT:    s_waitcnt lgkmcnt(0)
1298; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1299; VI-NEXT:    s_mov_b32 s4, s0
1300; VI-NEXT:    s_mov_b32 s5, s1
1301; VI-NEXT:    s_mov_b64 s[0:1], 0x12d687
1302; VI-NEXT:    s_waitcnt lgkmcnt(0)
1303; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1304; VI-NEXT:    v_mov_b32_e32 v0, s0
1305; VI-NEXT:    v_mov_b32_e32 v1, s1
1306; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1307; VI-NEXT:    s_endpgm
1308;
1309; EG-LABEL: v_shl_i64_32_bit_constant:
1310; EG:       ; %bb.0:
1311; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1312; EG-NEXT:    TEX 0 @6
1313; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
1314; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1315; EG-NEXT:    CF_END
1316; EG-NEXT:    PAD
1317; EG-NEXT:    Fetch clause starting at 6:
1318; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1319; EG-NEXT:    ALU clause starting at 8:
1320; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1321; EG-NEXT:    ALU clause starting at 9:
1322; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
1323; EG-NEXT:     NOT_INT * T1.W, T0.X,
1324; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1325; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1326; EG-NEXT:     LSHL T0.W, literal.y, PV.W,
1327; EG-NEXT:     AND_INT * T1.W, T0.X, literal.z,
1328; EG-NEXT:    617283(8.649977e-40), 1234567(1.729997e-39)
1329; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1330; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1331; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1332; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1333; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1334  %a = load i64, i64 addrspace(1)* %aptr, align 8
1335  %shl = shl i64 1234567, %a
1336  store i64 %shl, i64 addrspace(1)* %out, align 8
1337  ret void
1338}
1339
1340define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1341; SI-LABEL: v_shl_inline_imm_64_i64:
1342; SI:       ; %bb.0:
1343; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1344; SI-NEXT:    s_mov_b32 s7, 0xf000
1345; SI-NEXT:    s_mov_b32 s6, -1
1346; SI-NEXT:    s_mov_b32 s10, s6
1347; SI-NEXT:    s_mov_b32 s11, s7
1348; SI-NEXT:    s_waitcnt lgkmcnt(0)
1349; SI-NEXT:    s_mov_b32 s8, s2
1350; SI-NEXT:    s_mov_b32 s9, s3
1351; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1352; SI-NEXT:    s_mov_b32 s4, s0
1353; SI-NEXT:    s_mov_b32 s5, s1
1354; SI-NEXT:    s_waitcnt vmcnt(0)
1355; SI-NEXT:    v_lshl_b64 v[0:1], 64, v0
1356; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1357; SI-NEXT:    s_endpgm
1358;
1359; VI-LABEL: v_shl_inline_imm_64_i64:
1360; VI:       ; %bb.0:
1361; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1362; VI-NEXT:    s_waitcnt lgkmcnt(0)
1363; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
1364; VI-NEXT:    s_mov_b32 s3, 0xf000
1365; VI-NEXT:    s_mov_b32 s2, -1
1366; VI-NEXT:    s_waitcnt lgkmcnt(0)
1367; VI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1368; VI-NEXT:    v_mov_b32_e32 v0, s4
1369; VI-NEXT:    v_mov_b32_e32 v1, s5
1370; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1371; VI-NEXT:    s_endpgm
1372;
1373; EG-LABEL: v_shl_inline_imm_64_i64:
1374; EG:       ; %bb.0:
1375; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1376; EG-NEXT:    TEX 0 @6
1377; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
1378; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1379; EG-NEXT:    CF_END
1380; EG-NEXT:    PAD
1381; EG-NEXT:    Fetch clause starting at 6:
1382; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1383; EG-NEXT:    ALU clause starting at 8:
1384; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1385; EG-NEXT:    ALU clause starting at 9:
1386; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
1387; EG-NEXT:     NOT_INT * T1.W, T0.X,
1388; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1389; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1390; EG-NEXT:     LSHL T0.W, literal.y, PV.W,
1391; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
1392; EG-NEXT:    32(4.484155e-44), 64(8.968310e-44)
1393; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1394; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1395; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1396; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1397  %a = load i64, i64 addrspace(1)* %aptr, align 8
1398  %shl = shl i64 64, %a
1399  store i64 %shl, i64 addrspace(1)* %out, align 8
1400  ret void
1401}
1402
1403define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1404; SI-LABEL: s_shl_inline_imm_64_i64:
1405; SI:       ; %bb.0:
1406; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1407; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1408; SI-NEXT:    s_mov_b32 s3, 0xf000
1409; SI-NEXT:    s_mov_b32 s2, -1
1410; SI-NEXT:    s_waitcnt lgkmcnt(0)
1411; SI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1412; SI-NEXT:    v_mov_b32_e32 v0, s4
1413; SI-NEXT:    v_mov_b32_e32 v1, s5
1414; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1415; SI-NEXT:    s_endpgm
1416;
1417; VI-LABEL: s_shl_inline_imm_64_i64:
1418; VI:       ; %bb.0:
1419; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1420; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1421; VI-NEXT:    s_mov_b32 s3, 0xf000
1422; VI-NEXT:    s_mov_b32 s2, -1
1423; VI-NEXT:    s_waitcnt lgkmcnt(0)
1424; VI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1425; VI-NEXT:    v_mov_b32_e32 v0, s4
1426; VI-NEXT:    v_mov_b32_e32 v1, s5
1427; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1428; VI-NEXT:    s_endpgm
1429;
1430; EG-LABEL: s_shl_inline_imm_64_i64:
1431; EG:       ; %bb.0:
1432; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
1433; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1434; EG-NEXT:    CF_END
1435; EG-NEXT:    PAD
1436; EG-NEXT:    ALU clause starting at 4:
1437; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
1438; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1439; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1440; EG-NEXT:     LSHL T0.Z, literal.x, PS,
1441; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
1442; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1443; EG-NEXT:    64(8.968310e-44), 32(4.484155e-44)
1444; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1445; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
1446; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1447; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1448  %shl = shl i64 64, %a
1449  store i64 %shl, i64 addrspace(1)* %out, align 8
1450  ret void
1451}
1452
1453define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1454; SI-LABEL: s_shl_inline_imm_1_i64:
1455; SI:       ; %bb.0:
1456; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1457; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1458; SI-NEXT:    s_mov_b32 s3, 0xf000
1459; SI-NEXT:    s_mov_b32 s2, -1
1460; SI-NEXT:    s_waitcnt lgkmcnt(0)
1461; SI-NEXT:    s_lshl_b64 s[4:5], 1, s4
1462; SI-NEXT:    v_mov_b32_e32 v0, s4
1463; SI-NEXT:    v_mov_b32_e32 v1, s5
1464; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1465; SI-NEXT:    s_endpgm
1466;
1467; VI-LABEL: s_shl_inline_imm_1_i64:
1468; VI:       ; %bb.0:
1469; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1470; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1471; VI-NEXT:    s_mov_b32 s3, 0xf000
1472; VI-NEXT:    s_mov_b32 s2, -1
1473; VI-NEXT:    s_waitcnt lgkmcnt(0)
1474; VI-NEXT:    s_lshl_b64 s[4:5], 1, s4
1475; VI-NEXT:    v_mov_b32_e32 v0, s4
1476; VI-NEXT:    v_mov_b32_e32 v1, s5
1477; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1478; VI-NEXT:    s_endpgm
1479;
1480; EG-LABEL: s_shl_inline_imm_1_i64:
1481; EG:       ; %bb.0:
1482; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
1483; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1484; EG-NEXT:    CF_END
1485; EG-NEXT:    PAD
1486; EG-NEXT:    ALU clause starting at 4:
1487; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
1488; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.y,
1489; EG-NEXT:    31(4.344025e-44), 26(3.643376e-44)
1490; EG-NEXT:     ASHR T1.W, PS, literal.x,
1491; EG-NEXT:     LSHL * T0.W, 1, PV.W,
1492; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1493; EG-NEXT:     AND_INT T0.Y, PV.W, PS,
1494; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1495; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1496; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, 0.0,
1497; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1498; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1499  %shl = shl i64 1, %a
1500  store i64 %shl, i64 addrspace(1)* %out, align 8
1501  ret void
1502}
1503
1504define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1505; SI-LABEL: s_shl_inline_imm_1_0_i64:
1506; SI:       ; %bb.0:
1507; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1508; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1509; SI-NEXT:    s_mov_b32 s3, 0xf000
1510; SI-NEXT:    s_mov_b32 s2, -1
1511; SI-NEXT:    s_waitcnt lgkmcnt(0)
1512; SI-NEXT:    s_lshl_b64 s[4:5], 1.0, s4
1513; SI-NEXT:    v_mov_b32_e32 v0, s4
1514; SI-NEXT:    v_mov_b32_e32 v1, s5
1515; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1516; SI-NEXT:    s_endpgm
1517;
1518; VI-LABEL: s_shl_inline_imm_1_0_i64:
1519; VI:       ; %bb.0:
1520; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1521; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1522; VI-NEXT:    s_mov_b32 s3, 0xf000
1523; VI-NEXT:    s_mov_b32 s2, -1
1524; VI-NEXT:    s_waitcnt lgkmcnt(0)
1525; VI-NEXT:    s_lshl_b64 s[4:5], 1.0, s4
1526; VI-NEXT:    v_mov_b32_e32 v0, s4
1527; VI-NEXT:    v_mov_b32_e32 v1, s5
1528; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1529; VI-NEXT:    s_endpgm
1530;
1531; EG-LABEL: s_shl_inline_imm_1_0_i64:
1532; EG:       ; %bb.0:
1533; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1534; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1535; EG-NEXT:    CF_END
1536; EG-NEXT:    PAD
1537; EG-NEXT:    ALU clause starting at 4:
1538; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1539; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1540; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1541; EG-NEXT:    536346624(1.050321e-19), 32(4.484155e-44)
1542; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1543; EG-NEXT:     MOV T0.X, 0.0,
1544; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1545; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1546  %shl = shl i64 4607182418800017408, %a
1547  store i64 %shl, i64 addrspace(1)* %out, align 8
1548  ret void
1549}
1550
1551define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1552; SI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1553; SI:       ; %bb.0:
1554; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1555; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1556; SI-NEXT:    s_mov_b32 s3, 0xf000
1557; SI-NEXT:    s_mov_b32 s2, -1
1558; SI-NEXT:    s_waitcnt lgkmcnt(0)
1559; SI-NEXT:    s_lshl_b64 s[4:5], -1.0, s4
1560; SI-NEXT:    v_mov_b32_e32 v0, s4
1561; SI-NEXT:    v_mov_b32_e32 v1, s5
1562; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1563; SI-NEXT:    s_endpgm
1564;
1565; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1566; VI:       ; %bb.0:
1567; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1568; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1569; VI-NEXT:    s_mov_b32 s3, 0xf000
1570; VI-NEXT:    s_mov_b32 s2, -1
1571; VI-NEXT:    s_waitcnt lgkmcnt(0)
1572; VI-NEXT:    s_lshl_b64 s[4:5], -1.0, s4
1573; VI-NEXT:    v_mov_b32_e32 v0, s4
1574; VI-NEXT:    v_mov_b32_e32 v1, s5
1575; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1576; VI-NEXT:    s_endpgm
1577;
1578; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
1579; EG:       ; %bb.0:
1580; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1581; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1582; EG-NEXT:    CF_END
1583; EG-NEXT:    PAD
1584; EG-NEXT:    ALU clause starting at 4:
1585; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1586; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1587; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1588; EG-NEXT:    1610088448(3.574057e+19), 32(4.484155e-44)
1589; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1590; EG-NEXT:     MOV T0.X, 0.0,
1591; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1592; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1593  %shl = shl i64 13830554455654793216, %a
1594  store i64 %shl, i64 addrspace(1)* %out, align 8
1595  ret void
1596}
1597
1598define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1599; SI-LABEL: s_shl_inline_imm_0_5_i64:
1600; SI:       ; %bb.0:
1601; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1602; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1603; SI-NEXT:    s_mov_b32 s3, 0xf000
1604; SI-NEXT:    s_mov_b32 s2, -1
1605; SI-NEXT:    s_waitcnt lgkmcnt(0)
1606; SI-NEXT:    s_lshl_b64 s[4:5], 0.5, s4
1607; SI-NEXT:    v_mov_b32_e32 v0, s4
1608; SI-NEXT:    v_mov_b32_e32 v1, s5
1609; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1610; SI-NEXT:    s_endpgm
1611;
1612; VI-LABEL: s_shl_inline_imm_0_5_i64:
1613; VI:       ; %bb.0:
1614; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1615; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1616; VI-NEXT:    s_mov_b32 s3, 0xf000
1617; VI-NEXT:    s_mov_b32 s2, -1
1618; VI-NEXT:    s_waitcnt lgkmcnt(0)
1619; VI-NEXT:    s_lshl_b64 s[4:5], 0.5, s4
1620; VI-NEXT:    v_mov_b32_e32 v0, s4
1621; VI-NEXT:    v_mov_b32_e32 v1, s5
1622; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1623; VI-NEXT:    s_endpgm
1624;
1625; EG-LABEL: s_shl_inline_imm_0_5_i64:
1626; EG:       ; %bb.0:
1627; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1628; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1629; EG-NEXT:    CF_END
1630; EG-NEXT:    PAD
1631; EG-NEXT:    ALU clause starting at 4:
1632; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1633; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1634; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1635; EG-NEXT:    535822336(1.016440e-19), 32(4.484155e-44)
1636; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1637; EG-NEXT:     MOV T0.X, 0.0,
1638; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1639; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1640  %shl = shl i64 4602678819172646912, %a
1641  store i64 %shl, i64 addrspace(1)* %out, align 8
1642  ret void
1643}
1644
1645define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1646; SI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1647; SI:       ; %bb.0:
1648; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1649; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1650; SI-NEXT:    s_mov_b32 s3, 0xf000
1651; SI-NEXT:    s_mov_b32 s2, -1
1652; SI-NEXT:    s_waitcnt lgkmcnt(0)
1653; SI-NEXT:    s_lshl_b64 s[4:5], -0.5, s4
1654; SI-NEXT:    v_mov_b32_e32 v0, s4
1655; SI-NEXT:    v_mov_b32_e32 v1, s5
1656; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1657; SI-NEXT:    s_endpgm
1658;
1659; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1660; VI:       ; %bb.0:
1661; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1662; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1663; VI-NEXT:    s_mov_b32 s3, 0xf000
1664; VI-NEXT:    s_mov_b32 s2, -1
1665; VI-NEXT:    s_waitcnt lgkmcnt(0)
1666; VI-NEXT:    s_lshl_b64 s[4:5], -0.5, s4
1667; VI-NEXT:    v_mov_b32_e32 v0, s4
1668; VI-NEXT:    v_mov_b32_e32 v1, s5
1669; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1670; VI-NEXT:    s_endpgm
1671;
1672; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
1673; EG:       ; %bb.0:
1674; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1675; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1676; EG-NEXT:    CF_END
1677; EG-NEXT:    PAD
1678; EG-NEXT:    ALU clause starting at 4:
1679; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1680; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1681; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1682; EG-NEXT:    1609564160(3.458765e+19), 32(4.484155e-44)
1683; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1684; EG-NEXT:     MOV T0.X, 0.0,
1685; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1686; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1687  %shl = shl i64 13826050856027422720, %a
1688  store i64 %shl, i64 addrspace(1)* %out, align 8
1689  ret void
1690}
1691
1692define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1693; SI-LABEL: s_shl_inline_imm_2_0_i64:
1694; SI:       ; %bb.0:
1695; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1696; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1697; SI-NEXT:    s_mov_b32 s3, 0xf000
1698; SI-NEXT:    s_mov_b32 s2, -1
1699; SI-NEXT:    s_waitcnt lgkmcnt(0)
1700; SI-NEXT:    s_lshl_b64 s[4:5], 2.0, s4
1701; SI-NEXT:    v_mov_b32_e32 v0, s4
1702; SI-NEXT:    v_mov_b32_e32 v1, s5
1703; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1704; SI-NEXT:    s_endpgm
1705;
1706; VI-LABEL: s_shl_inline_imm_2_0_i64:
1707; VI:       ; %bb.0:
1708; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1709; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1710; VI-NEXT:    s_mov_b32 s3, 0xf000
1711; VI-NEXT:    s_mov_b32 s2, -1
1712; VI-NEXT:    s_waitcnt lgkmcnt(0)
1713; VI-NEXT:    s_lshl_b64 s[4:5], 2.0, s4
1714; VI-NEXT:    v_mov_b32_e32 v0, s4
1715; VI-NEXT:    v_mov_b32_e32 v1, s5
1716; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1717; VI-NEXT:    s_endpgm
1718;
1719; EG-LABEL: s_shl_inline_imm_2_0_i64:
1720; EG:       ; %bb.0:
1721; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1722; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1723; EG-NEXT:    CF_END
1724; EG-NEXT:    PAD
1725; EG-NEXT:    ALU clause starting at 4:
1726; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1727; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1728; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1729; EG-NEXT:    536870912(1.084202e-19), 32(4.484155e-44)
1730; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1731; EG-NEXT:     MOV T0.X, 0.0,
1732; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1733; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1734  %shl = shl i64 4611686018427387904, %a
1735  store i64 %shl, i64 addrspace(1)* %out, align 8
1736  ret void
1737}
1738
1739define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1740; SI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1741; SI:       ; %bb.0:
1742; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1743; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1744; SI-NEXT:    s_mov_b32 s3, 0xf000
1745; SI-NEXT:    s_mov_b32 s2, -1
1746; SI-NEXT:    s_waitcnt lgkmcnt(0)
1747; SI-NEXT:    s_lshl_b64 s[4:5], -2.0, s4
1748; SI-NEXT:    v_mov_b32_e32 v0, s4
1749; SI-NEXT:    v_mov_b32_e32 v1, s5
1750; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1751; SI-NEXT:    s_endpgm
1752;
1753; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1754; VI:       ; %bb.0:
1755; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1756; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1757; VI-NEXT:    s_mov_b32 s3, 0xf000
1758; VI-NEXT:    s_mov_b32 s2, -1
1759; VI-NEXT:    s_waitcnt lgkmcnt(0)
1760; VI-NEXT:    s_lshl_b64 s[4:5], -2.0, s4
1761; VI-NEXT:    v_mov_b32_e32 v0, s4
1762; VI-NEXT:    v_mov_b32_e32 v1, s5
1763; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1764; VI-NEXT:    s_endpgm
1765;
1766; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
1767; EG:       ; %bb.0:
1768; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1769; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1770; EG-NEXT:    CF_END
1771; EG-NEXT:    PAD
1772; EG-NEXT:    ALU clause starting at 4:
1773; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1774; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1775; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1776; EG-NEXT:    1610612736(3.689349e+19), 32(4.484155e-44)
1777; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1778; EG-NEXT:     MOV T0.X, 0.0,
1779; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1780; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1781  %shl = shl i64 13835058055282163712, %a
1782  store i64 %shl, i64 addrspace(1)* %out, align 8
1783  ret void
1784}
1785
1786define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1787; SI-LABEL: s_shl_inline_imm_4_0_i64:
1788; SI:       ; %bb.0:
1789; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1790; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1791; SI-NEXT:    s_mov_b32 s3, 0xf000
1792; SI-NEXT:    s_mov_b32 s2, -1
1793; SI-NEXT:    s_waitcnt lgkmcnt(0)
1794; SI-NEXT:    s_lshl_b64 s[4:5], 4.0, s4
1795; SI-NEXT:    v_mov_b32_e32 v0, s4
1796; SI-NEXT:    v_mov_b32_e32 v1, s5
1797; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1798; SI-NEXT:    s_endpgm
1799;
1800; VI-LABEL: s_shl_inline_imm_4_0_i64:
1801; VI:       ; %bb.0:
1802; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1803; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1804; VI-NEXT:    s_mov_b32 s3, 0xf000
1805; VI-NEXT:    s_mov_b32 s2, -1
1806; VI-NEXT:    s_waitcnt lgkmcnt(0)
1807; VI-NEXT:    s_lshl_b64 s[4:5], 4.0, s4
1808; VI-NEXT:    v_mov_b32_e32 v0, s4
1809; VI-NEXT:    v_mov_b32_e32 v1, s5
1810; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1811; VI-NEXT:    s_endpgm
1812;
1813; EG-LABEL: s_shl_inline_imm_4_0_i64:
1814; EG:       ; %bb.0:
1815; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1816; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1817; EG-NEXT:    CF_END
1818; EG-NEXT:    PAD
1819; EG-NEXT:    ALU clause starting at 4:
1820; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1821; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1822; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1823; EG-NEXT:    537395200(1.151965e-19), 32(4.484155e-44)
1824; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1825; EG-NEXT:     MOV T0.X, 0.0,
1826; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1827; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1828  %shl = shl i64 4616189618054758400, %a
1829  store i64 %shl, i64 addrspace(1)* %out, align 8
1830  ret void
1831}
1832
1833define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1834; SI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1835; SI:       ; %bb.0:
1836; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1837; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1838; SI-NEXT:    s_mov_b32 s3, 0xf000
1839; SI-NEXT:    s_mov_b32 s2, -1
1840; SI-NEXT:    s_waitcnt lgkmcnt(0)
1841; SI-NEXT:    s_lshl_b64 s[4:5], -4.0, s4
1842; SI-NEXT:    v_mov_b32_e32 v0, s4
1843; SI-NEXT:    v_mov_b32_e32 v1, s5
1844; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1845; SI-NEXT:    s_endpgm
1846;
1847; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1848; VI:       ; %bb.0:
1849; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1850; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1851; VI-NEXT:    s_mov_b32 s3, 0xf000
1852; VI-NEXT:    s_mov_b32 s2, -1
1853; VI-NEXT:    s_waitcnt lgkmcnt(0)
1854; VI-NEXT:    s_lshl_b64 s[4:5], -4.0, s4
1855; VI-NEXT:    v_mov_b32_e32 v0, s4
1856; VI-NEXT:    v_mov_b32_e32 v1, s5
1857; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1858; VI-NEXT:    s_endpgm
1859;
1860; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
1861; EG:       ; %bb.0:
1862; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1863; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1864; EG-NEXT:    CF_END
1865; EG-NEXT:    PAD
1866; EG-NEXT:    ALU clause starting at 4:
1867; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1868; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1869; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1870; EG-NEXT:    1611137024(3.919933e+19), 32(4.484155e-44)
1871; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1872; EG-NEXT:     MOV T0.X, 0.0,
1873; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1874; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1875  %shl = shl i64 13839561654909534208, %a
1876  store i64 %shl, i64 addrspace(1)* %out, align 8
1877  ret void
1878}
1879
1880
1881; Test with the 64-bit integer bitpattern for a 32-bit float in the
1882; low 32-bits, which is not a valid 64-bit inline immmediate.
1883define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1884; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1885; SI:       ; %bb.0:
1886; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1887; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
1888; SI-NEXT:    s_mov_b64 s[0:1], 0x40800000
1889; SI-NEXT:    s_mov_b32 s7, 0xf000
1890; SI-NEXT:    s_mov_b32 s6, -1
1891; SI-NEXT:    s_waitcnt lgkmcnt(0)
1892; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1893; SI-NEXT:    v_mov_b32_e32 v0, s0
1894; SI-NEXT:    v_mov_b32_e32 v1, s1
1895; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1896; SI-NEXT:    s_endpgm
1897;
1898; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1899; VI:       ; %bb.0:
1900; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1901; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
1902; VI-NEXT:    s_mov_b64 s[0:1], 0x40800000
1903; VI-NEXT:    s_mov_b32 s7, 0xf000
1904; VI-NEXT:    s_mov_b32 s6, -1
1905; VI-NEXT:    s_waitcnt lgkmcnt(0)
1906; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1907; VI-NEXT:    v_mov_b32_e32 v0, s0
1908; VI-NEXT:    v_mov_b32_e32 v1, s1
1909; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1910; VI-NEXT:    s_endpgm
1911;
1912; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
1913; EG:       ; %bb.0:
1914; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
1915; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1916; EG-NEXT:    CF_END
1917; EG-NEXT:    PAD
1918; EG-NEXT:    ALU clause starting at 4:
1919; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
1920; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1921; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1922; EG-NEXT:     LSHL T0.Z, literal.x, PS,
1923; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
1924; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1925; EG-NEXT:    1082130432(4.000000e+00), 541065216(1.626303e-19)
1926; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1927; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1928; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
1929; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1930; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1931  %shl = shl i64 1082130432, %a
1932  store i64 %shl, i64 addrspace(1)* %out, align 8
1933  ret void
1934}
1935
1936; FIXME: Copy of -1 register
1937define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1938; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1939; SI:       ; %bb.0:
1940; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1941; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
1942; SI-NEXT:    s_mov_b32 s6, -1
1943; SI-NEXT:    s_mov_b32 s0, -4.0
1944; SI-NEXT:    s_mov_b32 s1, s6
1945; SI-NEXT:    s_mov_b32 s7, 0xf000
1946; SI-NEXT:    s_waitcnt lgkmcnt(0)
1947; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1948; SI-NEXT:    v_mov_b32_e32 v0, s0
1949; SI-NEXT:    v_mov_b32_e32 v1, s1
1950; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1951; SI-NEXT:    s_endpgm
1952;
1953; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1954; VI:       ; %bb.0:
1955; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1956; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
1957; VI-NEXT:    s_mov_b32 s6, -1
1958; VI-NEXT:    s_mov_b32 s0, -4.0
1959; VI-NEXT:    s_mov_b32 s1, s6
1960; VI-NEXT:    s_mov_b32 s7, 0xf000
1961; VI-NEXT:    s_waitcnt lgkmcnt(0)
1962; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1963; VI-NEXT:    v_mov_b32_e32 v0, s0
1964; VI-NEXT:    v_mov_b32_e32 v1, s1
1965; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1966; VI-NEXT:    s_endpgm
1967;
1968; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1969; EG:       ; %bb.0:
1970; EG-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
1971; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1972; EG-NEXT:    CF_END
1973; EG-NEXT:    PAD
1974; EG-NEXT:    ALU clause starting at 4:
1975; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
1976; EG-NEXT:     MOV T0.W, literal.y,
1977; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
1978; EG-NEXT:    31(4.344025e-44), -532676608(-5.534023e+19)
1979; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
1980; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
1981; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1982; EG-NEXT:    2147483647(nan), -1065353216(-4.000000e+00)
1983; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1984; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1985; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1986; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1987; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1988  %shl = shl i64 -1065353216, %a
1989  store i64 %shl, i64 addrspace(1)* %out, align 8
1990  ret void
1991}
1992
1993define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1994; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
1995; SI:       ; %bb.0:
1996; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1997; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
1998; SI-NEXT:    s_mov_b32 s0, 0
1999; SI-NEXT:    s_mov_b32 s1, 4.0
2000; SI-NEXT:    s_mov_b32 s7, 0xf000
2001; SI-NEXT:    s_mov_b32 s6, -1
2002; SI-NEXT:    s_waitcnt lgkmcnt(0)
2003; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2004; SI-NEXT:    v_mov_b32_e32 v0, s0
2005; SI-NEXT:    v_mov_b32_e32 v1, s1
2006; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2007; SI-NEXT:    s_endpgm
2008;
2009; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2010; VI:       ; %bb.0:
2011; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2012; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
2013; VI-NEXT:    s_mov_b32 s0, 0
2014; VI-NEXT:    s_mov_b32 s1, 4.0
2015; VI-NEXT:    s_mov_b32 s7, 0xf000
2016; VI-NEXT:    s_mov_b32 s6, -1
2017; VI-NEXT:    s_waitcnt lgkmcnt(0)
2018; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2019; VI-NEXT:    v_mov_b32_e32 v0, s0
2020; VI-NEXT:    v_mov_b32_e32 v1, s1
2021; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2022; VI-NEXT:    s_endpgm
2023;
2024; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2025; EG:       ; %bb.0:
2026; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
2027; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2028; EG-NEXT:    CF_END
2029; EG-NEXT:    PAD
2030; EG-NEXT:    ALU clause starting at 4:
2031; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
2032; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2033; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
2034; EG-NEXT:    541065216(1.626303e-19), 32(4.484155e-44)
2035; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
2036; EG-NEXT:     MOV T0.X, 0.0,
2037; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2038; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2039  %shl = shl i64 4647714815446351872, %a
2040  store i64 %shl, i64 addrspace(1)* %out, align 8
2041  ret void
2042}
2043
2044define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
2045; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2046; SI:       ; %bb.0:
2047; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2048; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
2049; SI-NEXT:    s_mov_b32 s0, 0
2050; SI-NEXT:    s_mov_b32 s1, -4.0
2051; SI-NEXT:    s_mov_b32 s7, 0xf000
2052; SI-NEXT:    s_mov_b32 s6, -1
2053; SI-NEXT:    s_waitcnt lgkmcnt(0)
2054; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2055; SI-NEXT:    v_mov_b32_e32 v0, s0
2056; SI-NEXT:    v_mov_b32_e32 v1, s1
2057; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2058; SI-NEXT:    s_endpgm
2059;
2060; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2061; VI:       ; %bb.0:
2062; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2063; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
2064; VI-NEXT:    s_mov_b32 s0, 0
2065; VI-NEXT:    s_mov_b32 s1, -4.0
2066; VI-NEXT:    s_mov_b32 s7, 0xf000
2067; VI-NEXT:    s_mov_b32 s6, -1
2068; VI-NEXT:    s_waitcnt lgkmcnt(0)
2069; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2070; VI-NEXT:    v_mov_b32_e32 v0, s0
2071; VI-NEXT:    v_mov_b32_e32 v1, s1
2072; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2073; VI-NEXT:    s_endpgm
2074;
2075; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2076; EG:       ; %bb.0:
2077; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
2078; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2079; EG-NEXT:    CF_END
2080; EG-NEXT:    PAD
2081; EG-NEXT:    ALU clause starting at 4:
2082; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
2083; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2084; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
2085; EG-NEXT:    1614807040(5.534023e+19), 32(4.484155e-44)
2086; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
2087; EG-NEXT:     MOV T0.X, 0.0,
2088; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2089; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2090  %shl = shl i64 13871086852301127680, %a
2091  store i64 %shl, i64 addrspace(1)* %out, align 8
2092  ret void
2093}
2094
2095define amdgpu_kernel void @test_mul2(i32 %p) {
2096; SI-LABEL: test_mul2:
2097; SI:       ; %bb.0:
2098; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2099; SI-NEXT:    s_mov_b32 s3, 0xf000
2100; SI-NEXT:    s_mov_b32 s2, -1
2101; SI-NEXT:    s_waitcnt lgkmcnt(0)
2102; SI-NEXT:    s_lshl_b32 s0, s0, 1
2103; SI-NEXT:    v_mov_b32_e32 v0, s0
2104; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2105; SI-NEXT:    s_waitcnt vmcnt(0)
2106; SI-NEXT:    s_endpgm
2107;
2108; VI-LABEL: test_mul2:
2109; VI:       ; %bb.0:
2110; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2111; VI-NEXT:    s_mov_b32 s3, 0xf000
2112; VI-NEXT:    s_mov_b32 s2, -1
2113; VI-NEXT:    s_waitcnt lgkmcnt(0)
2114; VI-NEXT:    s_lshl_b32 s0, s0, 1
2115; VI-NEXT:    v_mov_b32_e32 v0, s0
2116; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2117; VI-NEXT:    s_waitcnt vmcnt(0)
2118; VI-NEXT:    s_endpgm
2119;
2120; EG-LABEL: test_mul2:
2121; EG:       ; %bb.0:
2122; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2123; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2124; EG-NEXT:    CF_END
2125; EG-NEXT:    PAD
2126; EG-NEXT:    ALU clause starting at 4:
2127; EG-NEXT:     MOV T0.X, literal.x,
2128; EG-NEXT:     LSHL * T1.X, KC0[2].Y, 1,
2129; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2130   %i = mul i32 %p, 2
2131   store volatile i32 %i, i32 addrspace(1)* undef
2132   ret void
2133}
2134
2135define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
2136; SI-LABEL: shl_or_k:
2137; SI:       ; %bb.0:
2138; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2139; SI-NEXT:    s_mov_b32 s6, 0
2140; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2141; SI-NEXT:    s_mov_b32 s7, 0xf000
2142; SI-NEXT:    s_mov_b32 s4, s6
2143; SI-NEXT:    s_mov_b32 s5, s6
2144; SI-NEXT:    v_or_b32_e32 v2, 4, v2
2145; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
2146; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2147; SI-NEXT:    s_setpc_b64 s[30:31]
2148;
2149; VI-LABEL: shl_or_k:
2150; VI:       ; %bb.0:
2151; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2152; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2153; VI-NEXT:    v_or_b32_e32 v2, 4, v2
2154; VI-NEXT:    flat_store_dword v[0:1], v2
2155; VI-NEXT:    s_waitcnt vmcnt(0)
2156; VI-NEXT:    s_setpc_b64 s[30:31]
2157;
2158; EG-LABEL: shl_or_k:
2159; EG:       ; %bb.0:
2160; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2161; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2162; EG-NEXT:    CF_END
2163; EG-NEXT:    PAD
2164; EG-NEXT:    ALU clause starting at 4:
2165; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
2166; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2167; EG-NEXT:     OR_INT T0.X, PV.W, literal.x,
2168; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
2169; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
2170  %tmp0 = or i32 %in, 1
2171  %tmp2 = shl i32 %tmp0, 2
2172  store i32 %tmp2, i32 addrspace(1)* %out
2173  ret void
2174}
2175
2176define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
2177; SI-LABEL: shl_or_k_two_uses:
2178; SI:       ; %bb.0:
2179; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2180; SI-NEXT:    s_mov_b32 s6, 0
2181; SI-NEXT:    v_or_b32_e32 v4, 1, v4
2182; SI-NEXT:    s_mov_b32 s7, 0xf000
2183; SI-NEXT:    s_mov_b32 s4, s6
2184; SI-NEXT:    s_mov_b32 s5, s6
2185; SI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
2186; SI-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64
2187; SI-NEXT:    buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
2188; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2189; SI-NEXT:    s_setpc_b64 s[30:31]
2190;
2191; VI-LABEL: shl_or_k_two_uses:
2192; VI:       ; %bb.0:
2193; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2194; VI-NEXT:    v_or_b32_e32 v4, 1, v4
2195; VI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
2196; VI-NEXT:    flat_store_dword v[0:1], v5
2197; VI-NEXT:    flat_store_dword v[2:3], v4
2198; VI-NEXT:    s_waitcnt vmcnt(0)
2199; VI-NEXT:    s_setpc_b64 s[30:31]
2200;
2201; EG-LABEL: shl_or_k_two_uses:
2202; EG:       ; %bb.0:
2203; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
2204; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
2205; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2206; EG-NEXT:    CF_END
2207; EG-NEXT:    ALU clause starting at 4:
2208; EG-NEXT:     LSHR T0.X, KC0[2].Z, literal.x,
2209; EG-NEXT:     OR_INT * T1.X, KC0[2].W, 1,
2210; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2211; EG-NEXT:     LSHL T2.X, PS, literal.x,
2212; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2213; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2214  %tmp0 = or i32 %in, 1
2215  %tmp2 = shl i32 %tmp0, 2
2216  store i32 %tmp2, i32 addrspace(1)* %out0
2217  store i32 %tmp0, i32 addrspace(1)* %out1
2218  ret void
2219}
2220
2221attributes #0 = { nounwind readnone }
2222