1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI
3; RUN: llc < %s -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8declare i32 @llvm.amdgcn.workgroup.id.x() #0
9
10define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
11; SI-LABEL: shl_v2i32:
12; SI:       ; %bb.0:
13; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
14; SI-NEXT:    s_mov_b32 s7, 0xf000
15; SI-NEXT:    s_mov_b32 s6, -1
16; SI-NEXT:    s_mov_b32 s10, s6
17; SI-NEXT:    s_mov_b32 s11, s7
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    s_mov_b32 s8, s2
20; SI-NEXT:    s_mov_b32 s9, s3
21; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
22; SI-NEXT:    s_mov_b32 s4, s0
23; SI-NEXT:    s_mov_b32 s5, s1
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    v_lshl_b32_e32 v1, v1, v3
26; SI-NEXT:    v_lshl_b32_e32 v0, v0, v2
27; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
28; SI-NEXT:    s_endpgm
29;
30; VI-LABEL: shl_v2i32:
31; VI:       ; %bb.0:
32; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
35; VI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x8
36; VI-NEXT:    s_mov_b32 s3, 0xf000
37; VI-NEXT:    s_mov_b32 s2, -1
38; VI-NEXT:    s_waitcnt lgkmcnt(0)
39; VI-NEXT:    s_lshl_b32 s5, s5, s7
40; VI-NEXT:    s_lshl_b32 s4, s4, s6
41; VI-NEXT:    v_mov_b32_e32 v0, s4
42; VI-NEXT:    v_mov_b32_e32 v1, s5
43; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
44; VI-NEXT:    s_endpgm
45;
46; EG-LABEL: shl_v2i32:
47; EG:       ; %bb.0:
48; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
49; EG-NEXT:    TEX 1 @6
50; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
51; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
52; EG-NEXT:    CF_END
53; EG-NEXT:    PAD
54; EG-NEXT:    Fetch clause starting at 6:
55; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
56; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
57; EG-NEXT:    ALU clause starting at 10:
58; EG-NEXT:     MOV * T0.X, KC0[2].Z,
59; EG-NEXT:    ALU clause starting at 11:
60; EG-NEXT:     LSHL * T0.Y, T0.Y, T1.Y,
61; EG-NEXT:     LSHL T0.X, T0.X, T1.X,
62; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
63; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
64  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
65  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
66  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
67  %result = shl <2 x i32> %a, %b
68  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
69  ret void
70}
71
72define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
73; SI-LABEL: shl_v4i32:
74; SI:       ; %bb.0:
75; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
76; SI-NEXT:    s_mov_b32 s7, 0xf000
77; SI-NEXT:    s_mov_b32 s6, -1
78; SI-NEXT:    s_mov_b32 s10, s6
79; SI-NEXT:    s_mov_b32 s11, s7
80; SI-NEXT:    s_waitcnt lgkmcnt(0)
81; SI-NEXT:    s_mov_b32 s8, s2
82; SI-NEXT:    s_mov_b32 s9, s3
83; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
84; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
85; SI-NEXT:    s_mov_b32 s4, s0
86; SI-NEXT:    s_mov_b32 s5, s1
87; SI-NEXT:    s_waitcnt vmcnt(0)
88; SI-NEXT:    v_lshl_b32_e32 v3, v3, v7
89; SI-NEXT:    v_lshl_b32_e32 v2, v2, v6
90; SI-NEXT:    v_lshl_b32_e32 v1, v1, v5
91; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
92; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
93; SI-NEXT:    s_endpgm
94;
95; VI-LABEL: shl_v4i32:
96; VI:       ; %bb.0:
97; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
98; VI-NEXT:    s_waitcnt lgkmcnt(0)
99; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
100; VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x10
101; VI-NEXT:    s_mov_b32 s3, 0xf000
102; VI-NEXT:    s_mov_b32 s2, -1
103; VI-NEXT:    s_waitcnt lgkmcnt(0)
104; VI-NEXT:    s_lshl_b32 s7, s7, s11
105; VI-NEXT:    s_lshl_b32 s6, s6, s10
106; VI-NEXT:    s_lshl_b32 s5, s5, s9
107; VI-NEXT:    s_lshl_b32 s4, s4, s8
108; VI-NEXT:    v_mov_b32_e32 v0, s4
109; VI-NEXT:    v_mov_b32_e32 v1, s5
110; VI-NEXT:    v_mov_b32_e32 v2, s6
111; VI-NEXT:    v_mov_b32_e32 v3, s7
112; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
113; VI-NEXT:    s_endpgm
114;
115; EG-LABEL: shl_v4i32:
116; EG:       ; %bb.0:
117; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
118; EG-NEXT:    TEX 1 @6
119; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
120; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
121; EG-NEXT:    CF_END
122; EG-NEXT:    PAD
123; EG-NEXT:    Fetch clause starting at 6:
124; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
125; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
126; EG-NEXT:    ALU clause starting at 10:
127; EG-NEXT:     MOV * T0.X, KC0[2].Z,
128; EG-NEXT:    ALU clause starting at 11:
129; EG-NEXT:     LSHL * T0.W, T0.W, T1.W,
130; EG-NEXT:     LSHL * T0.Z, T0.Z, T1.Z,
131; EG-NEXT:     LSHL * T0.Y, T0.Y, T1.Y,
132; EG-NEXT:     LSHL T0.X, T0.X, T1.X,
133; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
134; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
135  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
136  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
137  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
138  %result = shl <4 x i32> %a, %b
139  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
140  ret void
141}
142
143define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
144; SI-LABEL: shl_i16:
145; SI:       ; %bb.0:
146; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
147; SI-NEXT:    s_mov_b32 s7, 0xf000
148; SI-NEXT:    s_mov_b32 s6, -1
149; SI-NEXT:    s_mov_b32 s10, s6
150; SI-NEXT:    s_mov_b32 s11, s7
151; SI-NEXT:    s_waitcnt lgkmcnt(0)
152; SI-NEXT:    s_mov_b32 s8, s2
153; SI-NEXT:    s_mov_b32 s9, s3
154; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
155; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
156; SI-NEXT:    s_mov_b32 s4, s0
157; SI-NEXT:    s_mov_b32 s5, s1
158; SI-NEXT:    s_waitcnt vmcnt(0)
159; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
160; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
161; SI-NEXT:    s_endpgm
162;
163; VI-LABEL: shl_i16:
164; VI:       ; %bb.0:
165; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
166; VI-NEXT:    s_mov_b32 s7, 0xf000
167; VI-NEXT:    s_mov_b32 s6, -1
168; VI-NEXT:    s_mov_b32 s10, s6
169; VI-NEXT:    s_mov_b32 s11, s7
170; VI-NEXT:    s_waitcnt lgkmcnt(0)
171; VI-NEXT:    s_mov_b32 s8, s2
172; VI-NEXT:    s_mov_b32 s9, s3
173; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
174; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
175; VI-NEXT:    s_mov_b32 s4, s0
176; VI-NEXT:    s_mov_b32 s5, s1
177; VI-NEXT:    s_waitcnt vmcnt(0)
178; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
179; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
180; VI-NEXT:    s_endpgm
181;
182; EG-LABEL: shl_i16:
183; EG:       ; %bb.0:
184; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
185; EG-NEXT:    TEX 1 @6
186; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
187; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
188; EG-NEXT:    CF_END
189; EG-NEXT:    PAD
190; EG-NEXT:    Fetch clause starting at 6:
191; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
192; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
193; EG-NEXT:    ALU clause starting at 10:
194; EG-NEXT:     MOV * T0.X, KC0[2].Z,
195; EG-NEXT:    ALU clause starting at 11:
196; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
197; EG-NEXT:     LSHL * T1.W, T0.X, T1.X,
198; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
199; EG-NEXT:     AND_INT T1.W, PS, literal.x,
200; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
201; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
202; EG-NEXT:     LSHL T0.X, PV.W, PS,
203; EG-NEXT:     LSHL * T0.W, literal.x, PS,
204; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
205; EG-NEXT:     MOV T0.Y, 0.0,
206; EG-NEXT:     MOV * T0.Z, 0.0,
207; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
208; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
209  %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
210  %a = load i16, i16 addrspace(1)* %in
211  %b = load i16, i16 addrspace(1)* %b_ptr
212  %result = shl i16 %a, %b
213  store i16 %result, i16 addrspace(1)* %out
214  ret void
215}
216
217define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
218; SI-LABEL: shl_i16_v_s:
219; SI:       ; %bb.0:
220; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
221; SI-NEXT:    s_load_dword s12, s[0:1], 0xd
222; SI-NEXT:    s_mov_b32 s3, 0xf000
223; SI-NEXT:    s_mov_b32 s2, -1
224; SI-NEXT:    s_mov_b32 s10, s2
225; SI-NEXT:    s_waitcnt lgkmcnt(0)
226; SI-NEXT:    s_mov_b32 s8, s6
227; SI-NEXT:    s_mov_b32 s9, s7
228; SI-NEXT:    s_mov_b32 s11, s3
229; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
230; SI-NEXT:    s_mov_b32 s0, s4
231; SI-NEXT:    s_mov_b32 s1, s5
232; SI-NEXT:    s_waitcnt vmcnt(0)
233; SI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
234; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
235; SI-NEXT:    s_endpgm
236;
237; VI-LABEL: shl_i16_v_s:
238; VI:       ; %bb.0:
239; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
240; VI-NEXT:    s_load_dword s12, s[0:1], 0x34
241; VI-NEXT:    s_mov_b32 s3, 0xf000
242; VI-NEXT:    s_mov_b32 s2, -1
243; VI-NEXT:    s_mov_b32 s10, s2
244; VI-NEXT:    s_waitcnt lgkmcnt(0)
245; VI-NEXT:    s_mov_b32 s8, s6
246; VI-NEXT:    s_mov_b32 s9, s7
247; VI-NEXT:    s_mov_b32 s11, s3
248; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
249; VI-NEXT:    s_mov_b32 s0, s4
250; VI-NEXT:    s_mov_b32 s1, s5
251; VI-NEXT:    s_waitcnt vmcnt(0)
252; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
253; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
254; VI-NEXT:    s_endpgm
255;
256; EG-LABEL: shl_i16_v_s:
257; EG:       ; %bb.0:
258; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
259; EG-NEXT:    TEX 1 @6
260; EG-NEXT:    ALU 12, @12, KC0[CB0:0-32], KC1[]
261; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
262; EG-NEXT:    CF_END
263; EG-NEXT:    PAD
264; EG-NEXT:    Fetch clause starting at 6:
265; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
266; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
267; EG-NEXT:    ALU clause starting at 10:
268; EG-NEXT:     MOV T0.X, 0.0,
269; EG-NEXT:     MOV * T1.X, KC0[2].Z,
270; EG-NEXT:    ALU clause starting at 12:
271; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
272; EG-NEXT:     LSHL * T1.W, T1.X, T0.X,
273; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
274; EG-NEXT:     AND_INT T1.W, PS, literal.x,
275; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
276; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
277; EG-NEXT:     LSHL T0.X, PV.W, PS,
278; EG-NEXT:     LSHL * T0.W, literal.x, PS,
279; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
280; EG-NEXT:     MOV T0.Y, 0.0,
281; EG-NEXT:     MOV * T0.Z, 0.0,
282; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
283; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
284  %a = load i16, i16 addrspace(1)* %in
285  %result = shl i16 %a, %b
286  store i16 %result, i16 addrspace(1)* %out
287  ret void
288}
289
290define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
291; SI-LABEL: shl_i16_v_compute_s:
292; SI:       ; %bb.0:
293; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
294; SI-NEXT:    s_load_dword s12, s[0:1], 0xd
295; SI-NEXT:    s_mov_b32 s3, 0xf000
296; SI-NEXT:    s_mov_b32 s2, -1
297; SI-NEXT:    s_mov_b32 s10, s2
298; SI-NEXT:    s_waitcnt lgkmcnt(0)
299; SI-NEXT:    s_mov_b32 s8, s6
300; SI-NEXT:    s_mov_b32 s9, s7
301; SI-NEXT:    s_mov_b32 s11, s3
302; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
303; SI-NEXT:    s_add_i32 s12, s12, 3
304; SI-NEXT:    s_mov_b32 s0, s4
305; SI-NEXT:    s_mov_b32 s1, s5
306; SI-NEXT:    s_waitcnt vmcnt(0)
307; SI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
308; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
309; SI-NEXT:    s_endpgm
310;
311; VI-LABEL: shl_i16_v_compute_s:
312; VI:       ; %bb.0:
313; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
314; VI-NEXT:    s_load_dword s12, s[0:1], 0x34
315; VI-NEXT:    s_mov_b32 s3, 0xf000
316; VI-NEXT:    s_mov_b32 s2, -1
317; VI-NEXT:    s_mov_b32 s10, s2
318; VI-NEXT:    s_waitcnt lgkmcnt(0)
319; VI-NEXT:    s_mov_b32 s8, s6
320; VI-NEXT:    s_mov_b32 s9, s7
321; VI-NEXT:    s_mov_b32 s11, s3
322; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
323; VI-NEXT:    s_add_i32 s12, s12, 3
324; VI-NEXT:    s_mov_b32 s0, s4
325; VI-NEXT:    s_mov_b32 s1, s5
326; VI-NEXT:    s_waitcnt vmcnt(0)
327; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
328; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
329; VI-NEXT:    s_endpgm
330;
331; EG-LABEL: shl_i16_v_compute_s:
332; EG:       ; %bb.0:
333; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
334; EG-NEXT:    TEX 0 @8
335; EG-NEXT:    ALU 0, @13, KC0[CB0:0-32], KC1[]
336; EG-NEXT:    TEX 0 @10
337; EG-NEXT:    ALU 15, @14, KC0[CB0:0-32], KC1[]
338; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
339; EG-NEXT:    CF_END
340; EG-NEXT:    PAD
341; EG-NEXT:    Fetch clause starting at 8:
342; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
343; EG-NEXT:    Fetch clause starting at 10:
344; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
345; EG-NEXT:    ALU clause starting at 12:
346; EG-NEXT:     MOV * T0.X, 0.0,
347; EG-NEXT:    ALU clause starting at 13:
348; EG-NEXT:     MOV * T1.X, KC0[2].Z,
349; EG-NEXT:    ALU clause starting at 14:
350; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
351; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
352; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
353; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
354; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
355; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
356; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
357; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
358; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
359; EG-NEXT:     LSHL T0.X, PV.W, PS,
360; EG-NEXT:     LSHL * T0.W, literal.x, PS,
361; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
362; EG-NEXT:     MOV T0.Y, 0.0,
363; EG-NEXT:     MOV * T0.Z, 0.0,
364; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
365; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
366  %a = load i16, i16 addrspace(1)* %in
367  %b.add = add i16 %b, 3
368  %result = shl i16 %a, %b.add
369  store i16 %result, i16 addrspace(1)* %out
370  ret void
371}
372
373define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
374; SI-LABEL: shl_i16_computed_amount:
375; SI:       ; %bb.0:
376; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
377; SI-NEXT:    s_mov_b32 s7, 0xf000
378; SI-NEXT:    s_mov_b32 s6, -1
379; SI-NEXT:    s_mov_b32 s10, s6
380; SI-NEXT:    s_mov_b32 s11, s7
381; SI-NEXT:    s_waitcnt lgkmcnt(0)
382; SI-NEXT:    s_mov_b32 s8, s2
383; SI-NEXT:    s_mov_b32 s9, s3
384; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
385; SI-NEXT:    v_mov_b32_e32 v1, 0
386; SI-NEXT:    s_mov_b32 s14, 0
387; SI-NEXT:    s_mov_b32 s15, s7
388; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
389; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
390; SI-NEXT:    s_waitcnt vmcnt(0)
391; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc
392; SI-NEXT:    s_waitcnt vmcnt(0)
393; SI-NEXT:    s_mov_b32 s4, s0
394; SI-NEXT:    s_mov_b32 s5, s1
395; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
396; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
397; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
398; SI-NEXT:    s_endpgm
399;
400; VI-LABEL: shl_i16_computed_amount:
401; VI:       ; %bb.0:
402; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
403; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
404; VI-NEXT:    s_mov_b32 s7, 0xf000
405; VI-NEXT:    s_mov_b32 s6, -1
406; VI-NEXT:    s_mov_b32 s10, s6
407; VI-NEXT:    s_waitcnt lgkmcnt(0)
408; VI-NEXT:    v_mov_b32_e32 v1, s3
409; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
410; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
411; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
412; VI-NEXT:    s_mov_b32 s8, s2
413; VI-NEXT:    s_mov_b32 s9, s3
414; VI-NEXT:    s_mov_b32 s11, s7
415; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
416; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
417; VI-NEXT:    s_waitcnt vmcnt(0)
418; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
419; VI-NEXT:    s_waitcnt vmcnt(0)
420; VI-NEXT:    s_mov_b32 s4, s0
421; VI-NEXT:    s_mov_b32 s5, s1
422; VI-NEXT:    v_add_u16_e32 v0, 3, v0
423; VI-NEXT:    v_lshlrev_b16_e32 v0, v0, v2
424; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
425; VI-NEXT:    s_endpgm
426;
427; EG-LABEL: shl_i16_computed_amount:
428; EG:       ; %bb.0:
429; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
430; EG-NEXT:    TEX 0 @8
431; EG-NEXT:    ALU 1, @13, KC0[CB0:0-32], KC1[]
432; EG-NEXT:    TEX 0 @10
433; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
434; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
435; EG-NEXT:    CF_END
436; EG-NEXT:    PAD
437; EG-NEXT:    Fetch clause starting at 8:
438; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
439; EG-NEXT:    Fetch clause starting at 10:
440; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
441; EG-NEXT:    ALU clause starting at 12:
442; EG-NEXT:     MOV * T1.X, KC0[2].Z,
443; EG-NEXT:    ALU clause starting at 13:
444; EG-NEXT:     LSHL * T0.W, T0.X, 1,
445; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
446; EG-NEXT:    ALU clause starting at 15:
447; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
448; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
449; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
450; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
451; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
452; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
453; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
454; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
455; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
456; EG-NEXT:     LSHL T0.X, PV.W, PS,
457; EG-NEXT:     LSHL * T0.W, literal.x, PS,
458; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
459; EG-NEXT:     MOV T0.Y, 0.0,
460; EG-NEXT:     MOV * T0.Z, 0.0,
461; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
462; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
463  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
464  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
465  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
466  %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i16 1
467  %a = load volatile i16, i16 addrspace(1)* %in
468  %b = load volatile i16, i16 addrspace(1)* %b_ptr
469  %b.add = add i16 %b, 3
470  %result = shl i16 %a, %b.add
471  store i16 %result, i16 addrspace(1)* %out
472  ret void
473}
474
475define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
476; SI-LABEL: shl_i16_i_s:
477; SI:       ; %bb.0:
478; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
479; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
480; SI-NEXT:    s_mov_b32 s3, 0xf000
481; SI-NEXT:    s_mov_b32 s2, -1
482; SI-NEXT:    s_waitcnt lgkmcnt(0)
483; SI-NEXT:    s_lshl_b32 s4, s4, 12
484; SI-NEXT:    v_mov_b32_e32 v0, s4
485; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
486; SI-NEXT:    s_endpgm
487;
488; VI-LABEL: shl_i16_i_s:
489; VI:       ; %bb.0:
490; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
491; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
492; VI-NEXT:    s_mov_b32 s3, 0xf000
493; VI-NEXT:    s_mov_b32 s2, -1
494; VI-NEXT:    s_waitcnt lgkmcnt(0)
495; VI-NEXT:    s_lshl_b32 s4, s4, 12
496; VI-NEXT:    v_mov_b32_e32 v0, s4
497; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
498; VI-NEXT:    s_endpgm
499;
500; EG-LABEL: shl_i16_i_s:
501; EG:       ; %bb.0:
502; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
503; EG-NEXT:    TEX 0 @6
504; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
505; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
506; EG-NEXT:    CF_END
507; EG-NEXT:    PAD
508; EG-NEXT:    Fetch clause starting at 6:
509; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
510; EG-NEXT:    ALU clause starting at 8:
511; EG-NEXT:     MOV * T0.X, 0.0,
512; EG-NEXT:    ALU clause starting at 9:
513; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x,
514; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
515; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
516; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
517; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
518; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
519; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
520; EG-NEXT:    61440(8.609578e-41), 3(4.203895e-45)
521; EG-NEXT:     LSHL T0.X, PV.W, PS,
522; EG-NEXT:     LSHL * T0.W, literal.x, PS,
523; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
524; EG-NEXT:     MOV T0.Y, 0.0,
525; EG-NEXT:     MOV * T0.Z, 0.0,
526; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
527; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
528  %result = shl i16 %a, 12
529  store i16 %result, i16 addrspace(1)* %out
530  ret void
531}
532
533define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
534; SI-LABEL: shl_v2i16:
535; SI:       ; %bb.0:
536; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
537; SI-NEXT:    s_mov_b32 s7, 0xf000
538; SI-NEXT:    s_mov_b32 s6, -1
539; SI-NEXT:    s_mov_b32 s10, s6
540; SI-NEXT:    s_mov_b32 s11, s7
541; SI-NEXT:    s_waitcnt lgkmcnt(0)
542; SI-NEXT:    s_mov_b32 s8, s2
543; SI-NEXT:    s_mov_b32 s9, s3
544; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
545; SI-NEXT:    v_mov_b32_e32 v1, 0
546; SI-NEXT:    s_mov_b32 s14, 0
547; SI-NEXT:    s_mov_b32 s15, s7
548; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
549; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
550; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
551; SI-NEXT:    s_mov_b32 s4, s0
552; SI-NEXT:    s_mov_b32 s5, s1
553; SI-NEXT:    s_waitcnt vmcnt(1)
554; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
555; SI-NEXT:    s_waitcnt vmcnt(0)
556; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
557; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
558; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
559; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
560; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
561; SI-NEXT:    v_or_b32_e32 v0, v0, v1
562; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
563; SI-NEXT:    s_endpgm
564;
565; VI-LABEL: shl_v2i16:
566; VI:       ; %bb.0:
567; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
568; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
569; VI-NEXT:    s_waitcnt lgkmcnt(0)
570; VI-NEXT:    v_mov_b32_e32 v1, s3
571; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
572; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
573; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
574; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
575; VI-NEXT:    flat_load_dword v0, v[0:1]
576; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
577; VI-NEXT:    s_mov_b32 s3, 0xf000
578; VI-NEXT:    s_mov_b32 s2, -1
579; VI-NEXT:    s_waitcnt lgkmcnt(0)
580; VI-NEXT:    s_lshr_b32 s5, s4, 16
581; VI-NEXT:    v_mov_b32_e32 v1, s5
582; VI-NEXT:    s_waitcnt vmcnt(0)
583; VI-NEXT:    v_lshlrev_b16_e64 v2, v0, s4
584; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
585; VI-NEXT:    v_or_b32_e32 v0, v2, v0
586; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
587; VI-NEXT:    s_endpgm
588;
589; EG-LABEL: shl_v2i16:
590; EG:       ; %bb.0:
591; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
592; EG-NEXT:    TEX 0 @8
593; EG-NEXT:    ALU 0, @15, KC0[CB0:0-32], KC1[]
594; EG-NEXT:    TEX 0 @10
595; EG-NEXT:    ALU 12, @16, KC0[CB0:0-32], KC1[]
596; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
597; EG-NEXT:    CF_END
598; EG-NEXT:    PAD
599; EG-NEXT:    Fetch clause starting at 8:
600; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
601; EG-NEXT:    Fetch clause starting at 10:
602; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
603; EG-NEXT:    ALU clause starting at 12:
604; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
605; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
606; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
607; EG-NEXT:    ALU clause starting at 15:
608; EG-NEXT:     MOV * T7.X, KC0[2].Z,
609; EG-NEXT:    ALU clause starting at 16:
610; EG-NEXT:     AND_INT T0.Y, T0.X, literal.x,
611; EG-NEXT:     AND_INT T0.Z, T7.X, literal.x, BS:VEC_120/SCL_212
612; EG-NEXT:     LSHR T0.W, T0.X, literal.y,
613; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
614; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
615; EG-NEXT:     LSHL T0.W, PS, PV.W,
616; EG-NEXT:     LSHL * T1.W, PV.Z, PV.Y,
617; EG-NEXT:     AND_INT T1.W, PS, literal.x,
618; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
619; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
620; EG-NEXT:     OR_INT T0.X, PV.W, PS,
621; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
622; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
623  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
624  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
625  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
626  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
627  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
628  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
629  %result = shl <2 x i16> %a, %b
630  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
631  ret void
632}
633
634define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
635; SI-LABEL: shl_v4i16:
636; SI:       ; %bb.0:
637; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
638; SI-NEXT:    s_mov_b32 s7, 0xf000
639; SI-NEXT:    s_mov_b32 s6, 0
640; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
641; SI-NEXT:    v_mov_b32_e32 v1, 0
642; SI-NEXT:    s_waitcnt lgkmcnt(0)
643; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
644; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
645; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
646; SI-NEXT:    s_mov_b32 s4, 0xffff
647; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
648; SI-NEXT:    s_waitcnt vmcnt(1)
649; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
650; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
651; SI-NEXT:    s_waitcnt vmcnt(0)
652; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
653; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
654; SI-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
655; SI-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
656; SI-NEXT:    v_lshlrev_b32_e32 v4, v9, v7
657; SI-NEXT:    v_lshlrev_b32_e32 v5, v8, v6
658; SI-NEXT:    v_and_b32_e32 v3, s4, v3
659; SI-NEXT:    v_and_b32_e32 v2, s4, v2
660; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
661; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
662; SI-NEXT:    v_or_b32_e32 v3, v3, v4
663; SI-NEXT:    v_or_b32_e32 v2, v2, v5
664; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
665; SI-NEXT:    s_endpgm
666;
667; VI-LABEL: shl_v4i16:
668; VI:       ; %bb.0:
669; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
670; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
671; VI-NEXT:    s_waitcnt lgkmcnt(0)
672; VI-NEXT:    v_mov_b32_e32 v1, s3
673; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
674; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
675; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
676; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
677; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
678; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
679; VI-NEXT:    v_mov_b32_e32 v5, s1
680; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
681; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
682; VI-NEXT:    s_waitcnt vmcnt(0)
683; VI-NEXT:    v_lshlrev_b16_e32 v6, v3, v1
684; VI-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
685; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v0
686; VI-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
687; VI-NEXT:    v_or_b32_e32 v1, v6, v1
688; VI-NEXT:    v_or_b32_e32 v0, v3, v0
689; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
690; VI-NEXT:    s_endpgm
691;
692; EG-LABEL: shl_v4i16:
693; EG:       ; %bb.0:
694; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
695; EG-NEXT:    TEX 0 @8
696; EG-NEXT:    ALU 3, @15, KC0[], KC1[]
697; EG-NEXT:    TEX 0 @10
698; EG-NEXT:    ALU 49, @19, KC0[CB0:0-32], KC1[]
699; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
700; EG-NEXT:    CF_END
701; EG-NEXT:    PAD
702; EG-NEXT:    Fetch clause starting at 8:
703; EG-NEXT:     VTX_READ_64 T10.XY, T0.X, 0, #1
704; EG-NEXT:    Fetch clause starting at 10:
705; EG-NEXT:     VTX_READ_64 T10.XY, T0.X, 8, #1
706; EG-NEXT:    ALU clause starting at 12:
707; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
708; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
709; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
710; EG-NEXT:    ALU clause starting at 15:
711; EG-NEXT:     MOV T4.X, T10.X,
712; EG-NEXT:     MOV * T5.X, T10.Y,
713; EG-NEXT:     MOV T0.Y, PV.X,
714; EG-NEXT:     MOV * T0.Z, PS,
715; EG-NEXT:    ALU clause starting at 19:
716; EG-NEXT:     MOV T2.X, T10.X,
717; EG-NEXT:     MOV * T3.X, T10.Y,
718; EG-NEXT:     MOV T0.X, T6.X,
719; EG-NEXT:     MOV * T1.Y, PV.X,
720; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
721; EG-NEXT:     AND_INT * T2.W, T0.Y, literal.x,
722; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
723; EG-NEXT:     LSHL * T1.W, PS, PV.W,
724; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
725; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
726; EG-NEXT:    65535(9.183409e-41), -65536(nan)
727; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
728; EG-NEXT:     MOV T0.X, T3.X,
729; EG-NEXT:     MOV * T6.X, PV.W,
730; EG-NEXT:     MOV T1.Z, PS,
731; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
732; EG-NEXT:     LSHR * T2.W, T0.Y, literal.x,
733; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
734; EG-NEXT:     LSHL T1.W, PS, PV.W,
735; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.x,
736; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
737; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
738; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
739; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
740; EG-NEXT:     MOV T6.X, PV.W,
741; EG-NEXT:     MOV T0.Y, T7.X,
742; EG-NEXT:     AND_INT T1.W, T0.X, literal.x, BS:VEC_120/SCL_212
743; EG-NEXT:     AND_INT * T2.W, T0.Z, literal.x,
744; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
745; EG-NEXT:     LSHL T1.W, PS, PV.W,
746; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
747; EG-NEXT:    -65536(nan), 0(0.000000e+00)
748; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
749; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
750; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
751; EG-NEXT:     MOV * T7.X, PV.W,
752; EG-NEXT:     MOV T0.Y, PV.X,
753; EG-NEXT:     LSHR T1.W, T0.X, literal.x,
754; EG-NEXT:     LSHR * T2.W, T0.Z, literal.x,
755; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
756; EG-NEXT:     LSHL * T1.W, PS, PV.W,
757; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
758; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
759; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
760; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
761; EG-NEXT:     LSHR T0.X, PS, literal.x,
762; EG-NEXT:     OR_INT * T10.Y, PV.Z, PV.W,
763; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
764; EG-NEXT:     MOV T7.X, PV.Y,
765; EG-NEXT:     MOV * T10.X, T6.X,
766  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
767  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
768  %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
769  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
770  %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep
771  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
772  %result = shl <4 x i16> %a, %b
773  store <4 x i16> %result, <4 x i16> addrspace(1)* %gep.out
774  ret void
775}
776
777define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
778; SI-LABEL: shl_i64:
779; SI:       ; %bb.0:
780; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
781; SI-NEXT:    s_mov_b32 s7, 0xf000
782; SI-NEXT:    s_mov_b32 s6, -1
783; SI-NEXT:    s_mov_b32 s10, s6
784; SI-NEXT:    s_mov_b32 s11, s7
785; SI-NEXT:    s_waitcnt lgkmcnt(0)
786; SI-NEXT:    s_mov_b32 s8, s2
787; SI-NEXT:    s_mov_b32 s9, s3
788; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
789; SI-NEXT:    s_mov_b32 s4, s0
790; SI-NEXT:    s_mov_b32 s5, s1
791; SI-NEXT:    s_waitcnt vmcnt(0)
792; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
793; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
794; SI-NEXT:    s_endpgm
795;
796; VI-LABEL: shl_i64:
797; VI:       ; %bb.0:
798; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
799; VI-NEXT:    s_waitcnt lgkmcnt(0)
800; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
801; VI-NEXT:    s_mov_b32 s3, 0xf000
802; VI-NEXT:    s_mov_b32 s2, -1
803; VI-NEXT:    s_waitcnt lgkmcnt(0)
804; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
805; VI-NEXT:    v_mov_b32_e32 v0, s4
806; VI-NEXT:    v_mov_b32_e32 v1, s5
807; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
808; VI-NEXT:    s_endpgm
809;
810; EG-LABEL: shl_i64:
811; EG:       ; %bb.0:
812; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
813; EG-NEXT:    TEX 0 @6
814; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
815; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
816; EG-NEXT:    CF_END
817; EG-NEXT:    PAD
818; EG-NEXT:    Fetch clause starting at 6:
819; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
820; EG-NEXT:    ALU clause starting at 8:
821; EG-NEXT:     MOV * T0.X, KC0[2].Z,
822; EG-NEXT:    ALU clause starting at 9:
823; EG-NEXT:     AND_INT T1.Y, T0.Z, literal.x,
824; EG-NEXT:     LSHR T1.Z, T0.Y, 1,
825; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
826; EG-NEXT:     NOT_INT * T1.W, T0.Z,
827; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
828; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS,
829; EG-NEXT:     LSHL T0.W, T0.X, PV.Y,
830; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
831; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
832; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
833; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
834; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
835; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
836  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
837  %a = load i64, i64 addrspace(1)* %in
838  %b = load i64, i64 addrspace(1)* %b_ptr
839  %result = shl i64 %a, %b
840  store i64 %result, i64 addrspace(1)* %out
841  ret void
842}
843
844define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
845; SI-LABEL: shl_v2i64:
846; SI:       ; %bb.0:
847; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
848; SI-NEXT:    s_mov_b32 s7, 0xf000
849; SI-NEXT:    s_mov_b32 s6, -1
850; SI-NEXT:    s_mov_b32 s10, s6
851; SI-NEXT:    s_mov_b32 s11, s7
852; SI-NEXT:    s_waitcnt lgkmcnt(0)
853; SI-NEXT:    s_mov_b32 s8, s2
854; SI-NEXT:    s_mov_b32 s9, s3
855; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
856; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
857; SI-NEXT:    s_mov_b32 s4, s0
858; SI-NEXT:    s_mov_b32 s5, s1
859; SI-NEXT:    s_waitcnt vmcnt(0)
860; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
861; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
862; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
863; SI-NEXT:    s_endpgm
864;
865; VI-LABEL: shl_v2i64:
866; VI:       ; %bb.0:
867; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
868; VI-NEXT:    s_waitcnt lgkmcnt(0)
869; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
870; VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x10
871; VI-NEXT:    s_mov_b32 s3, 0xf000
872; VI-NEXT:    s_mov_b32 s2, -1
873; VI-NEXT:    s_waitcnt lgkmcnt(0)
874; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s10
875; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
876; VI-NEXT:    v_mov_b32_e32 v0, s4
877; VI-NEXT:    v_mov_b32_e32 v1, s5
878; VI-NEXT:    v_mov_b32_e32 v2, s6
879; VI-NEXT:    v_mov_b32_e32 v3, s7
880; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
881; VI-NEXT:    s_endpgm
882;
883; EG-LABEL: shl_v2i64:
884; EG:       ; %bb.0:
885; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
886; EG-NEXT:    TEX 1 @6
887; EG-NEXT:    ALU 22, @11, KC0[CB0:0-32], KC1[]
888; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
889; EG-NEXT:    CF_END
890; EG-NEXT:    PAD
891; EG-NEXT:    Fetch clause starting at 6:
892; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
893; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
894; EG-NEXT:    ALU clause starting at 10:
895; EG-NEXT:     MOV * T0.X, KC0[2].Z,
896; EG-NEXT:    ALU clause starting at 11:
897; EG-NEXT:     AND_INT T1.Y, T1.Z, literal.x,
898; EG-NEXT:     LSHR T2.Z, T0.W, 1,
899; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
900; EG-NEXT:     NOT_INT * T1.W, T1.Z,
901; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
902; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS,
903; EG-NEXT:     LSHL * T1.W, T0.Z, PV.Y,
904; EG-NEXT:     AND_INT T2.X, T1.Z, literal.x,
905; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
906; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
907; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
908; EG-NEXT:     NOT_INT * T3.W, T1.X,
909; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
910; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
911; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
912; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
913; EG-NEXT:     CNDE_INT * T3.W, PV.X, T0.W, T1.W,
914; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
915; EG-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
916; EG-NEXT:     CNDE_INT * T3.Z, T2.X, T1.W, 0.0,
917; EG-NEXT:     CNDE_INT T3.X, T2.W, T0.Z, 0.0,
918; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
919; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
920  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
921  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
922  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
923  %result = shl <2 x i64> %a, %b
924  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
925  ret void
926}
927
928define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
929; SI-LABEL: shl_v4i64:
930; SI:       ; %bb.0:
931; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
932; SI-NEXT:    s_mov_b32 s3, 0xf000
933; SI-NEXT:    s_mov_b32 s2, -1
934; SI-NEXT:    s_mov_b32 s10, s2
935; SI-NEXT:    s_mov_b32 s11, s3
936; SI-NEXT:    s_waitcnt lgkmcnt(0)
937; SI-NEXT:    s_mov_b32 s8, s6
938; SI-NEXT:    s_mov_b32 s9, s7
939; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
940; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
941; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
942; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
943; SI-NEXT:    s_mov_b32 s0, s4
944; SI-NEXT:    s_mov_b32 s1, s5
945; SI-NEXT:    s_waitcnt vmcnt(2)
946; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
947; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
948; SI-NEXT:    s_waitcnt vmcnt(0)
949; SI-NEXT:    v_lshl_b64 v[9:10], v[9:10], v13
950; SI-NEXT:    v_lshl_b64 v[7:8], v[7:8], v11
951; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
952; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
953; SI-NEXT:    s_endpgm
954;
955; VI-LABEL: shl_v4i64:
956; VI:       ; %bb.0:
957; VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
958; VI-NEXT:    s_waitcnt lgkmcnt(0)
959; VI-NEXT:    s_load_dwordx8 s[0:7], s[18:19], 0x0
960; VI-NEXT:    s_load_dwordx8 s[8:15], s[18:19], 0x20
961; VI-NEXT:    s_mov_b32 s19, 0xf000
962; VI-NEXT:    s_mov_b32 s18, -1
963; VI-NEXT:    s_waitcnt lgkmcnt(0)
964; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s14
965; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s12
966; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
967; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
968; VI-NEXT:    v_mov_b32_e32 v0, s4
969; VI-NEXT:    v_mov_b32_e32 v1, s5
970; VI-NEXT:    v_mov_b32_e32 v2, s6
971; VI-NEXT:    v_mov_b32_e32 v3, s7
972; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
973; VI-NEXT:    s_nop 0
974; VI-NEXT:    v_mov_b32_e32 v0, s0
975; VI-NEXT:    v_mov_b32_e32 v1, s1
976; VI-NEXT:    v_mov_b32_e32 v2, s2
977; VI-NEXT:    v_mov_b32_e32 v3, s3
978; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
979; VI-NEXT:    s_endpgm
980;
981; EG-LABEL: shl_v4i64:
982; EG:       ; %bb.0:
983; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
984; EG-NEXT:    TEX 3 @6
985; EG-NEXT:    ALU 47, @15, KC0[CB0:0-32], KC1[]
986; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
987; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
988; EG-NEXT:    CF_END
989; EG-NEXT:    Fetch clause starting at 6:
990; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
991; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
992; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 32, #1
993; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
994; EG-NEXT:    ALU clause starting at 14:
995; EG-NEXT:     MOV * T0.X, KC0[2].Z,
996; EG-NEXT:    ALU clause starting at 15:
997; EG-NEXT:     AND_INT T4.Z, T1.Z, literal.x,
998; EG-NEXT:     LSHR T1.W, T0.W, 1,
999; EG-NEXT:     NOT_INT * T3.W, T1.Z,
1000; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1001; EG-NEXT:     BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1,
1002; EG-NEXT:     AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201
1003; EG-NEXT:     LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212
1004; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221
1005; EG-NEXT:     NOT_INT * T2.W, T3.Z,
1006; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1007; EG-NEXT:     BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS,
1008; EG-NEXT:     LSHL T2.Z, T2.Z, PV.Y,
1009; EG-NEXT:     BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W,
1010; EG-NEXT:     LSHL * T1.W, T0.Z, T4.Z,
1011; EG-NEXT:     AND_INT T4.X, T1.Z, literal.x,
1012; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
1013; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
1014; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
1015; EG-NEXT:     NOT_INT * T3.W, T1.X,
1016; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
1017; EG-NEXT:     AND_INT T5.X, T3.Z, literal.x,
1018; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
1019; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
1020; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
1021; EG-NEXT:     CNDE_INT * T4.W, PV.X, T0.W, T1.W,
1022; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1023; EG-NEXT:     AND_INT T0.X, T3.X, literal.x,
1024; EG-NEXT:     CNDE_INT T4.Y, PV.W, PV.Y, PV.Z,
1025; EG-NEXT:     LSHR T1.Z, T2.Y, 1,
1026; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1,
1027; EG-NEXT:     NOT_INT * T3.W, T3.X,
1028; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1029; EG-NEXT:     BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS,
1030; EG-NEXT:     LSHL T0.Y, T2.X, PV.X,
1031; EG-NEXT:     CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212
1032; EG-NEXT:     AND_INT * T0.W, T3.X, literal.x, BS:VEC_201
1033; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1034; EG-NEXT:     CNDE_INT * T1.W, T5.X, T3.Y, T2.Z,
1035; EG-NEXT:     CNDE_INT T4.X, T2.W, T0.Z, 0.0,
1036; EG-NEXT:     CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212
1037; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
1038; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1039; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
1040; EG-NEXT:     CNDE_INT T1.Z, T5.X, T2.Z, 0.0,
1041; EG-NEXT:     CNDE_INT * T1.X, T0.W, T0.Y, 0.0,
1042; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1043; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
1044; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1045  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
1046  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
1047  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
1048  %result = shl <4 x i64> %a, %b
1049  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
1050  ret void
1051}
1052
1053; Make sure load width gets reduced to i32 load.
1054define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
1055; SI-LABEL: s_shl_32_i64:
1056; SI:       ; %bb.0:
1057; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
1058; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1059; SI-NEXT:    s_mov_b32 s3, 0xf000
1060; SI-NEXT:    s_mov_b32 s2, -1
1061; SI-NEXT:    v_mov_b32_e32 v0, 0
1062; SI-NEXT:    s_waitcnt lgkmcnt(0)
1063; SI-NEXT:    v_mov_b32_e32 v1, s4
1064; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1065; SI-NEXT:    s_endpgm
1066;
1067; VI-LABEL: s_shl_32_i64:
1068; VI:       ; %bb.0:
1069; VI-NEXT:    s_load_dword s4, s[0:1], 0x4c
1070; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1071; VI-NEXT:    s_mov_b32 s3, 0xf000
1072; VI-NEXT:    s_mov_b32 s2, -1
1073; VI-NEXT:    v_mov_b32_e32 v0, 0
1074; VI-NEXT:    s_waitcnt lgkmcnt(0)
1075; VI-NEXT:    v_mov_b32_e32 v1, s4
1076; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1077; VI-NEXT:    s_endpgm
1078;
1079; EG-LABEL: s_shl_32_i64:
1080; EG:       ; %bb.0:
1081; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1082; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1083; EG-NEXT:    CF_END
1084; EG-NEXT:    PAD
1085; EG-NEXT:    ALU clause starting at 4:
1086; EG-NEXT:     MOV * T0.Y, KC0[4].W,
1087; EG-NEXT:     MOV T0.X, 0.0,
1088; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1089; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1090  %result = shl i64 %a, 32
1091  store i64 %result, i64 addrspace(1)* %out
1092  ret void
1093}
1094
1095define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
1096; SI-LABEL: v_shl_32_i64:
1097; SI:       ; %bb.0:
1098; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1099; SI-NEXT:    s_ashr_i32 s3, s2, 31
1100; SI-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
1101; SI-NEXT:    v_mov_b32_e32 v0, s0
1102; SI-NEXT:    s_mov_b32 s11, 0xf000
1103; SI-NEXT:    s_mov_b32 s10, 0
1104; SI-NEXT:    s_waitcnt lgkmcnt(0)
1105; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
1106; SI-NEXT:    v_mov_b32_e32 v1, s1
1107; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
1108; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1109; SI-NEXT:    v_mov_b32_e32 v2, 0
1110; SI-NEXT:    s_waitcnt vmcnt(0)
1111; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
1112; SI-NEXT:    s_endpgm
1113;
1114; VI-LABEL: v_shl_32_i64:
1115; VI:       ; %bb.0:
1116; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1117; VI-NEXT:    s_ashr_i32 s3, s2, 31
1118; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
1119; VI-NEXT:    v_mov_b32_e32 v0, 0
1120; VI-NEXT:    s_waitcnt lgkmcnt(0)
1121; VI-NEXT:    s_add_u32 s2, s6, s0
1122; VI-NEXT:    s_addc_u32 s3, s7, s1
1123; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1124; VI-NEXT:    s_add_u32 s0, s4, s0
1125; VI-NEXT:    s_addc_u32 s1, s5, s1
1126; VI-NEXT:    v_mov_b32_e32 v3, s1
1127; VI-NEXT:    v_mov_b32_e32 v2, s0
1128; VI-NEXT:    s_waitcnt lgkmcnt(0)
1129; VI-NEXT:    v_mov_b32_e32 v1, s2
1130; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1131; VI-NEXT:    s_endpgm
1132;
1133; EG-LABEL: v_shl_32_i64:
1134; EG:       ; %bb.0:
1135; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1136; EG-NEXT:    TEX 0 @6
1137; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
1138; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
1139; EG-NEXT:    CF_END
1140; EG-NEXT:    PAD
1141; EG-NEXT:    Fetch clause starting at 6:
1142; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1143; EG-NEXT:    ALU clause starting at 8:
1144; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1145; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1146; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1147; EG-NEXT:    ALU clause starting at 11:
1148; EG-NEXT:     MOV T1.X, 0.0,
1149; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1150; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1151; EG-NEXT:     MOV * T1.Y, T0.X,
1152; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1153  %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0
1154  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
1155  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
1156  %a = load i64, i64 addrspace(1)* %gep.in
1157  %result = shl i64 %a, 32
1158  store i64 %result, i64 addrspace(1)* %gep.out
1159  ret void
1160}
1161
1162define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
1163; SI-LABEL: s_shl_constant_i64:
1164; SI:       ; %bb.0:
1165; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1166; SI-NEXT:    s_mov_b32 s6, -1
1167; SI-NEXT:    s_mov_b32 s9, 0xffff
1168; SI-NEXT:    s_mov_b32 s8, s6
1169; SI-NEXT:    s_mov_b32 s7, 0xf000
1170; SI-NEXT:    s_waitcnt lgkmcnt(0)
1171; SI-NEXT:    s_mov_b32 s4, s0
1172; SI-NEXT:    s_mov_b32 s5, s1
1173; SI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
1174; SI-NEXT:    v_mov_b32_e32 v0, s0
1175; SI-NEXT:    v_mov_b32_e32 v1, s1
1176; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1177; SI-NEXT:    s_endpgm
1178;
1179; VI-LABEL: s_shl_constant_i64:
1180; VI:       ; %bb.0:
1181; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1182; VI-NEXT:    s_mov_b32 s6, -1
1183; VI-NEXT:    s_mov_b32 s9, 0xffff
1184; VI-NEXT:    s_mov_b32 s8, s6
1185; VI-NEXT:    s_mov_b32 s7, 0xf000
1186; VI-NEXT:    s_waitcnt lgkmcnt(0)
1187; VI-NEXT:    s_mov_b32 s4, s0
1188; VI-NEXT:    s_mov_b32 s5, s1
1189; VI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
1190; VI-NEXT:    v_mov_b32_e32 v0, s0
1191; VI-NEXT:    v_mov_b32_e32 v1, s1
1192; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1193; VI-NEXT:    s_endpgm
1194;
1195; EG-LABEL: s_shl_constant_i64:
1196; EG:       ; %bb.0:
1197; EG-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
1198; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1199; EG-NEXT:    CF_END
1200; EG-NEXT:    PAD
1201; EG-NEXT:    ALU clause starting at 4:
1202; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
1203; EG-NEXT:     MOV T0.W, literal.y,
1204; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
1205; EG-NEXT:    31(4.344025e-44), -1(nan)
1206; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
1207; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
1208; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1209; EG-NEXT:    32767(4.591635e-41), -1(nan)
1210; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1211; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1212; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1213; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1214; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1215  %shl = shl i64 281474976710655, %a
1216  store i64 %shl, i64 addrspace(1)* %out, align 8
1217  ret void
1218}
1219
1220define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1221; SI-LABEL: v_shl_constant_i64:
1222; SI:       ; %bb.0:
1223; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1224; SI-NEXT:    s_mov_b32 s7, 0xf000
1225; SI-NEXT:    s_mov_b32 s6, -1
1226; SI-NEXT:    s_mov_b32 s10, s6
1227; SI-NEXT:    s_mov_b32 s11, s7
1228; SI-NEXT:    s_waitcnt lgkmcnt(0)
1229; SI-NEXT:    s_mov_b32 s8, s2
1230; SI-NEXT:    s_mov_b32 s9, s3
1231; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1232; SI-NEXT:    s_mov_b32 s2, 0xab19b207
1233; SI-NEXT:    s_movk_i32 s3, 0x11e
1234; SI-NEXT:    s_mov_b32 s4, s0
1235; SI-NEXT:    s_mov_b32 s5, s1
1236; SI-NEXT:    s_waitcnt vmcnt(0)
1237; SI-NEXT:    v_lshl_b64 v[0:1], s[2:3], v0
1238; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1239; SI-NEXT:    s_endpgm
1240;
1241; VI-LABEL: v_shl_constant_i64:
1242; VI:       ; %bb.0:
1243; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1244; VI-NEXT:    s_mov_b32 s7, 0xf000
1245; VI-NEXT:    s_mov_b32 s6, -1
1246; VI-NEXT:    s_waitcnt lgkmcnt(0)
1247; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1248; VI-NEXT:    s_mov_b32 s4, s0
1249; VI-NEXT:    s_mov_b32 s5, s1
1250; VI-NEXT:    s_mov_b32 s0, 0xab19b207
1251; VI-NEXT:    s_movk_i32 s1, 0x11e
1252; VI-NEXT:    s_waitcnt lgkmcnt(0)
1253; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1254; VI-NEXT:    v_mov_b32_e32 v0, s0
1255; VI-NEXT:    v_mov_b32_e32 v1, s1
1256; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1257; VI-NEXT:    s_endpgm
1258;
1259; EG-LABEL: v_shl_constant_i64:
1260; EG:       ; %bb.0:
1261; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1262; EG-NEXT:    TEX 0 @6
1263; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1264; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1265; EG-NEXT:    CF_END
1266; EG-NEXT:    PAD
1267; EG-NEXT:    Fetch clause starting at 6:
1268; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1269; EG-NEXT:    ALU clause starting at 8:
1270; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1271; EG-NEXT:    ALU clause starting at 9:
1272; EG-NEXT:     NOT_INT T0.Z, T0.X,
1273; EG-NEXT:     MOV T0.W, literal.x,
1274; EG-NEXT:     AND_INT * T1.W, T0.X, literal.y,
1275; EG-NEXT:    1435293955(1.935796e+13), 31(4.344025e-44)
1276; EG-NEXT:     LSHL T1.Z, literal.x, PS,
1277; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.W, PV.Z,
1278; EG-NEXT:     AND_INT * T1.W, T0.X, literal.z,
1279; EG-NEXT:    -1424379385(-5.460358e-13), 143(2.003857e-43)
1280; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1281; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1282; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
1283; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1284; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1285  %a = load i64, i64 addrspace(1)* %aptr, align 8
1286  %shl = shl i64 1231231234567, %a
1287  store i64 %shl, i64 addrspace(1)* %out, align 8
1288  ret void
1289}
1290
1291define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1292; SI-LABEL: v_shl_i64_32_bit_constant:
1293; SI:       ; %bb.0:
1294; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1295; SI-NEXT:    s_mov_b32 s7, 0xf000
1296; SI-NEXT:    s_mov_b32 s6, -1
1297; SI-NEXT:    s_mov_b32 s10, s6
1298; SI-NEXT:    s_mov_b32 s11, s7
1299; SI-NEXT:    s_waitcnt lgkmcnt(0)
1300; SI-NEXT:    s_mov_b32 s8, s2
1301; SI-NEXT:    s_mov_b32 s9, s3
1302; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1303; SI-NEXT:    s_mov_b64 s[2:3], 0x12d687
1304; SI-NEXT:    s_mov_b32 s4, s0
1305; SI-NEXT:    s_mov_b32 s5, s1
1306; SI-NEXT:    s_waitcnt vmcnt(0)
1307; SI-NEXT:    v_lshl_b64 v[0:1], s[2:3], v0
1308; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1309; SI-NEXT:    s_endpgm
1310;
1311; VI-LABEL: v_shl_i64_32_bit_constant:
1312; VI:       ; %bb.0:
1313; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1314; VI-NEXT:    s_mov_b32 s7, 0xf000
1315; VI-NEXT:    s_mov_b32 s6, -1
1316; VI-NEXT:    s_waitcnt lgkmcnt(0)
1317; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1318; VI-NEXT:    s_mov_b32 s4, s0
1319; VI-NEXT:    s_mov_b32 s5, s1
1320; VI-NEXT:    s_mov_b64 s[0:1], 0x12d687
1321; VI-NEXT:    s_waitcnt lgkmcnt(0)
1322; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1323; VI-NEXT:    v_mov_b32_e32 v0, s0
1324; VI-NEXT:    v_mov_b32_e32 v1, s1
1325; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1326; VI-NEXT:    s_endpgm
1327;
1328; EG-LABEL: v_shl_i64_32_bit_constant:
1329; EG:       ; %bb.0:
1330; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1331; EG-NEXT:    TEX 0 @6
1332; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
1333; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1334; EG-NEXT:    CF_END
1335; EG-NEXT:    PAD
1336; EG-NEXT:    Fetch clause starting at 6:
1337; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1338; EG-NEXT:    ALU clause starting at 8:
1339; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1340; EG-NEXT:    ALU clause starting at 9:
1341; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
1342; EG-NEXT:     NOT_INT * T1.W, T0.X,
1343; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1344; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1345; EG-NEXT:     LSHL T0.W, literal.y, PV.W,
1346; EG-NEXT:     AND_INT * T1.W, T0.X, literal.z,
1347; EG-NEXT:    617283(8.649977e-40), 1234567(1.729997e-39)
1348; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1349; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1350; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1351; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1352; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1353  %a = load i64, i64 addrspace(1)* %aptr, align 8
1354  %shl = shl i64 1234567, %a
1355  store i64 %shl, i64 addrspace(1)* %out, align 8
1356  ret void
1357}
1358
1359define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1360; SI-LABEL: v_shl_inline_imm_64_i64:
1361; SI:       ; %bb.0:
1362; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1363; SI-NEXT:    s_mov_b32 s7, 0xf000
1364; SI-NEXT:    s_mov_b32 s6, -1
1365; SI-NEXT:    s_mov_b32 s10, s6
1366; SI-NEXT:    s_mov_b32 s11, s7
1367; SI-NEXT:    s_waitcnt lgkmcnt(0)
1368; SI-NEXT:    s_mov_b32 s8, s2
1369; SI-NEXT:    s_mov_b32 s9, s3
1370; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1371; SI-NEXT:    s_mov_b32 s4, s0
1372; SI-NEXT:    s_mov_b32 s5, s1
1373; SI-NEXT:    s_waitcnt vmcnt(0)
1374; SI-NEXT:    v_lshl_b64 v[0:1], 64, v0
1375; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1376; SI-NEXT:    s_endpgm
1377;
1378; VI-LABEL: v_shl_inline_imm_64_i64:
1379; VI:       ; %bb.0:
1380; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1381; VI-NEXT:    s_waitcnt lgkmcnt(0)
1382; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
1383; VI-NEXT:    s_mov_b32 s3, 0xf000
1384; VI-NEXT:    s_mov_b32 s2, -1
1385; VI-NEXT:    s_waitcnt lgkmcnt(0)
1386; VI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1387; VI-NEXT:    v_mov_b32_e32 v0, s4
1388; VI-NEXT:    v_mov_b32_e32 v1, s5
1389; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1390; VI-NEXT:    s_endpgm
1391;
1392; EG-LABEL: v_shl_inline_imm_64_i64:
1393; EG:       ; %bb.0:
1394; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1395; EG-NEXT:    TEX 0 @6
1396; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
1397; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1398; EG-NEXT:    CF_END
1399; EG-NEXT:    PAD
1400; EG-NEXT:    Fetch clause starting at 6:
1401; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1402; EG-NEXT:    ALU clause starting at 8:
1403; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1404; EG-NEXT:    ALU clause starting at 9:
1405; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
1406; EG-NEXT:     NOT_INT * T1.W, T0.X,
1407; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1408; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1409; EG-NEXT:     LSHL T0.W, literal.y, PV.W,
1410; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
1411; EG-NEXT:    32(4.484155e-44), 64(8.968310e-44)
1412; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1413; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1414; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1415; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1416  %a = load i64, i64 addrspace(1)* %aptr, align 8
1417  %shl = shl i64 64, %a
1418  store i64 %shl, i64 addrspace(1)* %out, align 8
1419  ret void
1420}
1421
1422define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1423; SI-LABEL: s_shl_inline_imm_64_i64:
1424; SI:       ; %bb.0:
1425; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1426; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1427; SI-NEXT:    s_mov_b32 s3, 0xf000
1428; SI-NEXT:    s_mov_b32 s2, -1
1429; SI-NEXT:    s_waitcnt lgkmcnt(0)
1430; SI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1431; SI-NEXT:    v_mov_b32_e32 v0, s4
1432; SI-NEXT:    v_mov_b32_e32 v1, s5
1433; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1434; SI-NEXT:    s_endpgm
1435;
1436; VI-LABEL: s_shl_inline_imm_64_i64:
1437; VI:       ; %bb.0:
1438; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1439; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1440; VI-NEXT:    s_mov_b32 s3, 0xf000
1441; VI-NEXT:    s_mov_b32 s2, -1
1442; VI-NEXT:    s_waitcnt lgkmcnt(0)
1443; VI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1444; VI-NEXT:    v_mov_b32_e32 v0, s4
1445; VI-NEXT:    v_mov_b32_e32 v1, s5
1446; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1447; VI-NEXT:    s_endpgm
1448;
1449; EG-LABEL: s_shl_inline_imm_64_i64:
1450; EG:       ; %bb.0:
1451; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
1452; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1453; EG-NEXT:    CF_END
1454; EG-NEXT:    PAD
1455; EG-NEXT:    ALU clause starting at 4:
1456; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
1457; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1458; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1459; EG-NEXT:     LSHL T0.Z, literal.x, PS,
1460; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
1461; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1462; EG-NEXT:    64(8.968310e-44), 32(4.484155e-44)
1463; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1464; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
1465; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1466; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1467  %shl = shl i64 64, %a
1468  store i64 %shl, i64 addrspace(1)* %out, align 8
1469  ret void
1470}
1471
1472define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1473; SI-LABEL: s_shl_inline_imm_1_i64:
1474; SI:       ; %bb.0:
1475; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1476; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1477; SI-NEXT:    s_mov_b32 s3, 0xf000
1478; SI-NEXT:    s_mov_b32 s2, -1
1479; SI-NEXT:    s_waitcnt lgkmcnt(0)
1480; SI-NEXT:    s_lshl_b64 s[4:5], 1, s4
1481; SI-NEXT:    v_mov_b32_e32 v0, s4
1482; SI-NEXT:    v_mov_b32_e32 v1, s5
1483; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1484; SI-NEXT:    s_endpgm
1485;
1486; VI-LABEL: s_shl_inline_imm_1_i64:
1487; VI:       ; %bb.0:
1488; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1489; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1490; VI-NEXT:    s_mov_b32 s3, 0xf000
1491; VI-NEXT:    s_mov_b32 s2, -1
1492; VI-NEXT:    s_waitcnt lgkmcnt(0)
1493; VI-NEXT:    s_lshl_b64 s[4:5], 1, s4
1494; VI-NEXT:    v_mov_b32_e32 v0, s4
1495; VI-NEXT:    v_mov_b32_e32 v1, s5
1496; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1497; VI-NEXT:    s_endpgm
1498;
1499; EG-LABEL: s_shl_inline_imm_1_i64:
1500; EG:       ; %bb.0:
1501; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
1502; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1503; EG-NEXT:    CF_END
1504; EG-NEXT:    PAD
1505; EG-NEXT:    ALU clause starting at 4:
1506; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
1507; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.y,
1508; EG-NEXT:    31(4.344025e-44), 26(3.643376e-44)
1509; EG-NEXT:     ASHR T1.W, PS, literal.x,
1510; EG-NEXT:     LSHL * T0.W, 1, PV.W,
1511; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1512; EG-NEXT:     AND_INT T0.Y, PV.W, PS,
1513; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1514; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1515; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, 0.0,
1516; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1517; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1518  %shl = shl i64 1, %a
1519  store i64 %shl, i64 addrspace(1)* %out, align 8
1520  ret void
1521}
1522
1523define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1524; SI-LABEL: s_shl_inline_imm_1_0_i64:
1525; SI:       ; %bb.0:
1526; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1527; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1528; SI-NEXT:    s_mov_b32 s3, 0xf000
1529; SI-NEXT:    s_mov_b32 s2, -1
1530; SI-NEXT:    s_waitcnt lgkmcnt(0)
1531; SI-NEXT:    s_lshl_b64 s[4:5], 1.0, s4
1532; SI-NEXT:    v_mov_b32_e32 v0, s4
1533; SI-NEXT:    v_mov_b32_e32 v1, s5
1534; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1535; SI-NEXT:    s_endpgm
1536;
1537; VI-LABEL: s_shl_inline_imm_1_0_i64:
1538; VI:       ; %bb.0:
1539; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1540; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1541; VI-NEXT:    s_mov_b32 s3, 0xf000
1542; VI-NEXT:    s_mov_b32 s2, -1
1543; VI-NEXT:    s_waitcnt lgkmcnt(0)
1544; VI-NEXT:    s_lshl_b64 s[4:5], 1.0, s4
1545; VI-NEXT:    v_mov_b32_e32 v0, s4
1546; VI-NEXT:    v_mov_b32_e32 v1, s5
1547; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1548; VI-NEXT:    s_endpgm
1549;
1550; EG-LABEL: s_shl_inline_imm_1_0_i64:
1551; EG:       ; %bb.0:
1552; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1553; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1554; EG-NEXT:    CF_END
1555; EG-NEXT:    PAD
1556; EG-NEXT:    ALU clause starting at 4:
1557; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1558; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1559; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1560; EG-NEXT:    536346624(1.050321e-19), 32(4.484155e-44)
1561; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1562; EG-NEXT:     MOV T0.X, 0.0,
1563; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1564; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1565  %shl = shl i64 4607182418800017408, %a
1566  store i64 %shl, i64 addrspace(1)* %out, align 8
1567  ret void
1568}
1569
1570define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1571; SI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1572; SI:       ; %bb.0:
1573; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1574; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1575; SI-NEXT:    s_mov_b32 s3, 0xf000
1576; SI-NEXT:    s_mov_b32 s2, -1
1577; SI-NEXT:    s_waitcnt lgkmcnt(0)
1578; SI-NEXT:    s_lshl_b64 s[4:5], -1.0, s4
1579; SI-NEXT:    v_mov_b32_e32 v0, s4
1580; SI-NEXT:    v_mov_b32_e32 v1, s5
1581; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1582; SI-NEXT:    s_endpgm
1583;
1584; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1585; VI:       ; %bb.0:
1586; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1587; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1588; VI-NEXT:    s_mov_b32 s3, 0xf000
1589; VI-NEXT:    s_mov_b32 s2, -1
1590; VI-NEXT:    s_waitcnt lgkmcnt(0)
1591; VI-NEXT:    s_lshl_b64 s[4:5], -1.0, s4
1592; VI-NEXT:    v_mov_b32_e32 v0, s4
1593; VI-NEXT:    v_mov_b32_e32 v1, s5
1594; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1595; VI-NEXT:    s_endpgm
1596;
1597; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
1598; EG:       ; %bb.0:
1599; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1600; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1601; EG-NEXT:    CF_END
1602; EG-NEXT:    PAD
1603; EG-NEXT:    ALU clause starting at 4:
1604; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1605; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1606; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1607; EG-NEXT:    1610088448(3.574057e+19), 32(4.484155e-44)
1608; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1609; EG-NEXT:     MOV T0.X, 0.0,
1610; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1611; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1612  %shl = shl i64 13830554455654793216, %a
1613  store i64 %shl, i64 addrspace(1)* %out, align 8
1614  ret void
1615}
1616
1617define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1618; SI-LABEL: s_shl_inline_imm_0_5_i64:
1619; SI:       ; %bb.0:
1620; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1621; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1622; SI-NEXT:    s_mov_b32 s3, 0xf000
1623; SI-NEXT:    s_mov_b32 s2, -1
1624; SI-NEXT:    s_waitcnt lgkmcnt(0)
1625; SI-NEXT:    s_lshl_b64 s[4:5], 0.5, s4
1626; SI-NEXT:    v_mov_b32_e32 v0, s4
1627; SI-NEXT:    v_mov_b32_e32 v1, s5
1628; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1629; SI-NEXT:    s_endpgm
1630;
1631; VI-LABEL: s_shl_inline_imm_0_5_i64:
1632; VI:       ; %bb.0:
1633; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1634; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1635; VI-NEXT:    s_mov_b32 s3, 0xf000
1636; VI-NEXT:    s_mov_b32 s2, -1
1637; VI-NEXT:    s_waitcnt lgkmcnt(0)
1638; VI-NEXT:    s_lshl_b64 s[4:5], 0.5, s4
1639; VI-NEXT:    v_mov_b32_e32 v0, s4
1640; VI-NEXT:    v_mov_b32_e32 v1, s5
1641; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1642; VI-NEXT:    s_endpgm
1643;
1644; EG-LABEL: s_shl_inline_imm_0_5_i64:
1645; EG:       ; %bb.0:
1646; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1647; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1648; EG-NEXT:    CF_END
1649; EG-NEXT:    PAD
1650; EG-NEXT:    ALU clause starting at 4:
1651; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1652; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1653; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1654; EG-NEXT:    535822336(1.016440e-19), 32(4.484155e-44)
1655; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1656; EG-NEXT:     MOV T0.X, 0.0,
1657; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1658; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1659  %shl = shl i64 4602678819172646912, %a
1660  store i64 %shl, i64 addrspace(1)* %out, align 8
1661  ret void
1662}
1663
1664define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1665; SI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1666; SI:       ; %bb.0:
1667; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1668; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1669; SI-NEXT:    s_mov_b32 s3, 0xf000
1670; SI-NEXT:    s_mov_b32 s2, -1
1671; SI-NEXT:    s_waitcnt lgkmcnt(0)
1672; SI-NEXT:    s_lshl_b64 s[4:5], -0.5, s4
1673; SI-NEXT:    v_mov_b32_e32 v0, s4
1674; SI-NEXT:    v_mov_b32_e32 v1, s5
1675; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1676; SI-NEXT:    s_endpgm
1677;
1678; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1679; VI:       ; %bb.0:
1680; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1681; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1682; VI-NEXT:    s_mov_b32 s3, 0xf000
1683; VI-NEXT:    s_mov_b32 s2, -1
1684; VI-NEXT:    s_waitcnt lgkmcnt(0)
1685; VI-NEXT:    s_lshl_b64 s[4:5], -0.5, s4
1686; VI-NEXT:    v_mov_b32_e32 v0, s4
1687; VI-NEXT:    v_mov_b32_e32 v1, s5
1688; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1689; VI-NEXT:    s_endpgm
1690;
1691; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
1692; EG:       ; %bb.0:
1693; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1694; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1695; EG-NEXT:    CF_END
1696; EG-NEXT:    PAD
1697; EG-NEXT:    ALU clause starting at 4:
1698; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1699; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1700; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1701; EG-NEXT:    1609564160(3.458765e+19), 32(4.484155e-44)
1702; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1703; EG-NEXT:     MOV T0.X, 0.0,
1704; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1705; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1706  %shl = shl i64 13826050856027422720, %a
1707  store i64 %shl, i64 addrspace(1)* %out, align 8
1708  ret void
1709}
1710
1711define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1712; SI-LABEL: s_shl_inline_imm_2_0_i64:
1713; SI:       ; %bb.0:
1714; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1715; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1716; SI-NEXT:    s_mov_b32 s3, 0xf000
1717; SI-NEXT:    s_mov_b32 s2, -1
1718; SI-NEXT:    s_waitcnt lgkmcnt(0)
1719; SI-NEXT:    s_lshl_b64 s[4:5], 2.0, s4
1720; SI-NEXT:    v_mov_b32_e32 v0, s4
1721; SI-NEXT:    v_mov_b32_e32 v1, s5
1722; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1723; SI-NEXT:    s_endpgm
1724;
1725; VI-LABEL: s_shl_inline_imm_2_0_i64:
1726; VI:       ; %bb.0:
1727; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1728; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1729; VI-NEXT:    s_mov_b32 s3, 0xf000
1730; VI-NEXT:    s_mov_b32 s2, -1
1731; VI-NEXT:    s_waitcnt lgkmcnt(0)
1732; VI-NEXT:    s_lshl_b64 s[4:5], 2.0, s4
1733; VI-NEXT:    v_mov_b32_e32 v0, s4
1734; VI-NEXT:    v_mov_b32_e32 v1, s5
1735; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1736; VI-NEXT:    s_endpgm
1737;
1738; EG-LABEL: s_shl_inline_imm_2_0_i64:
1739; EG:       ; %bb.0:
1740; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1741; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1742; EG-NEXT:    CF_END
1743; EG-NEXT:    PAD
1744; EG-NEXT:    ALU clause starting at 4:
1745; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1746; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1747; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1748; EG-NEXT:    536870912(1.084202e-19), 32(4.484155e-44)
1749; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1750; EG-NEXT:     MOV T0.X, 0.0,
1751; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1752; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1753  %shl = shl i64 4611686018427387904, %a
1754  store i64 %shl, i64 addrspace(1)* %out, align 8
1755  ret void
1756}
1757
1758define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1759; SI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1760; SI:       ; %bb.0:
1761; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1762; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1763; SI-NEXT:    s_mov_b32 s3, 0xf000
1764; SI-NEXT:    s_mov_b32 s2, -1
1765; SI-NEXT:    s_waitcnt lgkmcnt(0)
1766; SI-NEXT:    s_lshl_b64 s[4:5], -2.0, s4
1767; SI-NEXT:    v_mov_b32_e32 v0, s4
1768; SI-NEXT:    v_mov_b32_e32 v1, s5
1769; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1770; SI-NEXT:    s_endpgm
1771;
1772; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1773; VI:       ; %bb.0:
1774; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1775; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1776; VI-NEXT:    s_mov_b32 s3, 0xf000
1777; VI-NEXT:    s_mov_b32 s2, -1
1778; VI-NEXT:    s_waitcnt lgkmcnt(0)
1779; VI-NEXT:    s_lshl_b64 s[4:5], -2.0, s4
1780; VI-NEXT:    v_mov_b32_e32 v0, s4
1781; VI-NEXT:    v_mov_b32_e32 v1, s5
1782; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1783; VI-NEXT:    s_endpgm
1784;
1785; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
1786; EG:       ; %bb.0:
1787; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1788; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1789; EG-NEXT:    CF_END
1790; EG-NEXT:    PAD
1791; EG-NEXT:    ALU clause starting at 4:
1792; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1793; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1794; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1795; EG-NEXT:    1610612736(3.689349e+19), 32(4.484155e-44)
1796; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1797; EG-NEXT:     MOV T0.X, 0.0,
1798; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1799; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1800  %shl = shl i64 13835058055282163712, %a
1801  store i64 %shl, i64 addrspace(1)* %out, align 8
1802  ret void
1803}
1804
1805define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1806; SI-LABEL: s_shl_inline_imm_4_0_i64:
1807; SI:       ; %bb.0:
1808; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1809; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1810; SI-NEXT:    s_mov_b32 s3, 0xf000
1811; SI-NEXT:    s_mov_b32 s2, -1
1812; SI-NEXT:    s_waitcnt lgkmcnt(0)
1813; SI-NEXT:    s_lshl_b64 s[4:5], 4.0, s4
1814; SI-NEXT:    v_mov_b32_e32 v0, s4
1815; SI-NEXT:    v_mov_b32_e32 v1, s5
1816; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1817; SI-NEXT:    s_endpgm
1818;
1819; VI-LABEL: s_shl_inline_imm_4_0_i64:
1820; VI:       ; %bb.0:
1821; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1822; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1823; VI-NEXT:    s_mov_b32 s3, 0xf000
1824; VI-NEXT:    s_mov_b32 s2, -1
1825; VI-NEXT:    s_waitcnt lgkmcnt(0)
1826; VI-NEXT:    s_lshl_b64 s[4:5], 4.0, s4
1827; VI-NEXT:    v_mov_b32_e32 v0, s4
1828; VI-NEXT:    v_mov_b32_e32 v1, s5
1829; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1830; VI-NEXT:    s_endpgm
1831;
1832; EG-LABEL: s_shl_inline_imm_4_0_i64:
1833; EG:       ; %bb.0:
1834; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1835; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1836; EG-NEXT:    CF_END
1837; EG-NEXT:    PAD
1838; EG-NEXT:    ALU clause starting at 4:
1839; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1840; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1841; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1842; EG-NEXT:    537395200(1.151965e-19), 32(4.484155e-44)
1843; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1844; EG-NEXT:     MOV T0.X, 0.0,
1845; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1846; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1847  %shl = shl i64 4616189618054758400, %a
1848  store i64 %shl, i64 addrspace(1)* %out, align 8
1849  ret void
1850}
1851
1852define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1853; SI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1854; SI:       ; %bb.0:
1855; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1856; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1857; SI-NEXT:    s_mov_b32 s3, 0xf000
1858; SI-NEXT:    s_mov_b32 s2, -1
1859; SI-NEXT:    s_waitcnt lgkmcnt(0)
1860; SI-NEXT:    s_lshl_b64 s[4:5], -4.0, s4
1861; SI-NEXT:    v_mov_b32_e32 v0, s4
1862; SI-NEXT:    v_mov_b32_e32 v1, s5
1863; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1864; SI-NEXT:    s_endpgm
1865;
1866; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1867; VI:       ; %bb.0:
1868; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1869; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1870; VI-NEXT:    s_mov_b32 s3, 0xf000
1871; VI-NEXT:    s_mov_b32 s2, -1
1872; VI-NEXT:    s_waitcnt lgkmcnt(0)
1873; VI-NEXT:    s_lshl_b64 s[4:5], -4.0, s4
1874; VI-NEXT:    v_mov_b32_e32 v0, s4
1875; VI-NEXT:    v_mov_b32_e32 v1, s5
1876; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1877; VI-NEXT:    s_endpgm
1878;
1879; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
1880; EG:       ; %bb.0:
1881; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1882; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1883; EG-NEXT:    CF_END
1884; EG-NEXT:    PAD
1885; EG-NEXT:    ALU clause starting at 4:
1886; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1887; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1888; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1889; EG-NEXT:    1611137024(3.919933e+19), 32(4.484155e-44)
1890; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1891; EG-NEXT:     MOV T0.X, 0.0,
1892; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1893; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1894  %shl = shl i64 13839561654909534208, %a
1895  store i64 %shl, i64 addrspace(1)* %out, align 8
1896  ret void
1897}
1898
1899
1900; Test with the 64-bit integer bitpattern for a 32-bit float in the
1901; low 32-bits, which is not a valid 64-bit inline immmediate.
1902define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1903; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1904; SI:       ; %bb.0:
1905; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1906; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
1907; SI-NEXT:    s_mov_b64 s[0:1], 0x40800000
1908; SI-NEXT:    s_mov_b32 s7, 0xf000
1909; SI-NEXT:    s_mov_b32 s6, -1
1910; SI-NEXT:    s_waitcnt lgkmcnt(0)
1911; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1912; SI-NEXT:    v_mov_b32_e32 v0, s0
1913; SI-NEXT:    v_mov_b32_e32 v1, s1
1914; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1915; SI-NEXT:    s_endpgm
1916;
1917; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1918; VI:       ; %bb.0:
1919; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1920; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
1921; VI-NEXT:    s_mov_b64 s[0:1], 0x40800000
1922; VI-NEXT:    s_mov_b32 s7, 0xf000
1923; VI-NEXT:    s_mov_b32 s6, -1
1924; VI-NEXT:    s_waitcnt lgkmcnt(0)
1925; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1926; VI-NEXT:    v_mov_b32_e32 v0, s0
1927; VI-NEXT:    v_mov_b32_e32 v1, s1
1928; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1929; VI-NEXT:    s_endpgm
1930;
1931; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
1932; EG:       ; %bb.0:
1933; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
1934; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1935; EG-NEXT:    CF_END
1936; EG-NEXT:    PAD
1937; EG-NEXT:    ALU clause starting at 4:
1938; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
1939; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1940; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1941; EG-NEXT:     LSHL T0.Z, literal.x, PS,
1942; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
1943; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1944; EG-NEXT:    1082130432(4.000000e+00), 541065216(1.626303e-19)
1945; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1946; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1947; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
1948; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1949; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1950  %shl = shl i64 1082130432, %a
1951  store i64 %shl, i64 addrspace(1)* %out, align 8
1952  ret void
1953}
1954
1955; FIXME: Copy of -1 register
1956define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1957; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1958; SI:       ; %bb.0:
1959; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1960; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
1961; SI-NEXT:    s_mov_b32 s6, -1
1962; SI-NEXT:    s_mov_b32 s0, -4.0
1963; SI-NEXT:    s_mov_b32 s1, s6
1964; SI-NEXT:    s_mov_b32 s7, 0xf000
1965; SI-NEXT:    s_waitcnt lgkmcnt(0)
1966; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1967; SI-NEXT:    v_mov_b32_e32 v0, s0
1968; SI-NEXT:    v_mov_b32_e32 v1, s1
1969; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1970; SI-NEXT:    s_endpgm
1971;
1972; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1973; VI:       ; %bb.0:
1974; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1975; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
1976; VI-NEXT:    s_mov_b32 s6, -1
1977; VI-NEXT:    s_mov_b32 s0, -4.0
1978; VI-NEXT:    s_mov_b32 s1, s6
1979; VI-NEXT:    s_mov_b32 s7, 0xf000
1980; VI-NEXT:    s_waitcnt lgkmcnt(0)
1981; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1982; VI-NEXT:    v_mov_b32_e32 v0, s0
1983; VI-NEXT:    v_mov_b32_e32 v1, s1
1984; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1985; VI-NEXT:    s_endpgm
1986;
1987; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1988; EG:       ; %bb.0:
1989; EG-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
1990; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1991; EG-NEXT:    CF_END
1992; EG-NEXT:    PAD
1993; EG-NEXT:    ALU clause starting at 4:
1994; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
1995; EG-NEXT:     MOV T0.W, literal.y,
1996; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
1997; EG-NEXT:    31(4.344025e-44), -532676608(-5.534023e+19)
1998; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
1999; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
2000; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
2001; EG-NEXT:    2147483647(nan), -1065353216(-4.000000e+00)
2002; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
2003; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
2004; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
2005; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2006; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2007  %shl = shl i64 -1065353216, %a
2008  store i64 %shl, i64 addrspace(1)* %out, align 8
2009  ret void
2010}
2011
2012define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
2013; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2014; SI:       ; %bb.0:
2015; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2016; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
2017; SI-NEXT:    s_mov_b32 s0, 0
2018; SI-NEXT:    s_mov_b32 s1, 4.0
2019; SI-NEXT:    s_mov_b32 s7, 0xf000
2020; SI-NEXT:    s_mov_b32 s6, -1
2021; SI-NEXT:    s_waitcnt lgkmcnt(0)
2022; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2023; SI-NEXT:    v_mov_b32_e32 v0, s0
2024; SI-NEXT:    v_mov_b32_e32 v1, s1
2025; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2026; SI-NEXT:    s_endpgm
2027;
2028; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2029; VI:       ; %bb.0:
2030; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2031; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
2032; VI-NEXT:    s_mov_b32 s0, 0
2033; VI-NEXT:    s_mov_b32 s1, 4.0
2034; VI-NEXT:    s_mov_b32 s7, 0xf000
2035; VI-NEXT:    s_mov_b32 s6, -1
2036; VI-NEXT:    s_waitcnt lgkmcnt(0)
2037; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2038; VI-NEXT:    v_mov_b32_e32 v0, s0
2039; VI-NEXT:    v_mov_b32_e32 v1, s1
2040; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2041; VI-NEXT:    s_endpgm
2042;
2043; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2044; EG:       ; %bb.0:
2045; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
2046; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2047; EG-NEXT:    CF_END
2048; EG-NEXT:    PAD
2049; EG-NEXT:    ALU clause starting at 4:
2050; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
2051; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2052; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
2053; EG-NEXT:    541065216(1.626303e-19), 32(4.484155e-44)
2054; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
2055; EG-NEXT:     MOV T0.X, 0.0,
2056; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2057; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2058  %shl = shl i64 4647714815446351872, %a
2059  store i64 %shl, i64 addrspace(1)* %out, align 8
2060  ret void
2061}
2062
2063define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
2064; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2065; SI:       ; %bb.0:
2066; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2067; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
2068; SI-NEXT:    s_mov_b32 s0, 0
2069; SI-NEXT:    s_mov_b32 s1, -4.0
2070; SI-NEXT:    s_mov_b32 s7, 0xf000
2071; SI-NEXT:    s_mov_b32 s6, -1
2072; SI-NEXT:    s_waitcnt lgkmcnt(0)
2073; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2074; SI-NEXT:    v_mov_b32_e32 v0, s0
2075; SI-NEXT:    v_mov_b32_e32 v1, s1
2076; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2077; SI-NEXT:    s_endpgm
2078;
2079; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2080; VI:       ; %bb.0:
2081; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2082; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
2083; VI-NEXT:    s_mov_b32 s0, 0
2084; VI-NEXT:    s_mov_b32 s1, -4.0
2085; VI-NEXT:    s_mov_b32 s7, 0xf000
2086; VI-NEXT:    s_mov_b32 s6, -1
2087; VI-NEXT:    s_waitcnt lgkmcnt(0)
2088; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
2089; VI-NEXT:    v_mov_b32_e32 v0, s0
2090; VI-NEXT:    v_mov_b32_e32 v1, s1
2091; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2092; VI-NEXT:    s_endpgm
2093;
2094; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2095; EG:       ; %bb.0:
2096; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
2097; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2098; EG-NEXT:    CF_END
2099; EG-NEXT:    PAD
2100; EG-NEXT:    ALU clause starting at 4:
2101; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
2102; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2103; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
2104; EG-NEXT:    1614807040(5.534023e+19), 32(4.484155e-44)
2105; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
2106; EG-NEXT:     MOV T0.X, 0.0,
2107; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2108; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2109  %shl = shl i64 13871086852301127680, %a
2110  store i64 %shl, i64 addrspace(1)* %out, align 8
2111  ret void
2112}
2113
2114define amdgpu_kernel void @test_mul2(i32 %p) {
2115; SI-LABEL: test_mul2:
2116; SI:       ; %bb.0:
2117; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
2118; SI-NEXT:    s_mov_b32 s3, 0xf000
2119; SI-NEXT:    s_mov_b32 s2, -1
2120; SI-NEXT:    s_waitcnt lgkmcnt(0)
2121; SI-NEXT:    s_lshl_b32 s0, s0, 1
2122; SI-NEXT:    v_mov_b32_e32 v0, s0
2123; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2124; SI-NEXT:    s_waitcnt vmcnt(0)
2125; SI-NEXT:    s_endpgm
2126;
2127; VI-LABEL: test_mul2:
2128; VI:       ; %bb.0:
2129; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
2130; VI-NEXT:    s_mov_b32 s3, 0xf000
2131; VI-NEXT:    s_mov_b32 s2, -1
2132; VI-NEXT:    s_waitcnt lgkmcnt(0)
2133; VI-NEXT:    s_lshl_b32 s0, s0, 1
2134; VI-NEXT:    v_mov_b32_e32 v0, s0
2135; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2136; VI-NEXT:    s_waitcnt vmcnt(0)
2137; VI-NEXT:    s_endpgm
2138;
2139; EG-LABEL: test_mul2:
2140; EG:       ; %bb.0:
2141; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2142; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2143; EG-NEXT:    CF_END
2144; EG-NEXT:    PAD
2145; EG-NEXT:    ALU clause starting at 4:
2146; EG-NEXT:     MOV T0.X, literal.x,
2147; EG-NEXT:     LSHL * T1.X, KC0[2].Y, 1,
2148; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2149   %i = mul i32 %p, 2
2150   store volatile i32 %i, i32 addrspace(1)* undef
2151   ret void
2152}
2153
2154define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
2155; SI-LABEL: shl_or_k:
2156; SI:       ; %bb.0:
2157; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2158; SI-NEXT:    s_mov_b32 s6, 0
2159; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2160; SI-NEXT:    s_mov_b32 s7, 0xf000
2161; SI-NEXT:    s_mov_b32 s4, s6
2162; SI-NEXT:    s_mov_b32 s5, s6
2163; SI-NEXT:    v_or_b32_e32 v2, 4, v2
2164; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
2165; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2166; SI-NEXT:    s_setpc_b64 s[30:31]
2167;
2168; VI-LABEL: shl_or_k:
2169; VI:       ; %bb.0:
2170; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2171; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2172; VI-NEXT:    v_or_b32_e32 v2, 4, v2
2173; VI-NEXT:    flat_store_dword v[0:1], v2
2174; VI-NEXT:    s_waitcnt vmcnt(0)
2175; VI-NEXT:    s_setpc_b64 s[30:31]
2176;
2177; EG-LABEL: shl_or_k:
2178; EG:       ; %bb.0:
2179; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2180; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2181; EG-NEXT:    CF_END
2182; EG-NEXT:    PAD
2183; EG-NEXT:    ALU clause starting at 4:
2184; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
2185; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2186; EG-NEXT:     OR_INT T0.X, PV.W, literal.x,
2187; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
2188; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
2189  %tmp0 = or i32 %in, 1
2190  %tmp2 = shl i32 %tmp0, 2
2191  store i32 %tmp2, i32 addrspace(1)* %out
2192  ret void
2193}
2194
2195define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
2196; SI-LABEL: shl_or_k_two_uses:
2197; SI:       ; %bb.0:
2198; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2199; SI-NEXT:    s_mov_b32 s6, 0
2200; SI-NEXT:    v_or_b32_e32 v4, 1, v4
2201; SI-NEXT:    s_mov_b32 s7, 0xf000
2202; SI-NEXT:    s_mov_b32 s4, s6
2203; SI-NEXT:    s_mov_b32 s5, s6
2204; SI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
2205; SI-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64
2206; SI-NEXT:    buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
2207; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2208; SI-NEXT:    s_setpc_b64 s[30:31]
2209;
2210; VI-LABEL: shl_or_k_two_uses:
2211; VI:       ; %bb.0:
2212; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2213; VI-NEXT:    v_or_b32_e32 v4, 1, v4
2214; VI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
2215; VI-NEXT:    flat_store_dword v[0:1], v5
2216; VI-NEXT:    flat_store_dword v[2:3], v4
2217; VI-NEXT:    s_waitcnt vmcnt(0)
2218; VI-NEXT:    s_setpc_b64 s[30:31]
2219;
2220; EG-LABEL: shl_or_k_two_uses:
2221; EG:       ; %bb.0:
2222; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
2223; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
2224; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2225; EG-NEXT:    CF_END
2226; EG-NEXT:    ALU clause starting at 4:
2227; EG-NEXT:     LSHR T0.X, KC0[2].Z, literal.x,
2228; EG-NEXT:     OR_INT * T1.X, KC0[2].W, 1,
2229; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2230; EG-NEXT:     LSHL T2.X, PS, literal.x,
2231; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2232; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2233  %tmp0 = or i32 %in, 1
2234  %tmp2 = shl i32 %tmp0, 2
2235  store i32 %tmp2, i32 addrspace(1)* %out0
2236  store i32 %tmp0, i32 addrspace(1)* %out1
2237  ret void
2238}
2239
2240attributes #0 = { nounwind readnone }
2241