1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
6
7declare i32 @llvm.amdgcn.workitem.id.x() #0
8
9@lds.obj = addrspace(3) global [256 x i32] undef, align 4
10
11define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
12; CI-LABEL: write_ds_sub0_offset0_global:
13; CI:       ; %bb.0: ; %entry
14; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
15; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
16; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
17; CI-NEXT:    s_mov_b32 m0, -1
18; CI-NEXT:    ds_write_b32 v0, v1 offset:12
19; CI-NEXT:    s_endpgm
20;
21; GFX9-LABEL: write_ds_sub0_offset0_global:
22; GFX9:       ; %bb.0: ; %entry
23; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
24; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
25; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
26; GFX9-NEXT:    ds_write_b32 v0, v1 offset:12
27; GFX9-NEXT:    s_endpgm
28;
29; GFX10-LABEL: write_ds_sub0_offset0_global:
30; GFX10:       ; %bb.0: ; %entry
31; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
32; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7b
33; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
34; GFX10-NEXT:    ds_write_b32 v0, v1 offset:12
35; GFX10-NEXT:    s_endpgm
36;
37; GFX11-LABEL: write_ds_sub0_offset0_global:
38; GFX11:       ; %bb.0: ; %entry
39; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
40; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
41; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
42; GFX11-NEXT:    ds_store_b32 v0, v1 offset:12
43; GFX11-NEXT:    s_endpgm
44entry:
45  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
46  %sub1 = sub i32 0, %x.i
47  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
48  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
49  store i32 123, i32 addrspace(3)* %arrayidx
50  ret void
51}
52
53define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 {
54; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit:
55; CI:       ; %bb.0: ; %entry
56; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
57; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
58; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
59; CI-NEXT:    s_mov_b64 vcc, 0
60; CI-NEXT:    s_waitcnt lgkmcnt(0)
61; CI-NEXT:    v_mov_b32_e32 v1, s0
62; CI-NEXT:    s_mov_b32 s0, 0
63; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
64; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
65; CI-NEXT:    s_mov_b32 m0, -1
66; CI-NEXT:    s_mov_b32 s3, 0xf000
67; CI-NEXT:    s_mov_b32 s2, -1
68; CI-NEXT:    s_mov_b32 s1, s0
69; CI-NEXT:    ds_write_b32 v0, v2 offset:12
70; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
71; CI-NEXT:    s_waitcnt vmcnt(0)
72; CI-NEXT:    s_endpgm
73;
74; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
75; GFX9:       ; %bb.0: ; %entry
76; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
77; GFX9-NEXT:    s_mov_b64 vcc, 0
78; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
79; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v0
80; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
81; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX9-NEXT:    v_mov_b32_e32 v1, s0
83; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
84; GFX9-NEXT:    v_mov_b32_e32 v0, 0
85; GFX9-NEXT:    v_mov_b32_e32 v1, 0
86; GFX9-NEXT:    ds_write_b32 v3, v4 offset:12
87; GFX9-NEXT:    global_store_dword v[0:1], v2, off
88; GFX9-NEXT:    s_waitcnt vmcnt(0)
89; GFX9-NEXT:    s_endpgm
90;
91; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit:
92; GFX10:       ; %bb.0: ; %entry
93; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x0
94; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
95; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
96; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
97; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
98; GFX10-NEXT:    v_mov_b32_e32 v0, 0
99; GFX10-NEXT:    v_mov_b32_e32 v1, 0
100; GFX10-NEXT:    ds_write_b32 v2, v3 offset:12
101; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
103; GFX10-NEXT:    global_store_dword v[0:1], v4, off
104; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
105; GFX10-NEXT:    s_endpgm
106;
107; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit:
108; GFX11:       ; %bb.0: ; %entry
109; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
110; GFX11-NEXT:    v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
111; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
112; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
113; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
114; GFX11-NEXT:    v_mov_b32_e32 v0, 0
115; GFX11-NEXT:    v_mov_b32_e32 v1, 0
116; GFX11-NEXT:    ds_store_b32 v2, v3 offset:12
117; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
119; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
120; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
121; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
122; GFX11-NEXT:    s_endpgm
123entry:
124  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
125  %sub1 = sub i32 0, %x.i
126  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
127  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
128  store i32 123, i32 addrspace(3)* %arrayidx
129  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
130  store volatile float %fmas, float addrspace(1)* null
131  ret void
132}
133
134define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 {
135; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit:
136; CI:       ; %bb.0:
137; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
138; CI-NEXT:    s_mov_b64 vcc, 0
139; CI-NEXT:    v_not_b32_e32 v0, v0
140; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
141; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
142; CI-NEXT:    s_waitcnt lgkmcnt(0)
143; CI-NEXT:    v_mov_b32_e32 v1, s0
144; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
145; CI-NEXT:    s_mov_b32 s0, 0
146; CI-NEXT:    s_mov_b32 m0, -1
147; CI-NEXT:    s_mov_b32 s3, 0xf000
148; CI-NEXT:    s_mov_b32 s2, -1
149; CI-NEXT:    s_mov_b32 s1, s0
150; CI-NEXT:    ds_write_b32 v0, v2 offset:65532
151; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
152; CI-NEXT:    s_waitcnt vmcnt(0)
153; CI-NEXT:    s_endpgm
154;
155; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit:
156; GFX9:       ; %bb.0:
157; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
158; GFX9-NEXT:    s_mov_b64 vcc, 0
159; GFX9-NEXT:    v_not_b32_e32 v0, v0
160; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
161; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
162; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX9-NEXT:    v_mov_b32_e32 v1, s0
164; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
165; GFX9-NEXT:    v_mov_b32_e32 v0, 0
166; GFX9-NEXT:    v_mov_b32_e32 v1, 0
167; GFX9-NEXT:    ds_write_b32 v3, v4 offset:65532
168; GFX9-NEXT:    global_store_dword v[0:1], v2, off
169; GFX9-NEXT:    s_waitcnt vmcnt(0)
170; GFX9-NEXT:    s_endpgm
171;
172; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit:
173; GFX10:       ; %bb.0:
174; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x0
175; GFX10-NEXT:    v_not_b32_e32 v0, v0
176; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
177; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
178; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
179; GFX10-NEXT:    v_mov_b32_e32 v0, 0
180; GFX10-NEXT:    v_mov_b32_e32 v1, 0
181; GFX10-NEXT:    ds_write_b32 v2, v3 offset:65532
182; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
184; GFX10-NEXT:    global_store_dword v[0:1], v4, off
185; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
186; GFX10-NEXT:    s_endpgm
187;
188; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit:
189; GFX11:       ; %bb.0:
190; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
191; GFX11-NEXT:    v_not_b32_e32 v0, v0
192; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
193; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
194; GFX11-NEXT:    v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v2, 2, v0
195; GFX11-NEXT:    v_mov_b32_e32 v0, 0
196; GFX11-NEXT:    v_mov_b32_e32 v1, 0
197; GFX11-NEXT:    ds_store_b32 v2, v3 offset:65532
198; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
200; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
201; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
202; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
203; GFX11-NEXT:    s_endpgm
204  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
205  %sub1 = sub i32 -1, %x.i
206  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
207  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 16383
208  store i32 123, i32 addrspace(3)* %arrayidx
209  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
210  store volatile float %fmas, float addrspace(1)* null
211  ret void
212}
213
214define amdgpu_kernel void @add_x_shl_max_offset() #1 {
215; CI-LABEL: add_x_shl_max_offset:
216; CI:       ; %bb.0:
217; CI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
218; CI-NEXT:    v_mov_b32_e32 v1, 13
219; CI-NEXT:    s_mov_b32 m0, -1
220; CI-NEXT:    ds_write_b8 v0, v1 offset:65535
221; CI-NEXT:    s_endpgm
222;
223; GFX9-LABEL: add_x_shl_max_offset:
224; GFX9:       ; %bb.0:
225; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
226; GFX9-NEXT:    v_mov_b32_e32 v1, 13
227; GFX9-NEXT:    ds_write_b8 v0, v1 offset:65535
228; GFX9-NEXT:    s_endpgm
229;
230; GFX10-LABEL: add_x_shl_max_offset:
231; GFX10:       ; %bb.0:
232; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
233; GFX10-NEXT:    v_mov_b32_e32 v1, 13
234; GFX10-NEXT:    ds_write_b8 v0, v1 offset:65535
235; GFX10-NEXT:    s_endpgm
236;
237; GFX11-LABEL: add_x_shl_max_offset:
238; GFX11:       ; %bb.0:
239; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 4, v0
240; GFX11-NEXT:    ds_store_b8 v0, v1 offset:65535
241; GFX11-NEXT:    s_endpgm
242  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
243  %shl = shl i32 %x.i, 4
244  %add = add i32 %shl, 65535
245  %z = zext i32 %add to i64
246  %ptr = inttoptr i64 %z to i8 addrspace(3)*
247  store i8 13, i8 addrspace(3)* %ptr, align 1
248  ret void
249}
250
251; this could have the offset transform, but sub became xor
252
253define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
254; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
255; CI:       ; %bb.0:
256; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
257; CI-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
258; CI-NEXT:    v_mov_b32_e32 v1, 13
259; CI-NEXT:    s_mov_b32 m0, -1
260; CI-NEXT:    ds_write_b8 v0, v1
261; CI-NEXT:    s_endpgm
262;
263; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
264; GFX9:       ; %bb.0:
265; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
266; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
267; GFX9-NEXT:    v_mov_b32_e32 v1, 13
268; GFX9-NEXT:    ds_write_b8 v0, v1
269; GFX9-NEXT:    s_endpgm
270;
271; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
272; GFX10:       ; %bb.0:
273; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
274; GFX10-NEXT:    v_mov_b32_e32 v1, 13
275; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
276; GFX10-NEXT:    ds_write_b8 v0, v1
277; GFX10-NEXT:    s_endpgm
278;
279; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
280; GFX11:       ; %bb.0:
281; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
282; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
283; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
284; GFX11-NEXT:    ds_store_b8 v0, v1
285; GFX11-NEXT:    s_endpgm
286  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
287  %.neg = mul i32 %x.i, -4
288  %add = add i32 %.neg, 65535
289  %z = zext i32 %add to i64
290  %ptr = inttoptr i64 %z to i8 addrspace(3)*
291  store i8 13, i8 addrspace(3)* %ptr, align 1
292  ret void
293}
294
295; this could have the offset transform, but sub became xor
296
297define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
298; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
299; CI:       ; %bb.0:
300; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
301; CI-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
302; CI-NEXT:    v_mov_b32_e32 v1, 13
303; CI-NEXT:    s_mov_b32 m0, -1
304; CI-NEXT:    ds_write_b8 v0, v1
305; CI-NEXT:    s_endpgm
306;
307; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
308; GFX9:       ; %bb.0:
309; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
310; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
311; GFX9-NEXT:    v_mov_b32_e32 v1, 13
312; GFX9-NEXT:    ds_write_b8 v0, v1
313; GFX9-NEXT:    s_endpgm
314;
315; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
316; GFX10:       ; %bb.0:
317; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
318; GFX10-NEXT:    v_mov_b32_e32 v1, 13
319; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
320; GFX10-NEXT:    ds_write_b8 v0, v1
321; GFX10-NEXT:    s_endpgm
322;
323; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
324; GFX11:       ; %bb.0:
325; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
326; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
327; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
328; GFX11-NEXT:    ds_store_b8 v0, v1
329; GFX11-NEXT:    s_endpgm
330  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
331  %neg = sub i32 0, %x.i
332  %shl = shl i32 %neg, 2
333  %add = add i32 65535, %shl
334  %ptr = inttoptr i32 %add to i8 addrspace(3)*
335  store i8 13, i8 addrspace(3)* %ptr
336  ret void
337}
338
339define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
340; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
341; CI:       ; %bb.0:
342; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
343; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x10000, v0
344; CI-NEXT:    v_mov_b32_e32 v1, 13
345; CI-NEXT:    s_mov_b32 m0, -1
346; CI-NEXT:    ds_write_b8 v0, v1
347; CI-NEXT:    s_endpgm
348;
349; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
350; GFX9:       ; %bb.0:
351; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
352; GFX9-NEXT:    v_sub_u32_e32 v0, 0x10000, v0
353; GFX9-NEXT:    v_mov_b32_e32 v1, 13
354; GFX9-NEXT:    ds_write_b8 v0, v1
355; GFX9-NEXT:    s_endpgm
356;
357; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
358; GFX10:       ; %bb.0:
359; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
360; GFX10-NEXT:    v_mov_b32_e32 v1, 13
361; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x10000, v0
362; GFX10-NEXT:    ds_write_b8 v0, v1
363; GFX10-NEXT:    s_endpgm
364;
365; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
366; GFX11:       ; %bb.0:
367; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
368; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
369; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x10000, v0
370; GFX11-NEXT:    ds_store_b8 v0, v1
371; GFX11-NEXT:    s_endpgm
372  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
373  %neg = sub i32 0, %x.i
374  %shl = shl i32 %neg, 2
375  %add = add i32 65536, %shl
376  %ptr = inttoptr i32 %add to i8 addrspace(3)*
377  store i8 13, i8 addrspace(3)* %ptr
378  ret void
379}
380
381define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
382; CI-LABEL: add_x_shl_neg_to_sub_multi_use:
383; CI:       ; %bb.0:
384; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
385; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
386; CI-NEXT:    v_mov_b32_e32 v1, 13
387; CI-NEXT:    s_mov_b32 m0, -1
388; CI-NEXT:    ds_write_b32 v0, v1 offset:123
389; CI-NEXT:    ds_write_b32 v0, v1 offset:456
390; CI-NEXT:    s_endpgm
391;
392; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use:
393; GFX9:       ; %bb.0:
394; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
395; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
396; GFX9-NEXT:    v_mov_b32_e32 v1, 13
397; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
398; GFX9-NEXT:    ds_write_b32 v0, v1 offset:456
399; GFX9-NEXT:    s_endpgm
400;
401; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use:
402; GFX10:       ; %bb.0:
403; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
404; GFX10-NEXT:    v_mov_b32_e32 v1, 13
405; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
406; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
407; GFX10-NEXT:    ds_write_b32 v0, v1 offset:456
408; GFX10-NEXT:    s_endpgm
409;
410; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use:
411; GFX11:       ; %bb.0:
412; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
413; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
414; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
415; GFX11-NEXT:    ds_store_b32 v0, v1 offset:123
416; GFX11-NEXT:    ds_store_b32 v0, v1 offset:456
417; GFX11-NEXT:    s_endpgm
418  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
419  %neg = sub i32 0, %x.i
420  %shl = shl i32 %neg, 2
421  %add0 = add i32 123, %shl
422  %add1 = add i32 456, %shl
423  %ptr0 = inttoptr i32 %add0 to i32 addrspace(3)*
424  store volatile i32 13, i32 addrspace(3)* %ptr0
425  %ptr1 = inttoptr i32 %add1 to i32 addrspace(3)*
426  store volatile i32 13, i32 addrspace(3)* %ptr1
427  ret void
428}
429
430define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
431; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
432; CI:       ; %bb.0:
433; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
434; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
435; CI-NEXT:    v_mov_b32_e32 v1, 13
436; CI-NEXT:    s_mov_b32 m0, -1
437; CI-NEXT:    ds_write_b32 v0, v1 offset:123
438; CI-NEXT:    ds_write_b32 v0, v1 offset:123
439; CI-NEXT:    s_endpgm
440;
441; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
442; GFX9:       ; %bb.0:
443; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
444; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
445; GFX9-NEXT:    v_mov_b32_e32 v1, 13
446; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
447; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
448; GFX9-NEXT:    s_endpgm
449;
450; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
451; GFX10:       ; %bb.0:
452; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
453; GFX10-NEXT:    v_mov_b32_e32 v1, 13
454; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
455; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
456; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
457; GFX10-NEXT:    s_endpgm
458;
459; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
460; GFX11:       ; %bb.0:
461; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
462; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
463; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
464; GFX11-NEXT:    ds_store_b32 v0, v1 offset:123
465; GFX11-NEXT:    ds_store_b32 v0, v1 offset:123
466; GFX11-NEXT:    s_endpgm
467  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
468  %neg = sub i32 0, %x.i
469  %shl = shl i32 %neg, 2
470  %add = add i32 123, %shl
471  %ptr = inttoptr i32 %add to i32 addrspace(3)*
472  store volatile i32 13, i32 addrspace(3)* %ptr
473  store volatile i32 13, i32 addrspace(3)* %ptr
474  ret void
475}
476
477define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
478; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
479; CI:       ; %bb.0:
480; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
481; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
482; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
483; CI-NEXT:    v_mov_b32_e32 v2, 0
484; CI-NEXT:    s_mov_b32 m0, -1
485; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
486; CI-NEXT:    s_endpgm
487;
488; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
489; GFX9:       ; %bb.0:
490; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
491; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fb, v0
492; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
493; GFX9-NEXT:    v_mov_b32_e32 v2, 0
494; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
495; GFX9-NEXT:    s_endpgm
496;
497; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
498; GFX10:       ; %bb.0:
499; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
500; GFX10-NEXT:    v_mov_b32_e32 v1, 0
501; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
502; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
503; GFX10-NEXT:    ds_write_b32 v0, v1 offset:1023
504; GFX10-NEXT:    ds_write_b32 v0, v2 offset:1019
505; GFX10-NEXT:    s_endpgm
506;
507; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
508; GFX11:       ; %bb.0:
509; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
510; GFX11-NEXT:    v_mov_b32_e32 v2, 0
511; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
512; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x3fb, v0
513; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
514; GFX11-NEXT:    s_endpgm
515  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
516  %neg = sub i32 0, %x.i
517  %shl = shl i32 %neg, 2
518  %add = add i32 1019, %shl
519  %ptr = inttoptr i32 %add to i64 addrspace(3)*
520  store i64 123, i64 addrspace(3)* %ptr, align 4
521  ret void
522}
523
524define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 {
525; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
526; CI:       ; %bb.0:
527; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
528; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
529; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
530; CI-NEXT:    s_mov_b64 vcc, 0
531; CI-NEXT:    s_waitcnt lgkmcnt(0)
532; CI-NEXT:    v_mov_b32_e32 v1, s0
533; CI-NEXT:    s_mov_b32 s0, 0
534; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
535; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
536; CI-NEXT:    v_mov_b32_e32 v3, 0
537; CI-NEXT:    s_mov_b32 m0, -1
538; CI-NEXT:    s_mov_b32 s3, 0xf000
539; CI-NEXT:    s_mov_b32 s2, -1
540; CI-NEXT:    s_mov_b32 s1, s0
541; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
542; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
543; CI-NEXT:    s_waitcnt vmcnt(0)
544; CI-NEXT:    s_endpgm
545;
546; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
547; GFX9:       ; %bb.0:
548; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
549; GFX9-NEXT:    s_mov_b64 vcc, 0
550; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
551; GFX9-NEXT:    v_sub_u32_e32 v3, 0x3fb, v0
552; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
553; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
554; GFX9-NEXT:    v_mov_b32_e32 v1, s0
555; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
556; GFX9-NEXT:    v_mov_b32_e32 v0, 0
557; GFX9-NEXT:    v_mov_b32_e32 v5, 0
558; GFX9-NEXT:    v_mov_b32_e32 v1, 0
559; GFX9-NEXT:    ds_write2_b32 v3, v4, v5 offset1:1
560; GFX9-NEXT:    global_store_dword v[0:1], v2, off
561; GFX9-NEXT:    s_waitcnt vmcnt(0)
562; GFX9-NEXT:    s_endpgm
563;
564; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
565; GFX10:       ; %bb.0:
566; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x0
567; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
568; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
569; GFX10-NEXT:    v_mov_b32_e32 v3, 0
570; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7b
571; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
572; GFX10-NEXT:    v_mov_b32_e32 v0, 0
573; GFX10-NEXT:    v_mov_b32_e32 v1, 0
574; GFX10-NEXT:    ds_write_b32 v2, v3 offset:1023
575; GFX10-NEXT:    ds_write_b32 v2, v4 offset:1019
576; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
577; GFX10-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
578; GFX10-NEXT:    global_store_dword v[0:1], v5, off
579; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
580; GFX10-NEXT:    s_endpgm
581;
582; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
583; GFX11:       ; %bb.0:
584; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
585; GFX11-NEXT:    v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
586; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
587; GFX11-NEXT:    v_mov_b32_e32 v4, 0
588; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
589; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x3fb, v0
590; GFX11-NEXT:    v_mov_b32_e32 v0, 0
591; GFX11-NEXT:    v_mov_b32_e32 v1, 0
592; GFX11-NEXT:    ds_store_2addr_b32 v2, v3, v4 offset1:1
593; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
594; GFX11-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
595; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
596; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
597; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
598; GFX11-NEXT:    s_endpgm
599  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
600  %neg = sub i32 0, %x.i
601  %shl = shl i32 %neg, 2
602  %add = add i32 1019, %shl
603  %ptr = inttoptr i32 %add to i64 addrspace(3)*
604  store i64 123, i64 addrspace(3)* %ptr, align 4
605  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
606  store volatile float %fmas, float addrspace(1)* null
607  ret void
608}
609
610define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
611; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
612; CI:       ; %bb.0:
613; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
614; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fc, v0
615; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
616; CI-NEXT:    v_mov_b32_e32 v2, 0
617; CI-NEXT:    s_mov_b32 m0, -1
618; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
619; CI-NEXT:    s_endpgm
620;
621; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
622; GFX9:       ; %bb.0:
623; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
624; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fc, v0
625; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
626; GFX9-NEXT:    v_mov_b32_e32 v2, 0
627; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
628; GFX9-NEXT:    s_endpgm
629;
630; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
631; GFX10:       ; %bb.0:
632; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
633; GFX10-NEXT:    v_mov_b32_e32 v1, 0
634; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
635; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
636; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
637; GFX10-NEXT:    ds_write2_b32 v0, v2, v1 offset0:127 offset1:128
638; GFX10-NEXT:    s_endpgm
639;
640; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
641; GFX11:       ; %bb.0:
642; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
643; GFX11-NEXT:    v_mov_b32_e32 v2, 0
644; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
645; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x3fc, v0
646; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
647; GFX11-NEXT:    s_endpgm
648  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
649  %neg = sub i32 0, %x.i
650  %shl = shl i32 %neg, 2
651  %add = add i32 1020, %shl
652  %ptr = inttoptr i32 %add to i64 addrspace(3)*
653  store i64 123, i64 addrspace(3)* %ptr, align 4
654  ret void
655}
656
657declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
658
659attributes #0 = { nounwind readnone }
660attributes #1 = { nounwind }
661attributes #2 = { nounwind convergent }
662