1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8@lds.obj = addrspace(3) global [256 x i32] undef, align 4
9
10define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
11; CI-LABEL: write_ds_sub0_offset0_global:
12; CI:       ; %bb.0: ; %entry
13; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
14; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
15; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
16; CI-NEXT:    s_mov_b32 m0, -1
17; CI-NEXT:    ds_write_b32 v0, v1 offset:12
18; CI-NEXT:    s_endpgm
19;
20; GFX9-LABEL: write_ds_sub0_offset0_global:
21; GFX9:       ; %bb.0: ; %entry
22; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
23; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
24; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
25; GFX9-NEXT:    ds_write_b32 v0, v1 offset:12
26; GFX9-NEXT:    s_endpgm
27;
28; GFX10-LABEL: write_ds_sub0_offset0_global:
29; GFX10:       ; %bb.0: ; %entry
30; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
31; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7b
32; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
33; GFX10-NEXT:    ds_write_b32 v0, v1 offset:12
34; GFX10-NEXT:    s_endpgm
35entry:
36  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
37  %sub1 = sub i32 0, %x.i
38  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
39  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
40  store i32 123, i32 addrspace(3)* %arrayidx
41  ret void
42}
43
44define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 {
45; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit:
46; CI:       ; %bb.0: ; %entry
47; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
48; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
49; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
50; CI-NEXT:    s_mov_b64 vcc, 0
51; CI-NEXT:    s_waitcnt lgkmcnt(0)
52; CI-NEXT:    v_mov_b32_e32 v1, s0
53; CI-NEXT:    s_mov_b32 s0, 0
54; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
55; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
56; CI-NEXT:    s_mov_b32 m0, -1
57; CI-NEXT:    s_mov_b32 s3, 0xf000
58; CI-NEXT:    s_mov_b32 s2, -1
59; CI-NEXT:    s_mov_b32 s1, s0
60; CI-NEXT:    ds_write_b32 v0, v2 offset:12
61; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
62; CI-NEXT:    s_waitcnt vmcnt(0)
63; CI-NEXT:    s_endpgm
64;
65; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
66; GFX9:       ; %bb.0: ; %entry
67; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
68; GFX9-NEXT:    s_mov_b64 vcc, 0
69; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
70; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v0
71; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
72; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NEXT:    v_mov_b32_e32 v1, s0
74; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
75; GFX9-NEXT:    v_mov_b32_e32 v0, 0
76; GFX9-NEXT:    v_mov_b32_e32 v1, 0
77; GFX9-NEXT:    ds_write_b32 v3, v4 offset:12
78; GFX9-NEXT:    global_store_dword v[0:1], v2, off
79; GFX9-NEXT:    s_waitcnt vmcnt(0)
80; GFX9-NEXT:    s_endpgm
81;
82; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit:
83; GFX10:       ; %bb.0: ; %entry
84; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x0
85; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
86; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
87; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
88; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
89; GFX10-NEXT:    v_mov_b32_e32 v0, 0
90; GFX10-NEXT:    v_mov_b32_e32 v1, 0
91; GFX10-NEXT:    ds_write_b32 v2, v3 offset:12
92; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
94; GFX10-NEXT:    global_store_dword v[0:1], v4, off
95; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
96; GFX10-NEXT:    s_endpgm
97entry:
98  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
99  %sub1 = sub i32 0, %x.i
100  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
101  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
102  store i32 123, i32 addrspace(3)* %arrayidx
103  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
104  store volatile float %fmas, float addrspace(1)* null
105  ret void
106}
107
108define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 {
109; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit:
110; CI:       ; %bb.0:
111; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
112; CI-NEXT:    s_mov_b64 vcc, 0
113; CI-NEXT:    v_not_b32_e32 v0, v0
114; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
115; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
116; CI-NEXT:    s_waitcnt lgkmcnt(0)
117; CI-NEXT:    v_mov_b32_e32 v1, s0
118; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
119; CI-NEXT:    s_mov_b32 s0, 0
120; CI-NEXT:    s_mov_b32 m0, -1
121; CI-NEXT:    s_mov_b32 s3, 0xf000
122; CI-NEXT:    s_mov_b32 s2, -1
123; CI-NEXT:    s_mov_b32 s1, s0
124; CI-NEXT:    ds_write_b32 v0, v2 offset:65532
125; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
126; CI-NEXT:    s_waitcnt vmcnt(0)
127; CI-NEXT:    s_endpgm
128;
129; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit:
130; GFX9:       ; %bb.0:
131; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
132; GFX9-NEXT:    s_mov_b64 vcc, 0
133; GFX9-NEXT:    v_not_b32_e32 v0, v0
134; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
135; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
136; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX9-NEXT:    v_mov_b32_e32 v1, s0
138; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
139; GFX9-NEXT:    v_mov_b32_e32 v0, 0
140; GFX9-NEXT:    v_mov_b32_e32 v1, 0
141; GFX9-NEXT:    ds_write_b32 v3, v4 offset:65532
142; GFX9-NEXT:    global_store_dword v[0:1], v2, off
143; GFX9-NEXT:    s_waitcnt vmcnt(0)
144; GFX9-NEXT:    s_endpgm
145;
146; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit:
147; GFX10:       ; %bb.0:
148; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x0
149; GFX10-NEXT:    v_not_b32_e32 v0, v0
150; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
151; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
152; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
153; GFX10-NEXT:    v_mov_b32_e32 v0, 0
154; GFX10-NEXT:    v_mov_b32_e32 v1, 0
155; GFX10-NEXT:    ds_write_b32 v2, v3 offset:65532
156; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
158; GFX10-NEXT:    global_store_dword v[0:1], v4, off
159; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
160; GFX10-NEXT:    s_endpgm
161  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
162  %sub1 = sub i32 -1, %x.i
163  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
164  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 16383
165  store i32 123, i32 addrspace(3)* %arrayidx
166  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
167  store volatile float %fmas, float addrspace(1)* null
168  ret void
169}
170
171define amdgpu_kernel void @add_x_shl_max_offset() #1 {
172; CI-LABEL: add_x_shl_max_offset:
173; CI:       ; %bb.0:
174; CI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
175; CI-NEXT:    v_mov_b32_e32 v1, 13
176; CI-NEXT:    s_mov_b32 m0, -1
177; CI-NEXT:    ds_write_b8 v0, v1 offset:65535
178; CI-NEXT:    s_endpgm
179;
180; GFX9-LABEL: add_x_shl_max_offset:
181; GFX9:       ; %bb.0:
182; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
183; GFX9-NEXT:    v_mov_b32_e32 v1, 13
184; GFX9-NEXT:    ds_write_b8 v0, v1 offset:65535
185; GFX9-NEXT:    s_endpgm
186;
187; GFX10-LABEL: add_x_shl_max_offset:
188; GFX10:       ; %bb.0:
189; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
190; GFX10-NEXT:    v_mov_b32_e32 v1, 13
191; GFX10-NEXT:    ds_write_b8 v0, v1 offset:65535
192; GFX10-NEXT:    s_endpgm
193  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
194  %shl = shl i32 %x.i, 4
195  %add = add i32 %shl, 65535
196  %z = zext i32 %add to i64
197  %ptr = inttoptr i64 %z to i8 addrspace(3)*
198  store i8 13, i8 addrspace(3)* %ptr, align 1
199  ret void
200}
201
202define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
203; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
204; CI:       ; %bb.0:
205; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
206; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
207; CI-NEXT:    v_mov_b32_e32 v1, 13
208; CI-NEXT:    s_mov_b32 m0, -1
209; CI-NEXT:    ds_write_b8 v0, v1 offset:65535
210; CI-NEXT:    s_endpgm
211;
212; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
213; GFX9:       ; %bb.0:
214; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
215; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
216; GFX9-NEXT:    v_mov_b32_e32 v1, 13
217; GFX9-NEXT:    ds_write_b8 v0, v1 offset:65535
218; GFX9-NEXT:    s_endpgm
219;
220; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
221; GFX10:       ; %bb.0:
222; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
223; GFX10-NEXT:    v_mov_b32_e32 v1, 13
224; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
225; GFX10-NEXT:    ds_write_b8 v0, v1 offset:65535
226; GFX10-NEXT:    s_endpgm
227  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
228  %.neg = mul i32 %x.i, -4
229  %add = add i32 %.neg, 65535
230  %z = zext i32 %add to i64
231  %ptr = inttoptr i64 %z to i8 addrspace(3)*
232  store i8 13, i8 addrspace(3)* %ptr, align 1
233  ret void
234}
235
236define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
237; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
238; CI:       ; %bb.0:
239; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
240; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
241; CI-NEXT:    v_mov_b32_e32 v1, 13
242; CI-NEXT:    s_mov_b32 m0, -1
243; CI-NEXT:    ds_write_b8 v0, v1 offset:65535
244; CI-NEXT:    s_endpgm
245;
246; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
247; GFX9:       ; %bb.0:
248; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
249; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
250; GFX9-NEXT:    v_mov_b32_e32 v1, 13
251; GFX9-NEXT:    ds_write_b8 v0, v1 offset:65535
252; GFX9-NEXT:    s_endpgm
253;
254; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
255; GFX10:       ; %bb.0:
256; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
257; GFX10-NEXT:    v_mov_b32_e32 v1, 13
258; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
259; GFX10-NEXT:    ds_write_b8 v0, v1 offset:65535
260; GFX10-NEXT:    s_endpgm
261  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
262  %neg = sub i32 0, %x.i
263  %shl = shl i32 %neg, 2
264  %add = add i32 65535, %shl
265  %ptr = inttoptr i32 %add to i8 addrspace(3)*
266  store i8 13, i8 addrspace(3)* %ptr
267  ret void
268}
269
270define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
271; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
272; CI:       ; %bb.0:
273; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
274; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x10000, v0
275; CI-NEXT:    v_mov_b32_e32 v1, 13
276; CI-NEXT:    s_mov_b32 m0, -1
277; CI-NEXT:    ds_write_b8 v0, v1
278; CI-NEXT:    s_endpgm
279;
280; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
281; GFX9:       ; %bb.0:
282; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
283; GFX9-NEXT:    v_sub_u32_e32 v0, 0x10000, v0
284; GFX9-NEXT:    v_mov_b32_e32 v1, 13
285; GFX9-NEXT:    ds_write_b8 v0, v1
286; GFX9-NEXT:    s_endpgm
287;
288; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
289; GFX10:       ; %bb.0:
290; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
291; GFX10-NEXT:    v_mov_b32_e32 v1, 13
292; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x10000, v0
293; GFX10-NEXT:    ds_write_b8 v0, v1
294; GFX10-NEXT:    s_endpgm
295  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
296  %neg = sub i32 0, %x.i
297  %shl = shl i32 %neg, 2
298  %add = add i32 65536, %shl
299  %ptr = inttoptr i32 %add to i8 addrspace(3)*
300  store i8 13, i8 addrspace(3)* %ptr
301  ret void
302}
303
304define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
305; CI-LABEL: add_x_shl_neg_to_sub_multi_use:
306; CI:       ; %bb.0:
307; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
308; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
309; CI-NEXT:    v_mov_b32_e32 v1, 13
310; CI-NEXT:    s_mov_b32 m0, -1
311; CI-NEXT:    ds_write_b32 v0, v1 offset:123
312; CI-NEXT:    ds_write_b32 v0, v1 offset:456
313; CI-NEXT:    s_endpgm
314;
315; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use:
316; GFX9:       ; %bb.0:
317; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
318; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
319; GFX9-NEXT:    v_mov_b32_e32 v1, 13
320; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
321; GFX9-NEXT:    ds_write_b32 v0, v1 offset:456
322; GFX9-NEXT:    s_endpgm
323;
324; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use:
325; GFX10:       ; %bb.0:
326; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
327; GFX10-NEXT:    v_mov_b32_e32 v1, 13
328; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
329; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
330; GFX10-NEXT:    ds_write_b32 v0, v1 offset:456
331; GFX10-NEXT:    s_endpgm
332  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
333  %neg = sub i32 0, %x.i
334  %shl = shl i32 %neg, 2
335  %add0 = add i32 123, %shl
336  %add1 = add i32 456, %shl
337  %ptr0 = inttoptr i32 %add0 to i32 addrspace(3)*
338  store volatile i32 13, i32 addrspace(3)* %ptr0
339  %ptr1 = inttoptr i32 %add1 to i32 addrspace(3)*
340  store volatile i32 13, i32 addrspace(3)* %ptr1
341  ret void
342}
343
344define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
345; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
346; CI:       ; %bb.0:
347; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
348; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
349; CI-NEXT:    v_mov_b32_e32 v1, 13
350; CI-NEXT:    s_mov_b32 m0, -1
351; CI-NEXT:    ds_write_b32 v0, v1 offset:123
352; CI-NEXT:    ds_write_b32 v0, v1 offset:123
353; CI-NEXT:    s_endpgm
354;
355; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
356; GFX9:       ; %bb.0:
357; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
358; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
359; GFX9-NEXT:    v_mov_b32_e32 v1, 13
360; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
361; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
362; GFX9-NEXT:    s_endpgm
363;
364; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
365; GFX10:       ; %bb.0:
366; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
367; GFX10-NEXT:    v_mov_b32_e32 v1, 13
368; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
369; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
370; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
371; GFX10-NEXT:    s_endpgm
372  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
373  %neg = sub i32 0, %x.i
374  %shl = shl i32 %neg, 2
375  %add = add i32 123, %shl
376  %ptr = inttoptr i32 %add to i32 addrspace(3)*
377  store volatile i32 13, i32 addrspace(3)* %ptr
378  store volatile i32 13, i32 addrspace(3)* %ptr
379  ret void
380}
381
382define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
383; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
384; CI:       ; %bb.0:
385; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
386; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
387; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
388; CI-NEXT:    v_mov_b32_e32 v2, 0
389; CI-NEXT:    s_mov_b32 m0, -1
390; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
391; CI-NEXT:    s_endpgm
392;
393; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
394; GFX9:       ; %bb.0:
395; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
396; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fb, v0
397; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
398; GFX9-NEXT:    v_mov_b32_e32 v2, 0
399; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
400; GFX9-NEXT:    s_endpgm
401;
402; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
403; GFX10:       ; %bb.0:
404; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
405; GFX10-NEXT:    v_mov_b32_e32 v1, 0
406; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
407; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
408; GFX10-NEXT:    ds_write_b32 v0, v1 offset:1023
409; GFX10-NEXT:    ds_write_b32 v0, v2 offset:1019
410; GFX10-NEXT:    s_endpgm
411  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
412  %neg = sub i32 0, %x.i
413  %shl = shl i32 %neg, 2
414  %add = add i32 1019, %shl
415  %ptr = inttoptr i32 %add to i64 addrspace(3)*
416  store i64 123, i64 addrspace(3)* %ptr, align 4
417  ret void
418}
419
420define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 {
421; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
422; CI:       ; %bb.0:
423; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
424; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
425; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
426; CI-NEXT:    s_mov_b64 vcc, 0
427; CI-NEXT:    s_waitcnt lgkmcnt(0)
428; CI-NEXT:    v_mov_b32_e32 v1, s0
429; CI-NEXT:    s_mov_b32 s0, 0
430; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
431; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
432; CI-NEXT:    v_mov_b32_e32 v3, 0
433; CI-NEXT:    s_mov_b32 m0, -1
434; CI-NEXT:    s_mov_b32 s3, 0xf000
435; CI-NEXT:    s_mov_b32 s2, -1
436; CI-NEXT:    s_mov_b32 s1, s0
437; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
438; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
439; CI-NEXT:    s_waitcnt vmcnt(0)
440; CI-NEXT:    s_endpgm
441;
442; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
443; GFX9:       ; %bb.0:
444; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
445; GFX9-NEXT:    s_mov_b64 vcc, 0
446; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
447; GFX9-NEXT:    v_sub_u32_e32 v3, 0x3fb, v0
448; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
449; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX9-NEXT:    v_mov_b32_e32 v1, s0
451; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
452; GFX9-NEXT:    v_mov_b32_e32 v0, 0
453; GFX9-NEXT:    v_mov_b32_e32 v5, 0
454; GFX9-NEXT:    v_mov_b32_e32 v1, 0
455; GFX9-NEXT:    ds_write2_b32 v3, v4, v5 offset1:1
456; GFX9-NEXT:    global_store_dword v[0:1], v2, off
457; GFX9-NEXT:    s_waitcnt vmcnt(0)
458; GFX9-NEXT:    s_endpgm
459;
460; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
461; GFX10:       ; %bb.0:
462; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x0
463; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
464; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
465; GFX10-NEXT:    v_mov_b32_e32 v3, 0
466; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7b
467; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
468; GFX10-NEXT:    v_mov_b32_e32 v0, 0
469; GFX10-NEXT:    v_mov_b32_e32 v1, 0
470; GFX10-NEXT:    ds_write_b32 v2, v3 offset:1023
471; GFX10-NEXT:    ds_write_b32 v2, v4 offset:1019
472; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX10-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
474; GFX10-NEXT:    global_store_dword v[0:1], v5, off
475; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
476; GFX10-NEXT:    s_endpgm
477  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
478  %neg = sub i32 0, %x.i
479  %shl = shl i32 %neg, 2
480  %add = add i32 1019, %shl
481  %ptr = inttoptr i32 %add to i64 addrspace(3)*
482  store i64 123, i64 addrspace(3)* %ptr, align 4
483  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
484  store volatile float %fmas, float addrspace(1)* null
485  ret void
486}
487
488define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
489; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
490; CI:       ; %bb.0:
491; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
492; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fc, v0
493; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
494; CI-NEXT:    v_mov_b32_e32 v2, 0
495; CI-NEXT:    s_mov_b32 m0, -1
496; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
497; CI-NEXT:    s_endpgm
498;
499; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
500; GFX9:       ; %bb.0:
501; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
502; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fc, v0
503; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
504; GFX9-NEXT:    v_mov_b32_e32 v2, 0
505; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
506; GFX9-NEXT:    s_endpgm
507;
508; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
509; GFX10:       ; %bb.0:
510; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
511; GFX10-NEXT:    v_mov_b32_e32 v1, 0
512; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
513; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
514; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
515; GFX10-NEXT:    ds_write2_b32 v0, v2, v1 offset0:127 offset1:128
516; GFX10-NEXT:    s_endpgm
517  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
518  %neg = sub i32 0, %x.i
519  %shl = shl i32 %neg, 2
520  %add = add i32 1020, %shl
521  %ptr = inttoptr i32 %add to i64 addrspace(3)*
522  store i64 123, i64 addrspace(3)* %ptr, align 4
523  ret void
524}
525
526declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
527
528attributes #0 = { nounwind readnone }
529attributes #1 = { nounwind }
530attributes #2 = { nounwind convergent }
531