1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | FileCheck %s
3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=amdgpu-promote-kernel-arguments,infer-address-spaces | FileCheck %s
4; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s
5
6; GCN-LABEL: ptr_nest_3:
7; GCN-COUNT-2: global_load_dwordx2
8; GCN:         global_store_dword
9define amdgpu_kernel void @ptr_nest_3(float** addrspace(1)* nocapture readonly %Arg) {
10; CHECK-LABEL: @ptr_nest_3(
11; CHECK-NEXT:  entry:
12; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
13; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
14; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0
15; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
16; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0
17; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
18; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
19; CHECK-NEXT:    ret void
20;
21entry:
22  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
23  %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i
24  %p2 = load float**, float** addrspace(1)* %p1, align 8
25  %p3 = load float*, float** %p2, align 8
26  store float 0.000000e+00, float* %p3, align 4
27  ret void
28}
29
30; GCN-LABEL: ptr_bitcast:
31; GCN: global_load_dwordx2
32; GCN: global_store_dword
33define amdgpu_kernel void @ptr_bitcast(float** nocapture readonly %Arg) {
34; CHECK-LABEL: @ptr_bitcast(
35; CHECK-NEXT:  entry:
36; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
37; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
38; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I]]
39; CHECK-NEXT:    [[P1_CAST:%.*]] = bitcast float* addrspace(1)* [[P1]] to i32* addrspace(1)*
40; CHECK-NEXT:    [[P2:%.*]] = load i32*, i32* addrspace(1)* [[P1_CAST]], align 8, !amdgpu.noclobber !0
41; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)*
42; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[P2_GLOBAL]], align 4
43; CHECK-NEXT:    ret void
44;
45entry:
46  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
47  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
48  %p1.cast = bitcast float** %p1 to i32**
49  %p2 = load i32*, i32** %p1.cast, align 8
50  store i32 0, i32* %p2, align 4
51  ret void
52}
53
54%struct.S = type { float* }
55
56; GCN-LABEL: ptr_in_struct:
57; GCN: s_load_dwordx2
58; GCN: global_store_dword
59define amdgpu_kernel void @ptr_in_struct(%struct.S addrspace(1)* nocapture readonly %Arg) {
60; CHECK-LABEL: @ptr_in_struct(
61; CHECK-NEXT:  entry:
62; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], [[STRUCT_S]] addrspace(1)* [[ARG:%.*]], i64 0, i32 0
63; CHECK-NEXT:    [[P1:%.*]] = load float*, float* addrspace(1)* [[P]], align 8, !amdgpu.noclobber !0
64; CHECK-NEXT:    [[P1_GLOBAL:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
65; CHECK-NEXT:    [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
66; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[P1_GLOBAL]], i32 [[ID]]
67; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[ARRAYIDX]], align 4
68; CHECK-NEXT:    ret void
69;
70entry:
71  %p = getelementptr inbounds %struct.S, %struct.S addrspace(1)* %Arg, i64 0, i32 0
72  %p1 = load float*, float* addrspace(1)* %p, align 8
73  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
74  %arrayidx = getelementptr inbounds float, float* %p1, i32 %id
75  store float 0.000000e+00, float* %arrayidx, align 4
76  ret void
77}
78
79@LDS = internal unnamed_addr addrspace(3) global [4 x float] undef, align 16
80
81; GCN-LABEL: flat_ptr_arg:
82; GCN-COUNT-2: global_load_dwordx2
83; GCN:         global_load_dwordx4
84; GCN:         global_store_dword
85define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) {
86; CHECK-LABEL: @flat_ptr_arg(
87; CHECK-NEXT:  entry:
88; CHECK-NEXT:    [[OUT_GLOBAL:%.*]] = addrspacecast float** [[OUT:%.*]] to float* addrspace(1)*
89; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
90; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
91; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
92; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i64 [[IDXPROM]]
93; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
94; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
95; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0
96; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
97; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
98; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
99; CHECK-NEXT:    [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
100; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
101; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
102; CHECK-NEXT:    store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
103; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
104; CHECK-NEXT:    [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
105; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
106; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
107; CHECK-NEXT:    store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
108; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
109; CHECK-NEXT:    [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
110; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
111; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
112; CHECK-NEXT:    store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4
113; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
114; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
115; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
116; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[OUT_GLOBAL]], i64 [[IDXPROM]]
117; CHECK-NEXT:    [[I7:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX11]], align 8, !amdgpu.noclobber !0
118; CHECK-NEXT:    [[I7_GLOBAL:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)*
119; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
120; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I7_GLOBAL]], i64 [[IDXPROM8]]
121; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
122; CHECK-NEXT:    ret void
123;
124entry:
125  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
126  %idxprom = zext i32 %i to i64
127  %arrayidx10 = getelementptr inbounds float*, float** %Arg, i64 %idxprom
128  %i1 = load float*, float** %arrayidx10, align 8
129  %i2 = load float, float* %i1, align 4
130  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
131  store float %i2, float addrspace(3)* %arrayidx512, align 4
132  %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1
133  %i3 = load float, float* %arrayidx3.1, align 4
134  %add.1 = add nsw i32 %X, 1
135  %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1
136  store float %i3, float addrspace(3)* %arrayidx512.1, align 4
137  %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2
138  %i4 = load float, float* %arrayidx3.2, align 4
139  %add.2 = add nsw i32 %X, 2
140  %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2
141  store float %i4, float addrspace(3)* %arrayidx512.2, align 4
142  %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3
143  %i5 = load float, float* %arrayidx3.3, align 4
144  %add.3 = add nsw i32 %X, 3
145  %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3
146  store float %i5, float addrspace(3)* %arrayidx512.3, align 4
147  %sub = add nsw i32 %X, -1
148  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
149  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
150  %arrayidx11 = getelementptr inbounds float*, float** %Out, i64 %idxprom
151  %i7 = load float*, float** %arrayidx11, align 8
152  %idxprom8 = sext i32 %X to i64
153  %arrayidx9 = getelementptr inbounds float, float* %i7, i64 %idxprom8
154  store float %i6, float* %arrayidx9, align 4
155  ret void
156}
157
158; GCN-LABEL: global_ptr_arg:
159; GCN: global_load_dwordx2
160; GCN: global_load_dwordx4
161; GCN: global_store_dword
162define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
163; CHECK-LABEL: @global_ptr_arg(
164; CHECK-NEXT:  entry:
165; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
166; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
167; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
168; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
169; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
170; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0
171; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
172; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
173; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
174; CHECK-NEXT:    [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
175; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
176; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
177; CHECK-NEXT:    store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
178; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
179; CHECK-NEXT:    [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
180; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
181; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
182; CHECK-NEXT:    store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
183; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
184; CHECK-NEXT:    [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
185; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
186; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
187; CHECK-NEXT:    store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4
188; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
189; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
190; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
191; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
192; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
193; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
194; CHECK-NEXT:    ret void
195;
196entry:
197  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
198  %idxprom = zext i32 %i to i64
199  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
200  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
201  %i2 = load float, float* %i1, align 4
202  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
203  store float %i2, float addrspace(3)* %arrayidx512, align 4
204  %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1
205  %i3 = load float, float* %arrayidx3.1, align 4
206  %add.1 = add nsw i32 %X, 1
207  %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1
208  store float %i3, float addrspace(3)* %arrayidx512.1, align 4
209  %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2
210  %i4 = load float, float* %arrayidx3.2, align 4
211  %add.2 = add nsw i32 %X, 2
212  %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2
213  store float %i4, float addrspace(3)* %arrayidx512.2, align 4
214  %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3
215  %i5 = load float, float* %arrayidx3.3, align 4
216  %add.3 = add nsw i32 %X, 3
217  %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3
218  store float %i5, float addrspace(3)* %arrayidx512.3, align 4
219  %sub = add nsw i32 %X, -1
220  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
221  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
222  %idxprom8 = sext i32 %X to i64
223  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
224  store float %i6, float* %arrayidx9, align 4
225  ret void
226}
227
228; GCN-LABEL: global_ptr_arg_clobbered:
229; GCN: global_store_dwordx2
230; GCN: global_load_dwordx2
231; GCN: flat_load_dword
232; GCN: flat_store_dword
233define amdgpu_kernel void @global_ptr_arg_clobbered(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
234; CHECK-LABEL: @global_ptr_arg_clobbered(
235; CHECK-NEXT:  entry:
236; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
237; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
238; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
239; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
240; CHECK-NEXT:    store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
241; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
242; CHECK-NEXT:    [[I2:%.*]] = load float, float* [[I1]], align 4
243; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
244; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
245; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
246; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
247; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
248; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
249; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[I1]], i64 [[IDXPROM8]]
250; CHECK-NEXT:    store float [[I6]], float* [[ARRAYIDX9]], align 4
251; CHECK-NEXT:    ret void
252;
253entry:
254  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
255  %idxprom = zext i32 %i to i64
256  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
257  %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X
258  store float* null, float* addrspace(1)* %arrayidx11, align 4
259  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
260  %i2 = load float, float* %i1, align 4
261  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
262  store float %i2, float addrspace(3)* %arrayidx512, align 4
263  %sub = add nsw i32 %X, -1
264  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
265  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
266  %idxprom8 = sext i32 %X to i64
267  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
268  store float %i6, float* %arrayidx9, align 4
269  ret void
270}
271
272; GCN-LABEL: global_ptr_arg_clobbered_after_load:
273; GCN: global_load_dwordx2
274; GCN: global_store_dwordx2
275; GCN: global_load_dword
276; GCN: global_store_dword
277define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
278; CHECK-LABEL: @global_ptr_arg_clobbered_after_load(
279; CHECK-NEXT:  entry:
280; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
281; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
282; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
283; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
284; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
285; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
286; CHECK-NEXT:    store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
287; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
288; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
289; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
290; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
291; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
292; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
293; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
294; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
295; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
296; CHECK-NEXT:    ret void
297;
298entry:
299  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
300  %idxprom = zext i32 %i to i64
301  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
302  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
303  %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X
304  store float* null, float* addrspace(1)* %arrayidx11, align 4
305  %i2 = load float, float* %i1, align 4
306  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
307  store float %i2, float addrspace(3)* %arrayidx512, align 4
308  %sub = add nsw i32 %X, -1
309  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
310  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
311  %idxprom8 = sext i32 %X to i64
312  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
313  store float %i6, float* %arrayidx9, align 4
314  ret void
315}
316
317; GCN-LABEL: ptr_nest_3_barrier:
318; GCN-COUNT-2: global_load_dwordx2
319; GCN:         global_store_dword
320define amdgpu_kernel void @ptr_nest_3_barrier(float** addrspace(1)* nocapture readonly %Arg) {
321; CHECK-LABEL: @ptr_nest_3_barrier(
322; CHECK-NEXT:  entry:
323; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
324; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
325; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
326; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0
327; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
328; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0
329; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
330; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
331; CHECK-NEXT:    ret void
332;
333entry:
334  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
335  %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i
336  tail call void @llvm.amdgcn.s.barrier()
337  %p2 = load float**, float** addrspace(1)* %p1, align 8
338  %p3 = load float*, float** %p2, align 8
339  store float 0.000000e+00, float* %p3, align 4
340  ret void
341}
342
343; GCN-LABEL: flat_ptr_nest_2:
344; GCN: s_lshl_b64
345; GCN: s_load_dwordx2
346; GCN: global_store_dword
347define amdgpu_kernel void @flat_ptr_nest_2(float** nocapture readonly %Arg, i32 %i) {
348; CHECK-LABEL: @flat_ptr_nest_2(
349; CHECK-NEXT:  entry:
350; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
351; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
352; CHECK-NEXT:    [[P2:%.*]] = load float*, float* addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0
353; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
354; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4
355; CHECK-NEXT:    ret void
356;
357entry:
358  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
359  %p2 = load float*, float** %p1, align 8
360  store float 0.000000e+00, float* %p2, align 4
361  ret void
362}
363
364; GCN-LABEL: const_ptr_nest_3:
365; GCN: s_lshl_b64
366; GCN: s_load_dwordx2
367; GCN: s_load_dwordx2
368; GCN: global_store_dword
369define amdgpu_kernel void @const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) {
370; CHECK-LABEL: @const_ptr_nest_3(
371; CHECK-NEXT:  entry:
372; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]]
373; CHECK-NEXT:    [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8, !amdgpu.noclobber !0
374; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8, !amdgpu.noclobber !0
375; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
376; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[TMP0]], align 4
377; CHECK-NEXT:    ret void
378;
379entry:
380  %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i
381  %p2 = load float* addrspace(4)*, float * addrspace(4)* addrspace(4)* %p1, align 8
382  %p3 = load float*, float* addrspace(4)* %p2, align 8
383  store float 0.000000e+00, float* %p3, align 4
384  ret void
385}
386
387; GCN-LABEL: cast_from_const_const_ptr_nest_3:
388; GCN: s_lshl_b64
389; GCN: s_load_dwordx2
390; GCN: s_load_dwordx2
391; GCN: global_store_dword
392define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) {
393; CHECK-LABEL: @cast_from_const_const_ptr_nest_3(
394; CHECK-NEXT:  entry:
395; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]]
396; CHECK-NEXT:    [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8, !amdgpu.noclobber !0
397; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8, !amdgpu.noclobber !0
398; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
399; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
400; CHECK-NEXT:    ret void
401;
402entry:
403  %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i
404  %a1 = addrspacecast float* addrspace(4)* addrspace(4)* %p1 to float* addrspace(4)**
405  %p2 = load float* addrspace(4)*, float* addrspace(4)** %a1, align 8
406  %a2 = addrspacecast float* addrspace(4)* %p2 to float**
407  %p3 = load float*, float** %a2, align 8
408  store float 0.000000e+00, float* %p3, align 4
409  ret void
410}
411
412; GCN-LABEL: flat_ptr_volatile_load:
413; GCN: s_lshl_b64
414; GCN: flat_load_dwordx2
415; GCN: global_store_dword
416define amdgpu_kernel void @flat_ptr_volatile_load(float** nocapture readonly %Arg, i32 %i) {
417; CHECK-LABEL: @flat_ptr_volatile_load(
418; CHECK-NEXT:  entry:
419; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
420; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
421; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[P1]] to float**
422; CHECK-NEXT:    [[P2:%.*]] = load volatile float*, float** [[TMP0]], align 8
423; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
424; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4
425; CHECK-NEXT:    ret void
426;
427entry:
428  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
429  %p2 = load volatile float*, float** %p1, align 8
430  store float 0.000000e+00, float* %p2, align 4
431  ret void
432}
433
434; GCN-LABEL: flat_ptr_atomic_load:
435; GCN: s_lshl_b64
436; GCN: global_load_dwordx2
437; GCN: global_store_dword
438define amdgpu_kernel void @flat_ptr_atomic_load(float** nocapture readonly %Arg, i32 %i) {
439; CHECK-LABEL: @flat_ptr_atomic_load(
440; CHECK-NEXT:  entry:
441; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
442; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
443; CHECK-NEXT:    [[P2:%.*]] = load atomic float*, float* addrspace(1)* [[P1]] monotonic, align 8
444; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
445; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4
446; CHECK-NEXT:    ret void
447;
448entry:
449  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
450  %p2 = load atomic float*, float** %p1 monotonic, align 8
451  store float 0.000000e+00, float* %p2, align 4
452  ret void
453}
454
455; GCN-LABEL: cast_changing_pointee_type:
456; GCN: s_lshl_b64
457; GCN: s_load_dwordx2
458; GCN: s_load_dwordx2
459; GCN: global_store_dword
460define amdgpu_kernel void @cast_changing_pointee_type(float* addrspace(1)* addrspace(1)* nocapture readonly %Arg, i32 %i) {
461; CHECK-LABEL: @cast_changing_pointee_type(
462; CHECK-NEXT:  entry:
463; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* [[ARG:%.*]], i32 [[I:%.*]]
464; CHECK-NEXT:    [[A1:%.*]] = bitcast float* addrspace(1)* addrspace(1)* [[P1]] to i32* addrspace(1)* addrspace(1)*
465; CHECK-NEXT:    [[P2:%.*]] = load i32* addrspace(1)*, i32* addrspace(1)* addrspace(1)* [[A1]], align 8, !amdgpu.noclobber !0
466; CHECK-NEXT:    [[A2:%.*]] = bitcast i32* addrspace(1)* [[P2]] to float* addrspace(1)*
467; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[A2]], align 8, !amdgpu.noclobber !0
468; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
469; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
470; CHECK-NEXT:    ret void
471;
472entry:
473  %p1 = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* %Arg, i32 %i
474  %a1 = addrspacecast float* addrspace(1)* addrspace(1)* %p1 to i32* addrspace(1)**
475  %p2 = load i32* addrspace(1)*, i32* addrspace(1)** %a1, align 8
476  %a2 = addrspacecast i32* addrspace(1)* %p2 to float**
477  %p3 = load float*, float** %a2, align 8
478  store float 0.000000e+00, float* %p3, align 4
479  ret void
480}
481
482declare i32 @llvm.amdgcn.workitem.id.x()
483declare void @llvm.amdgcn.s.barrier()
484