1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | FileCheck %s 3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=amdgpu-promote-kernel-arguments,infer-address-spaces | FileCheck %s 4; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s 5 6; GCN-LABEL: ptr_nest_3: 7; GCN-COUNT-2: global_load_dwordx2 8; GCN: global_store_dword 9define amdgpu_kernel void @ptr_nest_3(float** addrspace(1)* nocapture readonly %Arg) { 10; CHECK-LABEL: @ptr_nest_3( 11; CHECK-NEXT: entry: 12; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() 13; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]] 14; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0 15; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)* 16; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0 17; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* 18; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 19; CHECK-NEXT: ret void 20; 21entry: 22 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 23 %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i 24 %p2 = load float**, float** addrspace(1)* %p1, align 8 25 %p3 = load float*, float** %p2, align 8 26 store float 0.000000e+00, float* %p3, align 4 27 ret void 28} 29 30; GCN-LABEL: ptr_bitcast: 31; GCN: global_load_dwordx2 32; GCN: global_store_dword 33define amdgpu_kernel void @ptr_bitcast(float** nocapture readonly %Arg) { 34; CHECK-LABEL: @ptr_bitcast( 35; CHECK-NEXT: entry: 36; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* 37; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() 38; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I]] 39; CHECK-NEXT: [[P1_CAST:%.*]] = bitcast float* addrspace(1)* [[P1]] to i32* addrspace(1)* 40; CHECK-NEXT: [[P2:%.*]] = load i32*, i32* addrspace(1)* [[P1_CAST]], align 8, !amdgpu.noclobber !0 41; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)* 42; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[P2_GLOBAL]], align 4 43; CHECK-NEXT: ret void 44; 45entry: 46 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 47 %p1 = getelementptr inbounds float*, float** %Arg, i32 %i 48 %p1.cast = bitcast float** %p1 to i32** 49 %p2 = load i32*, i32** %p1.cast, align 8 50 store i32 0, i32* %p2, align 4 51 ret void 52} 53 54%struct.S = type { float* } 55 56; GCN-LABEL: ptr_in_struct: 57; GCN: s_load_dwordx2 58; GCN: global_store_dword 59define amdgpu_kernel void @ptr_in_struct(%struct.S addrspace(1)* nocapture readonly %Arg) { 60; CHECK-LABEL: @ptr_in_struct( 61; CHECK-NEXT: entry: 62; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], [[STRUCT_S]] addrspace(1)* [[ARG:%.*]], i64 0, i32 0 63; CHECK-NEXT: [[P1:%.*]] = load float*, float* addrspace(1)* [[P]], align 8, !amdgpu.noclobber !0 64; CHECK-NEXT: [[P1_GLOBAL:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)* 65; CHECK-NEXT: [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() 66; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[P1_GLOBAL]], i32 [[ID]] 67; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[ARRAYIDX]], align 4 68; CHECK-NEXT: ret void 69; 70entry: 71 %p = getelementptr inbounds %struct.S, %struct.S addrspace(1)* %Arg, i64 0, i32 0 72 %p1 = load float*, float* addrspace(1)* %p, align 8 73 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 74 %arrayidx = getelementptr inbounds float, float* %p1, i32 %id 75 store float 0.000000e+00, float* %arrayidx, align 4 76 ret void 77} 78 79@LDS = internal unnamed_addr addrspace(3) global [4 x float] undef, align 16 80 81; GCN-LABEL: flat_ptr_arg: 82; GCN-COUNT-2: global_load_dwordx2 83; GCN: global_load_dwordx4 84; GCN: global_store_dword 85define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) { 86; CHECK-LABEL: @flat_ptr_arg( 87; CHECK-NEXT: entry: 88; CHECK-NEXT: [[OUT_GLOBAL:%.*]] = addrspacecast float** [[OUT:%.*]] to float* addrspace(1)* 89; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* 90; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() 91; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 92; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i64 [[IDXPROM]] 93; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0 94; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* 95; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0 96; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]] 97; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 98; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1 99; CHECK-NEXT: [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4 100; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1 101; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]] 102; CHECK-NEXT: store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4 103; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2 104; CHECK-NEXT: [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4 105; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2 106; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]] 107; CHECK-NEXT: store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4 108; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3 109; CHECK-NEXT: [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4 110; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3 111; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]] 112; CHECK-NEXT: store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4 113; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 114; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] 115; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 116; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[OUT_GLOBAL]], i64 [[IDXPROM]] 117; CHECK-NEXT: [[I7:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX11]], align 8, !amdgpu.noclobber !0 118; CHECK-NEXT: [[I7_GLOBAL:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)* 119; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 120; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I7_GLOBAL]], i64 [[IDXPROM8]] 121; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 122; CHECK-NEXT: ret void 123; 124entry: 125 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 126 %idxprom = zext i32 %i to i64 127 %arrayidx10 = getelementptr inbounds float*, float** %Arg, i64 %idxprom 128 %i1 = load float*, float** %arrayidx10, align 8 129 %i2 = load float, float* %i1, align 4 130 %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X 131 store float %i2, float addrspace(3)* %arrayidx512, align 4 132 %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1 133 %i3 = load float, float* %arrayidx3.1, align 4 134 %add.1 = add nsw i32 %X, 1 135 %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1 136 store float %i3, float addrspace(3)* %arrayidx512.1, align 4 137 %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2 138 %i4 = load float, float* %arrayidx3.2, align 4 139 %add.2 = add nsw i32 %X, 2 140 %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2 141 store float %i4, float addrspace(3)* %arrayidx512.2, align 4 142 %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3 143 %i5 = load float, float* %arrayidx3.3, align 4 144 %add.3 = add nsw i32 %X, 3 145 %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3 146 store float %i5, float addrspace(3)* %arrayidx512.3, align 4 147 %sub = add nsw i32 %X, -1 148 %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub 149 %i6 = load float, float addrspace(3)* %arrayidx711, align 4 150 %arrayidx11 = getelementptr inbounds float*, float** %Out, i64 %idxprom 151 %i7 = load float*, float** %arrayidx11, align 8 152 %idxprom8 = sext i32 %X to i64 153 %arrayidx9 = getelementptr inbounds float, float* %i7, i64 %idxprom8 154 store float %i6, float* %arrayidx9, align 4 155 ret void 156} 157 158; GCN-LABEL: global_ptr_arg: 159; GCN: global_load_dwordx2 160; GCN: global_load_dwordx4 161; GCN: global_store_dword 162define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) { 163; CHECK-LABEL: @global_ptr_arg( 164; CHECK-NEXT: entry: 165; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() 166; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 167; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]] 168; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0 169; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* 170; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0 171; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]] 172; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 173; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1 174; CHECK-NEXT: [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4 175; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1 176; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]] 177; CHECK-NEXT: store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4 178; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2 179; CHECK-NEXT: [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4 180; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2 181; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]] 182; CHECK-NEXT: store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4 183; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3 184; CHECK-NEXT: [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4 185; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3 186; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]] 187; CHECK-NEXT: store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4 188; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 189; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] 190; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 191; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 192; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]] 193; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 194; CHECK-NEXT: ret void 195; 196entry: 197 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 198 %idxprom = zext i32 %i to i64 199 %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom 200 %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8 201 %i2 = load float, float* %i1, align 4 202 %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X 203 store float %i2, float addrspace(3)* %arrayidx512, align 4 204 %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1 205 %i3 = load float, float* %arrayidx3.1, align 4 206 %add.1 = add nsw i32 %X, 1 207 %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1 208 store float %i3, float addrspace(3)* %arrayidx512.1, align 4 209 %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2 210 %i4 = load float, float* %arrayidx3.2, align 4 211 %add.2 = add nsw i32 %X, 2 212 %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2 213 store float %i4, float addrspace(3)* %arrayidx512.2, align 4 214 %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3 215 %i5 = load float, float* %arrayidx3.3, align 4 216 %add.3 = add nsw i32 %X, 3 217 %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3 218 store float %i5, float addrspace(3)* %arrayidx512.3, align 4 219 %sub = add nsw i32 %X, -1 220 %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub 221 %i6 = load float, float addrspace(3)* %arrayidx711, align 4 222 %idxprom8 = sext i32 %X to i64 223 %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8 224 store float %i6, float* %arrayidx9, align 4 225 ret void 226} 227 228; GCN-LABEL: global_ptr_arg_clobbered: 229; GCN: global_store_dwordx2 230; GCN: global_load_dwordx2 231; GCN: flat_load_dword 232; GCN: flat_store_dword 233define amdgpu_kernel void @global_ptr_arg_clobbered(float* addrspace(1)* nocapture readonly %Arg, i32 %X) { 234; CHECK-LABEL: @global_ptr_arg_clobbered( 235; CHECK-NEXT: entry: 236; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() 237; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 238; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]] 239; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]] 240; CHECK-NEXT: store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4 241; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8 242; CHECK-NEXT: [[I2:%.*]] = load float, float* [[I1]], align 4 243; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]] 244; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 245; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 246; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] 247; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 248; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 249; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[I1]], i64 [[IDXPROM8]] 250; CHECK-NEXT: store float [[I6]], float* [[ARRAYIDX9]], align 4 251; CHECK-NEXT: ret void 252; 253entry: 254 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 255 %idxprom = zext i32 %i to i64 256 %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom 257 %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X 258 store float* null, float* addrspace(1)* %arrayidx11, align 4 259 %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8 260 %i2 = load float, float* %i1, align 4 261 %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X 262 store float %i2, float addrspace(3)* %arrayidx512, align 4 263 %sub = add nsw i32 %X, -1 264 %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub 265 %i6 = load float, float addrspace(3)* %arrayidx711, align 4 266 %idxprom8 = sext i32 %X to i64 267 %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8 268 store float %i6, float* %arrayidx9, align 4 269 ret void 270} 271 272; GCN-LABEL: global_ptr_arg_clobbered_after_load: 273; GCN: global_load_dwordx2 274; GCN: global_store_dwordx2 275; GCN: global_load_dword 276; GCN: global_store_dword 277define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(float* addrspace(1)* nocapture readonly %Arg, i32 %X) { 278; CHECK-LABEL: @global_ptr_arg_clobbered_after_load( 279; CHECK-NEXT: entry: 280; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() 281; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 282; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]] 283; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0 284; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* 285; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]] 286; CHECK-NEXT: store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4 287; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4 288; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]] 289; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 290; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 291; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] 292; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 293; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 294; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]] 295; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 296; CHECK-NEXT: ret void 297; 298entry: 299 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 300 %idxprom = zext i32 %i to i64 301 %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom 302 %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8 303 %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X 304 store float* null, float* addrspace(1)* %arrayidx11, align 4 305 %i2 = load float, float* %i1, align 4 306 %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X 307 store float %i2, float addrspace(3)* %arrayidx512, align 4 308 %sub = add nsw i32 %X, -1 309 %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub 310 %i6 = load float, float addrspace(3)* %arrayidx711, align 4 311 %idxprom8 = sext i32 %X to i64 312 %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8 313 store float %i6, float* %arrayidx9, align 4 314 ret void 315} 316 317; GCN-LABEL: ptr_nest_3_barrier: 318; GCN-COUNT-2: global_load_dwordx2 319; GCN: global_store_dword 320define amdgpu_kernel void @ptr_nest_3_barrier(float** addrspace(1)* nocapture readonly %Arg) { 321; CHECK-LABEL: @ptr_nest_3_barrier( 322; CHECK-NEXT: entry: 323; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() 324; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]] 325; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() 326; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0 327; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)* 328; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0 329; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* 330; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 331; CHECK-NEXT: ret void 332; 333entry: 334 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 335 %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i 336 tail call void @llvm.amdgcn.s.barrier() 337 %p2 = load float**, float** addrspace(1)* %p1, align 8 338 %p3 = load float*, float** %p2, align 8 339 store float 0.000000e+00, float* %p3, align 4 340 ret void 341} 342 343; GCN-LABEL: flat_ptr_nest_2: 344; GCN: s_lshl_b64 345; GCN: s_load_dwordx2 346; GCN: global_store_dword 347define amdgpu_kernel void @flat_ptr_nest_2(float** nocapture readonly %Arg, i32 %i) { 348; CHECK-LABEL: @flat_ptr_nest_2( 349; CHECK-NEXT: entry: 350; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* 351; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]] 352; CHECK-NEXT: [[P2:%.*]] = load float*, float* addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0 353; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)* 354; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4 355; CHECK-NEXT: ret void 356; 357entry: 358 %p1 = getelementptr inbounds float*, float** %Arg, i32 %i 359 %p2 = load float*, float** %p1, align 8 360 store float 0.000000e+00, float* %p2, align 4 361 ret void 362} 363 364; GCN-LABEL: const_ptr_nest_3: 365; GCN: s_lshl_b64 366; GCN: s_load_dwordx2 367; GCN: s_load_dwordx2 368; GCN: global_store_dword 369define amdgpu_kernel void @const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) { 370; CHECK-LABEL: @const_ptr_nest_3( 371; CHECK-NEXT: entry: 372; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]] 373; CHECK-NEXT: [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8, !amdgpu.noclobber !0 374; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8, !amdgpu.noclobber !0 375; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* 376; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[TMP0]], align 4 377; CHECK-NEXT: ret void 378; 379entry: 380 %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i 381 %p2 = load float* addrspace(4)*, float * addrspace(4)* addrspace(4)* %p1, align 8 382 %p3 = load float*, float* addrspace(4)* %p2, align 8 383 store float 0.000000e+00, float* %p3, align 4 384 ret void 385} 386 387; GCN-LABEL: cast_from_const_const_ptr_nest_3: 388; GCN: s_lshl_b64 389; GCN: s_load_dwordx2 390; GCN: s_load_dwordx2 391; GCN: global_store_dword 392define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) { 393; CHECK-LABEL: @cast_from_const_const_ptr_nest_3( 394; CHECK-NEXT: entry: 395; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]] 396; CHECK-NEXT: [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8, !amdgpu.noclobber !0 397; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8, !amdgpu.noclobber !0 398; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* 399; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 400; CHECK-NEXT: ret void 401; 402entry: 403 %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i 404 %a1 = addrspacecast float* addrspace(4)* addrspace(4)* %p1 to float* addrspace(4)** 405 %p2 = load float* addrspace(4)*, float* addrspace(4)** %a1, align 8 406 %a2 = addrspacecast float* addrspace(4)* %p2 to float** 407 %p3 = load float*, float** %a2, align 8 408 store float 0.000000e+00, float* %p3, align 4 409 ret void 410} 411 412; GCN-LABEL: flat_ptr_volatile_load: 413; GCN: s_lshl_b64 414; GCN: flat_load_dwordx2 415; GCN: global_store_dword 416define amdgpu_kernel void @flat_ptr_volatile_load(float** nocapture readonly %Arg, i32 %i) { 417; CHECK-LABEL: @flat_ptr_volatile_load( 418; CHECK-NEXT: entry: 419; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* 420; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]] 421; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[P1]] to float** 422; CHECK-NEXT: [[P2:%.*]] = load volatile float*, float** [[TMP0]], align 8 423; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)* 424; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4 425; CHECK-NEXT: ret void 426; 427entry: 428 %p1 = getelementptr inbounds float*, float** %Arg, i32 %i 429 %p2 = load volatile float*, float** %p1, align 8 430 store float 0.000000e+00, float* %p2, align 4 431 ret void 432} 433 434; GCN-LABEL: flat_ptr_atomic_load: 435; GCN: s_lshl_b64 436; GCN: global_load_dwordx2 437; GCN: global_store_dword 438define amdgpu_kernel void @flat_ptr_atomic_load(float** nocapture readonly %Arg, i32 %i) { 439; CHECK-LABEL: @flat_ptr_atomic_load( 440; CHECK-NEXT: entry: 441; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* 442; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]] 443; CHECK-NEXT: [[P2:%.*]] = load atomic float*, float* addrspace(1)* [[P1]] monotonic, align 8 444; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)* 445; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4 446; CHECK-NEXT: ret void 447; 448entry: 449 %p1 = getelementptr inbounds float*, float** %Arg, i32 %i 450 %p2 = load atomic float*, float** %p1 monotonic, align 8 451 store float 0.000000e+00, float* %p2, align 4 452 ret void 453} 454 455; GCN-LABEL: cast_changing_pointee_type: 456; GCN: s_lshl_b64 457; GCN: s_load_dwordx2 458; GCN: s_load_dwordx2 459; GCN: global_store_dword 460define amdgpu_kernel void @cast_changing_pointee_type(float* addrspace(1)* addrspace(1)* nocapture readonly %Arg, i32 %i) { 461; CHECK-LABEL: @cast_changing_pointee_type( 462; CHECK-NEXT: entry: 463; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* [[ARG:%.*]], i32 [[I:%.*]] 464; CHECK-NEXT: [[A1:%.*]] = bitcast float* addrspace(1)* addrspace(1)* [[P1]] to i32* addrspace(1)* addrspace(1)* 465; CHECK-NEXT: [[P2:%.*]] = load i32* addrspace(1)*, i32* addrspace(1)* addrspace(1)* [[A1]], align 8, !amdgpu.noclobber !0 466; CHECK-NEXT: [[A2:%.*]] = bitcast i32* addrspace(1)* [[P2]] to float* addrspace(1)* 467; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(1)* [[A2]], align 8, !amdgpu.noclobber !0 468; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* 469; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 470; CHECK-NEXT: ret void 471; 472entry: 473 %p1 = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* %Arg, i32 %i 474 %a1 = addrspacecast float* addrspace(1)* addrspace(1)* %p1 to i32* addrspace(1)** 475 %p2 = load i32* addrspace(1)*, i32* addrspace(1)** %a1, align 8 476 %a2 = addrspacecast i32* addrspace(1)* %p2 to float** 477 %p3 = load float*, float** %a2, align 8 478 store float 0.000000e+00, float* %p3, align 4 479 ret void 480} 481 482declare i32 @llvm.amdgcn.workitem.id.x() 483declare void @llvm.amdgcn.s.barrier() 484