1; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-lower-kernel-attributes -instcombine %s | FileCheck -enable-var-scope %s 2; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope %s 3 4target datalayout = "n32" 5 6; CHECK-LABEL: @invalid_reqd_work_group_size( 7; CHECK: load i16, 8define amdgpu_kernel void @invalid_reqd_work_group_size(i16 addrspace(1)* %out) #0 !reqd_work_group_size !1 { 9 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 10 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 11 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 12 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 13 store i16 %group.size.x, i16 addrspace(1)* %out 14 ret void 15} 16 17; CHECK-LABEL: @volatile_load_group_size_x( 18; CHECK: load volatile i16, 19define amdgpu_kernel void @volatile_load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 20 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 21 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 22 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 23 %group.size.x = load volatile i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 24 store i16 %group.size.x, i16 addrspace(1)* %out 25 ret void 26} 27 28; CHECK-LABEL: @load_group_size_x( 29; CHECK-NEXT: store i16 8, 30define amdgpu_kernel void @load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 31 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 32 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 33 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 34 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 35 store i16 %group.size.x, i16 addrspace(1)* %out 36 ret void 37} 38 39; CHECK-LABEL: @load_group_size_y( 40; CHECK-NEXT: store i16 16, 41define amdgpu_kernel void @load_group_size_y(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 42 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 43 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6 44 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)* 45 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4 46 store i16 %group.size.y, i16 addrspace(1)* %out 47 ret void 48} 49 50; CHECK-LABEL: @load_group_size_z( 51; CHECK-NEXT: store i16 2, 52define amdgpu_kernel void @load_group_size_z(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 53 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 54 %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8 55 %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)* 56 %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4 57 store i16 %group.size.z, i16 addrspace(1)* %out 58 ret void 59} 60 61; Metadata uses i64 instead of i32 62; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64( 63; CHECK-NEXT: store i16 8, 64define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(i16 addrspace(1)* %out) #0 !reqd_work_group_size !2 { 65 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 66 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 67 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 68 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 69 store i16 %group.size.x, i16 addrspace(1)* %out 70 ret void 71} 72 73; Metadata uses i16 instead of i32 74; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16( 75; CHECK-NEXT: store i16 8, 76define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(i16 addrspace(1)* %out) #0 !reqd_work_group_size !3 { 77 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 78 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 79 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 80 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 81 store i16 %group.size.x, i16 addrspace(1)* %out 82 ret void 83} 84 85; CHECK-LABEL: @use_local_size_x_8_16_2( 86; CHECK-NEXT: store i64 8, 87define amdgpu_kernel void @use_local_size_x_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 88 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 89 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 90 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 91 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 92 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 93 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 94 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 95 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 96 %group.size.x.zext = zext i16 %group.size.x to i32 97 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 98 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 99 %cmp = icmp ult i32 %sub, %group.size.x.zext 100 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext 101 %zext = zext i32 %select to i64 102 store i64 %zext, i64 addrspace(1)* %out 103 ret void 104} 105 106; CHECK-LABEL: @use_local_size_y_8_16_2( 107; CHECK-NEXT: store i64 16, 108define amdgpu_kernel void @use_local_size_y_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 109 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 110 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6 111 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)* 112 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4 113 %gep.grid.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16 114 %gep.grid.size.y.bc = bitcast i8 addrspace(4)* %gep.grid.size.y to i32 addrspace(4)* 115 %grid.size.y = load i32, i32 addrspace(4)* %gep.grid.size.y.bc, align 4 116 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() 117 %group.size.y.zext = zext i16 %group.size.y to i32 118 %group.id_x_group.size.y = mul i32 %group.id, %group.size.y.zext 119 %sub = sub i32 %grid.size.y, %group.id_x_group.size.y 120 %cmp = icmp ult i32 %sub, %group.size.y.zext 121 %select = select i1 %cmp, i32 %sub, i32 %group.size.y.zext 122 %zext = zext i32 %select to i64 123 store i64 %zext, i64 addrspace(1)* %out 124 ret void 125} 126 127; CHECK-LABEL: @use_local_size_z_8_16_2( 128; CHECK-NEXT: store i64 2, 129define amdgpu_kernel void @use_local_size_z_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 130 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 131 %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8 132 %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)* 133 %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4 134 %gep.grid.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 20 135 %gep.grid.size.z.bc = bitcast i8 addrspace(4)* %gep.grid.size.z to i32 addrspace(4)* 136 %grid.size.z = load i32, i32 addrspace(4)* %gep.grid.size.z.bc, align 4 137 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z() 138 %group.size.z.zext = zext i16 %group.size.z to i32 139 %group.id_x_group.size.z = mul i32 %group.id, %group.size.z.zext 140 %sub = sub i32 %grid.size.z, %group.id_x_group.size.z 141 %cmp = icmp ult i32 %sub, %group.size.z.zext 142 %select = select i1 %cmp, i32 %sub, i32 %group.size.z.zext 143 %zext = zext i32 %select to i64 144 store i64 %zext, i64 addrspace(1)* %out 145 ret void 146} 147 148; Simplification on select is invalid, but we can still eliminate the 149; load of the group size. 150 151; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id( 152; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() 153; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 154define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 155 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 156 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 157 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 158 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 159 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 160 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 161 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 162 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() 163 %group.size.x.zext = zext i16 %group.size.x to i32 164 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 165 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 166 %cmp = icmp ult i32 %sub, %group.size.x.zext 167 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext 168 %zext = zext i32 %select to i64 169 store i64 %zext, i64 addrspace(1)* %out 170 ret void 171} 172 173; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size( 174; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 175; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 176; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 177 define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 178 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 179 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 180 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 181 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 182 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16 183 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 184 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 185 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 186 %group.size.x.zext = zext i16 %group.size.x to i32 187 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 188 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 189 %cmp = icmp ult i32 %sub, %group.size.x.zext 190 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext 191 %zext = zext i32 %select to i64 192 store i64 %zext, i64 addrspace(1)* %out 193 ret void 194} 195 196; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type( 197; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 198; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 199; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 200; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x 201; CHECK: %cmp = icmp slt i32 %sub, 8 202; CHECK: %select = select i1 %cmp, i32 %sub, i32 8 203define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 204 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 205 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 206 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 207 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 208 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 209 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 210 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 211 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 212 %group.size.x.zext = zext i16 %group.size.x to i32 213 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 214 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 215 %cmp = icmp slt i32 %sub, %group.size.x.zext 216 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext 217 %zext = zext i32 %select to i64 218 store i64 %zext, i64 addrspace(1)* %out 219 ret void 220} 221 222; CHECK-LABEL: @local_size_x_8_16_2_wrong_select( 223; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 224; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x 225; CHECK: %1 = icmp ugt i32 %sub, 8 226; CHECK: %select = select i1 %1, i32 %sub, i32 8 227; CHECK: %zext = zext i32 %select to i64 228define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 229 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 230 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 231 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 232 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 233 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 234 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 235 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 236 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 237 %group.size.x.zext = zext i16 %group.size.x to i32 238 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 239 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 240 %cmp = icmp ult i32 %sub, %group.size.x.zext 241 %select = select i1 %cmp, i32 %group.size.x.zext, i32 %sub 242 %zext = zext i32 %select to i64 243 store i64 %zext, i64 addrspace(1)* %out 244 ret void 245} 246 247; CHECK-LABEL: @use_local_size_x_8_16_2_wrong_grid_load_size( 248; CHECK: %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4 249; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32 250; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 251; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 252; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x.zext 253define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 254 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 255 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 256 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 257 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 258 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 259 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i16 addrspace(4)* 260 %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4 261 %grid.size.x.zext = zext i16 %grid.size.x to i32 262 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 263 %group.size.x.zext = zext i16 %group.size.x to i32 264 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 265 %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x 266 %cmp = icmp ult i32 %sub, %group.size.x.zext 267 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext 268 %zext = zext i32 %select to i64 269 store i64 %zext, i64 addrspace(1)* %out 270 ret void 271} 272 273; CHECK-LABEL: @func_group_size_x( 274; CHECK-NEXT: ret i32 8 275define i32 @func_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 276 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 277 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 278 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 279 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 280 %zext = zext i16 %group.size.x to i32 281 ret i32 %zext 282} 283 284; CHECK-LABEL: @__ockl_get_local_size_reqd_size( 285; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ] 286define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 { 287bb: 288 %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 289 switch i32 %arg, label %bb25 [ 290 i32 0, label %bb1 291 i32 1, label %bb9 292 i32 2, label %bb17 293 ] 294 295bb1: ; preds = %bb 296 %tmp2 = tail call i32 @llvm.amdgcn.workgroup.id.x() 297 %tmp3 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 12 298 %tmp4 = bitcast i8 addrspace(4)* %tmp3 to i32 addrspace(4)* 299 %tmp5 = load i32, i32 addrspace(4)* %tmp4, align 4 300 %tmp6 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4 301 %tmp7 = bitcast i8 addrspace(4)* %tmp6 to i16 addrspace(4)* 302 %tmp8 = load i16, i16 addrspace(4)* %tmp7, align 4 303 br label %bb25 304 305bb9: ; preds = %bb 306 %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.y() 307 %tmp11 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 16 308 %tmp12 = bitcast i8 addrspace(4)* %tmp11 to i32 addrspace(4)* 309 %tmp13 = load i32, i32 addrspace(4)* %tmp12, align 8 310 %tmp14 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 6 311 %tmp15 = bitcast i8 addrspace(4)* %tmp14 to i16 addrspace(4)* 312 %tmp16 = load i16, i16 addrspace(4)* %tmp15, align 2 313 br label %bb25 314 315bb17: ; preds = %bb 316 %tmp18 = tail call i32 @llvm.amdgcn.workgroup.id.z() 317 %tmp19 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 20 318 %tmp20 = bitcast i8 addrspace(4)* %tmp19 to i32 addrspace(4)* 319 %tmp21 = load i32, i32 addrspace(4)* %tmp20, align 4 320 %tmp22 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 8 321 %tmp23 = bitcast i8 addrspace(4)* %tmp22 to i16 addrspace(4)* 322 %tmp24 = load i16, i16 addrspace(4)* %tmp23, align 8 323 br label %bb25 324 325bb25: ; preds = %bb17, %bb9, %bb1, %bb 326 %tmp26 = phi i32 [ %tmp21, %bb17 ], [ %tmp13, %bb9 ], [ %tmp5, %bb1 ], [ 0, %bb ] 327 %group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, %bb1 ], [ 1, %bb ] 328 %tmp28 = phi i32 [ %tmp18, %bb17 ], [ %tmp10, %bb9 ], [ %tmp2, %bb1 ], [ 0, %bb ] 329 %tmp29 = zext i16 %group.size to i32 330 %tmp30 = mul i32 %tmp28, %tmp29 331 %tmp31 = sub i32 %tmp26, %tmp30 332 %tmp32 = icmp ult i32 %tmp31, %tmp29 333 %tmp33 = select i1 %tmp32, i32 %tmp31, i32 %tmp29 334 %tmp34 = zext i32 %tmp33 to i64 335 ret i64 %tmp34 336} 337 338; CHECK-LABEL: @all_local_size( 339; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4 340; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4 341; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4 342define amdgpu_kernel void @all_local_size(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 { 343 %tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 344 %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0 345 %tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12 346 %tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)* 347 %tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4 348 %tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4 349 %tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)* 350 %tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4 351 %tmp29.i = zext i16 %tmp8.i to i32 352 %tmp30.i = mul i32 %tmp2.i, %tmp29.i 353 %tmp31.i = sub i32 %tmp5.i, %tmp30.i 354 %tmp32.i = icmp ult i32 %tmp31.i, %tmp29.i 355 %tmp33.i = select i1 %tmp32.i, i32 %tmp31.i, i32 %tmp29.i 356 %tmp34.i = zext i32 %tmp33.i to i64 357 %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0 358 %tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16 359 %tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)* 360 %tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8 361 %tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6 362 %tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)* 363 %tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2 364 %tmp29.i9 = zext i16 %tmp16.i to i32 365 %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9 366 %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10 367 %tmp32.i12 = icmp ult i32 %tmp31.i11, %tmp29.i9 368 %tmp33.i13 = select i1 %tmp32.i12, i32 %tmp31.i11, i32 %tmp29.i9 369 %tmp34.i14 = zext i32 %tmp33.i13 to i64 370 %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0 371 %tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20 372 %tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)* 373 %tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4 374 %tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8 375 %tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)* 376 %tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8 377 %tmp29.i2 = zext i16 %tmp24.i to i32 378 %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2 379 %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3 380 %tmp32.i5 = icmp ult i32 %tmp31.i4, %tmp29.i2 381 %tmp33.i6 = select i1 %tmp32.i5, i32 %tmp31.i4, i32 %tmp29.i2 382 %tmp34.i7 = zext i32 %tmp33.i6 to i64 383 store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4 384 store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4 385 store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4 386 ret void 387} 388 389; TODO: Should be able to handle this, but not much reason to. 390; CHECK-LABEL: @partial_load_group_size_x( 391; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 392; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 393; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 4 394; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1 395define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 396 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 397 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 398 %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1 399 store i8 %group.size.x.lo, i8 addrspace(1)* %out 400 ret void 401} 402 403; CHECK-LABEL: @partial_load_group_size_x_explicit_callsite_align( 404; CHECK-NEXT: %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 405; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 406; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 2 407; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1 408define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 409 %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 410 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 411 %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1 412 store i8 %group.size.x.lo, i8 addrspace(1)* %out 413 ret void 414} 415 416; TODO: Should be able to handle this 417; CHECK-LABEL: @load_group_size_xy_i32( 418; CHECK: %group.size.xy = load i32, 419; CHECK: store i32 %group.size.xy 420define amdgpu_kernel void @load_group_size_xy_i32(i32 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 421 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 422 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 423 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i32 addrspace(4)* 424 %group.size.xy = load i32, i32 addrspace(4)* %gep.group.size.x.bc, align 4 425 store i32 %group.size.xy, i32 addrspace(1)* %out 426 ret void 427} 428 429; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr( 430; CHECK-NEXT: store volatile i16 8, i16 addrspace(1)* %out, align 2 431; CHECK-NEXT: store volatile i16 16, i16 addrspace(1)* %out, align 2 432define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 433 %dispatch.ptr0 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 434 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr0, i64 4 435 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 436 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 437 store volatile i16 %group.size.x, i16 addrspace(1)* %out 438 439 %dispatch.ptr1 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 440 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr1, i64 6 441 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)* 442 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4 443 store volatile i16 %group.size.y, i16 addrspace(1)* %out 444 445 ret void 446} 447 448; CHECK-LABEL: @use_local_size_x_uniform_work_group_size( 449; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 450; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 451; CHECK-NEXT: %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 452; CHECK-NEXT: %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 453; CHECK-NEXT: %zext = zext i16 %group.size.x to i64 454; CHECK-NEXT: store i64 %zext, i64 addrspace(1)* %out, align 4 455define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(i64 addrspace(1)* %out) #2 { 456 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 457 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 458 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 459 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 460 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 461 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 462 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 463 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 464 %group.size.x.zext = zext i16 %group.size.x to i32 465 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 466 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 467 %cmp = icmp ult i32 %sub, %group.size.x.zext 468 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext 469 %zext = zext i32 %select to i64 470 store i64 %zext, i64 addrspace(1)* %out 471 ret void 472} 473 474; CHECK-LABEL: @use_local_size_x_uniform_work_group_size_false( 475; CHECK: icmp ult 476; CHECK: select 477define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(i64 addrspace(1)* %out) #3 { 478 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 479 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 480 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 481 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 482 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 483 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 484 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 485 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 486 %group.size.x.zext = zext i16 %group.size.x to i32 487 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 488 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 489 %cmp = icmp ult i32 %sub, %group.size.x.zext 490 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext 491 %zext = zext i32 %select to i64 492 store i64 %zext, i64 addrspace(1)* %out 493 ret void 494} 495 496; CHECK-LABEL: @no_use_dispatch_ptr( 497; CHECK-NEXT: ret void 498define amdgpu_kernel void @no_use_dispatch_ptr() { 499 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 500 ret void 501} 502 503declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 504declare i32 @llvm.amdgcn.workgroup.id.x() #1 505declare i32 @llvm.amdgcn.workgroup.id.y() #1 506declare i32 @llvm.amdgcn.workgroup.id.z() #1 507 508attributes #0 = { nounwind "uniform-work-group-size"="true" } 509attributes #1 = { nounwind readnone speculatable } 510attributes #2 = { nounwind "uniform-work-group-size"="true" } 511attributes #3 = { nounwind "uniform-work-group-size"="false" } 512 513!0 = !{i32 8, i32 16, i32 2} 514!1 = !{i32 8, i32 16} 515!2 = !{i64 8, i64 16, i64 2} 516!3 = !{i16 8, i16 16, i16 2} 517