1; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-lower-kernel-attributes -instcombine %s | FileCheck -enable-var-scope %s 2; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope %s 3 4target datalayout = "n32" 5 6; CHECK-LABEL: @invalid_reqd_work_group_size( 7; CHECK: load i16, 8define amdgpu_kernel void @invalid_reqd_work_group_size(i16 addrspace(1)* %out) #0 !reqd_work_group_size !1 { 9 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 10 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 11 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 12 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 13 store i16 %group.size.x, i16 addrspace(1)* %out 14 ret void 15} 16 17; CHECK-LABEL: @volatile_load_group_size_x( 18; CHECK: load volatile i16, 19define amdgpu_kernel void @volatile_load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 20 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 21 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 22 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 23 %group.size.x = load volatile i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 24 store i16 %group.size.x, i16 addrspace(1)* %out 25 ret void 26} 27 28; CHECK-LABEL: @load_group_size_x( 29; CHECK-NEXT: store i16 8, 30define amdgpu_kernel void @load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 31 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 32 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 33 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 34 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 35 store i16 %group.size.x, i16 addrspace(1)* %out 36 ret void 37} 38 39; CHECK-LABEL: @load_group_size_y( 40; CHECK-NEXT: store i16 16, 41define amdgpu_kernel void @load_group_size_y(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 42 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 43 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6 44 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)* 45 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4 46 store i16 %group.size.y, i16 addrspace(1)* %out 47 ret void 48} 49 50; CHECK-LABEL: @load_group_size_z( 51; CHECK-NEXT: store i16 2, 52define amdgpu_kernel void @load_group_size_z(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 53 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 54 %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8 55 %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)* 56 %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4 57 store i16 %group.size.z, i16 addrspace(1)* %out 58 ret void 59} 60 61; Metadata uses i64 instead of i32 62; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64( 63; CHECK-NEXT: store i16 8, 64define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(i16 addrspace(1)* %out) #0 !reqd_work_group_size !2 { 65 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 66 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 67 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 68 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 69 store i16 %group.size.x, i16 addrspace(1)* %out 70 ret void 71} 72 73; Metadata uses i16 instead of i32 74; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16( 75; CHECK-NEXT: store i16 8, 76define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(i16 addrspace(1)* %out) #0 !reqd_work_group_size !3 { 77 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 78 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 79 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 80 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 81 store i16 %group.size.x, i16 addrspace(1)* %out 82 ret void 83} 84 85; CHECK-LABEL: @use_local_size_x_8_16_2( 86; CHECK-NEXT: store i64 8, 87define amdgpu_kernel void @use_local_size_x_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 88 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 89 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 90 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 91 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 92 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 93 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 94 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 95 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 96 %group.size.x.zext = zext i16 %group.size.x to i32 97 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 98 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 99 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext) 100 %zext = zext i32 %umin to i64 101 store i64 %zext, i64 addrspace(1)* %out 102 ret void 103} 104 105; CHECK-LABEL: @use_local_size_y_8_16_2( 106; CHECK-NEXT: store i64 16, 107define amdgpu_kernel void @use_local_size_y_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 108 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 109 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6 110 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)* 111 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4 112 %gep.grid.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16 113 %gep.grid.size.y.bc = bitcast i8 addrspace(4)* %gep.grid.size.y to i32 addrspace(4)* 114 %grid.size.y = load i32, i32 addrspace(4)* %gep.grid.size.y.bc, align 4 115 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() 116 %group.size.y.zext = zext i16 %group.size.y to i32 117 %group.id_x_group.size.y = mul i32 %group.id, %group.size.y.zext 118 %sub = sub i32 %grid.size.y, %group.id_x_group.size.y 119 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.y.zext) 120 %zext = zext i32 %umin to i64 121 store i64 %zext, i64 addrspace(1)* %out 122 ret void 123} 124 125; CHECK-LABEL: @use_local_size_z_8_16_2( 126; CHECK-NEXT: store i64 2, 127define amdgpu_kernel void @use_local_size_z_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 128 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 129 %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8 130 %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)* 131 %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4 132 %gep.grid.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 20 133 %gep.grid.size.z.bc = bitcast i8 addrspace(4)* %gep.grid.size.z to i32 addrspace(4)* 134 %grid.size.z = load i32, i32 addrspace(4)* %gep.grid.size.z.bc, align 4 135 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z() 136 %group.size.z.zext = zext i16 %group.size.z to i32 137 %group.id_x_group.size.z = mul i32 %group.id, %group.size.z.zext 138 %sub = sub i32 %grid.size.z, %group.id_x_group.size.z 139 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.z.zext) 140 %zext = zext i32 %umin to i64 141 store i64 %zext, i64 addrspace(1)* %out 142 ret void 143} 144 145; Simplification on select is invalid, but we can still eliminate the 146; load of the group size. 147 148; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id( 149; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() 150; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 151define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 152 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 153 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 154 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 155 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 156 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 157 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 158 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 159 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() 160 %group.size.x.zext = zext i16 %group.size.x to i32 161 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 162 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 163 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext) 164 %zext = zext i32 %umin to i64 165 store i64 %zext, i64 addrspace(1)* %out 166 ret void 167} 168 169; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size( 170; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 171; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 172; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 173 define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 174 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 175 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 176 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 177 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 178 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16 179 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 180 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 181 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 182 %group.size.x.zext = zext i16 %group.size.x to i32 183 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 184 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 185 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext) 186 %zext = zext i32 %umin to i64 187 store i64 %zext, i64 addrspace(1)* %out 188 ret void 189} 190 191; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type( 192; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 193; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 194; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 195; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x 196; CHECK: %smin = call i32 @llvm.smin.i32(i32 %sub, i32 8) 197define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 198 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 199 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 200 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 201 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 202 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 203 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 204 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 205 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 206 %group.size.x.zext = zext i16 %group.size.x to i32 207 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 208 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 209 %smin = call i32 @llvm.smin.i32(i32 %sub, i32 %group.size.x.zext) 210 %zext = zext i32 %smin to i64 211 store i64 %zext, i64 addrspace(1)* %out 212 ret void 213} 214 215; CHECK-LABEL: @local_size_x_8_16_2_wrong_select( 216; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 217; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x 218; CHECK: %umax = call i32 @llvm.umax.i32(i32 %sub, i32 8) 219; CHECK: %zext = zext i32 %umax to i64 220define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 221 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 222 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 223 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 224 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 225 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 226 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 227 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 228 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 229 %group.size.x.zext = zext i16 %group.size.x to i32 230 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 231 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 232 %umax = call i32 @llvm.umax.i32(i32 %sub, i32 %group.size.x.zext) 233 %zext = zext i32 %umax to i64 234 store i64 %zext, i64 addrspace(1)* %out 235 ret void 236} 237 238; CHECK-LABEL: @use_local_size_x_8_16_2_wrong_grid_load_size( 239; CHECK: %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4 240; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32 241; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 242; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8 243; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x.zext 244define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 245 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 246 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 247 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 248 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 249 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 250 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i16 addrspace(4)* 251 %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4 252 %grid.size.x.zext = zext i16 %grid.size.x to i32 253 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 254 %group.size.x.zext = zext i16 %group.size.x to i32 255 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 256 %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x 257 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext) 258 %zext = zext i32 %umin to i64 259 store i64 %zext, i64 addrspace(1)* %out 260 ret void 261} 262 263; CHECK-LABEL: @func_group_size_x( 264; CHECK-NEXT: ret i32 8 265define i32 @func_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 266 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 267 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 268 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 269 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 270 %zext = zext i16 %group.size.x to i32 271 ret i32 %zext 272} 273 274; CHECK-LABEL: @__ockl_get_local_size_reqd_size( 275; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ] 276define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 { 277bb: 278 %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2 279 switch i32 %arg, label %bb25 [ 280 i32 0, label %bb1 281 i32 1, label %bb9 282 i32 2, label %bb17 283 ] 284 285bb1: ; preds = %bb 286 %tmp2 = tail call i32 @llvm.amdgcn.workgroup.id.x() 287 %tmp3 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 12 288 %tmp4 = bitcast i8 addrspace(4)* %tmp3 to i32 addrspace(4)* 289 %tmp5 = load i32, i32 addrspace(4)* %tmp4, align 4 290 %tmp6 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4 291 %tmp7 = bitcast i8 addrspace(4)* %tmp6 to i16 addrspace(4)* 292 %tmp8 = load i16, i16 addrspace(4)* %tmp7, align 4 293 br label %bb25 294 295bb9: ; preds = %bb 296 %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.y() 297 %tmp11 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 16 298 %tmp12 = bitcast i8 addrspace(4)* %tmp11 to i32 addrspace(4)* 299 %tmp13 = load i32, i32 addrspace(4)* %tmp12, align 8 300 %tmp14 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 6 301 %tmp15 = bitcast i8 addrspace(4)* %tmp14 to i16 addrspace(4)* 302 %tmp16 = load i16, i16 addrspace(4)* %tmp15, align 2 303 br label %bb25 304 305bb17: ; preds = %bb 306 %tmp18 = tail call i32 @llvm.amdgcn.workgroup.id.z() 307 %tmp19 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 20 308 %tmp20 = bitcast i8 addrspace(4)* %tmp19 to i32 addrspace(4)* 309 %tmp21 = load i32, i32 addrspace(4)* %tmp20, align 4 310 %tmp22 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 8 311 %tmp23 = bitcast i8 addrspace(4)* %tmp22 to i16 addrspace(4)* 312 %tmp24 = load i16, i16 addrspace(4)* %tmp23, align 8 313 br label %bb25 314 315bb25: ; preds = %bb17, %bb9, %bb1, %bb 316 %tmp26 = phi i32 [ %tmp21, %bb17 ], [ %tmp13, %bb9 ], [ %tmp5, %bb1 ], [ 0, %bb ] 317 %group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, %bb1 ], [ 1, %bb ] 318 %tmp28 = phi i32 [ %tmp18, %bb17 ], [ %tmp10, %bb9 ], [ %tmp2, %bb1 ], [ 0, %bb ] 319 %tmp29 = zext i16 %group.size to i32 320 %tmp30 = mul i32 %tmp28, %tmp29 321 %tmp31 = sub i32 %tmp26, %tmp30 322 %umin = call i32 @llvm.umin.i32(i32 %tmp31, i32 %tmp29) 323 %tmp34 = zext i32 %umin to i64 324 ret i64 %tmp34 325} 326 327; CHECK-LABEL: @all_local_size( 328; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4 329; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4 330; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4 331define amdgpu_kernel void @all_local_size(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 { 332 %tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 333 %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0 334 %tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12 335 %tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)* 336 %tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4 337 %tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4 338 %tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)* 339 %tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4 340 %tmp29.i = zext i16 %tmp8.i to i32 341 %tmp30.i = mul i32 %tmp2.i, %tmp29.i 342 %tmp31.i = sub i32 %tmp5.i, %tmp30.i 343 %umin0 = call i32 @llvm.umin.i32(i32 %tmp31.i, i32 %tmp29.i) 344 %tmp34.i = zext i32 %umin0 to i64 345 %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0 346 %tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16 347 %tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)* 348 %tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8 349 %tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6 350 %tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)* 351 %tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2 352 %tmp29.i9 = zext i16 %tmp16.i to i32 353 %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9 354 %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10 355 %umin1 = call i32 @llvm.umin.i32(i32 %tmp31.i11, i32 %tmp29.i9) 356 %tmp34.i14 = zext i32 %umin1 to i64 357 %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0 358 %tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20 359 %tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)* 360 %tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4 361 %tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8 362 %tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)* 363 %tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8 364 %tmp29.i2 = zext i16 %tmp24.i to i32 365 %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2 366 %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3 367 %umin2 = call i32 @llvm.umin.i32(i32 %tmp31.i4, i32 %tmp29.i2) 368 %tmp34.i7 = zext i32 %umin2 to i64 369 store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4 370 store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4 371 store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4 372 ret void 373} 374 375; TODO: Should be able to handle this, but not much reason to. 376; CHECK-LABEL: @partial_load_group_size_x( 377; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 378; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 379; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 4 380; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1 381define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 382 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 383 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 384 %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1 385 store i8 %group.size.x.lo, i8 addrspace(1)* %out 386 ret void 387} 388 389; CHECK-LABEL: @partial_load_group_size_x_explicit_callsite_align( 390; CHECK-NEXT: %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 391; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 392; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 2 393; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1 394define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 395 %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 396 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 397 %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1 398 store i8 %group.size.x.lo, i8 addrspace(1)* %out 399 ret void 400} 401 402; TODO: Should be able to handle this 403; CHECK-LABEL: @load_group_size_xy_i32( 404; CHECK: %group.size.xy = load i32, 405; CHECK: store i32 %group.size.xy 406define amdgpu_kernel void @load_group_size_xy_i32(i32 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 407 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 408 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 409 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i32 addrspace(4)* 410 %group.size.xy = load i32, i32 addrspace(4)* %gep.group.size.x.bc, align 4 411 store i32 %group.size.xy, i32 addrspace(1)* %out 412 ret void 413} 414 415; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr( 416; CHECK-NEXT: store volatile i16 8, i16 addrspace(1)* %out, align 2 417; CHECK-NEXT: store volatile i16 16, i16 addrspace(1)* %out, align 2 418define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { 419 %dispatch.ptr0 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 420 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr0, i64 4 421 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 422 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 423 store volatile i16 %group.size.x, i16 addrspace(1)* %out 424 425 %dispatch.ptr1 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 426 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr1, i64 6 427 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)* 428 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4 429 store volatile i16 %group.size.y, i16 addrspace(1)* %out 430 431 ret void 432} 433 434; CHECK-LABEL: @use_local_size_x_uniform_work_group_size( 435; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 436; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 437; CHECK-NEXT: %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 438; CHECK-NEXT: %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 439; CHECK-NEXT: %zext = zext i16 %group.size.x to i64 440; CHECK-NEXT: store i64 %zext, i64 addrspace(1)* %out, align 4 441define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(i64 addrspace(1)* %out) #2 { 442 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 443 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 444 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 445 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 446 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 447 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 448 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 449 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 450 %group.size.x.zext = zext i16 %group.size.x to i32 451 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 452 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 453 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext) 454 %zext = zext i32 %umin to i64 455 store i64 %zext, i64 addrspace(1)* %out 456 ret void 457} 458 459; CHECK-LABEL: @use_local_size_x_uniform_work_group_size_false( 460; CHECK: call i32 @llvm.umin 461define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(i64 addrspace(1)* %out) #3 { 462 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 463 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 464 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* 465 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 466 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12 467 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)* 468 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4 469 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() 470 %group.size.x.zext = zext i16 %group.size.x to i32 471 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext 472 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x 473 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext) 474 %zext = zext i32 %umin to i64 475 store i64 %zext, i64 addrspace(1)* %out 476 ret void 477} 478 479; CHECK-LABEL: @no_use_dispatch_ptr( 480; CHECK-NEXT: ret void 481define amdgpu_kernel void @no_use_dispatch_ptr() { 482 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() 483 ret void 484} 485 486declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 487declare i32 @llvm.amdgcn.workgroup.id.x() #1 488declare i32 @llvm.amdgcn.workgroup.id.y() #1 489declare i32 @llvm.amdgcn.workgroup.id.z() #1 490declare i32 @llvm.umin.i32(i32, i32) #1 491declare i32 @llvm.smin.i32(i32, i32) #1 492declare i32 @llvm.umax.i32(i32, i32) #1 493 494attributes #0 = { nounwind "uniform-work-group-size"="true" } 495attributes #1 = { nounwind readnone speculatable } 496attributes #2 = { nounwind "uniform-work-group-size"="true" } 497attributes #3 = { nounwind "uniform-work-group-size"="false" } 498 499!0 = !{i32 8, i32 16, i32 2} 500!1 = !{i32 8, i32 16} 501!2 = !{i64 8, i64 16, i64 2} 502!3 = !{i16 8, i16 16, i16 2} 503