1; RUN: llc -march=r600 -mcpu=redwood -disable-promote-alloca-to-vector < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC 2; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600-VECT -check-prefix=FUNC 3; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=OPT %s 4target datalayout = "A5" 5 6declare i32 @llvm.r600.read.tidig.x() nounwind readnone 7 8; FUNC-LABEL: {{^}}mova_same_clause: 9 10; R600: LDS_WRITE 11; R600: LDS_WRITE 12; R600: LDS_READ 13; R600: LDS_READ 14 15; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0 16; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0 17; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1 18; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1 19; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1 20 21define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { 22entry: 23 %stack = alloca [5 x i32], align 4, addrspace(5) 24 %0 = load i32, i32 addrspace(1)* %in, align 4 25 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 26 store i32 4, i32 addrspace(5)* %arrayidx1, align 4 27 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 28 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 29 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 30 store i32 5, i32 addrspace(5)* %arrayidx3, align 4 31 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 32 %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 33 store i32 %2, i32 addrspace(1)* %out, align 4 34 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 35 %3 = load i32, i32 addrspace(5)* %arrayidx12 36 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 37 store i32 %3, i32 addrspace(1)* %arrayidx13 38 ret void 39} 40 41; This test checks that the stack offset is calculated correctly for structs. 42; All register loads/stores should be optimized away, so there shouldn't be 43; any MOVA instructions. 44; 45; XXX: This generated code has unnecessary MOVs, we should be able to optimize 46; this. 47 48; FUNC-LABEL: {{^}}multiple_structs: 49; R600-NOT: MOVA_INT 50%struct.point = type { i32, i32 } 51 52define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { 53entry: 54 %a = alloca %struct.point, addrspace(5) 55 %b = alloca %struct.point, addrspace(5) 56 %a.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 57 %a.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 1 58 %b.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 59 %b.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 1 60 store i32 0, i32 addrspace(5)* %a.x.ptr 61 store i32 1, i32 addrspace(5)* %a.y.ptr 62 store i32 2, i32 addrspace(5)* %b.x.ptr 63 store i32 3, i32 addrspace(5)* %b.y.ptr 64 %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 65 %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 66 %a.indirect = load i32, i32 addrspace(5)* %a.indirect.ptr 67 %b.indirect = load i32, i32 addrspace(5)* %b.indirect.ptr 68 %0 = add i32 %a.indirect, %b.indirect 69 store i32 %0, i32 addrspace(1)* %out 70 ret void 71} 72 73; Test direct access of a private array inside a loop. The private array 74; loads and stores should be lowered to copies, so there shouldn't be any 75; MOVA instructions. 76 77; FUNC-LABEL: {{^}}direct_loop: 78; R600-NOT: MOVA_INT 79 80define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 81entry: 82 %prv_array_const = alloca [2 x i32], addrspace(5) 83 %prv_array = alloca [2 x i32], addrspace(5) 84 %a = load i32, i32 addrspace(1)* %in 85 %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 86 %b = load i32, i32 addrspace(1)* %b_src_ptr 87 %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 88 store i32 %a, i32 addrspace(5)* %a_dst_ptr 89 %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 1 90 store i32 %b, i32 addrspace(5)* %b_dst_ptr 91 br label %for.body 92 93for.body: 94 %inc = phi i32 [0, %entry], [%count, %for.body] 95 %x_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 96 %x = load i32, i32 addrspace(5)* %x_ptr 97 %y_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 98 %y = load i32, i32 addrspace(5)* %y_ptr 99 %xy = add i32 %x, %y 100 store i32 %xy, i32 addrspace(5)* %y_ptr 101 %count = add i32 %inc, 1 102 %done = icmp eq i32 %count, 4095 103 br i1 %done, label %for.end, label %for.body 104 105for.end: 106 %value_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 107 %value = load i32, i32 addrspace(5)* %value_ptr 108 store i32 %value, i32 addrspace(1)* %out 109 ret void 110} 111 112; FUNC-LABEL: {{^}}short_array: 113 114; R600-VECT: MOVA_INT 115define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { 116entry: 117 %0 = alloca [2 x i16], addrspace(5) 118 %1 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 0 119 %2 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 1 120 store i16 0, i16 addrspace(5)* %1 121 store i16 1, i16 addrspace(5)* %2 122 %3 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 %index 123 %4 = load i16, i16 addrspace(5)* %3 124 %5 = sext i16 %4 to i32 125 store i32 %5, i32 addrspace(1)* %out 126 ret void 127} 128 129; FUNC-LABEL: {{^}}char_array: 130 131; R600-VECT: MOVA_INT 132define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { 133entry: 134 %0 = alloca [2 x i8], addrspace(5) 135 %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 0 136 %2 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 1 137 store i8 0, i8 addrspace(5)* %1 138 store i8 1, i8 addrspace(5)* %2 139 %3 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 %index 140 %4 = load i8, i8 addrspace(5)* %3 141 %5 = sext i8 %4 to i32 142 store i32 %5, i32 addrspace(1)* %out 143 ret void 144 145} 146 147; Make sure we don't overwrite workitem information with private memory 148 149; FUNC-LABEL: {{^}}work_item_info: 150; R600-NOT: MOV T0.X 151; Additional check in case the move ends up in the last slot 152; R600-NOT: MOV * TO.X 153define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { 154entry: 155 %0 = alloca [2 x i32], addrspace(5) 156 %1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0 157 %2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1 158 store i32 0, i32 addrspace(5)* %1 159 store i32 1, i32 addrspace(5)* %2 160 %3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in 161 %4 = load i32, i32 addrspace(5)* %3 162 %5 = call i32 @llvm.r600.read.tidig.x() 163 %6 = add i32 %4, %5 164 store i32 %6, i32 addrspace(1)* %out 165 ret void 166} 167 168; Test that two stack objects are not stored in the same register 169; The second stack object should be in T3.X 170; FUNC-LABEL: {{^}}no_overlap: 171define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { 172entry: 173 %0 = alloca [3 x i8], align 1, addrspace(5) 174 %1 = alloca [2 x i8], align 1, addrspace(5) 175 %2 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 0 176 %3 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 1 177 %4 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 2 178 %5 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 0 179 %6 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 1 180 store i8 0, i8 addrspace(5)* %2 181 store i8 1, i8 addrspace(5)* %3 182 store i8 2, i8 addrspace(5)* %4 183 store i8 1, i8 addrspace(5)* %5 184 store i8 0, i8 addrspace(5)* %6 185 %7 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 %in 186 %8 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 %in 187 %9 = load i8, i8 addrspace(5)* %7 188 %10 = load i8, i8 addrspace(5)* %8 189 %11 = add i8 %9, %10 190 %12 = sext i8 %11 to i32 191 store i32 %12, i32 addrspace(1)* %out 192 ret void 193} 194 195define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { 196entry: 197 %alloca = alloca [2 x [2 x i8]], addrspace(5) 198 %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 199 %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 200 store i8 0, i8 addrspace(5)* %gep0 201 store i8 1, i8 addrspace(5)* %gep1 202 %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index 203 %load = load i8, i8 addrspace(5)* %gep2 204 %sext = sext i8 %load to i32 205 store i32 %sext, i32 addrspace(1)* %out 206 ret void 207} 208 209define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { 210entry: 211 %alloca = alloca [2 x [2 x i32]], addrspace(5) 212 %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 213 %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 214 store i32 0, i32 addrspace(5)* %gep0 215 store i32 1, i32 addrspace(5)* %gep1 216 %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index 217 %load = load i32, i32 addrspace(5)* %gep2 218 store i32 %load, i32 addrspace(1)* %out 219 ret void 220} 221 222define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { 223entry: 224 %alloca = alloca [2 x [2 x i64]], addrspace(5) 225 %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 226 %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 227 store i64 0, i64 addrspace(5)* %gep0 228 store i64 1, i64 addrspace(5)* %gep1 229 %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index 230 %load = load i64, i64 addrspace(5)* %gep2 231 store i64 %load, i64 addrspace(1)* %out 232 ret void 233} 234 235%struct.pair32 = type { i32, i32 } 236 237define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { 238entry: 239 %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5) 240 %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0, i32 1 241 %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1, i32 1 242 store i32 0, i32 addrspace(5)* %gep0 243 store i32 1, i32 addrspace(5)* %gep1 244 %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index, i32 0 245 %load = load i32, i32 addrspace(5)* %gep2 246 store i32 %load, i32 addrspace(1)* %out 247 ret void 248} 249 250define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { 251entry: 252 %alloca = alloca [2 x %struct.pair32], addrspace(5) 253 %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 0, i32 1 254 %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 1, i32 0 255 store i32 0, i32 addrspace(5)* %gep0 256 store i32 1, i32 addrspace(5)* %gep1 257 %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 %index, i32 0 258 %load = load i32, i32 addrspace(5)* %gep2 259 store i32 %load, i32 addrspace(1)* %out 260 ret void 261} 262 263define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { 264entry: 265 %tmp = alloca [2 x i32], addrspace(5) 266 %tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 267 %tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 268 store i32 0, i32 addrspace(5)* %tmp1 269 store i32 1, i32 addrspace(5)* %tmp2 270 %cmp = icmp eq i32 %in, 0 271 %sel = select i1 %cmp, i32 addrspace(5)* %tmp1, i32 addrspace(5)* %tmp2 272 %load = load i32, i32 addrspace(5)* %sel 273 store i32 %load, i32 addrspace(1)* %out 274 ret void 275} 276 277; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it 278; finds one, it should stop trying to promote. 279 280; FUNC-LABEL: ptrtoint: 281; SI-NOT: ds_write 282; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen 283; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; 284define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 285 %alloca = alloca [16 x i32], addrspace(5) 286 %tmp0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a 287 store i32 5, i32 addrspace(5)* %tmp0 288 %tmp1 = ptrtoint [16 x i32] addrspace(5)* %alloca to i32 289 %tmp2 = add i32 %tmp1, 5 290 %tmp3 = inttoptr i32 %tmp2 to i32 addrspace(5)* 291 %tmp4 = getelementptr inbounds i32, i32 addrspace(5)* %tmp3, i32 %b 292 %tmp5 = load i32, i32 addrspace(5)* %tmp4 293 store i32 %tmp5, i32 addrspace(1)* %out 294 ret void 295} 296 297; OPT: !0 = !{i32 0, i32 257} 298; OPT: !1 = !{i32 0, i32 256} 299 300attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" } 301