1; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s 2 3; GCN-LABEL: {{^}}test_membound: 4; GCN: MemoryBound: 1 5; GCN: WaveLimiterHint : 1 6define amdgpu_kernel void @test_membound(<4 x i32> addrspace(1)* nocapture readonly %arg, <4 x i32> addrspace(1)* nocapture %arg1) { 7bb: 8 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 9 %tmp2 = zext i32 %tmp to i64 10 %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2 11 %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16 12 %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2 13 store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16 14 %tmp6 = add nuw nsw i64 %tmp2, 1 15 %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6 16 %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16 17 %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6 18 store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16 19 ret void 20} 21 22; GCN-LABEL: {{^}}test_large_stride: 23; GCN: MemoryBound: 0 24; GCN: WaveLimiterHint : 1 25define amdgpu_kernel void @test_large_stride(i32 addrspace(1)* nocapture %arg) { 26bb: 27 %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4096 28 %tmp1 = load i32, i32 addrspace(1)* %tmp, align 4 29 %mul1 = mul i32 %tmp1, %tmp1 30 %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 31 store i32 %mul1, i32 addrspace(1)* %tmp2, align 4 32 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8192 33 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 34 %mul4 = mul i32 %tmp4, %tmp4 35 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 36 store i32 %mul4, i32 addrspace(1)* %tmp5, align 4 37 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12288 38 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 39 %mul7 = mul i32 %tmp7, %tmp7 40 %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 41 store i32 %mul7, i32 addrspace(1)* %tmp8, align 4 42 ret void 43} 44 45; GCN-LABEL: {{^}}test_indirect: 46; GCN: MemoryBound: 0 47; GCN: WaveLimiterHint : 1 48define amdgpu_kernel void @test_indirect(i32 addrspace(1)* nocapture %arg) { 49bb: 50 %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 51 %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 52 %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 53 %tmp3 = bitcast i32 addrspace(1)* %arg to <4 x i32> addrspace(1)* 54 %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4 55 %tmp5 = extractelement <4 x i32> %tmp4, i32 0 56 %tmp6 = sext i32 %tmp5 to i64 57 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 58 %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4 59 store i32 %tmp8, i32 addrspace(1)* %arg, align 4 60 %tmp9 = extractelement <4 x i32> %tmp4, i32 1 61 %tmp10 = sext i32 %tmp9 to i64 62 %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp10 63 %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4 64 store i32 %tmp12, i32 addrspace(1)* %tmp, align 4 65 %tmp13 = extractelement <4 x i32> %tmp4, i32 2 66 %tmp14 = sext i32 %tmp13 to i64 67 %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp14 68 %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 69 store i32 %tmp16, i32 addrspace(1)* %tmp1, align 4 70 %tmp17 = extractelement <4 x i32> %tmp4, i32 3 71 %tmp18 = sext i32 %tmp17 to i64 72 %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp18 73 %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4 74 store i32 %tmp20, i32 addrspace(1)* %tmp2, align 4 75 ret void 76} 77 78; GCN-LABEL: {{^}}test_indirect_through_phi: 79; MemoryBound: 0 80; WaveLimiterHint : 0 81define amdgpu_kernel void @test_indirect_through_phi(float addrspace(1)* %arg) { 82bb: 83 %load = load float, float addrspace(1)* %arg, align 8 84 %load.f = bitcast float %load to i32 85 %n = tail call i32 @llvm.amdgcn.workitem.id.x() 86 br label %bb1 87 88bb1: ; preds = %bb1, %bb 89 %phi = phi i32 [ %load.f, %bb ], [ %and2, %bb1 ] 90 %ind = phi i32 [ 0, %bb ], [ %inc2, %bb1 ] 91 %and1 = and i32 %phi, %n 92 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %and1 93 store float %load, float addrspace(1)* %gep, align 4 94 %inc1 = add nsw i32 %phi, 1310720 95 %and2 = and i32 %inc1, %n 96 %inc2 = add nuw nsw i32 %ind, 1 97 %cmp = icmp eq i32 %inc2, 1024 98 br i1 %cmp, label %bb2, label %bb1 99 100bb2: ; preds = %bb1 101 ret void 102} 103 104declare i32 @llvm.amdgcn.workitem.id.x() 105