1; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s 2; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s 3; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -passes='default<O3>' -S -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s 4; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -passes='default<O3>' -S < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s 5 6define coldcc float @foo(float %x, float %y) { 7entry: 8 %cmp = fcmp ogt float %x, 0.000000e+00 9 %div = fdiv float %y, %x 10 %mul = fmul float %x, %y 11 %cond = select i1 %cmp, float %div, float %mul 12 ret float %cond 13} 14 15define coldcc void @foo_private_ptr(float addrspace(5)* nocapture %p) { 16entry: 17 %tmp1 = load float, float addrspace(5)* %p, align 4 18 %cmp = fcmp ogt float %tmp1, 1.000000e+00 19 br i1 %cmp, label %if.then, label %if.end 20 21if.then: ; preds = %entry 22 %div = fdiv float 1.000000e+00, %tmp1 23 store float %div, float addrspace(5)* %p, align 4 24 br label %if.end 25 26if.end: ; preds = %if.then, %entry 27 ret void 28} 29 30define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) { 31entry: 32 %tmp1 = load float, float addrspace(5)* %p1, align 4 33 %cmp = fcmp ogt float %tmp1, 1.000000e+00 34 br i1 %cmp, label %if.then, label %if.end 35 36if.then: 37 %div = fdiv float 2.000000e+00, %tmp1 38 store float %div, float addrspace(5)* %p2, align 4 39 br label %if.end 40 41if.end: 42 ret void 43} 44 45define float @sin_wrapper(float %x) { 46bb: 47 %call = tail call float @_Z3sinf(float %x) 48 ret float %call 49} 50 51define void @foo_noinline(float addrspace(5)* nocapture %p) #0 { 52entry: 53 %tmp1 = load float, float addrspace(5)* %p, align 4 54 %mul = fmul float %tmp1, 2.000000e+00 55 store float %mul, float addrspace(5)* %p, align 4 56 ret void 57} 58 59; GCN: define amdgpu_kernel void @test_inliner( 60; GCN-INL1: %c1 = tail call coldcc float @foo( 61; GCN-INLDEF: %cmp.i = fcmp ogt float %tmp2, 0.000000e+00 62; GCN: %div.i{{[0-9]*}} = fdiv float 1.000000e+00, %c 63; GCN: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i 64; GCN: call void @foo_noinline( 65; GCN: tail call float @_Z3sinf( 66define amdgpu_kernel void @test_inliner(float addrspace(1)* nocapture %a, i32 %n) { 67entry: 68 %pvt_arr = alloca [64 x float], align 4, addrspace(5) 69 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() 70 %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid 71 %tmp2 = load float, float addrspace(1)* %arrayidx, align 4 72 %add = add i32 %tid, 1 73 %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %a, i32 %add 74 %tmp5 = load float, float addrspace(1)* %arrayidx2, align 4 75 %c1 = tail call coldcc float @foo(float %tmp2, float %tmp5) 76 %or = or i32 %tid, %n 77 %arrayidx5 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %or 78 store float %c1, float addrspace(5)* %arrayidx5, align 4 79 %arrayidx7 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %or 80 call coldcc void @foo_private_ptr(float addrspace(5)* %arrayidx7) 81 %arrayidx8 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 1 82 %arrayidx9 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 2 83 call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9) 84 call void @foo_noinline(float addrspace(5)* %arrayidx7) 85 %and = and i32 %tid, %n 86 %arrayidx11 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %and 87 %tmp12 = load float, float addrspace(5)* %arrayidx11, align 4 88 %c2 = call float @sin_wrapper(float %tmp12) 89 store float %c2, float addrspace(5)* %arrayidx7, align 4 90 %xor = xor i32 %tid, %n 91 %arrayidx16 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %xor 92 %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4 93 store float %tmp16, float addrspace(1)* %arrayidx, align 4 94 ret void 95} 96 97; GCN: define amdgpu_kernel void @test_inliner_multi_pvt_ptr( 98; GCN: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i 99define amdgpu_kernel void @test_inliner_multi_pvt_ptr(float addrspace(1)* nocapture %a, i32 %n, float %v) { 100entry: 101 %pvt_arr1 = alloca [32 x float], align 4, addrspace(5) 102 %pvt_arr2 = alloca [32 x float], align 4, addrspace(5) 103 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() 104 %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid 105 %or = or i32 %tid, %n 106 %arrayidx4 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %or 107 %arrayidx5 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %or 108 store float %v, float addrspace(5)* %arrayidx4, align 4 109 store float %v, float addrspace(5)* %arrayidx5, align 4 110 %arrayidx8 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 1 111 %arrayidx9 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 2 112 call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9) 113 %xor = xor i32 %tid, %n 114 %arrayidx15 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %xor 115 %arrayidx16 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %xor 116 %tmp15 = load float, float addrspace(5)* %arrayidx15, align 4 117 %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4 118 %tmp17 = fadd float %tmp15, %tmp16 119 store float %tmp17, float addrspace(1)* %arrayidx, align 4 120 ret void 121} 122 123; GCN: define amdgpu_kernel void @test_inliner_multi_pvt_ptr_cutoff( 124; GCN-INL1: call coldcc void @foo_private_ptr2 125; GCN-INLDEF: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i 126define amdgpu_kernel void @test_inliner_multi_pvt_ptr_cutoff(float addrspace(1)* nocapture %a, i32 %n, float %v) { 127entry: 128 %pvt_arr1 = alloca [32 x float], align 4, addrspace(5) 129 %pvt_arr2 = alloca [33 x float], align 4, addrspace(5) 130 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() 131 %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid 132 %or = or i32 %tid, %n 133 %arrayidx4 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %or 134 %arrayidx5 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %or 135 store float %v, float addrspace(5)* %arrayidx4, align 4 136 store float %v, float addrspace(5)* %arrayidx5, align 4 137 %arrayidx8 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 1 138 %arrayidx9 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 2 139 call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9) 140 %xor = xor i32 %tid, %n 141 %arrayidx15 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %xor 142 %arrayidx16 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %xor 143 %tmp15 = load float, float addrspace(5)* %arrayidx15, align 4 144 %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4 145 %tmp17 = fadd float %tmp15, %tmp16 146 store float %tmp17, float addrspace(1)* %arrayidx, align 4 147 ret void 148} 149 150declare i32 @llvm.amdgcn.workitem.id.x() #1 151declare float @_Z3sinf(float) #1 152 153attributes #0 = { noinline } 154attributes #1 = { nounwind readnone } 155