1; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s 2 3target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 4 5; Make sure the pointer / address space of AtomicRMW is considered 6 7; OPT-LABEL: @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32( 8 9; OPT: .lr.ph.preheader: 10; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)* %arg1, i32 16383 11; OPT: br label %.lr.ph 12; OPT: .lr.ph: 13; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] 14; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] 15; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] 16; OPT: %tmp4 = atomicrmw add i32 addrspace(3)* %lsr.iv3, i32 undef seq_cst, align 4 17; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst, align 4 18; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst, align 4 19; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1 20; OPT: br i1 %exitcond 21define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { 22bb: 23 %tmp = icmp sgt i32 %n, 0 24 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge 25 26.lr.ph.preheader: ; preds = %bb 27 br label %.lr.ph 28 29._crit_edge.loopexit: ; preds = %.lr.ph 30 br label %._crit_edge 31 32._crit_edge: ; preds = %._crit_edge.loopexit, %bb 33 ret void 34 35.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 36 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 37 %tmp1 = add nuw nsw i32 %indvars.iv, 16383 38 %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 39 %tmp4 = atomicrmw add i32 addrspace(3)* %tmp3, i32 undef seq_cst 40 %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv 41 %tmp7 = atomicrmw add i32 addrspace(3)* %tmp6, i32 undef seq_cst 42 %tmp8 = add nsw i32 %tmp7, %tmp4 43 atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst 44 %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 45 %exitcond = icmp eq i32 %indvars.iv.next, %n 46 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 47} 48 49; OPT-LABEL: test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32( 50 51; OPT: .lr.ph.preheader: 52; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)* %arg1, i32 16383 53; OPT: br label %.lr.ph 54; OPT: .lr.ph: 55; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] 56; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] 57; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] 58; OPT: %tmp4 = cmpxchg i32 addrspace(3)* %lsr.iv3, i32 undef, i32 undef seq_cst monotonic, align 4 59; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1 60define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { 61bb: 62 %tmp = icmp sgt i32 %n, 0 63 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge 64 65.lr.ph.preheader: ; preds = %bb 66 br label %.lr.ph 67 68._crit_edge.loopexit: ; preds = %.lr.ph 69 br label %._crit_edge 70 71._crit_edge: ; preds = %._crit_edge.loopexit, %bb 72 ret void 73 74.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 75 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 76 %tmp1 = add nuw nsw i32 %indvars.iv, 16383 77 %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 78 %tmp4 = cmpxchg i32 addrspace(3)* %tmp3, i32 undef, i32 undef seq_cst monotonic 79 %tmp4.0 = extractvalue { i32, i1 } %tmp4, 0 80 %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv 81 %tmp7 = cmpxchg i32 addrspace(3)* %tmp6, i32 undef, i32 undef seq_cst monotonic 82 %tmp7.0 = extractvalue { i32, i1 } %tmp7, 0 83 %tmp8 = add nsw i32 %tmp7.0, %tmp4.0 84 atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst 85 %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 86 %exitcond = icmp eq i32 %indvars.iv.next, %n 87 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 88} 89 90; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32( 91 92; OPT: .lr.ph.preheader: 93; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)* %arg1, i32 16383 94; OPT: br label %.lr.ph 95; OPT: .lr.ph: 96; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] 97; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] 98; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] 99; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv3, i32 undef, i32 0, i32 0, i1 false) 100; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false) 101; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1 102define amdgpu_kernel void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { 103bb: 104 %tmp = icmp sgt i32 %n, 0 105 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge 106 107.lr.ph.preheader: ; preds = %bb 108 br label %.lr.ph 109 110._crit_edge.loopexit: ; preds = %.lr.ph 111 br label %._crit_edge 112 113._crit_edge: ; preds = %._crit_edge.loopexit, %bb 114 ret void 115 116.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 117 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 118 %tmp1 = add nuw nsw i32 %indvars.iv, 16383 119 %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 120 %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false) 121 %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv 122 %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false) 123 %tmp8 = add nsw i32 %tmp7, %tmp4 124 atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst 125 %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 126 %exitcond = icmp eq i32 %indvars.iv.next, %n 127 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 128} 129 130; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32( 131; OPT: .lr.ph.preheader: 132; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)* %arg1, i32 16383 133; OPT: br label %.lr.ph 134; OPT: .lr.ph: 135; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] 136; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] 137; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] 138; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv3, i32 undef, i32 0, i32 0, i1 false) 139; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false) 140; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1 141define amdgpu_kernel void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { 142bb: 143 %tmp = icmp sgt i32 %n, 0 144 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge 145 146.lr.ph.preheader: ; preds = %bb 147 br label %.lr.ph 148 149._crit_edge.loopexit: ; preds = %.lr.ph 150 br label %._crit_edge 151 152._crit_edge: ; preds = %._crit_edge.loopexit, %bb 153 ret void 154 155.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 156 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 157 %tmp1 = add nuw nsw i32 %indvars.iv, 16383 158 %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 159 %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false) 160 %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv 161 %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false) 162 %tmp8 = add nsw i32 %tmp7, %tmp4 163 atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst 164 %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 165 %exitcond = icmp eq i32 %indvars.iv.next, %n 166 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 167} 168 169declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1 170declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1 171 172attributes #0 = { nounwind } 173attributes #1 = { nounwind argmemonly } 174