1; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s
2
3target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
4
5; Make sure the pointer / address space of AtomicRMW is considered
6
7; OPT-LABEL: @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(
8
9; OPT: .lr.ph.preheader:
10; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)*  %arg1, i32 16383
11; OPT: br label %.lr.ph
12; OPT: .lr.ph:
13; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
14; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
15; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
16; OPT: %tmp4 = atomicrmw add i32 addrspace(3)* %lsr.iv3, i32 undef seq_cst, align 4
17; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst, align 4
18; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst, align 4
19; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1
20; OPT: br i1 %exitcond
21define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
22bb:
23  %tmp = icmp sgt i32 %n, 0
24  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
25
26.lr.ph.preheader:                                 ; preds = %bb
27  br label %.lr.ph
28
29._crit_edge.loopexit:                             ; preds = %.lr.ph
30  br label %._crit_edge
31
32._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
33  ret void
34
35.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
36  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
37  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
38  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
39  %tmp4 = atomicrmw add i32 addrspace(3)* %tmp3, i32 undef seq_cst
40  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
41  %tmp7 = atomicrmw add i32 addrspace(3)* %tmp6, i32 undef seq_cst
42  %tmp8 = add nsw i32 %tmp7, %tmp4
43  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
44  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
45  %exitcond = icmp eq i32 %indvars.iv.next, %n
46  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
47}
48
49; OPT-LABEL: test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(
50
51; OPT: .lr.ph.preheader:
52; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)*  %arg1, i32 16383
53; OPT: br label %.lr.ph
54; OPT: .lr.ph:
55; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
56; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
57; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
58; OPT: %tmp4 =  cmpxchg i32 addrspace(3)* %lsr.iv3, i32 undef, i32 undef seq_cst monotonic, align 4
59; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1
60define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
61bb:
62  %tmp = icmp sgt i32 %n, 0
63  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
64
65.lr.ph.preheader:                                 ; preds = %bb
66  br label %.lr.ph
67
68._crit_edge.loopexit:                             ; preds = %.lr.ph
69  br label %._crit_edge
70
71._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
72  ret void
73
74.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
75  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
76  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
77  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
78  %tmp4 = cmpxchg i32 addrspace(3)* %tmp3, i32 undef, i32 undef seq_cst monotonic
79  %tmp4.0 = extractvalue { i32, i1 } %tmp4, 0
80  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
81  %tmp7 = cmpxchg i32 addrspace(3)* %tmp6, i32 undef, i32 undef seq_cst monotonic
82  %tmp7.0 = extractvalue { i32, i1 } %tmp7, 0
83  %tmp8 = add nsw i32 %tmp7.0, %tmp4.0
84  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
85  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
86  %exitcond = icmp eq i32 %indvars.iv.next, %n
87  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
88}
89
90; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(
91
92; OPT: .lr.ph.preheader:
93; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)*  %arg1, i32 16383
94; OPT: br label %.lr.ph
95; OPT: .lr.ph:
96; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
97; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
98; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
99; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv3, i32 undef, i32 0, i32 0, i1 false)
100; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false)
101; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1
102define amdgpu_kernel void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
103bb:
104  %tmp = icmp sgt i32 %n, 0
105  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
106
107.lr.ph.preheader:                                 ; preds = %bb
108  br label %.lr.ph
109
110._crit_edge.loopexit:                             ; preds = %.lr.ph
111  br label %._crit_edge
112
113._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
114  ret void
115
116.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
117  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
118  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
119  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
120  %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false)
121  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
122  %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false)
123  %tmp8 = add nsw i32 %tmp7, %tmp4
124  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
125  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
126  %exitcond = icmp eq i32 %indvars.iv.next, %n
127  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
128}
129
130; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(
131; OPT: .lr.ph.preheader:
132; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)*  %arg1, i32 16383
133; OPT: br label %.lr.ph
134; OPT: .lr.ph:
135; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
136; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
137; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
138; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv3, i32 undef, i32 0, i32 0, i1 false)
139; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false)
140; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1
141define amdgpu_kernel void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
142bb:
143  %tmp = icmp sgt i32 %n, 0
144  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
145
146.lr.ph.preheader:                                 ; preds = %bb
147  br label %.lr.ph
148
149._crit_edge.loopexit:                             ; preds = %.lr.ph
150  br label %._crit_edge
151
152._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
153  ret void
154
155.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
156  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
157  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
158  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
159  %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false)
160  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
161  %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false)
162  %tmp8 = add nsw i32 %tmp7, %tmp4
163  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
164  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
165  %exitcond = icmp eq i32 %indvars.iv.next, %n
166  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
167}
168
169declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1
170declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1
171
172attributes #0 = { nounwind }
173attributes #1 = { nounwind argmemonly }
174