1 // Test target codegen - host bc file has to be created first.
2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
4 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
5 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
6 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
7 // expected-no-diagnostics
8 #ifndef HEADER
9 #define HEADER
10 
11 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
12 // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
13 
14 // Check that the execution mode of all 3 target regions is set to Spmd Mode.
15 // CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
16 // CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
17 // CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
18 
19 template<typename tx>
20 tx ftemplate(int n) {
21   int a;
22   short b;
23   tx c;
24   float d;
25   double e;
26 
27   #pragma omp target parallel reduction(+: e)
28   {
29     e += 5;
30   }
31 
32   #pragma omp target parallel reduction(^: c) reduction(*: d)
33   {
34     c ^= 2;
35     d *= 33;
36   }
37 
38   #pragma omp target parallel reduction(|: a) reduction(max: b)
39   {
40     a |= 1;
41     b = 99 > b ? 99 : b;
42   }
43 
44   return a+b+c+d+e;
45 }
46 
47 int bar(int n){
48   int a = 0;
49 
50   a += ftemplate<char>(n);
51 
52   return a;
53 }
54 
55 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
56 //
57 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
58 // CHECK: br label {{%?}}[[EXECUTE:.+]]
59 //
60 // CHECK: [[EXECUTE]]
61 // CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
62 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
63 //
64 //
65 // define internal void [[PFN]](
66 // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
67 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
68 // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
69 // CHECK: store double [[ADD]], double* [[E]], align
70 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
71 // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
72 // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
73 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
74 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
75 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
76 // CHECK: br i1 [[CMP]], label
77 
78 // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
79 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
80 // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
81 // CHECK: store double [[ADD]], double* [[E_IN]], align
82 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
83 // CHECK: br label
84 //
85 // CHECK: ret
86 
87 //
88 // Reduction function
89 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
90 // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
91 // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
92 // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
93 //
94 // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
95 // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
96 // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
97 //
98 // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
99 // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
100 // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
101 // CHECK: store double [[RES]], double* [[VAR_LHS]],
102 // CHECK: ret void
103 
104 //
105 // Shuffle and reduce function
106 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
107 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
108 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
109 //
110 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
111 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
112 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
113 //
114 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
115 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
116 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
117 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
118 //
119 // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
120 // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
121 // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
122 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
123 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
124 // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
125 //
126 // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
127 // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
128 // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
129 //
130 // Condition to reduce
131 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
132 //
133 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
134 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
135 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
136 //
137 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
138 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
139 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
140 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
141 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
142 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
143 //
144 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
145 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
146 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
147 //
148 // CHECK: [[DO_REDUCE]]
149 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
150 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
151 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
152 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
153 //
154 // CHECK: [[REDUCE_ELSE]]
155 // CHECK: br label {{%?}}[[REDUCE_CONT]]
156 //
157 // CHECK: [[REDUCE_CONT]]
158 // Now check if we should just copy over the remote reduction list
159 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
160 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
161 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
162 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
163 //
164 // CHECK: [[DO_COPY]]
165 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
166 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
167 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
168 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
169 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
170 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
171 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
172 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
173 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
174 //
175 // CHECK: [[COPY_ELSE]]
176 // CHECK: br label {{%?}}[[COPY_CONT]]
177 //
178 // CHECK: [[COPY_CONT]]
179 // CHECK: void
180 
181 //
182 // Inter warp copy function
183 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
184 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
185 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
186 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
187 // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
188 // CHECK: br label
189 // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
190 // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
191 // CHECK: br i1 [[DONE_COPY]], label
192 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
193 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
194 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
195 //
196 // [[DO_COPY]]
197 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
198 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
199 // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
200 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
201 //
202 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
203 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
204 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
205 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
206 //
207 // CHECK: [[COPY_ELSE]]
208 // CHECK: br label {{%?}}[[COPY_CONT]]
209 //
210 // Barrier after copy to shared memory storage medium.
211 // CHECK: [[COPY_CONT]]
212 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
213 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
214 //
215 // Read into warp 0.
216 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
217 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
218 //
219 // CHECK: [[DO_READ]]
220 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
221 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
222 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
223 // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
224 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
225 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
226 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
227 // CHECK: br label {{%?}}[[READ_CONT:.+]]
228 //
229 // CHECK: [[READ_ELSE]]
230 // CHECK: br label {{%?}}[[READ_CONT]]
231 //
232 // CHECK: [[READ_CONT]]
233 // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
234 // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
235 // CHECK: br label
236 // CHECK: ret
237 
238 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
239 //
240 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
241 // CHECK: br label {{%?}}[[EXECUTE:.+]]
242 //
243 // CHECK: [[EXECUTE]]
244 // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32*
245 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
246 //
247 //
248 // define internal void [[PFN1]](
249 // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
250 // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
251 // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
252 // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
253 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
254 // CHECK: store i8 [[TRUNC]], i8* [[C]], align
255 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
256 // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
257 // CHECK: store float [[MUL]], float* [[D]], align
258 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
259 // CHECK: store i8* [[C]], i8** [[PTR1]], align
260 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
261 // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
262 // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
263 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
264 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
265 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
266 // CHECK: br i1 [[CMP]], label
267 // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
268 // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
269 // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
270 // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
271 // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
272 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
273 // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
274 // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
275 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
276 // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
277 // CHECK: store float [[MUL]], float* [[D_IN]], align
278 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
279 // CHECK: br label
280 //
281 // CHECK: ret
282 
283 //
284 // Reduction function
285 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
286 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
287 // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
288 //
289 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
290 // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
291 //
292 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
293 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
294 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
295 //
296 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
297 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
298 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
299 //
300 // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
301 // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
302 // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
303 // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
304 // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
305 // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
306 // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
307 //
308 // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
309 // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
310 // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
311 // CHECK: store float [[RES]], float* [[VAR2_LHS]],
312 // CHECK: ret void
313 
314 //
315 // Shuffle and reduce function
316 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
317 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
318 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
319 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
320 //
321 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
322 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
323 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
324 //
325 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
326 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
327 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
328 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
329 //
330 // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
331 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
332 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
333 // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
334 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
335 //
336 // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
337 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
338 //
339 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
340 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
341 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
342 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
343 //
344 // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
345 // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
346 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
347 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
348 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
349 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
350 //
351 // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
352 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
353 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
354 //
355 // Condition to reduce
356 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
357 //
358 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
359 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
360 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
361 //
362 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
363 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
364 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
365 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
366 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
367 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
368 //
369 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
370 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
371 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
372 //
373 // CHECK: [[DO_REDUCE]]
374 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
375 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
376 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
377 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
378 //
379 // CHECK: [[REDUCE_ELSE]]
380 // CHECK: br label {{%?}}[[REDUCE_CONT]]
381 //
382 // CHECK: [[REDUCE_CONT]]
383 // Now check if we should just copy over the remote reduction list
384 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
385 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
386 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
387 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
388 //
389 // CHECK: [[DO_COPY]]
390 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
391 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
392 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
393 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
394 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
395 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
396 //
397 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
398 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
399 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
400 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
401 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
402 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
403 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
404 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
405 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
406 //
407 // CHECK: [[COPY_ELSE]]
408 // CHECK: br label {{%?}}[[COPY_CONT]]
409 //
410 // CHECK: [[COPY_CONT]]
411 // CHECK: void
412 
413 //
414 // Inter warp copy function
415 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
416 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
417 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
418 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
419 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
420 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
421 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
422 //
423 // [[DO_COPY]]
424 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
425 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
426 //
427 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
428 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
429 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
430 // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
431 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
432 //
433 // CHECK: [[COPY_ELSE]]
434 // CHECK: br label {{%?}}[[COPY_CONT]]
435 //
436 // Barrier after copy to shared memory storage medium.
437 // CHECK: [[COPY_CONT]]
438 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
439 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
440 //
441 // Read into warp 0.
442 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
443 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
444 //
445 // CHECK: [[DO_READ]]
446 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
447 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
448 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
449 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
450 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
451 // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
452 // CHECK: br label {{%?}}[[READ_CONT:.+]]
453 //
454 // CHECK: [[READ_ELSE]]
455 // CHECK: br label {{%?}}[[READ_CONT]]
456 //
457 // CHECK: [[READ_CONT]]
458 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
459 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
460 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
461 //
462 // [[DO_COPY]]
463 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
464 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
465 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
466 //
467 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
468 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
469 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
470 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
471 //
472 // CHECK: [[COPY_ELSE]]
473 // CHECK: br label {{%?}}[[COPY_CONT]]
474 //
475 // Barrier after copy to shared memory storage medium.
476 // CHECK: [[COPY_CONT]]
477 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
478 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
479 //
480 // Read into warp 0.
481 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
482 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
483 //
484 // CHECK: [[DO_READ]]
485 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
486 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
487 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
488 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
489 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
490 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
491 // CHECK: br label {{%?}}[[READ_CONT:.+]]
492 //
493 // CHECK: [[READ_ELSE]]
494 // CHECK: br label {{%?}}[[READ_CONT]]
495 //
496 // CHECK: [[READ_CONT]]
497 // CHECK: ret
498 
499 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
500 //
501 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
502 // CHECK: br label {{%?}}[[EXECUTE:.+]]
503 //
504 // CHECK: [[EXECUTE]]
505 // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32*
506 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
507 //
508 //
509 // define internal void [[PFN2]](
510 // CHECK: store i32 0, i32* [[A:%.+]], align
511 // CHECK: store i16 -32768, i16* [[B:%.+]], align
512 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
513 // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
514 // CHECK: store i32 [[OR]], i32* [[A]], align
515 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
516 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
517 // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
518 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
519 //
520 // CHECK: [[DO_MAX]]
521 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
522 //
523 // CHECK: [[MAX_ELSE]]
524 // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
525 // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
526 // CHECK: br label {{%?}}[[MAX_CONT]]
527 //
528 // CHECK: [[MAX_CONT]]
529 // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
530 // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
531 // CHECK: store i16 [[TRUNC]], i16* [[B]], align
532 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
533 // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
534 // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
535 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
536 // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
537 // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
538 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
539 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
540 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
541 // CHECK: br i1 [[CMP]], label
542 
543 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
544 // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
545 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
546 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
547 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
548 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
549 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
550 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
551 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
552 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
553 //
554 // CHECK: [[DO_MAX]]
555 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
556 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
557 //
558 // CHECK: [[MAX_ELSE]]
559 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
560 // CHECK: br label {{%?}}[[MAX_CONT]]
561 //
562 // CHECK: [[MAX_CONT]]
563 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
564 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
565 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
566 // CHECK: br label
567 //
568 // CHECK: ret
569 
570 //
571 // Reduction function
572 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
573 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
574 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
575 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
576 //
577 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
578 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
579 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
580 //
581 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
582 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
583 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
584 //
585 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
586 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
587 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
588 //
589 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
590 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
591 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
592 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
593 //
594 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
595 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
596 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
597 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
598 //
599 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
600 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
601 //
602 // CHECK: [[DO_MAX]]
603 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
604 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
605 //
606 // CHECK: [[MAX_ELSE]]
607 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
608 // CHECK: br label {{%?}}[[MAX_CONT]]
609 //
610 // CHECK: [[MAX_CONT]]
611 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
612 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
613 // CHECK: ret void
614 
615 //
616 // Shuffle and reduce function
617 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
618 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
619 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
620 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
621 //
622 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
623 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
624 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
625 //
626 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
627 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
628 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
629 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
630 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
631 //
632 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
633 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
634 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
635 //
636 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
637 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
638 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
639 //
640 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
641 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
642 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
643 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
644 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
645 //
646 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
647 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
648 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
649 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
650 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
651 //
652 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
653 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
654 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
655 //
656 // Condition to reduce
657 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
658 //
659 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
660 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
661 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
662 //
663 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
664 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
665 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
666 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
667 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
668 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
669 //
670 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
671 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
672 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
673 //
674 // CHECK: [[DO_REDUCE]]
675 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
676 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
677 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
678 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
679 //
680 // CHECK: [[REDUCE_ELSE]]
681 // CHECK: br label {{%?}}[[REDUCE_CONT]]
682 //
683 // CHECK: [[REDUCE_CONT]]
684 // Now check if we should just copy over the remote reduction list
685 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
686 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
687 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
688 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
689 //
690 // CHECK: [[DO_COPY]]
691 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
692 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
693 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
694 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
695 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
696 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
697 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
698 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
699 //
700 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
701 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
702 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
703 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
704 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
705 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
706 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
707 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
708 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
709 //
710 // CHECK: [[COPY_ELSE]]
711 // CHECK: br label {{%?}}[[COPY_CONT]]
712 //
713 // CHECK: [[COPY_CONT]]
714 // CHECK: void
715 
716 //
717 // Inter warp copy function
718 // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
719 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
720 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
721 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
722 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
723 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
724 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
725 //
726 // [[DO_COPY]]
727 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
728 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
729 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
730 //
731 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
732 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
733 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
734 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
735 //
736 // CHECK: [[COPY_ELSE]]
737 // CHECK: br label {{%?}}[[COPY_CONT]]
738 //
739 // Barrier after copy to shared memory storage medium.
740 // CHECK: [[COPY_CONT]]
741 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
742 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
743 //
744 // Read into warp 0.
745 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
746 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
747 //
748 // CHECK: [[DO_READ]]
749 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
750 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
751 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
752 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
753 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
754 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
755 // CHECK: br label {{%?}}[[READ_CONT:.+]]
756 //
757 // CHECK: [[READ_ELSE]]
758 // CHECK: br label {{%?}}[[READ_CONT]]
759 //
760 // CHECK: [[READ_CONT]]
761 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
762 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
763 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
764 //
765 // [[DO_COPY]]
766 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
767 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
768 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
769 //
770 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
771 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
772 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
773 // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
774 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
775 //
776 // CHECK: [[COPY_ELSE]]
777 // CHECK: br label {{%?}}[[COPY_CONT]]
778 //
779 // Barrier after copy to shared memory storage medium.
780 // CHECK: [[COPY_CONT]]
781 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
782 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
783 //
784 // Read into warp 0.
785 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
786 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
787 //
788 // CHECK: [[DO_READ]]
789 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
790 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
791 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
792 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
793 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
794 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
795 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
796 // CHECK: br label {{%?}}[[READ_CONT:.+]]
797 //
798 // CHECK: [[READ_ELSE]]
799 // CHECK: br label {{%?}}[[READ_CONT]]
800 //
801 // CHECK: [[READ_CONT]]
802 // CHECK: ret
803 
804 #endif
805