1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // Test target codegen - host bc file has to be created first.
3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
5 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
6 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
7 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK3
8 // expected-no-diagnostics
9 
10 #ifndef HEADER
11 #define HEADER
12 
13 __thread int id;
14 
15 int baz(int f, double &a);
16 
17 template <typename tx, typename ty>
18 struct TT {
19   tx X;
20   ty Y;
21   tx &operator[](int i) { return X; }
22 };
23 
24 void targetBar(int *Ptr1, int *Ptr2) {
25 #pragma omp target map(Ptr1[:0], Ptr2)
26 #pragma omp parallel num_threads(2)
27   *Ptr1 = *Ptr2;
28 }
29 
30 int foo(int n) {
31   int a = 0;
32   short aa = 0;
33   float b[10];
34   float bn[n];
35   double c[5][10];
36   double cn[5][n];
37   TT<long long, char> d;
38 
39 #pragma omp target
40   {
41   }
42 
43 #pragma omp target if (0)
44   {
45   }
46 
47 #pragma omp target if (1)
48   {
49     aa += 1;
50     aa += 2;
51   }
52 
53 #pragma omp target if (n > 20)
54   {
55     a += 1;
56     b[2] += 1.0;
57     bn[3] += 1.0;
58     c[1][2] += 1.0;
59     cn[1][3] += 1.0;
60     d.X += 1;
61     d.Y += 1;
62     d[0] += 1;
63   }
64 
65   return a;
66 }
67 
68 template <typename tx>
69 tx ftemplate(int n) {
70   tx a = 0;
71   short aa = 0;
72   tx b[10];
73 
74 #pragma omp target if (n > 40)
75   {
76     a += 1;
77     aa += 1;
78     b[2] += 1;
79   }
80 
81   return a;
82 }
83 
84 static int fstatic(int n) {
85   int a = 0;
86   short aa = 0;
87   char aaa = 0;
88   int b[10];
89 
90 #pragma omp target if (n > 50)
91   {
92     a += 1;
93     aa += 1;
94     aaa += 1;
95     b[2] += 1;
96   }
97 
98   return a;
99 }
100 
101 struct S1 {
102   double a;
103 
104   int r1(int n) {
105     int b = n + 1;
106     short int c[2][n];
107 
108 #pragma omp target if (n > 60)
109     {
110       this->a = (double)b + 1.5;
111       c[1][1] = ++a;
112       baz(a, a);
113     }
114 
115     return c[1][1] + (int)b;
116   }
117 };
118 
119 int bar(int n) {
120   int a = 0;
121 
122   a += foo(n);
123 
124   S1 S;
125   a += S.r1(n);
126 
127   a += fstatic(n);
128 
129   a += ftemplate<int>(n);
130 
131   return a;
132 }
133 
134 int baz(int f, double &a) {
135 #pragma omp parallel
136   f = 2 + a;
137   return f;
138 }
139 
140 extern void assert(int) throw() __attribute__((__noreturn__));
141 void unreachable_call() {
142 #pragma omp target
143     assert(0);
144 }
145 
146 #endif
147 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25
148 // CHECK1-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
149 // CHECK1-NEXT:  entry:
150 // CHECK1-NEXT:    [[PTR1_ADDR:%.*]] = alloca i32*, align 8
151 // CHECK1-NEXT:    [[PTR2_ADDR:%.*]] = alloca i32**, align 8
152 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
153 // CHECK1-NEXT:    store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 8
154 // CHECK1-NEXT:    store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 8
155 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8
156 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
157 // CHECK1-NEXT:    call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
158 // CHECK1-NEXT:    br label [[DOTEXECUTE:%.*]]
159 // CHECK1:       .execute:
160 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
161 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
162 // CHECK1-NEXT:    [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8*
163 // CHECK1-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 8
164 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
165 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8*
166 // CHECK1-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 8
167 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
168 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i64 2)
169 // CHECK1-NEXT:    br label [[DOTOMP_DEINIT:%.*]]
170 // CHECK1:       .omp.deinit:
171 // CHECK1-NEXT:    call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
172 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
173 // CHECK1:       .exit:
174 // CHECK1-NEXT:    ret void
175 //
176 //
177 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
178 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR1:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR2:%.*]]) #[[ATTR0]] {
179 // CHECK1-NEXT:  entry:
180 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
181 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
182 // CHECK1-NEXT:    [[PTR1_ADDR:%.*]] = alloca i32**, align 8
183 // CHECK1-NEXT:    [[PTR2_ADDR:%.*]] = alloca i32**, align 8
184 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
185 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
186 // CHECK1-NEXT:    store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 8
187 // CHECK1-NEXT:    store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 8
188 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 8
189 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8
190 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 8
191 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
192 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 8
193 // CHECK1-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
194 // CHECK1-NEXT:    ret void
195 //
196 //
197 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker
198 // CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
199 // CHECK1-NEXT:  entry:
200 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
201 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
202 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
203 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
204 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
205 // CHECK1:       .await.work:
206 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
207 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
208 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
209 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
210 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
211 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
212 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
213 // CHECK1:       .select.workers:
214 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
215 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
216 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
217 // CHECK1:       .execute.parallel:
218 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
219 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
220 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
221 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
222 // CHECK1:       .terminate.parallel:
223 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
224 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
225 // CHECK1:       .barrier.parallel:
226 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
227 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
228 // CHECK1:       .exit:
229 // CHECK1-NEXT:    ret void
230 //
231 //
232 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39
233 // CHECK1-SAME: () #[[ATTR0]] {
234 // CHECK1-NEXT:  entry:
235 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
236 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
237 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
238 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
239 // CHECK1-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
240 // CHECK1-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
241 // CHECK1:       .worker:
242 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]]
243 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
244 // CHECK1:       .mastercheck:
245 // CHECK1-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
246 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
247 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
248 // CHECK1-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
249 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
250 // CHECK1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
251 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
252 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
253 // CHECK1-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
254 // CHECK1:       .master:
255 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
256 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
257 // CHECK1-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
258 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
259 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
260 // CHECK1:       .termination.notifier:
261 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
262 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
263 // CHECK1-NEXT:    br label [[DOTEXIT]]
264 // CHECK1:       .exit:
265 // CHECK1-NEXT:    ret void
266 //
267 //
268 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker
269 // CHECK1-SAME: () #[[ATTR3]] {
270 // CHECK1-NEXT:  entry:
271 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
272 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
273 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
274 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
275 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
276 // CHECK1:       .await.work:
277 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
278 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
279 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
280 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
281 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
282 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
283 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
284 // CHECK1:       .select.workers:
285 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
286 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
287 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
288 // CHECK1:       .execute.parallel:
289 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
290 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
291 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
292 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
293 // CHECK1:       .terminate.parallel:
294 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
295 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
296 // CHECK1:       .barrier.parallel:
297 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
298 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
299 // CHECK1:       .exit:
300 // CHECK1-NEXT:    ret void
301 //
302 //
303 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47
304 // CHECK1-SAME: (i64 [[AA:%.*]]) #[[ATTR0]] {
305 // CHECK1-NEXT:  entry:
306 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
307 // CHECK1-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
308 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
309 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
310 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
311 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
312 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
313 // CHECK1-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
314 // CHECK1-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
315 // CHECK1:       .worker:
316 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]]
317 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
318 // CHECK1:       .mastercheck:
319 // CHECK1-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
320 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
321 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
322 // CHECK1-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
323 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
324 // CHECK1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
325 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
326 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
327 // CHECK1-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
328 // CHECK1:       .master:
329 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
330 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
331 // CHECK1-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
332 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
333 // CHECK1-NEXT:    [[TMP5:%.*]] = load i16, i16* [[CONV]], align 8
334 // CHECK1-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
335 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
336 // CHECK1-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD]] to i16
337 // CHECK1-NEXT:    store i16 [[CONV8]], i16* [[CONV]], align 8
338 // CHECK1-NEXT:    [[TMP6:%.*]] = load i16, i16* [[CONV]], align 8
339 // CHECK1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP6]] to i32
340 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2
341 // CHECK1-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
342 // CHECK1-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 8
343 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
344 // CHECK1:       .termination.notifier:
345 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
346 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
347 // CHECK1-NEXT:    br label [[DOTEXIT]]
348 // CHECK1:       .exit:
349 // CHECK1-NEXT:    ret void
350 //
351 //
352 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker
353 // CHECK1-SAME: () #[[ATTR3]] {
354 // CHECK1-NEXT:  entry:
355 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
356 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
357 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
358 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
359 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
360 // CHECK1:       .await.work:
361 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
362 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
363 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
364 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
365 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
366 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
367 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
368 // CHECK1:       .select.workers:
369 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
370 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
371 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
372 // CHECK1:       .execute.parallel:
373 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
374 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
375 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
376 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
377 // CHECK1:       .terminate.parallel:
378 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
379 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
380 // CHECK1:       .barrier.parallel:
381 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
382 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
383 // CHECK1:       .exit:
384 // CHECK1-NEXT:    ret void
385 //
386 //
387 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53
388 // CHECK1-SAME: (i64 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i64 [[VLA1:%.*]], i64 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
389 // CHECK1-NEXT:  entry:
390 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
391 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
392 // CHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
393 // CHECK1-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 8
394 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
395 // CHECK1-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
396 // CHECK1-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
397 // CHECK1-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 8
398 // CHECK1-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
399 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
400 // CHECK1-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
401 // CHECK1-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
402 // CHECK1-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 8
403 // CHECK1-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
404 // CHECK1-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
405 // CHECK1-NEXT:    store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
406 // CHECK1-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 8
407 // CHECK1-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
408 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
409 // CHECK1-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
410 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
411 // CHECK1-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
412 // CHECK1-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
413 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
414 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
415 // CHECK1-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
416 // CHECK1-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
417 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
418 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
419 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
420 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
421 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
422 // CHECK1-NEXT:    br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
423 // CHECK1:       .worker:
424 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]]
425 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
426 // CHECK1:       .mastercheck:
427 // CHECK1-NEXT:    [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
428 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
429 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
430 // CHECK1-NEXT:    [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1
431 // CHECK1-NEXT:    [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1
432 // CHECK1-NEXT:    [[TMP11:%.*]] = xor i32 [[TMP9]], -1
433 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]]
434 // CHECK1-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]]
435 // CHECK1-NEXT:    br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
436 // CHECK1:       .master:
437 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
438 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
439 // CHECK1-NEXT:    [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
440 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
441 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8
442 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
443 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 8
444 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i64 0, i64 2
445 // CHECK1-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
446 // CHECK1-NEXT:    [[CONV11:%.*]] = fpext float [[TMP14]] to double
447 // CHECK1-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
448 // CHECK1-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
449 // CHECK1-NEXT:    store float [[CONV13]], float* [[ARRAYIDX]], align 4
450 // CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 3
451 // CHECK1-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX14]], align 4
452 // CHECK1-NEXT:    [[CONV15:%.*]] = fpext float [[TMP15]] to double
453 // CHECK1-NEXT:    [[ADD16:%.*]] = fadd double [[CONV15]], 1.000000e+00
454 // CHECK1-NEXT:    [[CONV17:%.*]] = fptrunc double [[ADD16]] to float
455 // CHECK1-NEXT:    store float [[CONV17]], float* [[ARRAYIDX14]], align 4
456 // CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i64 0, i64 1
457 // CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX18]], i64 0, i64 2
458 // CHECK1-NEXT:    [[TMP16:%.*]] = load double, double* [[ARRAYIDX19]], align 8
459 // CHECK1-NEXT:    [[ADD20:%.*]] = fadd double [[TMP16]], 1.000000e+00
460 // CHECK1-NEXT:    store double [[ADD20]], double* [[ARRAYIDX19]], align 8
461 // CHECK1-NEXT:    [[TMP17:%.*]] = mul nsw i64 1, [[TMP5]]
462 // CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP17]]
463 // CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX21]], i64 3
464 // CHECK1-NEXT:    [[TMP18:%.*]] = load double, double* [[ARRAYIDX22]], align 8
465 // CHECK1-NEXT:    [[ADD23:%.*]] = fadd double [[TMP18]], 1.000000e+00
466 // CHECK1-NEXT:    store double [[ADD23]], double* [[ARRAYIDX22]], align 8
467 // CHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0
468 // CHECK1-NEXT:    [[TMP19:%.*]] = load i64, i64* [[X]], align 8
469 // CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i64 [[TMP19]], 1
470 // CHECK1-NEXT:    store i64 [[ADD24]], i64* [[X]], align 8
471 // CHECK1-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1
472 // CHECK1-NEXT:    [[TMP20:%.*]] = load i8, i8* [[Y]], align 8
473 // CHECK1-NEXT:    [[CONV25:%.*]] = sext i8 [[TMP20]] to i32
474 // CHECK1-NEXT:    [[ADD26:%.*]] = add nsw i32 [[CONV25]], 1
475 // CHECK1-NEXT:    [[CONV27:%.*]] = trunc i32 [[ADD26]] to i8
476 // CHECK1-NEXT:    store i8 [[CONV27]], i8* [[Y]], align 8
477 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]]
478 // CHECK1-NEXT:    [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8
479 // CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i64 [[TMP21]], 1
480 // CHECK1-NEXT:    store i64 [[ADD28]], i64* [[CALL]], align 8
481 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
482 // CHECK1:       .termination.notifier:
483 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
484 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
485 // CHECK1-NEXT:    br label [[DOTEXIT]]
486 // CHECK1:       .exit:
487 // CHECK1-NEXT:    ret void
488 //
489 //
490 // CHECK1-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
491 // CHECK1-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 {
492 // CHECK1-NEXT:  entry:
493 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 8
494 // CHECK1-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
495 // CHECK1-NEXT:    store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 8
496 // CHECK1-NEXT:    store i32 [[I]], i32* [[I_ADDR]], align 4
497 // CHECK1-NEXT:    [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 8
498 // CHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0
499 // CHECK1-NEXT:    ret i64* [[X]]
500 //
501 //
502 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker
503 // CHECK1-SAME: () #[[ATTR3]] {
504 // CHECK1-NEXT:  entry:
505 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
506 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
507 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
508 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
509 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
510 // CHECK1:       .await.work:
511 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
512 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
513 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
514 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
515 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
516 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
517 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
518 // CHECK1:       .select.workers:
519 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
520 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
521 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
522 // CHECK1:       .execute.parallel:
523 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
524 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
525 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
526 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
527 // CHECK1:       .terminate.parallel:
528 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
529 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
530 // CHECK1:       .barrier.parallel:
531 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
532 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
533 // CHECK1:       .exit:
534 // CHECK1-NEXT:    ret void
535 //
536 //
537 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90
538 // CHECK1-SAME: (i64 [[A:%.*]], i64 [[AA:%.*]], i64 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
539 // CHECK1-NEXT:  entry:
540 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
541 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
542 // CHECK1-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
543 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
544 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
545 // CHECK1-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
546 // CHECK1-NEXT:    store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
547 // CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
548 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
549 // CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
550 // CHECK1-NEXT:    [[CONV2:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
551 // CHECK1-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
552 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
553 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
554 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
555 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
556 // CHECK1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
557 // CHECK1-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
558 // CHECK1:       .worker:
559 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]]
560 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
561 // CHECK1:       .mastercheck:
562 // CHECK1-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
563 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
564 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
565 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
566 // CHECK1-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
567 // CHECK1-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
568 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
569 // CHECK1-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
570 // CHECK1-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
571 // CHECK1:       .master:
572 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
573 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
574 // CHECK1-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
575 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
576 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8
577 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
578 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 8
579 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[CONV1]], align 8
580 // CHECK1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP7]] to i32
581 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1
582 // CHECK1-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
583 // CHECK1-NEXT:    store i16 [[CONV11]], i16* [[CONV1]], align 8
584 // CHECK1-NEXT:    [[TMP8:%.*]] = load i8, i8* [[CONV2]], align 8
585 // CHECK1-NEXT:    [[CONV12:%.*]] = sext i8 [[TMP8]] to i32
586 // CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i32 [[CONV12]], 1
587 // CHECK1-NEXT:    [[CONV14:%.*]] = trunc i32 [[ADD13]] to i8
588 // CHECK1-NEXT:    store i8 [[CONV14]], i8* [[CONV2]], align 8
589 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2
590 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
591 // CHECK1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP9]], 1
592 // CHECK1-NEXT:    store i32 [[ADD15]], i32* [[ARRAYIDX]], align 4
593 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
594 // CHECK1:       .termination.notifier:
595 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
596 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
597 // CHECK1-NEXT:    br label [[DOTEXIT]]
598 // CHECK1:       .exit:
599 // CHECK1-NEXT:    ret void
600 //
601 //
602 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker
603 // CHECK1-SAME: () #[[ATTR3]] {
604 // CHECK1-NEXT:  entry:
605 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
606 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
607 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
608 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
609 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
610 // CHECK1:       .await.work:
611 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
612 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
613 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
614 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
615 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
616 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
617 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
618 // CHECK1:       .select.workers:
619 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
620 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
621 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
622 // CHECK1:       .execute.parallel:
623 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
624 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
625 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
626 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
627 // CHECK1:       .terminate.parallel:
628 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
629 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
630 // CHECK1:       .barrier.parallel:
631 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
632 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
633 // CHECK1:       .exit:
634 // CHECK1-NEXT:    ret void
635 //
636 //
637 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108
638 // CHECK1-SAME: (%struct.S1* [[THIS:%.*]], i64 [[B:%.*]], i64 [[VLA:%.*]], i64 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
639 // CHECK1-NEXT:  entry:
640 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
641 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
642 // CHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
643 // CHECK1-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
644 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 8
645 // CHECK1-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
646 // CHECK1-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
647 // CHECK1-NEXT:    store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
648 // CHECK1-NEXT:    store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
649 // CHECK1-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 8
650 // CHECK1-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
651 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
652 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
653 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
654 // CHECK1-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
655 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
656 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
657 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
658 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
659 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
660 // CHECK1-NEXT:    br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
661 // CHECK1:       .worker:
662 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]]
663 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
664 // CHECK1:       .mastercheck:
665 // CHECK1-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
666 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
667 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
668 // CHECK1-NEXT:    [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
669 // CHECK1-NEXT:    [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
670 // CHECK1-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], -1
671 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]]
672 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
673 // CHECK1-NEXT:    br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
674 // CHECK1:       .master:
675 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
676 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
677 // CHECK1-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
678 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
679 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8
680 // CHECK1-NEXT:    [[CONV9:%.*]] = sitofp i32 [[TMP9]] to double
681 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV9]], 1.500000e+00
682 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
683 // CHECK1-NEXT:    store double [[ADD]], double* [[A]], align 8
684 // CHECK1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
685 // CHECK1-NEXT:    [[TMP10:%.*]] = load double, double* [[A10]], align 8
686 // CHECK1-NEXT:    [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00
687 // CHECK1-NEXT:    store double [[INC]], double* [[A10]], align 8
688 // CHECK1-NEXT:    [[CONV11:%.*]] = fptosi double [[INC]] to i16
689 // CHECK1-NEXT:    [[TMP11:%.*]] = mul nsw i64 1, [[TMP2]]
690 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i64 [[TMP11]]
691 // CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
692 // CHECK1-NEXT:    store i16 [[CONV11]], i16* [[ARRAYIDX12]], align 2
693 // CHECK1-NEXT:    [[A13:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
694 // CHECK1-NEXT:    [[TMP12:%.*]] = load double, double* [[A13]], align 8
695 // CHECK1-NEXT:    [[CONV14:%.*]] = fptosi double [[TMP12]] to i32
696 // CHECK1-NEXT:    [[A15:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
697 // CHECK1-NEXT:    [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV14]], double* nonnull align 8 dereferenceable(8) [[A15]]) #[[ATTR7]]
698 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
699 // CHECK1:       .termination.notifier:
700 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
701 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
702 // CHECK1-NEXT:    br label [[DOTEXIT]]
703 // CHECK1:       .exit:
704 // CHECK1-NEXT:    ret void
705 //
706 //
707 // CHECK1-LABEL: define {{[^@]+}}@_Z3baziRd
708 // CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
709 // CHECK1-NEXT:  entry:
710 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 8
711 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
712 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
713 // CHECK1-NEXT:    [[F:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
714 // CHECK1-NEXT:    [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
715 // CHECK1-NEXT:    store i32 [[F1]], i32* [[F_ON_STACK]], align 4
716 // CHECK1-NEXT:    store double* [[A]], double** [[A_ADDR]], align 8
717 // CHECK1-NEXT:    [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 8
718 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
719 // CHECK1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
720 // CHECK1-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 8
721 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
722 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
723 // CHECK1-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 8
724 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
725 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i64 2)
726 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
727 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[F]])
728 // CHECK1-NEXT:    ret i32 [[TMP7]]
729 //
730 //
731 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
732 // CHECK1-SAME: () #[[ATTR3]] {
733 // CHECK1-NEXT:  entry:
734 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
735 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
736 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
737 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
738 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
739 // CHECK1:       .await.work:
740 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
741 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
742 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
743 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
744 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
745 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
746 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
747 // CHECK1:       .select.workers:
748 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
749 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
750 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
751 // CHECK1:       .execute.parallel:
752 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
753 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
754 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
755 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
756 // CHECK1:       .terminate.parallel:
757 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
758 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
759 // CHECK1:       .barrier.parallel:
760 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
761 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
762 // CHECK1:       .exit:
763 // CHECK1-NEXT:    ret void
764 //
765 //
766 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142
767 // CHECK1-SAME: () #[[ATTR0]] {
768 // CHECK1-NEXT:  entry:
769 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
770 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
771 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
772 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
773 // CHECK1-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
774 // CHECK1-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
775 // CHECK1:       .worker:
776 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]]
777 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
778 // CHECK1:       .mastercheck:
779 // CHECK1-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
780 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
781 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
782 // CHECK1-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
783 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
784 // CHECK1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
785 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
786 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
787 // CHECK1-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
788 // CHECK1:       .master:
789 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
790 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
791 // CHECK1-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
792 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
793 // CHECK1-NEXT:    call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
794 // CHECK1-NEXT:    unreachable
795 // CHECK1:       5:
796 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
797 // CHECK1:       .termination.notifier:
798 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
799 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
800 // CHECK1-NEXT:    br label [[DOTEXIT]]
801 // CHECK1:       .exit:
802 // CHECK1-NEXT:    ret void
803 //
804 //
805 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker
806 // CHECK1-SAME: () #[[ATTR3]] {
807 // CHECK1-NEXT:  entry:
808 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
809 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
810 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
811 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
812 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
813 // CHECK1:       .await.work:
814 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
815 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
816 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
817 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
818 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
819 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
820 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
821 // CHECK1:       .select.workers:
822 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
823 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
824 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
825 // CHECK1:       .execute.parallel:
826 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
827 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
828 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
829 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
830 // CHECK1:       .terminate.parallel:
831 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
832 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
833 // CHECK1:       .barrier.parallel:
834 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
835 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
836 // CHECK1:       .exit:
837 // CHECK1-NEXT:    ret void
838 //
839 //
840 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74
841 // CHECK1-SAME: (i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
842 // CHECK1-NEXT:  entry:
843 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
844 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
845 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
846 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
847 // CHECK1-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
848 // CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
849 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
850 // CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
851 // CHECK1-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
852 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
853 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
854 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
855 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
856 // CHECK1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
857 // CHECK1-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
858 // CHECK1:       .worker:
859 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]]
860 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
861 // CHECK1:       .mastercheck:
862 // CHECK1-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
863 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
864 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
865 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
866 // CHECK1-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
867 // CHECK1-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
868 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
869 // CHECK1-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
870 // CHECK1-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
871 // CHECK1:       .master:
872 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
873 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
874 // CHECK1-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
875 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
876 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8
877 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
878 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 8
879 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[CONV1]], align 8
880 // CHECK1-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP7]] to i32
881 // CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1
882 // CHECK1-NEXT:    [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16
883 // CHECK1-NEXT:    store i16 [[CONV10]], i16* [[CONV1]], align 8
884 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2
885 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
886 // CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP8]], 1
887 // CHECK1-NEXT:    store i32 [[ADD11]], i32* [[ARRAYIDX]], align 4
888 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
889 // CHECK1:       .termination.notifier:
890 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
891 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
892 // CHECK1-NEXT:    br label [[DOTEXIT]]
893 // CHECK1:       .exit:
894 // CHECK1-NEXT:    ret void
895 //
896 //
897 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
898 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
899 // CHECK1-NEXT:  entry:
900 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
901 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
902 // CHECK1-NEXT:    [[F_ADDR:%.*]] = alloca i32*, align 8
903 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 8
904 // CHECK1-NEXT:    [[TMP:%.*]] = alloca double*, align 8
905 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
906 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
907 // CHECK1-NEXT:    store i32* [[F]], i32** [[F_ADDR]], align 8
908 // CHECK1-NEXT:    store double* [[A]], double** [[A_ADDR]], align 8
909 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 8
910 // CHECK1-NEXT:    [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 8
911 // CHECK1-NEXT:    store double* [[TMP1]], double** [[TMP]], align 8
912 // CHECK1-NEXT:    [[TMP2:%.*]] = load double*, double** [[TMP]], align 8
913 // CHECK1-NEXT:    [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
914 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]]
915 // CHECK1-NEXT:    [[CONV:%.*]] = fptosi double [[ADD]] to i32
916 // CHECK1-NEXT:    store i32 [[CONV]], i32* [[TMP0]], align 4
917 // CHECK1-NEXT:    ret void
918 //
919 //
920 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
921 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] {
922 // CHECK1-NEXT:  entry:
923 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
924 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
925 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
926 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
927 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
928 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
929 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
930 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
931 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
932 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
933 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
934 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
935 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1
936 // CHECK1-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double**
937 // CHECK1-NEXT:    [[TMP8:%.*]] = load double*, double** [[TMP7]], align 8
938 // CHECK1-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]]
939 // CHECK1-NEXT:    ret void
940 //
941 //
942 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25
943 // CHECK2-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
944 // CHECK2-NEXT:  entry:
945 // CHECK2-NEXT:    [[PTR1_ADDR:%.*]] = alloca i32*, align 4
946 // CHECK2-NEXT:    [[PTR2_ADDR:%.*]] = alloca i32**, align 4
947 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
948 // CHECK2-NEXT:    store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 4
949 // CHECK2-NEXT:    store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4
950 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
951 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
952 // CHECK2-NEXT:    call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
953 // CHECK2-NEXT:    br label [[DOTEXECUTE:%.*]]
954 // CHECK2:       .execute:
955 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
956 // CHECK2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
957 // CHECK2-NEXT:    [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8*
958 // CHECK2-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
959 // CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
960 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8*
961 // CHECK2-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
962 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
963 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i32 2)
964 // CHECK2-NEXT:    br label [[DOTOMP_DEINIT:%.*]]
965 // CHECK2:       .omp.deinit:
966 // CHECK2-NEXT:    call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
967 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
968 // CHECK2:       .exit:
969 // CHECK2-NEXT:    ret void
970 //
971 //
972 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
973 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0]] {
974 // CHECK2-NEXT:  entry:
975 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
976 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
977 // CHECK2-NEXT:    [[PTR1_ADDR:%.*]] = alloca i32**, align 4
978 // CHECK2-NEXT:    [[PTR2_ADDR:%.*]] = alloca i32**, align 4
979 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
980 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
981 // CHECK2-NEXT:    store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 4
982 // CHECK2-NEXT:    store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4
983 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 4
984 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
985 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 4
986 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
987 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 4
988 // CHECK2-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
989 // CHECK2-NEXT:    ret void
990 //
991 //
992 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker
993 // CHECK2-SAME: () #[[ATTR3:[0-9]+]] {
994 // CHECK2-NEXT:  entry:
995 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
996 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
997 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
998 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
999 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1000 // CHECK2:       .await.work:
1001 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1002 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1003 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1004 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1005 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1006 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1007 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1008 // CHECK2:       .select.workers:
1009 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1010 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1011 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1012 // CHECK2:       .execute.parallel:
1013 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
1014 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1015 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1016 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1017 // CHECK2:       .terminate.parallel:
1018 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
1019 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1020 // CHECK2:       .barrier.parallel:
1021 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1022 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
1023 // CHECK2:       .exit:
1024 // CHECK2-NEXT:    ret void
1025 //
1026 //
1027 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39
1028 // CHECK2-SAME: () #[[ATTR0]] {
1029 // CHECK2-NEXT:  entry:
1030 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1031 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1032 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1033 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1034 // CHECK2-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1035 // CHECK2-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1036 // CHECK2:       .worker:
1037 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]]
1038 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
1039 // CHECK2:       .mastercheck:
1040 // CHECK2-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1041 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1042 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1043 // CHECK2-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1044 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1045 // CHECK2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1046 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1047 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1048 // CHECK2-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1049 // CHECK2:       .master:
1050 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1051 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1052 // CHECK2-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1053 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1054 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1055 // CHECK2:       .termination.notifier:
1056 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1057 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1058 // CHECK2-NEXT:    br label [[DOTEXIT]]
1059 // CHECK2:       .exit:
1060 // CHECK2-NEXT:    ret void
1061 //
1062 //
1063 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker
1064 // CHECK2-SAME: () #[[ATTR3]] {
1065 // CHECK2-NEXT:  entry:
1066 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1067 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1068 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1069 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1070 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1071 // CHECK2:       .await.work:
1072 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1073 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1074 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1075 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1076 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1077 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1078 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1079 // CHECK2:       .select.workers:
1080 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1081 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1082 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1083 // CHECK2:       .execute.parallel:
1084 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
1085 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1086 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1087 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1088 // CHECK2:       .terminate.parallel:
1089 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
1090 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1091 // CHECK2:       .barrier.parallel:
1092 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1093 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
1094 // CHECK2:       .exit:
1095 // CHECK2-NEXT:    ret void
1096 //
1097 //
1098 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47
1099 // CHECK2-SAME: (i32 [[AA:%.*]]) #[[ATTR0]] {
1100 // CHECK2-NEXT:  entry:
1101 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
1102 // CHECK2-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
1103 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
1104 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1105 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1106 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1107 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1108 // CHECK2-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1109 // CHECK2-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1110 // CHECK2:       .worker:
1111 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]]
1112 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
1113 // CHECK2:       .mastercheck:
1114 // CHECK2-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1115 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1116 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1117 // CHECK2-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1118 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1119 // CHECK2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1120 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1121 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1122 // CHECK2-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1123 // CHECK2:       .master:
1124 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1125 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1126 // CHECK2-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1127 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1128 // CHECK2-NEXT:    [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4
1129 // CHECK2-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
1130 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
1131 // CHECK2-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD]] to i16
1132 // CHECK2-NEXT:    store i16 [[CONV8]], i16* [[CONV]], align 4
1133 // CHECK2-NEXT:    [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4
1134 // CHECK2-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP6]] to i32
1135 // CHECK2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2
1136 // CHECK2-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
1137 // CHECK2-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 4
1138 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1139 // CHECK2:       .termination.notifier:
1140 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1141 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1142 // CHECK2-NEXT:    br label [[DOTEXIT]]
1143 // CHECK2:       .exit:
1144 // CHECK2-NEXT:    ret void
1145 //
1146 //
1147 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker
1148 // CHECK2-SAME: () #[[ATTR3]] {
1149 // CHECK2-NEXT:  entry:
1150 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1151 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1152 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1153 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1154 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1155 // CHECK2:       .await.work:
1156 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1157 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1158 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1159 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1160 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1161 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1162 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1163 // CHECK2:       .select.workers:
1164 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1165 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1166 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1167 // CHECK2:       .execute.parallel:
1168 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
1169 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1170 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1171 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1172 // CHECK2:       .terminate.parallel:
1173 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
1174 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1175 // CHECK2:       .barrier.parallel:
1176 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1177 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
1178 // CHECK2:       .exit:
1179 // CHECK2-NEXT:    ret void
1180 //
1181 //
1182 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53
1183 // CHECK2-SAME: (i32 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i32 [[VLA1:%.*]], i32 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
1184 // CHECK2-NEXT:  entry:
1185 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1186 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
1187 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
1188 // CHECK2-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
1189 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
1190 // CHECK2-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
1191 // CHECK2-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
1192 // CHECK2-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
1193 // CHECK2-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
1194 // CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1195 // CHECK2-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
1196 // CHECK2-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
1197 // CHECK2-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
1198 // CHECK2-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
1199 // CHECK2-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
1200 // CHECK2-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
1201 // CHECK2-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
1202 // CHECK2-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
1203 // CHECK2-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
1204 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
1205 // CHECK2-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
1206 // CHECK2-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
1207 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
1208 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
1209 // CHECK2-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
1210 // CHECK2-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
1211 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1212 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1213 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1214 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1215 // CHECK2-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1216 // CHECK2-NEXT:    br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1217 // CHECK2:       .worker:
1218 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]]
1219 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
1220 // CHECK2:       .mastercheck:
1221 // CHECK2-NEXT:    [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1222 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1223 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1224 // CHECK2-NEXT:    [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1
1225 // CHECK2-NEXT:    [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1
1226 // CHECK2-NEXT:    [[TMP11:%.*]] = xor i32 [[TMP9]], -1
1227 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]]
1228 // CHECK2-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]]
1229 // CHECK2-NEXT:    br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1230 // CHECK2:       .master:
1231 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1232 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1233 // CHECK2-NEXT:    [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
1234 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
1235 // CHECK2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4
1236 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
1237 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
1238 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i32 0, i32 2
1239 // CHECK2-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
1240 // CHECK2-NEXT:    [[CONV:%.*]] = fpext float [[TMP14]] to double
1241 // CHECK2-NEXT:    [[ADD11:%.*]] = fadd double [[CONV]], 1.000000e+00
1242 // CHECK2-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
1243 // CHECK2-NEXT:    store float [[CONV12]], float* [[ARRAYIDX]], align 4
1244 // CHECK2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 3
1245 // CHECK2-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4
1246 // CHECK2-NEXT:    [[CONV14:%.*]] = fpext float [[TMP15]] to double
1247 // CHECK2-NEXT:    [[ADD15:%.*]] = fadd double [[CONV14]], 1.000000e+00
1248 // CHECK2-NEXT:    [[CONV16:%.*]] = fptrunc double [[ADD15]] to float
1249 // CHECK2-NEXT:    store float [[CONV16]], float* [[ARRAYIDX13]], align 4
1250 // CHECK2-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i32 0, i32 1
1251 // CHECK2-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX17]], i32 0, i32 2
1252 // CHECK2-NEXT:    [[TMP16:%.*]] = load double, double* [[ARRAYIDX18]], align 8
1253 // CHECK2-NEXT:    [[ADD19:%.*]] = fadd double [[TMP16]], 1.000000e+00
1254 // CHECK2-NEXT:    store double [[ADD19]], double* [[ARRAYIDX18]], align 8
1255 // CHECK2-NEXT:    [[TMP17:%.*]] = mul nsw i32 1, [[TMP5]]
1256 // CHECK2-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 [[TMP17]]
1257 // CHECK2-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX20]], i32 3
1258 // CHECK2-NEXT:    [[TMP18:%.*]] = load double, double* [[ARRAYIDX21]], align 8
1259 // CHECK2-NEXT:    [[ADD22:%.*]] = fadd double [[TMP18]], 1.000000e+00
1260 // CHECK2-NEXT:    store double [[ADD22]], double* [[ARRAYIDX21]], align 8
1261 // CHECK2-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0
1262 // CHECK2-NEXT:    [[TMP19:%.*]] = load i64, i64* [[X]], align 8
1263 // CHECK2-NEXT:    [[ADD23:%.*]] = add nsw i64 [[TMP19]], 1
1264 // CHECK2-NEXT:    store i64 [[ADD23]], i64* [[X]], align 8
1265 // CHECK2-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1
1266 // CHECK2-NEXT:    [[TMP20:%.*]] = load i8, i8* [[Y]], align 8
1267 // CHECK2-NEXT:    [[CONV24:%.*]] = sext i8 [[TMP20]] to i32
1268 // CHECK2-NEXT:    [[ADD25:%.*]] = add nsw i32 [[CONV24]], 1
1269 // CHECK2-NEXT:    [[CONV26:%.*]] = trunc i32 [[ADD25]] to i8
1270 // CHECK2-NEXT:    store i8 [[CONV26]], i8* [[Y]], align 8
1271 // CHECK2-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]]
1272 // CHECK2-NEXT:    [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8
1273 // CHECK2-NEXT:    [[ADD27:%.*]] = add nsw i64 [[TMP21]], 1
1274 // CHECK2-NEXT:    store i64 [[ADD27]], i64* [[CALL]], align 8
1275 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1276 // CHECK2:       .termination.notifier:
1277 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1278 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1279 // CHECK2-NEXT:    br label [[DOTEXIT]]
1280 // CHECK2:       .exit:
1281 // CHECK2-NEXT:    ret void
1282 //
1283 //
1284 // CHECK2-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
1285 // CHECK2-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 {
1286 // CHECK2-NEXT:  entry:
1287 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4
1288 // CHECK2-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
1289 // CHECK2-NEXT:    store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 4
1290 // CHECK2-NEXT:    store i32 [[I]], i32* [[I_ADDR]], align 4
1291 // CHECK2-NEXT:    [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 4
1292 // CHECK2-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0
1293 // CHECK2-NEXT:    ret i64* [[X]]
1294 //
1295 //
1296 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker
1297 // CHECK2-SAME: () #[[ATTR3]] {
1298 // CHECK2-NEXT:  entry:
1299 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1300 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1301 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1302 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1303 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1304 // CHECK2:       .await.work:
1305 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1306 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1307 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1308 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1309 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1310 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1311 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1312 // CHECK2:       .select.workers:
1313 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1314 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1315 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1316 // CHECK2:       .execute.parallel:
1317 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
1318 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1319 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1320 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1321 // CHECK2:       .terminate.parallel:
1322 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
1323 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1324 // CHECK2:       .barrier.parallel:
1325 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1326 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
1327 // CHECK2:       .exit:
1328 // CHECK2-NEXT:    ret void
1329 //
1330 //
1331 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90
1332 // CHECK2-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], i32 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
1333 // CHECK2-NEXT:  entry:
1334 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1335 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
1336 // CHECK2-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
1337 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
1338 // CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1339 // CHECK2-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
1340 // CHECK2-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
1341 // CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
1342 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
1343 // CHECK2-NEXT:    [[CONV1:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
1344 // CHECK2-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
1345 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1346 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1347 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1348 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1349 // CHECK2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1350 // CHECK2-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1351 // CHECK2:       .worker:
1352 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]]
1353 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
1354 // CHECK2:       .mastercheck:
1355 // CHECK2-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1356 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1357 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1358 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
1359 // CHECK2-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
1360 // CHECK2-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
1361 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
1362 // CHECK2-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
1363 // CHECK2-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1364 // CHECK2:       .master:
1365 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1366 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1367 // CHECK2-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
1368 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
1369 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
1370 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
1371 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
1372 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4
1373 // CHECK2-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP7]] to i32
1374 // CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1
1375 // CHECK2-NEXT:    [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16
1376 // CHECK2-NEXT:    store i16 [[CONV10]], i16* [[CONV]], align 4
1377 // CHECK2-NEXT:    [[TMP8:%.*]] = load i8, i8* [[CONV1]], align 4
1378 // CHECK2-NEXT:    [[CONV11:%.*]] = sext i8 [[TMP8]] to i32
1379 // CHECK2-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
1380 // CHECK2-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i8
1381 // CHECK2-NEXT:    store i8 [[CONV13]], i8* [[CONV1]], align 4
1382 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
1383 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1384 // CHECK2-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP9]], 1
1385 // CHECK2-NEXT:    store i32 [[ADD14]], i32* [[ARRAYIDX]], align 4
1386 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1387 // CHECK2:       .termination.notifier:
1388 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1389 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1390 // CHECK2-NEXT:    br label [[DOTEXIT]]
1391 // CHECK2:       .exit:
1392 // CHECK2-NEXT:    ret void
1393 //
1394 //
1395 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker
1396 // CHECK2-SAME: () #[[ATTR3]] {
1397 // CHECK2-NEXT:  entry:
1398 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1399 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1400 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1401 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1402 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1403 // CHECK2:       .await.work:
1404 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1405 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1406 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1407 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1408 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1409 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1410 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1411 // CHECK2:       .select.workers:
1412 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1413 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1414 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1415 // CHECK2:       .execute.parallel:
1416 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
1417 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1418 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1419 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1420 // CHECK2:       .terminate.parallel:
1421 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
1422 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1423 // CHECK2:       .barrier.parallel:
1424 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1425 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
1426 // CHECK2:       .exit:
1427 // CHECK2-NEXT:    ret void
1428 //
1429 //
1430 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108
1431 // CHECK2-SAME: (%struct.S1* [[THIS:%.*]], i32 [[B:%.*]], i32 [[VLA:%.*]], i32 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
1432 // CHECK2-NEXT:  entry:
1433 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
1434 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
1435 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
1436 // CHECK2-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
1437 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
1438 // CHECK2-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
1439 // CHECK2-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
1440 // CHECK2-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
1441 // CHECK2-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
1442 // CHECK2-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
1443 // CHECK2-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
1444 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
1445 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
1446 // CHECK2-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
1447 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1448 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1449 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1450 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1451 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1452 // CHECK2-NEXT:    br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1453 // CHECK2:       .worker:
1454 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]]
1455 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
1456 // CHECK2:       .mastercheck:
1457 // CHECK2-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1458 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1459 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1460 // CHECK2-NEXT:    [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
1461 // CHECK2-NEXT:    [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
1462 // CHECK2-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], -1
1463 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]]
1464 // CHECK2-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
1465 // CHECK2-NEXT:    br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1466 // CHECK2:       .master:
1467 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1468 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1469 // CHECK2-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
1470 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
1471 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4
1472 // CHECK2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP9]] to double
1473 // CHECK2-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
1474 // CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
1475 // CHECK2-NEXT:    store double [[ADD]], double* [[A]], align 8
1476 // CHECK2-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
1477 // CHECK2-NEXT:    [[TMP10:%.*]] = load double, double* [[A9]], align 8
1478 // CHECK2-NEXT:    [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00
1479 // CHECK2-NEXT:    store double [[INC]], double* [[A9]], align 8
1480 // CHECK2-NEXT:    [[CONV10:%.*]] = fptosi double [[INC]] to i16
1481 // CHECK2-NEXT:    [[TMP11:%.*]] = mul nsw i32 1, [[TMP2]]
1482 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[TMP11]]
1483 // CHECK2-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
1484 // CHECK2-NEXT:    store i16 [[CONV10]], i16* [[ARRAYIDX11]], align 2
1485 // CHECK2-NEXT:    [[A12:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
1486 // CHECK2-NEXT:    [[TMP12:%.*]] = load double, double* [[A12]], align 8
1487 // CHECK2-NEXT:    [[CONV13:%.*]] = fptosi double [[TMP12]] to i32
1488 // CHECK2-NEXT:    [[A14:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
1489 // CHECK2-NEXT:    [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV13]], double* nonnull align 8 dereferenceable(8) [[A14]]) #[[ATTR7]]
1490 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1491 // CHECK2:       .termination.notifier:
1492 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1493 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1494 // CHECK2-NEXT:    br label [[DOTEXIT]]
1495 // CHECK2:       .exit:
1496 // CHECK2-NEXT:    ret void
1497 //
1498 //
1499 // CHECK2-LABEL: define {{[^@]+}}@_Z3baziRd
1500 // CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
1501 // CHECK2-NEXT:  entry:
1502 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 4
1503 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
1504 // CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
1505 // CHECK2-NEXT:    [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
1506 // CHECK2-NEXT:    [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
1507 // CHECK2-NEXT:    store i32 [[F1]], i32* [[F_ON_STACK]], align 4
1508 // CHECK2-NEXT:    store double* [[A]], double** [[A_ADDR]], align 4
1509 // CHECK2-NEXT:    [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
1510 // CHECK2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
1511 // CHECK2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
1512 // CHECK2-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
1513 // CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
1514 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
1515 // CHECK2-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
1516 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1517 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2)
1518 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
1519 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[F]])
1520 // CHECK2-NEXT:    ret i32 [[TMP7]]
1521 //
1522 //
1523 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
1524 // CHECK2-SAME: () #[[ATTR3]] {
1525 // CHECK2-NEXT:  entry:
1526 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1527 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1528 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1529 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1530 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1531 // CHECK2:       .await.work:
1532 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1533 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1534 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1535 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1536 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1537 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1538 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1539 // CHECK2:       .select.workers:
1540 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1541 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1542 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1543 // CHECK2:       .execute.parallel:
1544 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
1545 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1546 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1547 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1548 // CHECK2:       .terminate.parallel:
1549 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
1550 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1551 // CHECK2:       .barrier.parallel:
1552 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1553 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
1554 // CHECK2:       .exit:
1555 // CHECK2-NEXT:    ret void
1556 //
1557 //
1558 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142
1559 // CHECK2-SAME: () #[[ATTR0]] {
1560 // CHECK2-NEXT:  entry:
1561 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1562 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1563 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1564 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1565 // CHECK2-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1566 // CHECK2-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1567 // CHECK2:       .worker:
1568 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]]
1569 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
1570 // CHECK2:       .mastercheck:
1571 // CHECK2-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1572 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1573 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1574 // CHECK2-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1575 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1576 // CHECK2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1577 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1578 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1579 // CHECK2-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1580 // CHECK2:       .master:
1581 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1582 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1583 // CHECK2-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1584 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1585 // CHECK2-NEXT:    call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
1586 // CHECK2-NEXT:    unreachable
1587 // CHECK2:       5:
1588 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1589 // CHECK2:       .termination.notifier:
1590 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1591 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1592 // CHECK2-NEXT:    br label [[DOTEXIT]]
1593 // CHECK2:       .exit:
1594 // CHECK2-NEXT:    ret void
1595 //
1596 //
1597 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker
1598 // CHECK2-SAME: () #[[ATTR3]] {
1599 // CHECK2-NEXT:  entry:
1600 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1601 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1602 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1603 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1604 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1605 // CHECK2:       .await.work:
1606 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1607 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1608 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1609 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1610 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1611 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1612 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1613 // CHECK2:       .select.workers:
1614 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1615 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1616 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1617 // CHECK2:       .execute.parallel:
1618 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
1619 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1620 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1621 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1622 // CHECK2:       .terminate.parallel:
1623 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
1624 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1625 // CHECK2:       .barrier.parallel:
1626 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1627 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
1628 // CHECK2:       .exit:
1629 // CHECK2-NEXT:    ret void
1630 //
1631 //
1632 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74
1633 // CHECK2-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
1634 // CHECK2-NEXT:  entry:
1635 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1636 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
1637 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
1638 // CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1639 // CHECK2-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
1640 // CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
1641 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
1642 // CHECK2-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
1643 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1644 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1645 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1646 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1647 // CHECK2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1648 // CHECK2-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1649 // CHECK2:       .worker:
1650 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]]
1651 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
1652 // CHECK2:       .mastercheck:
1653 // CHECK2-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1654 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1655 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1656 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1657 // CHECK2-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1658 // CHECK2-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
1659 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
1660 // CHECK2-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1661 // CHECK2-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1662 // CHECK2:       .master:
1663 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1664 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1665 // CHECK2-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1666 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1667 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
1668 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
1669 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
1670 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4
1671 // CHECK2-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP7]] to i32
1672 // CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
1673 // CHECK2-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
1674 // CHECK2-NEXT:    store i16 [[CONV9]], i16* [[CONV]], align 4
1675 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
1676 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1677 // CHECK2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP8]], 1
1678 // CHECK2-NEXT:    store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4
1679 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1680 // CHECK2:       .termination.notifier:
1681 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1682 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1683 // CHECK2-NEXT:    br label [[DOTEXIT]]
1684 // CHECK2:       .exit:
1685 // CHECK2-NEXT:    ret void
1686 //
1687 //
1688 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
1689 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
1690 // CHECK2-NEXT:  entry:
1691 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1692 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1693 // CHECK2-NEXT:    [[F_ADDR:%.*]] = alloca i32*, align 4
1694 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 4
1695 // CHECK2-NEXT:    [[TMP:%.*]] = alloca double*, align 4
1696 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1697 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1698 // CHECK2-NEXT:    store i32* [[F]], i32** [[F_ADDR]], align 4
1699 // CHECK2-NEXT:    store double* [[A]], double** [[A_ADDR]], align 4
1700 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 4
1701 // CHECK2-NEXT:    [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
1702 // CHECK2-NEXT:    store double* [[TMP1]], double** [[TMP]], align 4
1703 // CHECK2-NEXT:    [[TMP2:%.*]] = load double*, double** [[TMP]], align 4
1704 // CHECK2-NEXT:    [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
1705 // CHECK2-NEXT:    [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]]
1706 // CHECK2-NEXT:    [[CONV:%.*]] = fptosi double [[ADD]] to i32
1707 // CHECK2-NEXT:    store i32 [[CONV]], i32* [[TMP0]], align 4
1708 // CHECK2-NEXT:    ret void
1709 //
1710 //
1711 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
1712 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] {
1713 // CHECK2-NEXT:  entry:
1714 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1715 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1716 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1717 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1718 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1719 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1720 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1721 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1722 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
1723 // CHECK2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
1724 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
1725 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4
1726 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 1
1727 // CHECK2-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double**
1728 // CHECK2-NEXT:    [[TMP8:%.*]] = load double*, double** [[TMP7]], align 4
1729 // CHECK2-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]]
1730 // CHECK2-NEXT:    ret void
1731 //
1732 //
1733 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25
1734 // CHECK3-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
1735 // CHECK3-NEXT:  entry:
1736 // CHECK3-NEXT:    [[PTR1_ADDR:%.*]] = alloca i32*, align 4
1737 // CHECK3-NEXT:    [[PTR2_ADDR:%.*]] = alloca i32**, align 4
1738 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
1739 // CHECK3-NEXT:    store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 4
1740 // CHECK3-NEXT:    store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4
1741 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
1742 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1743 // CHECK3-NEXT:    call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
1744 // CHECK3-NEXT:    br label [[DOTEXECUTE:%.*]]
1745 // CHECK3:       .execute:
1746 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
1747 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
1748 // CHECK3-NEXT:    [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8*
1749 // CHECK3-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
1750 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
1751 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8*
1752 // CHECK3-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
1753 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1754 // CHECK3-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i32 2)
1755 // CHECK3-NEXT:    br label [[DOTOMP_DEINIT:%.*]]
1756 // CHECK3:       .omp.deinit:
1757 // CHECK3-NEXT:    call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
1758 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
1759 // CHECK3:       .exit:
1760 // CHECK3-NEXT:    ret void
1761 //
1762 //
1763 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__
1764 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0]] {
1765 // CHECK3-NEXT:  entry:
1766 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1767 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1768 // CHECK3-NEXT:    [[PTR1_ADDR:%.*]] = alloca i32**, align 4
1769 // CHECK3-NEXT:    [[PTR2_ADDR:%.*]] = alloca i32**, align 4
1770 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1771 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1772 // CHECK3-NEXT:    store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 4
1773 // CHECK3-NEXT:    store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4
1774 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 4
1775 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
1776 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 4
1777 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
1778 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 4
1779 // CHECK3-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
1780 // CHECK3-NEXT:    ret void
1781 //
1782 //
1783 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker
1784 // CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
1785 // CHECK3-NEXT:  entry:
1786 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1787 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1788 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1789 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1790 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1791 // CHECK3:       .await.work:
1792 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1793 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1794 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1795 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1796 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1797 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1798 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1799 // CHECK3:       .select.workers:
1800 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1801 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1802 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1803 // CHECK3:       .execute.parallel:
1804 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
1805 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1806 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1807 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1808 // CHECK3:       .terminate.parallel:
1809 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
1810 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1811 // CHECK3:       .barrier.parallel:
1812 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1813 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
1814 // CHECK3:       .exit:
1815 // CHECK3-NEXT:    ret void
1816 //
1817 //
1818 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39
1819 // CHECK3-SAME: () #[[ATTR0]] {
1820 // CHECK3-NEXT:  entry:
1821 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1822 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1823 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1824 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1825 // CHECK3-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1826 // CHECK3-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1827 // CHECK3:       .worker:
1828 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]]
1829 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
1830 // CHECK3:       .mastercheck:
1831 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1832 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1833 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1834 // CHECK3-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1835 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1836 // CHECK3-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1837 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1838 // CHECK3-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1839 // CHECK3-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1840 // CHECK3:       .master:
1841 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1842 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1843 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1844 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1845 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1846 // CHECK3:       .termination.notifier:
1847 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1848 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1849 // CHECK3-NEXT:    br label [[DOTEXIT]]
1850 // CHECK3:       .exit:
1851 // CHECK3-NEXT:    ret void
1852 //
1853 //
1854 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker
1855 // CHECK3-SAME: () #[[ATTR3]] {
1856 // CHECK3-NEXT:  entry:
1857 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1858 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1859 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1860 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1861 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1862 // CHECK3:       .await.work:
1863 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1864 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1865 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1866 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1867 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1868 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1869 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1870 // CHECK3:       .select.workers:
1871 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1872 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1873 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1874 // CHECK3:       .execute.parallel:
1875 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
1876 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1877 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1878 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1879 // CHECK3:       .terminate.parallel:
1880 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
1881 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1882 // CHECK3:       .barrier.parallel:
1883 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1884 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
1885 // CHECK3:       .exit:
1886 // CHECK3-NEXT:    ret void
1887 //
1888 //
1889 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47
1890 // CHECK3-SAME: (i32 [[AA:%.*]]) #[[ATTR0]] {
1891 // CHECK3-NEXT:  entry:
1892 // CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
1893 // CHECK3-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
1894 // CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
1895 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1896 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1897 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1898 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1899 // CHECK3-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1900 // CHECK3-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1901 // CHECK3:       .worker:
1902 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]]
1903 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
1904 // CHECK3:       .mastercheck:
1905 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1906 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1907 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1908 // CHECK3-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1909 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1910 // CHECK3-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1911 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1912 // CHECK3-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1913 // CHECK3-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1914 // CHECK3:       .master:
1915 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1916 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1917 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1918 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1919 // CHECK3-NEXT:    [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4
1920 // CHECK3-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
1921 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
1922 // CHECK3-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD]] to i16
1923 // CHECK3-NEXT:    store i16 [[CONV8]], i16* [[CONV]], align 4
1924 // CHECK3-NEXT:    [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4
1925 // CHECK3-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP6]] to i32
1926 // CHECK3-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2
1927 // CHECK3-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
1928 // CHECK3-NEXT:    store i16 [[CONV11]], i16* [[CONV]], align 4
1929 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1930 // CHECK3:       .termination.notifier:
1931 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1932 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1933 // CHECK3-NEXT:    br label [[DOTEXIT]]
1934 // CHECK3:       .exit:
1935 // CHECK3-NEXT:    ret void
1936 //
1937 //
1938 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker
1939 // CHECK3-SAME: () #[[ATTR3]] {
1940 // CHECK3-NEXT:  entry:
1941 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1942 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1943 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1944 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1945 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1946 // CHECK3:       .await.work:
1947 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1948 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1949 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1950 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1951 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1952 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1953 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1954 // CHECK3:       .select.workers:
1955 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1956 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1957 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1958 // CHECK3:       .execute.parallel:
1959 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
1960 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1961 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1962 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1963 // CHECK3:       .terminate.parallel:
1964 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
1965 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1966 // CHECK3:       .barrier.parallel:
1967 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1968 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
1969 // CHECK3:       .exit:
1970 // CHECK3-NEXT:    ret void
1971 //
1972 //
1973 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53
1974 // CHECK3-SAME: (i32 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i32 [[VLA1:%.*]], i32 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
1975 // CHECK3-NEXT:  entry:
1976 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1977 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
1978 // CHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
1979 // CHECK3-NEXT:    [[BN_ADDR:%.*]] = alloca float*, align 4
1980 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
1981 // CHECK3-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
1982 // CHECK3-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
1983 // CHECK3-NEXT:    [[CN_ADDR:%.*]] = alloca double*, align 4
1984 // CHECK3-NEXT:    [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
1985 // CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1986 // CHECK3-NEXT:    store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
1987 // CHECK3-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
1988 // CHECK3-NEXT:    store float* [[BN]], float** [[BN_ADDR]], align 4
1989 // CHECK3-NEXT:    store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
1990 // CHECK3-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
1991 // CHECK3-NEXT:    store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
1992 // CHECK3-NEXT:    store double* [[CN]], double** [[CN_ADDR]], align 4
1993 // CHECK3-NEXT:    store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
1994 // CHECK3-NEXT:    [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
1995 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
1996 // CHECK3-NEXT:    [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
1997 // CHECK3-NEXT:    [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
1998 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
1999 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
2000 // CHECK3-NEXT:    [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
2001 // CHECK3-NEXT:    [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
2002 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2003 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2004 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2005 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2006 // CHECK3-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2007 // CHECK3-NEXT:    br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2008 // CHECK3:       .worker:
2009 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]]
2010 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
2011 // CHECK3:       .mastercheck:
2012 // CHECK3-NEXT:    [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2013 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2014 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2015 // CHECK3-NEXT:    [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1
2016 // CHECK3-NEXT:    [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1
2017 // CHECK3-NEXT:    [[TMP11:%.*]] = xor i32 [[TMP9]], -1
2018 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]]
2019 // CHECK3-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]]
2020 // CHECK3-NEXT:    br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2021 // CHECK3:       .master:
2022 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2023 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2024 // CHECK3-NEXT:    [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
2025 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
2026 // CHECK3-NEXT:    [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4
2027 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
2028 // CHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
2029 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i32 0, i32 2
2030 // CHECK3-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
2031 // CHECK3-NEXT:    [[CONV:%.*]] = fpext float [[TMP14]] to double
2032 // CHECK3-NEXT:    [[ADD11:%.*]] = fadd double [[CONV]], 1.000000e+00
2033 // CHECK3-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
2034 // CHECK3-NEXT:    store float [[CONV12]], float* [[ARRAYIDX]], align 4
2035 // CHECK3-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 3
2036 // CHECK3-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4
2037 // CHECK3-NEXT:    [[CONV14:%.*]] = fpext float [[TMP15]] to double
2038 // CHECK3-NEXT:    [[ADD15:%.*]] = fadd double [[CONV14]], 1.000000e+00
2039 // CHECK3-NEXT:    [[CONV16:%.*]] = fptrunc double [[ADD15]] to float
2040 // CHECK3-NEXT:    store float [[CONV16]], float* [[ARRAYIDX13]], align 4
2041 // CHECK3-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i32 0, i32 1
2042 // CHECK3-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX17]], i32 0, i32 2
2043 // CHECK3-NEXT:    [[TMP16:%.*]] = load double, double* [[ARRAYIDX18]], align 8
2044 // CHECK3-NEXT:    [[ADD19:%.*]] = fadd double [[TMP16]], 1.000000e+00
2045 // CHECK3-NEXT:    store double [[ADD19]], double* [[ARRAYIDX18]], align 8
2046 // CHECK3-NEXT:    [[TMP17:%.*]] = mul nsw i32 1, [[TMP5]]
2047 // CHECK3-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 [[TMP17]]
2048 // CHECK3-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX20]], i32 3
2049 // CHECK3-NEXT:    [[TMP18:%.*]] = load double, double* [[ARRAYIDX21]], align 8
2050 // CHECK3-NEXT:    [[ADD22:%.*]] = fadd double [[TMP18]], 1.000000e+00
2051 // CHECK3-NEXT:    store double [[ADD22]], double* [[ARRAYIDX21]], align 8
2052 // CHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0
2053 // CHECK3-NEXT:    [[TMP19:%.*]] = load i64, i64* [[X]], align 8
2054 // CHECK3-NEXT:    [[ADD23:%.*]] = add nsw i64 [[TMP19]], 1
2055 // CHECK3-NEXT:    store i64 [[ADD23]], i64* [[X]], align 8
2056 // CHECK3-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1
2057 // CHECK3-NEXT:    [[TMP20:%.*]] = load i8, i8* [[Y]], align 8
2058 // CHECK3-NEXT:    [[CONV24:%.*]] = sext i8 [[TMP20]] to i32
2059 // CHECK3-NEXT:    [[ADD25:%.*]] = add nsw i32 [[CONV24]], 1
2060 // CHECK3-NEXT:    [[CONV26:%.*]] = trunc i32 [[ADD25]] to i8
2061 // CHECK3-NEXT:    store i8 [[CONV26]], i8* [[Y]], align 8
2062 // CHECK3-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]]
2063 // CHECK3-NEXT:    [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8
2064 // CHECK3-NEXT:    [[ADD27:%.*]] = add nsw i64 [[TMP21]], 1
2065 // CHECK3-NEXT:    store i64 [[ADD27]], i64* [[CALL]], align 8
2066 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2067 // CHECK3:       .termination.notifier:
2068 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2069 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2070 // CHECK3-NEXT:    br label [[DOTEXIT]]
2071 // CHECK3:       .exit:
2072 // CHECK3-NEXT:    ret void
2073 //
2074 //
2075 // CHECK3-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
2076 // CHECK3-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 {
2077 // CHECK3-NEXT:  entry:
2078 // CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4
2079 // CHECK3-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
2080 // CHECK3-NEXT:    store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 4
2081 // CHECK3-NEXT:    store i32 [[I]], i32* [[I_ADDR]], align 4
2082 // CHECK3-NEXT:    [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 4
2083 // CHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0
2084 // CHECK3-NEXT:    ret i64* [[X]]
2085 //
2086 //
2087 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker
2088 // CHECK3-SAME: () #[[ATTR3]] {
2089 // CHECK3-NEXT:  entry:
2090 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
2091 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
2092 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
2093 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
2094 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
2095 // CHECK3:       .await.work:
2096 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2097 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
2098 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
2099 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
2100 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2101 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
2102 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
2103 // CHECK3:       .select.workers:
2104 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
2105 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
2106 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
2107 // CHECK3:       .execute.parallel:
2108 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
2109 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
2110 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
2111 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
2112 // CHECK3:       .terminate.parallel:
2113 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
2114 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
2115 // CHECK3:       .barrier.parallel:
2116 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2117 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
2118 // CHECK3:       .exit:
2119 // CHECK3-NEXT:    ret void
2120 //
2121 //
2122 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90
2123 // CHECK3-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], i32 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
2124 // CHECK3-NEXT:  entry:
2125 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
2126 // CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
2127 // CHECK3-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
2128 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
2129 // CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
2130 // CHECK3-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
2131 // CHECK3-NEXT:    store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
2132 // CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
2133 // CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
2134 // CHECK3-NEXT:    [[CONV1:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
2135 // CHECK3-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
2136 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2137 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2138 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2139 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2140 // CHECK3-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2141 // CHECK3-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2142 // CHECK3:       .worker:
2143 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]]
2144 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
2145 // CHECK3:       .mastercheck:
2146 // CHECK3-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2147 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2148 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2149 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
2150 // CHECK3-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
2151 // CHECK3-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
2152 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
2153 // CHECK3-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
2154 // CHECK3-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2155 // CHECK3:       .master:
2156 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2157 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2158 // CHECK3-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
2159 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
2160 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
2161 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
2162 // CHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
2163 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4
2164 // CHECK3-NEXT:    [[CONV8:%.*]] = sext i16 [[TMP7]] to i32
2165 // CHECK3-NEXT:    [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1
2166 // CHECK3-NEXT:    [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16
2167 // CHECK3-NEXT:    store i16 [[CONV10]], i16* [[CONV]], align 4
2168 // CHECK3-NEXT:    [[TMP8:%.*]] = load i8, i8* [[CONV1]], align 4
2169 // CHECK3-NEXT:    [[CONV11:%.*]] = sext i8 [[TMP8]] to i32
2170 // CHECK3-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
2171 // CHECK3-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i8
2172 // CHECK3-NEXT:    store i8 [[CONV13]], i8* [[CONV1]], align 4
2173 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
2174 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
2175 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP9]], 1
2176 // CHECK3-NEXT:    store i32 [[ADD14]], i32* [[ARRAYIDX]], align 4
2177 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2178 // CHECK3:       .termination.notifier:
2179 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2180 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2181 // CHECK3-NEXT:    br label [[DOTEXIT]]
2182 // CHECK3:       .exit:
2183 // CHECK3-NEXT:    ret void
2184 //
2185 //
2186 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker
2187 // CHECK3-SAME: () #[[ATTR3]] {
2188 // CHECK3-NEXT:  entry:
2189 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
2190 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
2191 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
2192 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
2193 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
2194 // CHECK3:       .await.work:
2195 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2196 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
2197 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
2198 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
2199 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2200 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
2201 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
2202 // CHECK3:       .select.workers:
2203 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
2204 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
2205 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
2206 // CHECK3:       .execute.parallel:
2207 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
2208 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
2209 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
2210 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
2211 // CHECK3:       .terminate.parallel:
2212 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
2213 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
2214 // CHECK3:       .barrier.parallel:
2215 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2216 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
2217 // CHECK3:       .exit:
2218 // CHECK3-NEXT:    ret void
2219 //
2220 //
2221 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108
2222 // CHECK3-SAME: (%struct.S1* [[THIS:%.*]], i32 [[B:%.*]], i32 [[VLA:%.*]], i32 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
2223 // CHECK3-NEXT:  entry:
2224 // CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
2225 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
2226 // CHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
2227 // CHECK3-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
2228 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca i16*, align 4
2229 // CHECK3-NEXT:    store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
2230 // CHECK3-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
2231 // CHECK3-NEXT:    store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
2232 // CHECK3-NEXT:    store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
2233 // CHECK3-NEXT:    store i16* [[C]], i16** [[C_ADDR]], align 4
2234 // CHECK3-NEXT:    [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
2235 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
2236 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
2237 // CHECK3-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
2238 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2239 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2240 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2241 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2242 // CHECK3-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2243 // CHECK3-NEXT:    br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2244 // CHECK3:       .worker:
2245 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]]
2246 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
2247 // CHECK3:       .mastercheck:
2248 // CHECK3-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2249 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2250 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2251 // CHECK3-NEXT:    [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
2252 // CHECK3-NEXT:    [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
2253 // CHECK3-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], -1
2254 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]]
2255 // CHECK3-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
2256 // CHECK3-NEXT:    br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2257 // CHECK3:       .master:
2258 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2259 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2260 // CHECK3-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
2261 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
2262 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4
2263 // CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP9]] to double
2264 // CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
2265 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
2266 // CHECK3-NEXT:    store double [[ADD]], double* [[A]], align 8
2267 // CHECK3-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
2268 // CHECK3-NEXT:    [[TMP10:%.*]] = load double, double* [[A9]], align 8
2269 // CHECK3-NEXT:    [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00
2270 // CHECK3-NEXT:    store double [[INC]], double* [[A9]], align 8
2271 // CHECK3-NEXT:    [[CONV10:%.*]] = fptosi double [[INC]] to i16
2272 // CHECK3-NEXT:    [[TMP11:%.*]] = mul nsw i32 1, [[TMP2]]
2273 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[TMP11]]
2274 // CHECK3-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
2275 // CHECK3-NEXT:    store i16 [[CONV10]], i16* [[ARRAYIDX11]], align 2
2276 // CHECK3-NEXT:    [[A12:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
2277 // CHECK3-NEXT:    [[TMP12:%.*]] = load double, double* [[A12]], align 8
2278 // CHECK3-NEXT:    [[CONV13:%.*]] = fptosi double [[TMP12]] to i32
2279 // CHECK3-NEXT:    [[A14:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
2280 // CHECK3-NEXT:    [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV13]], double* nonnull align 8 dereferenceable(8) [[A14]]) #[[ATTR7]]
2281 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2282 // CHECK3:       .termination.notifier:
2283 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2284 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2285 // CHECK3-NEXT:    br label [[DOTEXIT]]
2286 // CHECK3:       .exit:
2287 // CHECK3-NEXT:    ret void
2288 //
2289 //
2290 // CHECK3-LABEL: define {{[^@]+}}@_Z3baziRd
2291 // CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
2292 // CHECK3-NEXT:  entry:
2293 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 4
2294 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
2295 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
2296 // CHECK3-NEXT:    [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
2297 // CHECK3-NEXT:    [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
2298 // CHECK3-NEXT:    store i32 [[F1]], i32* [[F_ON_STACK]], align 4
2299 // CHECK3-NEXT:    store double* [[A]], double** [[A_ADDR]], align 4
2300 // CHECK3-NEXT:    [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
2301 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
2302 // CHECK3-NEXT:    [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
2303 // CHECK3-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
2304 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
2305 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
2306 // CHECK3-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
2307 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
2308 // CHECK3-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2)
2309 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
2310 // CHECK3-NEXT:    call void @__kmpc_free_shared(i8* [[F]])
2311 // CHECK3-NEXT:    ret i32 [[TMP7]]
2312 //
2313 //
2314 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
2315 // CHECK3-SAME: () #[[ATTR3]] {
2316 // CHECK3-NEXT:  entry:
2317 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
2318 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
2319 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
2320 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
2321 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
2322 // CHECK3:       .await.work:
2323 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2324 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
2325 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
2326 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
2327 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2328 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
2329 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
2330 // CHECK3:       .select.workers:
2331 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
2332 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
2333 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
2334 // CHECK3:       .execute.parallel:
2335 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
2336 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
2337 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
2338 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
2339 // CHECK3:       .terminate.parallel:
2340 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
2341 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
2342 // CHECK3:       .barrier.parallel:
2343 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2344 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
2345 // CHECK3:       .exit:
2346 // CHECK3-NEXT:    ret void
2347 //
2348 //
2349 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142
2350 // CHECK3-SAME: () #[[ATTR0]] {
2351 // CHECK3-NEXT:  entry:
2352 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2353 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2354 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2355 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2356 // CHECK3-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2357 // CHECK3-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2358 // CHECK3:       .worker:
2359 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]]
2360 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
2361 // CHECK3:       .mastercheck:
2362 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2363 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2364 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2365 // CHECK3-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
2366 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
2367 // CHECK3-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
2368 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
2369 // CHECK3-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
2370 // CHECK3-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2371 // CHECK3:       .master:
2372 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2373 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2374 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
2375 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
2376 // CHECK3-NEXT:    call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
2377 // CHECK3-NEXT:    unreachable
2378 // CHECK3:       5:
2379 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2380 // CHECK3:       .termination.notifier:
2381 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2382 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2383 // CHECK3-NEXT:    br label [[DOTEXIT]]
2384 // CHECK3:       .exit:
2385 // CHECK3-NEXT:    ret void
2386 //
2387 //
2388 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker
2389 // CHECK3-SAME: () #[[ATTR3]] {
2390 // CHECK3-NEXT:  entry:
2391 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
2392 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
2393 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
2394 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
2395 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
2396 // CHECK3:       .await.work:
2397 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2398 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
2399 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
2400 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
2401 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2402 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
2403 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
2404 // CHECK3:       .select.workers:
2405 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
2406 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
2407 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
2408 // CHECK3:       .execute.parallel:
2409 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
2410 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
2411 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
2412 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
2413 // CHECK3:       .terminate.parallel:
2414 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
2415 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
2416 // CHECK3:       .barrier.parallel:
2417 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2418 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
2419 // CHECK3:       .exit:
2420 // CHECK3-NEXT:    ret void
2421 //
2422 //
2423 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74
2424 // CHECK3-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
2425 // CHECK3-NEXT:  entry:
2426 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
2427 // CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
2428 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
2429 // CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
2430 // CHECK3-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
2431 // CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
2432 // CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
2433 // CHECK3-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
2434 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2435 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2436 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2437 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2438 // CHECK3-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2439 // CHECK3-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2440 // CHECK3:       .worker:
2441 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]]
2442 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
2443 // CHECK3:       .mastercheck:
2444 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2445 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2446 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2447 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
2448 // CHECK3-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
2449 // CHECK3-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
2450 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
2451 // CHECK3-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
2452 // CHECK3-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2453 // CHECK3:       .master:
2454 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2455 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2456 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
2457 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
2458 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
2459 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
2460 // CHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
2461 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4
2462 // CHECK3-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP7]] to i32
2463 // CHECK3-NEXT:    [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
2464 // CHECK3-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
2465 // CHECK3-NEXT:    store i16 [[CONV9]], i16* [[CONV]], align 4
2466 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
2467 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
2468 // CHECK3-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP8]], 1
2469 // CHECK3-NEXT:    store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4
2470 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2471 // CHECK3:       .termination.notifier:
2472 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2473 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2474 // CHECK3-NEXT:    br label [[DOTEXIT]]
2475 // CHECK3:       .exit:
2476 // CHECK3-NEXT:    ret void
2477 //
2478 //
2479 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1
2480 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
2481 // CHECK3-NEXT:  entry:
2482 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2483 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2484 // CHECK3-NEXT:    [[F_ADDR:%.*]] = alloca i32*, align 4
2485 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 4
2486 // CHECK3-NEXT:    [[TMP:%.*]] = alloca double*, align 4
2487 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2488 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2489 // CHECK3-NEXT:    store i32* [[F]], i32** [[F_ADDR]], align 4
2490 // CHECK3-NEXT:    store double* [[A]], double** [[A_ADDR]], align 4
2491 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 4
2492 // CHECK3-NEXT:    [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
2493 // CHECK3-NEXT:    store double* [[TMP1]], double** [[TMP]], align 4
2494 // CHECK3-NEXT:    [[TMP2:%.*]] = load double*, double** [[TMP]], align 4
2495 // CHECK3-NEXT:    [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
2496 // CHECK3-NEXT:    [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]]
2497 // CHECK3-NEXT:    [[CONV:%.*]] = fptosi double [[ADD]] to i32
2498 // CHECK3-NEXT:    store i32 [[CONV]], i32* [[TMP0]], align 4
2499 // CHECK3-NEXT:    ret void
2500 //
2501 //
2502 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
2503 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] {
2504 // CHECK3-NEXT:  entry:
2505 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
2506 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2507 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2508 // CHECK3-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
2509 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2510 // CHECK3-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
2511 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2512 // CHECK3-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2513 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
2514 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
2515 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
2516 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4
2517 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 1
2518 // CHECK3-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double**
2519 // CHECK3-NEXT:    [[TMP8:%.*]] = load double*, double** [[TMP7]], align 4
2520 // CHECK3-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]]
2521 // CHECK3-NEXT:    ret void
2522 //
2523