1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // Test target codegen - host bc file has to be created first.
3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
5 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2
6 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
7 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix=CHECK3
8 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK4
9 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK5
10 // expected-no-diagnostics
11 #ifndef HEADER
12 #define HEADER
13 
14 template<typename tx>
15 tx ftemplate(int n) {
16   tx a = 0;
17   short aa = 0;
18   tx b[10];
19 
20   #pragma omp target if(0)
21   {
22     #pragma omp parallel
23     {
24       int a = 41;
25     }
26     a += 1;
27   }
28 
29   #pragma omp target
30   {
31     #pragma omp parallel
32     {
33       int a = 42;
34     }
35     #pragma omp parallel if(0)
36     {
37       int a = 43;
38     }
39     #pragma omp parallel if(1)
40     {
41       int a = 44;
42     }
43     a += 1;
44   }
45 
46   #pragma omp target if(n>40)
47   {
48     #pragma omp parallel if(n>1000)
49     {
50       int a = 45;
51 #pragma omp barrier
52     }
53     a += 1;
54     aa += 1;
55     b[2] += 1;
56   }
57 
58   #pragma omp target
59   {
60     #pragma omp parallel
61     {
62     #pragma omp critical
63     ++a;
64     }
65     ++a;
66   }
67   return a;
68 }
69 
70 int bar(int n){
71   int a = 0;
72 
73   a += ftemplate<int>(n);
74 
75   return a;
76 }
77 
78 #endif
79 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker
80 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
81 // CHECK1-NEXT:  entry:
82 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
83 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
84 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
85 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
86 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
87 // CHECK1:       .await.work:
88 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
89 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
90 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
91 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
92 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
93 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
94 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
95 // CHECK1:       .select.workers:
96 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
97 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
98 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
99 // CHECK1:       .execute.parallel:
100 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
101 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
102 // CHECK1-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
103 // CHECK1-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
104 // CHECK1:       .execute.fn:
105 // CHECK1-NEXT:    call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
106 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
107 // CHECK1:       .check.next:
108 // CHECK1-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8
109 // CHECK1-NEXT:    [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
110 // CHECK1-NEXT:    br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]]
111 // CHECK1:       .execute.fn2:
112 // CHECK1-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
113 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
114 // CHECK1:       .check.next3:
115 // CHECK1-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8
116 // CHECK1-NEXT:    [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
117 // CHECK1-NEXT:    br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]]
118 // CHECK1:       .execute.fn5:
119 // CHECK1-NEXT:    call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
120 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
121 // CHECK1:       .check.next6:
122 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
123 // CHECK1-NEXT:    call void [[TMP8]](i16 0, i32 [[TMP4]])
124 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
125 // CHECK1:       .terminate.parallel:
126 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
127 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
128 // CHECK1:       .barrier.parallel:
129 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
130 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
131 // CHECK1:       .exit:
132 // CHECK1-NEXT:    ret void
133 //
134 //
135 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
136 // CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] {
137 // CHECK1-NEXT:  entry:
138 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
139 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
140 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8
141 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8
142 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
143 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
144 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
145 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
146 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
147 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
148 // CHECK1-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
149 // CHECK1-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
150 // CHECK1:       .worker:
151 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]]
152 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
153 // CHECK1:       .mastercheck:
154 // CHECK1-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
155 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
156 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
157 // CHECK1-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
158 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
159 // CHECK1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
160 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
161 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
162 // CHECK1-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
163 // CHECK1:       .master:
164 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
165 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
166 // CHECK1-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
167 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
168 // CHECK1-NEXT:    call void @__kmpc_data_sharing_init_stack()
169 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
170 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
171 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0)
172 // CHECK1-NEXT:    [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8**
173 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0)
174 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8**
175 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0)
176 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8
177 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], 1
178 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 8
179 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
180 // CHECK1:       .termination.notifier:
181 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
182 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
183 // CHECK1-NEXT:    br label [[DOTEXIT]]
184 // CHECK1:       .exit:
185 // CHECK1-NEXT:    ret void
186 //
187 //
188 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
189 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
190 // CHECK1-NEXT:  entry:
191 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
192 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
193 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
194 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
195 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
196 // CHECK1-NEXT:    store i32 42, i32* [[A]], align 4
197 // CHECK1-NEXT:    ret void
198 //
199 //
200 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
201 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
202 // CHECK1-NEXT:  entry:
203 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
204 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
205 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
206 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
207 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
208 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
209 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
210 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
211 // CHECK1-NEXT:    call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
212 // CHECK1-NEXT:    ret void
213 //
214 //
215 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
216 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
217 // CHECK1-NEXT:  entry:
218 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
219 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
220 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
221 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
222 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
223 // CHECK1-NEXT:    store i32 43, i32* [[A]], align 4
224 // CHECK1-NEXT:    ret void
225 //
226 //
227 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
228 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
229 // CHECK1-NEXT:  entry:
230 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
231 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
232 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
233 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
234 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
235 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
236 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
237 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
238 // CHECK1-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
239 // CHECK1-NEXT:    ret void
240 //
241 //
242 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2
243 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
244 // CHECK1-NEXT:  entry:
245 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
246 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
247 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
248 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
249 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
250 // CHECK1-NEXT:    store i32 44, i32* [[A]], align 4
251 // CHECK1-NEXT:    ret void
252 //
253 //
254 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
255 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
256 // CHECK1-NEXT:  entry:
257 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
258 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
259 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
260 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
261 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
262 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
263 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
264 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
265 // CHECK1-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
266 // CHECK1-NEXT:    ret void
267 //
268 //
269 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker
270 // CHECK1-SAME: () #[[ATTR0]] {
271 // CHECK1-NEXT:  entry:
272 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
273 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
274 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
275 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
276 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
277 // CHECK1:       .await.work:
278 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
279 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
280 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
281 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
282 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
283 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
284 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
285 // CHECK1:       .select.workers:
286 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
287 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
288 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
289 // CHECK1:       .execute.parallel:
290 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
291 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
292 // CHECK1-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*)
293 // CHECK1-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
294 // CHECK1:       .execute.fn:
295 // CHECK1-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
296 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
297 // CHECK1:       .check.next:
298 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
299 // CHECK1-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
300 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
301 // CHECK1:       .terminate.parallel:
302 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
303 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
304 // CHECK1:       .barrier.parallel:
305 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
306 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
307 // CHECK1:       .exit:
308 // CHECK1-NEXT:    ret void
309 //
310 //
311 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
312 // CHECK1-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
313 // CHECK1-NEXT:  entry:
314 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
315 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
316 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
317 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
318 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
319 // CHECK1-NEXT:    store i64 [[N]], i64* [[N_ADDR]], align 8
320 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
321 // CHECK1-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
322 // CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
323 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
324 // CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32*
325 // CHECK1-NEXT:    [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
326 // CHECK1-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
327 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
328 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
329 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
330 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
331 // CHECK1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
332 // CHECK1-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
333 // CHECK1:       .worker:
334 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]]
335 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
336 // CHECK1:       .mastercheck:
337 // CHECK1-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
338 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
339 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
340 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
341 // CHECK1-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
342 // CHECK1-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
343 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
344 // CHECK1-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
345 // CHECK1-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
346 // CHECK1:       .master:
347 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
348 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
349 // CHECK1-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
350 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
351 // CHECK1-NEXT:    call void @__kmpc_data_sharing_init_stack()
352 // CHECK1-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
353 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8
354 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000
355 // CHECK1-NEXT:    [[TMP8:%.*]] = zext i1 [[CMP]] to i32
356 // CHECK1-NEXT:    [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
357 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0)
358 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8
359 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], 1
360 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV1]], align 8
361 // CHECK1-NEXT:    [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8
362 // CHECK1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP11]] to i32
363 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1
364 // CHECK1-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
365 // CHECK1-NEXT:    store i16 [[CONV11]], i16* [[CONV2]], align 8
366 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2
367 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
368 // CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1
369 // CHECK1-NEXT:    store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4
370 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
371 // CHECK1:       .termination.notifier:
372 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
373 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
374 // CHECK1-NEXT:    br label [[DOTEXIT]]
375 // CHECK1:       .exit:
376 // CHECK1-NEXT:    ret void
377 //
378 //
379 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3
380 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
381 // CHECK1-NEXT:  entry:
382 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
383 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
384 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
385 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
386 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
387 // CHECK1-NEXT:    store i32 45, i32* [[A]], align 4
388 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
389 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
390 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
391 // CHECK1-NEXT:    ret void
392 //
393 //
394 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
395 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
396 // CHECK1-NEXT:  entry:
397 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
398 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
399 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
400 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
401 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
402 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
403 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
404 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
405 // CHECK1-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
406 // CHECK1-NEXT:    ret void
407 //
408 //
409 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker
410 // CHECK1-SAME: () #[[ATTR0]] {
411 // CHECK1-NEXT:  entry:
412 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
413 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
414 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
415 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
416 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
417 // CHECK1:       .await.work:
418 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
419 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
420 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
421 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
422 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
423 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
424 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
425 // CHECK1:       .select.workers:
426 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
427 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
428 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
429 // CHECK1:       .execute.parallel:
430 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
431 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
432 // CHECK1-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*)
433 // CHECK1-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
434 // CHECK1:       .execute.fn:
435 // CHECK1-NEXT:    call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
436 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
437 // CHECK1:       .check.next:
438 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
439 // CHECK1-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
440 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
441 // CHECK1:       .terminate.parallel:
442 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
443 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
444 // CHECK1:       .barrier.parallel:
445 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
446 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
447 // CHECK1:       .exit:
448 // CHECK1-NEXT:    ret void
449 //
450 //
451 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58
452 // CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1]] {
453 // CHECK1-NEXT:  entry:
454 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
455 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
456 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
457 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
458 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
459 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
460 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
461 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
462 // CHECK1-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
463 // CHECK1-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
464 // CHECK1:       .worker:
465 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]]
466 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
467 // CHECK1:       .mastercheck:
468 // CHECK1-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
469 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
470 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
471 // CHECK1-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
472 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
473 // CHECK1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
474 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
475 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
476 // CHECK1-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
477 // CHECK1:       .master:
478 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
479 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
480 // CHECK1-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
481 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
482 // CHECK1-NEXT:    call void @__kmpc_data_sharing_init_stack()
483 // CHECK1-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
484 // CHECK1-NEXT:    [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8
485 // CHECK1-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
486 // CHECK1-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
487 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0
488 // CHECK1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty*
489 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 8
490 // CHECK1-NEXT:    [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0
491 // CHECK1-NEXT:    store i32 [[TMP10]], i32* [[A7]], align 4
492 // CHECK1-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
493 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
494 // CHECK1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[A7]] to i8*
495 // CHECK1-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
496 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
497 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP14]], i64 1)
498 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A7]], align 4
499 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP15]], 1
500 // CHECK1-NEXT:    store i32 [[INC]], i32* [[A7]], align 4
501 // CHECK1-NEXT:    [[TMP16:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
502 // CHECK1-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP16]])
503 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
504 // CHECK1:       .termination.notifier:
505 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
506 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
507 // CHECK1-NEXT:    br label [[DOTEXIT]]
508 // CHECK1:       .exit:
509 // CHECK1-NEXT:    ret void
510 //
511 //
512 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4
513 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
514 // CHECK1-NEXT:  entry:
515 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
516 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
517 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
518 // CHECK1-NEXT:    [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4
519 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
520 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
521 // CHECK1-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
522 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
523 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
524 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
525 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
526 // CHECK1-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
527 // CHECK1-NEXT:    br label [[OMP_CRITICAL_LOOP:%.*]]
528 // CHECK1:       omp.critical.loop:
529 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
530 // CHECK1-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]]
531 // CHECK1-NEXT:    br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
532 // CHECK1:       omp.critical.test:
533 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
534 // CHECK1-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]]
535 // CHECK1-NEXT:    br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
536 // CHECK1:       omp.critical.body:
537 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
538 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
539 // CHECK1-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
540 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4
541 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP8]], 1
542 // CHECK1-NEXT:    store i32 [[INC]], i32* [[TMP0]], align 4
543 // CHECK1-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
544 // CHECK1-NEXT:    br label [[OMP_CRITICAL_SYNC]]
545 // CHECK1:       omp.critical.sync:
546 // CHECK1-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
547 // CHECK1-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
548 // CHECK1-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
549 // CHECK1-NEXT:    br label [[OMP_CRITICAL_LOOP]]
550 // CHECK1:       omp.critical.exit:
551 // CHECK1-NEXT:    ret void
552 //
553 //
554 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper
555 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
556 // CHECK1-NEXT:  entry:
557 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
558 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
559 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
560 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
561 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
562 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
563 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
564 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
565 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
566 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
567 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
568 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
569 // CHECK1-NEXT:    call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
570 // CHECK1-NEXT:    ret void
571 //
572 //
573 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker
574 // CHECK2-SAME: () #[[ATTR0:[0-9]+]] {
575 // CHECK2-NEXT:  entry:
576 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
577 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
578 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
579 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
580 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
581 // CHECK2:       .await.work:
582 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
583 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
584 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
585 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
586 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
587 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
588 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
589 // CHECK2:       .select.workers:
590 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
591 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
592 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
593 // CHECK2:       .execute.parallel:
594 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
595 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
596 // CHECK2-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
597 // CHECK2-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
598 // CHECK2:       .execute.fn:
599 // CHECK2-NEXT:    call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
600 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
601 // CHECK2:       .check.next:
602 // CHECK2-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8
603 // CHECK2-NEXT:    [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
604 // CHECK2-NEXT:    br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]]
605 // CHECK2:       .execute.fn2:
606 // CHECK2-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
607 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
608 // CHECK2:       .check.next3:
609 // CHECK2-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8
610 // CHECK2-NEXT:    [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
611 // CHECK2-NEXT:    br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]]
612 // CHECK2:       .execute.fn5:
613 // CHECK2-NEXT:    call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
614 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
615 // CHECK2:       .check.next6:
616 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
617 // CHECK2-NEXT:    call void [[TMP8]](i16 0, i32 [[TMP4]])
618 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
619 // CHECK2:       .terminate.parallel:
620 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
621 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
622 // CHECK2:       .barrier.parallel:
623 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
624 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
625 // CHECK2:       .exit:
626 // CHECK2-NEXT:    ret void
627 //
628 //
629 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
630 // CHECK2-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] {
631 // CHECK2-NEXT:  entry:
632 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
633 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
634 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8
635 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8
636 // CHECK2-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
637 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
638 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
639 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
640 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
641 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
642 // CHECK2-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
643 // CHECK2-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
644 // CHECK2:       .worker:
645 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]]
646 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
647 // CHECK2:       .mastercheck:
648 // CHECK2-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
649 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
650 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
651 // CHECK2-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
652 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
653 // CHECK2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
654 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
655 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
656 // CHECK2-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
657 // CHECK2:       .master:
658 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
659 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
660 // CHECK2-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
661 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
662 // CHECK2-NEXT:    call void @__kmpc_data_sharing_init_stack()
663 // CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
664 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
665 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0)
666 // CHECK2-NEXT:    [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8**
667 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0)
668 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8**
669 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0)
670 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8
671 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], 1
672 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 8
673 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
674 // CHECK2:       .termination.notifier:
675 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
676 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
677 // CHECK2-NEXT:    br label [[DOTEXIT]]
678 // CHECK2:       .exit:
679 // CHECK2-NEXT:    ret void
680 //
681 //
682 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
683 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
684 // CHECK2-NEXT:  entry:
685 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
686 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
687 // CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
688 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
689 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
690 // CHECK2-NEXT:    store i32 42, i32* [[A]], align 4
691 // CHECK2-NEXT:    ret void
692 //
693 //
694 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
695 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
696 // CHECK2-NEXT:  entry:
697 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
698 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
699 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
700 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
701 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
702 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
703 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
704 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
705 // CHECK2-NEXT:    call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
706 // CHECK2-NEXT:    ret void
707 //
708 //
709 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
710 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
711 // CHECK2-NEXT:  entry:
712 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
713 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
714 // CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
715 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
716 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
717 // CHECK2-NEXT:    store i32 43, i32* [[A]], align 4
718 // CHECK2-NEXT:    ret void
719 //
720 //
721 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
722 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
723 // CHECK2-NEXT:  entry:
724 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
725 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
726 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
727 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
728 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
729 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
730 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
731 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
732 // CHECK2-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
733 // CHECK2-NEXT:    ret void
734 //
735 //
736 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2
737 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
738 // CHECK2-NEXT:  entry:
739 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
740 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
741 // CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
742 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
743 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
744 // CHECK2-NEXT:    store i32 44, i32* [[A]], align 4
745 // CHECK2-NEXT:    ret void
746 //
747 //
748 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
749 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
750 // CHECK2-NEXT:  entry:
751 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
752 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
753 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
754 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
755 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
756 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
757 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
758 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
759 // CHECK2-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
760 // CHECK2-NEXT:    ret void
761 //
762 //
763 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker
764 // CHECK2-SAME: () #[[ATTR0]] {
765 // CHECK2-NEXT:  entry:
766 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
767 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
768 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
769 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
770 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
771 // CHECK2:       .await.work:
772 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
773 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
774 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
775 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
776 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
777 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
778 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
779 // CHECK2:       .select.workers:
780 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
781 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
782 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
783 // CHECK2:       .execute.parallel:
784 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
785 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
786 // CHECK2-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*)
787 // CHECK2-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
788 // CHECK2:       .execute.fn:
789 // CHECK2-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
790 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
791 // CHECK2:       .check.next:
792 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
793 // CHECK2-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
794 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
795 // CHECK2:       .terminate.parallel:
796 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
797 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
798 // CHECK2:       .barrier.parallel:
799 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
800 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
801 // CHECK2:       .exit:
802 // CHECK2-NEXT:    ret void
803 //
804 //
805 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
806 // CHECK2-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
807 // CHECK2-NEXT:  entry:
808 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
809 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
810 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
811 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
812 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
813 // CHECK2-NEXT:    store i64 [[N]], i64* [[N_ADDR]], align 8
814 // CHECK2-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
815 // CHECK2-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
816 // CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
817 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
818 // CHECK2-NEXT:    [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32*
819 // CHECK2-NEXT:    [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
820 // CHECK2-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
821 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
822 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
823 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
824 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
825 // CHECK2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
826 // CHECK2-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
827 // CHECK2:       .worker:
828 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]]
829 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
830 // CHECK2:       .mastercheck:
831 // CHECK2-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
832 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
833 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
834 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
835 // CHECK2-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
836 // CHECK2-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
837 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
838 // CHECK2-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
839 // CHECK2-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
840 // CHECK2:       .master:
841 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
842 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
843 // CHECK2-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
844 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
845 // CHECK2-NEXT:    call void @__kmpc_data_sharing_init_stack()
846 // CHECK2-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
847 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8
848 // CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000
849 // CHECK2-NEXT:    [[TMP8:%.*]] = zext i1 [[CMP]] to i32
850 // CHECK2-NEXT:    [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
851 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0)
852 // CHECK2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8
853 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], 1
854 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[CONV1]], align 8
855 // CHECK2-NEXT:    [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8
856 // CHECK2-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP11]] to i32
857 // CHECK2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1
858 // CHECK2-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
859 // CHECK2-NEXT:    store i16 [[CONV11]], i16* [[CONV2]], align 8
860 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2
861 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
862 // CHECK2-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1
863 // CHECK2-NEXT:    store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4
864 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
865 // CHECK2:       .termination.notifier:
866 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
867 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
868 // CHECK2-NEXT:    br label [[DOTEXIT]]
869 // CHECK2:       .exit:
870 // CHECK2-NEXT:    ret void
871 //
872 //
873 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3
874 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
875 // CHECK2-NEXT:  entry:
876 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
877 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
878 // CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
879 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
880 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
881 // CHECK2-NEXT:    store i32 45, i32* [[A]], align 4
882 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
883 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
884 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
885 // CHECK2-NEXT:    ret void
886 //
887 //
888 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
889 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
890 // CHECK2-NEXT:  entry:
891 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
892 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
893 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
894 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
895 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
896 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
897 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
898 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
899 // CHECK2-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
900 // CHECK2-NEXT:    ret void
901 //
902 //
903 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker
904 // CHECK2-SAME: () #[[ATTR0]] {
905 // CHECK2-NEXT:  entry:
906 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
907 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
908 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
909 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
910 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
911 // CHECK2:       .await.work:
912 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
913 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
914 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
915 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
916 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
917 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
918 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
919 // CHECK2:       .select.workers:
920 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
921 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
922 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
923 // CHECK2:       .execute.parallel:
924 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
925 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
926 // CHECK2-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*)
927 // CHECK2-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
928 // CHECK2:       .execute.fn:
929 // CHECK2-NEXT:    call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
930 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
931 // CHECK2:       .check.next:
932 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
933 // CHECK2-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
934 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
935 // CHECK2:       .terminate.parallel:
936 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
937 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
938 // CHECK2:       .barrier.parallel:
939 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
940 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
941 // CHECK2:       .exit:
942 // CHECK2-NEXT:    ret void
943 //
944 //
945 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58
946 // CHECK2-SAME: (i64 [[A:%.*]]) #[[ATTR1]] {
947 // CHECK2-NEXT:  entry:
948 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
949 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
950 // CHECK2-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
951 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
952 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
953 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
954 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
955 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
956 // CHECK2-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
957 // CHECK2-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
958 // CHECK2:       .worker:
959 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]]
960 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
961 // CHECK2:       .mastercheck:
962 // CHECK2-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
963 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
964 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
965 // CHECK2-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
966 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
967 // CHECK2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
968 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
969 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
970 // CHECK2-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
971 // CHECK2:       .master:
972 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
973 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
974 // CHECK2-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
975 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
976 // CHECK2-NEXT:    call void @__kmpc_data_sharing_init_stack()
977 // CHECK2-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1)
978 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
979 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8
980 // CHECK2-NEXT:    [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
981 // CHECK2-NEXT:    store i32 [[TMP7]], i32* [[A7]], align 4
982 // CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
983 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
984 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A7]] to i8*
985 // CHECK2-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 8
986 // CHECK2-NEXT:    [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
987 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i64 1)
988 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[A7]], align 4
989 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP12]], 1
990 // CHECK2-NEXT:    store i32 [[INC]], i32* [[A7]], align 4
991 // CHECK2-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
992 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
993 // CHECK2:       .termination.notifier:
994 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
995 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
996 // CHECK2-NEXT:    br label [[DOTEXIT]]
997 // CHECK2:       .exit:
998 // CHECK2-NEXT:    ret void
999 //
1000 //
1001 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4
1002 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
1003 // CHECK2-NEXT:  entry:
1004 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
1005 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
1006 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
1007 // CHECK2-NEXT:    [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4
1008 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
1009 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
1010 // CHECK2-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
1011 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
1012 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
1013 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1014 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1015 // CHECK2-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
1016 // CHECK2-NEXT:    br label [[OMP_CRITICAL_LOOP:%.*]]
1017 // CHECK2:       omp.critical.loop:
1018 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
1019 // CHECK2-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]]
1020 // CHECK2-NEXT:    br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
1021 // CHECK2:       omp.critical.test:
1022 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
1023 // CHECK2-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]]
1024 // CHECK2-NEXT:    br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
1025 // CHECK2:       omp.critical.body:
1026 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
1027 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
1028 // CHECK2-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
1029 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4
1030 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP8]], 1
1031 // CHECK2-NEXT:    store i32 [[INC]], i32* [[TMP0]], align 4
1032 // CHECK2-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
1033 // CHECK2-NEXT:    br label [[OMP_CRITICAL_SYNC]]
1034 // CHECK2:       omp.critical.sync:
1035 // CHECK2-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
1036 // CHECK2-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
1037 // CHECK2-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
1038 // CHECK2-NEXT:    br label [[OMP_CRITICAL_LOOP]]
1039 // CHECK2:       omp.critical.exit:
1040 // CHECK2-NEXT:    ret void
1041 //
1042 //
1043 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper
1044 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1045 // CHECK2-NEXT:  entry:
1046 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1047 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1048 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1049 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1050 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1051 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1052 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1053 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1054 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1055 // CHECK2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
1056 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
1057 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
1058 // CHECK2-NEXT:    call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
1059 // CHECK2-NEXT:    ret void
1060 //
1061 //
1062 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker
1063 // CHECK3-SAME: () #[[ATTR0:[0-9]+]] {
1064 // CHECK3-NEXT:  entry:
1065 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1066 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1067 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1068 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1069 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1070 // CHECK3:       .await.work:
1071 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1072 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1073 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1074 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1075 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1076 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1077 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1078 // CHECK3:       .select.workers:
1079 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1080 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1081 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1082 // CHECK3:       .execute.parallel:
1083 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
1084 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1085 // CHECK3-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
1086 // CHECK3-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
1087 // CHECK3:       .execute.fn:
1088 // CHECK3-NEXT:    call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2:[0-9]+]]
1089 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1090 // CHECK3:       .check.next:
1091 // CHECK3-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1092 // CHECK3-NEXT:    [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
1093 // CHECK3-NEXT:    br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]]
1094 // CHECK3:       .execute.fn2:
1095 // CHECK3-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]]
1096 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1097 // CHECK3:       .check.next3:
1098 // CHECK3-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1099 // CHECK3-NEXT:    [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
1100 // CHECK3-NEXT:    br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]]
1101 // CHECK3:       .execute.fn5:
1102 // CHECK3-NEXT:    call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]]
1103 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1104 // CHECK3:       .check.next6:
1105 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1106 // CHECK3-NEXT:    call void [[TMP8]](i16 0, i32 [[TMP4]])
1107 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1108 // CHECK3:       .terminate.parallel:
1109 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
1110 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1111 // CHECK3:       .barrier.parallel:
1112 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1113 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
1114 // CHECK3:       .exit:
1115 // CHECK3-NEXT:    ret void
1116 //
1117 //
1118 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
1119 // CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR0]] {
1120 // CHECK3-NEXT:  entry:
1121 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1122 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
1123 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4
1124 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4
1125 // CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1126 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1127 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1128 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1129 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1130 // CHECK3-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1131 // CHECK3-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1132 // CHECK3:       .worker:
1133 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR2]]
1134 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
1135 // CHECK3:       .mastercheck:
1136 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1137 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1138 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1139 // CHECK3-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1140 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1141 // CHECK3-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1142 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1143 // CHECK3-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1144 // CHECK3-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1145 // CHECK3:       .master:
1146 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1147 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1148 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1149 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1150 // CHECK3-NEXT:    call void @__kmpc_data_sharing_init_stack()
1151 // CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1152 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1153 // CHECK3-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0)
1154 // CHECK3-NEXT:    [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8**
1155 // CHECK3-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0)
1156 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8**
1157 // CHECK3-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0)
1158 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4
1159 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], 1
1160 // CHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
1161 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1162 // CHECK3:       .termination.notifier:
1163 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1164 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1165 // CHECK3-NEXT:    br label [[DOTEXIT]]
1166 // CHECK3:       .exit:
1167 // CHECK3-NEXT:    ret void
1168 //
1169 //
1170 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__
1171 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1172 // CHECK3-NEXT:  entry:
1173 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1174 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1175 // CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
1176 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1177 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1178 // CHECK3-NEXT:    store i32 42, i32* [[A]], align 4
1179 // CHECK3-NEXT:    ret void
1180 //
1181 //
1182 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
1183 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1184 // CHECK3-NEXT:  entry:
1185 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1186 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1187 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1188 // CHECK3-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1189 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1190 // CHECK3-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1191 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1192 // CHECK3-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1193 // CHECK3-NEXT:    call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
1194 // CHECK3-NEXT:    ret void
1195 //
1196 //
1197 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1
1198 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1199 // CHECK3-NEXT:  entry:
1200 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1201 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1202 // CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
1203 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1204 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1205 // CHECK3-NEXT:    store i32 43, i32* [[A]], align 4
1206 // CHECK3-NEXT:    ret void
1207 //
1208 //
1209 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
1210 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1211 // CHECK3-NEXT:  entry:
1212 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1213 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1214 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1215 // CHECK3-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1216 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1217 // CHECK3-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1218 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1219 // CHECK3-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1220 // CHECK3-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
1221 // CHECK3-NEXT:    ret void
1222 //
1223 //
1224 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2
1225 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1226 // CHECK3-NEXT:  entry:
1227 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1228 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1229 // CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
1230 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1231 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1232 // CHECK3-NEXT:    store i32 44, i32* [[A]], align 4
1233 // CHECK3-NEXT:    ret void
1234 //
1235 //
1236 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
1237 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1238 // CHECK3-NEXT:  entry:
1239 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1240 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1241 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1242 // CHECK3-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1243 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1244 // CHECK3-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1245 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1246 // CHECK3-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1247 // CHECK3-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
1248 // CHECK3-NEXT:    ret void
1249 //
1250 //
1251 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker
1252 // CHECK3-SAME: () #[[ATTR0]] {
1253 // CHECK3-NEXT:  entry:
1254 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1255 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1256 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1257 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1258 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1259 // CHECK3:       .await.work:
1260 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1261 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1262 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1263 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1264 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1265 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1266 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1267 // CHECK3:       .select.workers:
1268 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1269 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1270 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1271 // CHECK3:       .execute.parallel:
1272 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1273 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1274 // CHECK3-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*)
1275 // CHECK3-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
1276 // CHECK3:       .execute.fn:
1277 // CHECK3-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]]
1278 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1279 // CHECK3:       .check.next:
1280 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1281 // CHECK3-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
1282 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1283 // CHECK3:       .terminate.parallel:
1284 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
1285 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1286 // CHECK3:       .barrier.parallel:
1287 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1288 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
1289 // CHECK3:       .exit:
1290 // CHECK3-NEXT:    ret void
1291 //
1292 //
1293 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
1294 // CHECK3-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
1295 // CHECK3-NEXT:  entry:
1296 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
1297 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1298 // CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
1299 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
1300 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
1301 // CHECK3-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
1302 // CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1303 // CHECK3-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
1304 // CHECK3-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
1305 // CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
1306 // CHECK3-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
1307 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1308 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1309 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1310 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1311 // CHECK3-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1312 // CHECK3-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1313 // CHECK3:       .worker:
1314 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR2]]
1315 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
1316 // CHECK3:       .mastercheck:
1317 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1318 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1319 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1320 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1321 // CHECK3-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1322 // CHECK3-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
1323 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
1324 // CHECK3-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1325 // CHECK3-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1326 // CHECK3:       .master:
1327 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1328 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1329 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1330 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1331 // CHECK3-NEXT:    call void @__kmpc_data_sharing_init_stack()
1332 // CHECK3-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1333 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
1334 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000
1335 // CHECK3-NEXT:    [[TMP8:%.*]] = zext i1 [[CMP]] to i32
1336 // CHECK3-NEXT:    [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1337 // CHECK3-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0)
1338 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4
1339 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], 1
1340 // CHECK3-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
1341 // CHECK3-NEXT:    [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4
1342 // CHECK3-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP11]] to i32
1343 // CHECK3-NEXT:    [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
1344 // CHECK3-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
1345 // CHECK3-NEXT:    store i16 [[CONV9]], i16* [[CONV]], align 4
1346 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
1347 // CHECK3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1348 // CHECK3-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1
1349 // CHECK3-NEXT:    store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4
1350 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1351 // CHECK3:       .termination.notifier:
1352 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1353 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1354 // CHECK3-NEXT:    br label [[DOTEXIT]]
1355 // CHECK3:       .exit:
1356 // CHECK3-NEXT:    ret void
1357 //
1358 //
1359 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3
1360 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1361 // CHECK3-NEXT:  entry:
1362 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1363 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1364 // CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
1365 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1366 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1367 // CHECK3-NEXT:    store i32 45, i32* [[A]], align 4
1368 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
1369 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
1370 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
1371 // CHECK3-NEXT:    ret void
1372 //
1373 //
1374 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
1375 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1376 // CHECK3-NEXT:  entry:
1377 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1378 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1379 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1380 // CHECK3-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1381 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1382 // CHECK3-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1383 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1384 // CHECK3-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1385 // CHECK3-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
1386 // CHECK3-NEXT:    ret void
1387 //
1388 //
1389 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker
1390 // CHECK3-SAME: () #[[ATTR0]] {
1391 // CHECK3-NEXT:  entry:
1392 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1393 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1394 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1395 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1396 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1397 // CHECK3:       .await.work:
1398 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1399 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1400 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1401 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1402 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1403 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1404 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1405 // CHECK3:       .select.workers:
1406 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1407 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1408 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1409 // CHECK3:       .execute.parallel:
1410 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1411 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1412 // CHECK3-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*)
1413 // CHECK3-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
1414 // CHECK3:       .execute.fn:
1415 // CHECK3-NEXT:    call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]]
1416 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1417 // CHECK3:       .check.next:
1418 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1419 // CHECK3-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
1420 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1421 // CHECK3:       .terminate.parallel:
1422 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
1423 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1424 // CHECK3:       .barrier.parallel:
1425 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1426 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
1427 // CHECK3:       .exit:
1428 // CHECK3-NEXT:    ret void
1429 //
1430 //
1431 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58
1432 // CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR0]] {
1433 // CHECK3-NEXT:  entry:
1434 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1435 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
1436 // CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1437 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1438 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1439 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1440 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1441 // CHECK3-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1442 // CHECK3-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1443 // CHECK3:       .worker:
1444 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR2]]
1445 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
1446 // CHECK3:       .mastercheck:
1447 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1448 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1449 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1450 // CHECK3-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1451 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1452 // CHECK3-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1453 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1454 // CHECK3-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1455 // CHECK3-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1456 // CHECK3:       .master:
1457 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1458 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1459 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1460 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1461 // CHECK3-NEXT:    call void @__kmpc_data_sharing_init_stack()
1462 // CHECK3-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
1463 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4
1464 // CHECK3-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
1465 // CHECK3-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
1466 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
1467 // CHECK3-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty*
1468 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4
1469 // CHECK3-NEXT:    [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0
1470 // CHECK3-NEXT:    store i32 [[TMP10]], i32* [[A7]], align 4
1471 // CHECK3-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1472 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
1473 // CHECK3-NEXT:    [[TMP13:%.*]] = bitcast i32* [[A7]] to i8*
1474 // CHECK3-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 4
1475 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1476 // CHECK3-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP14]], i32 1)
1477 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A7]], align 4
1478 // CHECK3-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP15]], 1
1479 // CHECK3-NEXT:    store i32 [[INC]], i32* [[A7]], align 4
1480 // CHECK3-NEXT:    [[TMP16:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
1481 // CHECK3-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP16]])
1482 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1483 // CHECK3:       .termination.notifier:
1484 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1485 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1486 // CHECK3-NEXT:    br label [[DOTEXIT]]
1487 // CHECK3:       .exit:
1488 // CHECK3-NEXT:    ret void
1489 //
1490 //
1491 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4
1492 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] {
1493 // CHECK3-NEXT:  entry:
1494 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1495 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1496 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
1497 // CHECK3-NEXT:    [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4
1498 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1499 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1500 // CHECK3-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
1501 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
1502 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
1503 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1504 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1505 // CHECK3-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
1506 // CHECK3-NEXT:    br label [[OMP_CRITICAL_LOOP:%.*]]
1507 // CHECK3:       omp.critical.loop:
1508 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
1509 // CHECK3-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]]
1510 // CHECK3-NEXT:    br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
1511 // CHECK3:       omp.critical.test:
1512 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
1513 // CHECK3-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]]
1514 // CHECK3-NEXT:    br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
1515 // CHECK3:       omp.critical.body:
1516 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
1517 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
1518 // CHECK3-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
1519 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4
1520 // CHECK3-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP8]], 1
1521 // CHECK3-NEXT:    store i32 [[INC]], i32* [[TMP0]], align 4
1522 // CHECK3-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
1523 // CHECK3-NEXT:    br label [[OMP_CRITICAL_SYNC]]
1524 // CHECK3:       omp.critical.sync:
1525 // CHECK3-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
1526 // CHECK3-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
1527 // CHECK3-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
1528 // CHECK3-NEXT:    br label [[OMP_CRITICAL_LOOP]]
1529 // CHECK3:       omp.critical.exit:
1530 // CHECK3-NEXT:    ret void
1531 //
1532 //
1533 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper
1534 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1535 // CHECK3-NEXT:  entry:
1536 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1537 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1538 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1539 // CHECK3-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1540 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1541 // CHECK3-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1542 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1543 // CHECK3-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1544 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
1545 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
1546 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
1547 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4
1548 // CHECK3-NEXT:    call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]]
1549 // CHECK3-NEXT:    ret void
1550 //
1551 //
1552 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker
1553 // CHECK4-SAME: () #[[ATTR0:[0-9]+]] {
1554 // CHECK4-NEXT:  entry:
1555 // CHECK4-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1556 // CHECK4-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1557 // CHECK4-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1558 // CHECK4-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1559 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1560 // CHECK4:       .await.work:
1561 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1562 // CHECK4-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1563 // CHECK4-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1564 // CHECK4-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1565 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1566 // CHECK4-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1567 // CHECK4-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1568 // CHECK4:       .select.workers:
1569 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1570 // CHECK4-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1571 // CHECK4-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1572 // CHECK4:       .execute.parallel:
1573 // CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
1574 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1575 // CHECK4-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
1576 // CHECK4-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
1577 // CHECK4:       .execute.fn:
1578 // CHECK4-NEXT:    call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
1579 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1580 // CHECK4:       .check.next:
1581 // CHECK4-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1582 // CHECK4-NEXT:    [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
1583 // CHECK4-NEXT:    br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]]
1584 // CHECK4:       .execute.fn2:
1585 // CHECK4-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
1586 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1587 // CHECK4:       .check.next3:
1588 // CHECK4-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1589 // CHECK4-NEXT:    [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
1590 // CHECK4-NEXT:    br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]]
1591 // CHECK4:       .execute.fn5:
1592 // CHECK4-NEXT:    call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
1593 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1594 // CHECK4:       .check.next6:
1595 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1596 // CHECK4-NEXT:    call void [[TMP8]](i16 0, i32 [[TMP4]])
1597 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1598 // CHECK4:       .terminate.parallel:
1599 // CHECK4-NEXT:    call void @__kmpc_kernel_end_parallel()
1600 // CHECK4-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1601 // CHECK4:       .barrier.parallel:
1602 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1603 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK]]
1604 // CHECK4:       .exit:
1605 // CHECK4-NEXT:    ret void
1606 //
1607 //
1608 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
1609 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] {
1610 // CHECK4-NEXT:  entry:
1611 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1612 // CHECK4-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
1613 // CHECK4-NEXT:    [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4
1614 // CHECK4-NEXT:    [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4
1615 // CHECK4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1616 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1617 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1618 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1619 // CHECK4-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1620 // CHECK4-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1621 // CHECK4-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1622 // CHECK4:       .worker:
1623 // CHECK4-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]]
1624 // CHECK4-NEXT:    br label [[DOTEXIT:%.*]]
1625 // CHECK4:       .mastercheck:
1626 // CHECK4-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1627 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1628 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1629 // CHECK4-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1630 // CHECK4-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1631 // CHECK4-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1632 // CHECK4-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1633 // CHECK4-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1634 // CHECK4-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1635 // CHECK4:       .master:
1636 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1637 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1638 // CHECK4-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1639 // CHECK4-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1640 // CHECK4-NEXT:    call void @__kmpc_data_sharing_init_stack()
1641 // CHECK4-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1642 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1643 // CHECK4-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0)
1644 // CHECK4-NEXT:    [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8**
1645 // CHECK4-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0)
1646 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8**
1647 // CHECK4-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0)
1648 // CHECK4-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4
1649 // CHECK4-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], 1
1650 // CHECK4-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
1651 // CHECK4-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1652 // CHECK4:       .termination.notifier:
1653 // CHECK4-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1654 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1655 // CHECK4-NEXT:    br label [[DOTEXIT]]
1656 // CHECK4:       .exit:
1657 // CHECK4-NEXT:    ret void
1658 //
1659 //
1660 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__
1661 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
1662 // CHECK4-NEXT:  entry:
1663 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1664 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1665 // CHECK4-NEXT:    [[A:%.*]] = alloca i32, align 4
1666 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1667 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1668 // CHECK4-NEXT:    store i32 42, i32* [[A]], align 4
1669 // CHECK4-NEXT:    ret void
1670 //
1671 //
1672 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
1673 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1674 // CHECK4-NEXT:  entry:
1675 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1676 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1677 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1678 // CHECK4-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1679 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1680 // CHECK4-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1681 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1682 // CHECK4-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1683 // CHECK4-NEXT:    call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
1684 // CHECK4-NEXT:    ret void
1685 //
1686 //
1687 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1
1688 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
1689 // CHECK4-NEXT:  entry:
1690 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1691 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1692 // CHECK4-NEXT:    [[A:%.*]] = alloca i32, align 4
1693 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1694 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1695 // CHECK4-NEXT:    store i32 43, i32* [[A]], align 4
1696 // CHECK4-NEXT:    ret void
1697 //
1698 //
1699 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
1700 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1701 // CHECK4-NEXT:  entry:
1702 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1703 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1704 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1705 // CHECK4-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1706 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1707 // CHECK4-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1708 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1709 // CHECK4-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1710 // CHECK4-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
1711 // CHECK4-NEXT:    ret void
1712 //
1713 //
1714 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2
1715 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
1716 // CHECK4-NEXT:  entry:
1717 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1718 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1719 // CHECK4-NEXT:    [[A:%.*]] = alloca i32, align 4
1720 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1721 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1722 // CHECK4-NEXT:    store i32 44, i32* [[A]], align 4
1723 // CHECK4-NEXT:    ret void
1724 //
1725 //
1726 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
1727 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1728 // CHECK4-NEXT:  entry:
1729 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1730 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1731 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1732 // CHECK4-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1733 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1734 // CHECK4-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1735 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1736 // CHECK4-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1737 // CHECK4-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
1738 // CHECK4-NEXT:    ret void
1739 //
1740 //
1741 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker
1742 // CHECK4-SAME: () #[[ATTR0]] {
1743 // CHECK4-NEXT:  entry:
1744 // CHECK4-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1745 // CHECK4-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1746 // CHECK4-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1747 // CHECK4-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1748 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1749 // CHECK4:       .await.work:
1750 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1751 // CHECK4-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1752 // CHECK4-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1753 // CHECK4-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1754 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1755 // CHECK4-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1756 // CHECK4-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1757 // CHECK4:       .select.workers:
1758 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1759 // CHECK4-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1760 // CHECK4-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1761 // CHECK4:       .execute.parallel:
1762 // CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1763 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1764 // CHECK4-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*)
1765 // CHECK4-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
1766 // CHECK4:       .execute.fn:
1767 // CHECK4-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
1768 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1769 // CHECK4:       .check.next:
1770 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1771 // CHECK4-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
1772 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1773 // CHECK4:       .terminate.parallel:
1774 // CHECK4-NEXT:    call void @__kmpc_kernel_end_parallel()
1775 // CHECK4-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1776 // CHECK4:       .barrier.parallel:
1777 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1778 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK]]
1779 // CHECK4:       .exit:
1780 // CHECK4-NEXT:    ret void
1781 //
1782 //
1783 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
1784 // CHECK4-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
1785 // CHECK4-NEXT:  entry:
1786 // CHECK4-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
1787 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1788 // CHECK4-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
1789 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
1790 // CHECK4-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
1791 // CHECK4-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
1792 // CHECK4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1793 // CHECK4-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
1794 // CHECK4-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
1795 // CHECK4-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
1796 // CHECK4-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
1797 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1798 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1799 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1800 // CHECK4-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1801 // CHECK4-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1802 // CHECK4-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1803 // CHECK4:       .worker:
1804 // CHECK4-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]]
1805 // CHECK4-NEXT:    br label [[DOTEXIT:%.*]]
1806 // CHECK4:       .mastercheck:
1807 // CHECK4-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1808 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1809 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1810 // CHECK4-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1811 // CHECK4-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1812 // CHECK4-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
1813 // CHECK4-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
1814 // CHECK4-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1815 // CHECK4-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1816 // CHECK4:       .master:
1817 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1818 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1819 // CHECK4-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1820 // CHECK4-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1821 // CHECK4-NEXT:    call void @__kmpc_data_sharing_init_stack()
1822 // CHECK4-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1823 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
1824 // CHECK4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000
1825 // CHECK4-NEXT:    [[TMP8:%.*]] = zext i1 [[CMP]] to i32
1826 // CHECK4-NEXT:    [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1827 // CHECK4-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0)
1828 // CHECK4-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4
1829 // CHECK4-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], 1
1830 // CHECK4-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
1831 // CHECK4-NEXT:    [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4
1832 // CHECK4-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP11]] to i32
1833 // CHECK4-NEXT:    [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
1834 // CHECK4-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
1835 // CHECK4-NEXT:    store i16 [[CONV9]], i16* [[CONV]], align 4
1836 // CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
1837 // CHECK4-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1838 // CHECK4-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1
1839 // CHECK4-NEXT:    store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4
1840 // CHECK4-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1841 // CHECK4:       .termination.notifier:
1842 // CHECK4-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1843 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1844 // CHECK4-NEXT:    br label [[DOTEXIT]]
1845 // CHECK4:       .exit:
1846 // CHECK4-NEXT:    ret void
1847 //
1848 //
1849 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3
1850 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
1851 // CHECK4-NEXT:  entry:
1852 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1853 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1854 // CHECK4-NEXT:    [[A:%.*]] = alloca i32, align 4
1855 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1856 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1857 // CHECK4-NEXT:    store i32 45, i32* [[A]], align 4
1858 // CHECK4-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
1859 // CHECK4-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
1860 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
1861 // CHECK4-NEXT:    ret void
1862 //
1863 //
1864 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
1865 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1866 // CHECK4-NEXT:  entry:
1867 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
1868 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1869 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1870 // CHECK4-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
1871 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1872 // CHECK4-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
1873 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1874 // CHECK4-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1875 // CHECK4-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
1876 // CHECK4-NEXT:    ret void
1877 //
1878 //
1879 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker
1880 // CHECK4-SAME: () #[[ATTR0]] {
1881 // CHECK4-NEXT:  entry:
1882 // CHECK4-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1883 // CHECK4-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1884 // CHECK4-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1885 // CHECK4-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1886 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1887 // CHECK4:       .await.work:
1888 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1889 // CHECK4-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1890 // CHECK4-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1891 // CHECK4-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1892 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1893 // CHECK4-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1894 // CHECK4-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1895 // CHECK4:       .select.workers:
1896 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1897 // CHECK4-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1898 // CHECK4-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1899 // CHECK4:       .execute.parallel:
1900 // CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1901 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1902 // CHECK4-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*)
1903 // CHECK4-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
1904 // CHECK4:       .execute.fn:
1905 // CHECK4-NEXT:    call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
1906 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1907 // CHECK4:       .check.next:
1908 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1909 // CHECK4-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
1910 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
1911 // CHECK4:       .terminate.parallel:
1912 // CHECK4-NEXT:    call void @__kmpc_kernel_end_parallel()
1913 // CHECK4-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1914 // CHECK4:       .barrier.parallel:
1915 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1916 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK]]
1917 // CHECK4:       .exit:
1918 // CHECK4-NEXT:    ret void
1919 //
1920 //
1921 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58
1922 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1]] {
1923 // CHECK4-NEXT:  entry:
1924 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1925 // CHECK4-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
1926 // CHECK4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1927 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1928 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1929 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1930 // CHECK4-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1931 // CHECK4-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1932 // CHECK4-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1933 // CHECK4:       .worker:
1934 // CHECK4-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]]
1935 // CHECK4-NEXT:    br label [[DOTEXIT:%.*]]
1936 // CHECK4:       .mastercheck:
1937 // CHECK4-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1938 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1939 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1940 // CHECK4-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1941 // CHECK4-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1942 // CHECK4-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1943 // CHECK4-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1944 // CHECK4-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1945 // CHECK4-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1946 // CHECK4:       .master:
1947 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1948 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1949 // CHECK4-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1950 // CHECK4-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1951 // CHECK4-NEXT:    call void @__kmpc_data_sharing_init_stack()
1952 // CHECK4-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1)
1953 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
1954 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4
1955 // CHECK4-NEXT:    [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
1956 // CHECK4-NEXT:    store i32 [[TMP7]], i32* [[A7]], align 4
1957 // CHECK4-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1958 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
1959 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A7]] to i8*
1960 // CHECK4-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
1961 // CHECK4-NEXT:    [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1962 // CHECK4-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1)
1963 // CHECK4-NEXT:    [[TMP12:%.*]] = load i32, i32* [[A7]], align 4
1964 // CHECK4-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP12]], 1
1965 // CHECK4-NEXT:    store i32 [[INC]], i32* [[A7]], align 4
1966 // CHECK4-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
1967 // CHECK4-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1968 // CHECK4:       .termination.notifier:
1969 // CHECK4-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1970 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1971 // CHECK4-NEXT:    br label [[DOTEXIT]]
1972 // CHECK4:       .exit:
1973 // CHECK4-NEXT:    ret void
1974 //
1975 //
1976 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4
1977 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
1978 // CHECK4-NEXT:  entry:
1979 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1980 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1981 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
1982 // CHECK4-NEXT:    [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4
1983 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1984 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1985 // CHECK4-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
1986 // CHECK4-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
1987 // CHECK4-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
1988 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1989 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1990 // CHECK4-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
1991 // CHECK4-NEXT:    br label [[OMP_CRITICAL_LOOP:%.*]]
1992 // CHECK4:       omp.critical.loop:
1993 // CHECK4-NEXT:    [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
1994 // CHECK4-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]]
1995 // CHECK4-NEXT:    br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
1996 // CHECK4:       omp.critical.test:
1997 // CHECK4-NEXT:    [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
1998 // CHECK4-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]]
1999 // CHECK4-NEXT:    br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
2000 // CHECK4:       omp.critical.body:
2001 // CHECK4-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
2002 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
2003 // CHECK4-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
2004 // CHECK4-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4
2005 // CHECK4-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP8]], 1
2006 // CHECK4-NEXT:    store i32 [[INC]], i32* [[TMP0]], align 4
2007 // CHECK4-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
2008 // CHECK4-NEXT:    br label [[OMP_CRITICAL_SYNC]]
2009 // CHECK4:       omp.critical.sync:
2010 // CHECK4-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
2011 // CHECK4-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
2012 // CHECK4-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
2013 // CHECK4-NEXT:    br label [[OMP_CRITICAL_LOOP]]
2014 // CHECK4:       omp.critical.exit:
2015 // CHECK4-NEXT:    ret void
2016 //
2017 //
2018 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper
2019 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2020 // CHECK4-NEXT:  entry:
2021 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
2022 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2023 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2024 // CHECK4-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
2025 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2026 // CHECK4-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
2027 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2028 // CHECK4-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2029 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
2030 // CHECK4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
2031 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
2032 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4
2033 // CHECK4-NEXT:    call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
2034 // CHECK4-NEXT:    ret void
2035 //
2036 //
2037 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker
2038 // CHECK5-SAME: () #[[ATTR0:[0-9]+]] {
2039 // CHECK5-NEXT:  entry:
2040 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
2041 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
2042 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
2043 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
2044 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
2045 // CHECK5:       .await.work:
2046 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2047 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
2048 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
2049 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
2050 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2051 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
2052 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
2053 // CHECK5:       .select.workers:
2054 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
2055 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
2056 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
2057 // CHECK5:       .execute.parallel:
2058 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
2059 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2060 // CHECK5-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
2061 // CHECK5-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
2062 // CHECK5:       .execute.fn:
2063 // CHECK5-NEXT:    call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
2064 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
2065 // CHECK5:       .check.next:
2066 // CHECK5-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2067 // CHECK5-NEXT:    [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
2068 // CHECK5-NEXT:    br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]]
2069 // CHECK5:       .execute.fn2:
2070 // CHECK5-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
2071 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
2072 // CHECK5:       .check.next3:
2073 // CHECK5-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2074 // CHECK5-NEXT:    [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
2075 // CHECK5-NEXT:    br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]]
2076 // CHECK5:       .execute.fn5:
2077 // CHECK5-NEXT:    call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
2078 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
2079 // CHECK5:       .check.next6:
2080 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
2081 // CHECK5-NEXT:    call void [[TMP8]](i16 0, i32 [[TMP4]])
2082 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
2083 // CHECK5:       .terminate.parallel:
2084 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
2085 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
2086 // CHECK5:       .barrier.parallel:
2087 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2088 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
2089 // CHECK5:       .exit:
2090 // CHECK5-NEXT:    ret void
2091 //
2092 //
2093 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
2094 // CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] {
2095 // CHECK5-NEXT:  entry:
2096 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
2097 // CHECK5-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
2098 // CHECK5-NEXT:    [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4
2099 // CHECK5-NEXT:    [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4
2100 // CHECK5-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
2101 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2102 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2103 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2104 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2105 // CHECK5-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2106 // CHECK5-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2107 // CHECK5:       .worker:
2108 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]]
2109 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
2110 // CHECK5:       .mastercheck:
2111 // CHECK5-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2112 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2113 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2114 // CHECK5-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
2115 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
2116 // CHECK5-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
2117 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
2118 // CHECK5-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
2119 // CHECK5-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2120 // CHECK5:       .master:
2121 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2122 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2123 // CHECK5-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
2124 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
2125 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
2126 // CHECK5-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
2127 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
2128 // CHECK5-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0)
2129 // CHECK5-NEXT:    [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8**
2130 // CHECK5-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0)
2131 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8**
2132 // CHECK5-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0)
2133 // CHECK5-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4
2134 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], 1
2135 // CHECK5-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
2136 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2137 // CHECK5:       .termination.notifier:
2138 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2139 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2140 // CHECK5-NEXT:    br label [[DOTEXIT]]
2141 // CHECK5:       .exit:
2142 // CHECK5-NEXT:    ret void
2143 //
2144 //
2145 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__
2146 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
2147 // CHECK5-NEXT:  entry:
2148 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2149 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2150 // CHECK5-NEXT:    [[A:%.*]] = alloca i32, align 4
2151 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2152 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2153 // CHECK5-NEXT:    store i32 42, i32* [[A]], align 4
2154 // CHECK5-NEXT:    ret void
2155 //
2156 //
2157 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
2158 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2159 // CHECK5-NEXT:  entry:
2160 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
2161 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2162 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2163 // CHECK5-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
2164 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2165 // CHECK5-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
2166 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2167 // CHECK5-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2168 // CHECK5-NEXT:    call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
2169 // CHECK5-NEXT:    ret void
2170 //
2171 //
2172 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1
2173 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
2174 // CHECK5-NEXT:  entry:
2175 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2176 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2177 // CHECK5-NEXT:    [[A:%.*]] = alloca i32, align 4
2178 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2179 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2180 // CHECK5-NEXT:    store i32 43, i32* [[A]], align 4
2181 // CHECK5-NEXT:    ret void
2182 //
2183 //
2184 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
2185 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2186 // CHECK5-NEXT:  entry:
2187 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
2188 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2189 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2190 // CHECK5-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
2191 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2192 // CHECK5-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
2193 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2194 // CHECK5-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2195 // CHECK5-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
2196 // CHECK5-NEXT:    ret void
2197 //
2198 //
2199 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2
2200 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
2201 // CHECK5-NEXT:  entry:
2202 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2203 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2204 // CHECK5-NEXT:    [[A:%.*]] = alloca i32, align 4
2205 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2206 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2207 // CHECK5-NEXT:    store i32 44, i32* [[A]], align 4
2208 // CHECK5-NEXT:    ret void
2209 //
2210 //
2211 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
2212 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2213 // CHECK5-NEXT:  entry:
2214 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
2215 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2216 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2217 // CHECK5-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
2218 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2219 // CHECK5-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
2220 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2221 // CHECK5-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2222 // CHECK5-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
2223 // CHECK5-NEXT:    ret void
2224 //
2225 //
2226 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker
2227 // CHECK5-SAME: () #[[ATTR0]] {
2228 // CHECK5-NEXT:  entry:
2229 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
2230 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
2231 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
2232 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
2233 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
2234 // CHECK5:       .await.work:
2235 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2236 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
2237 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
2238 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
2239 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2240 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
2241 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
2242 // CHECK5:       .select.workers:
2243 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
2244 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
2245 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
2246 // CHECK5:       .execute.parallel:
2247 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
2248 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2249 // CHECK5-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*)
2250 // CHECK5-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
2251 // CHECK5:       .execute.fn:
2252 // CHECK5-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
2253 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
2254 // CHECK5:       .check.next:
2255 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
2256 // CHECK5-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
2257 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
2258 // CHECK5:       .terminate.parallel:
2259 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
2260 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
2261 // CHECK5:       .barrier.parallel:
2262 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2263 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
2264 // CHECK5:       .exit:
2265 // CHECK5-NEXT:    ret void
2266 //
2267 //
2268 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
2269 // CHECK5-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
2270 // CHECK5-NEXT:  entry:
2271 // CHECK5-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
2272 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
2273 // CHECK5-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
2274 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
2275 // CHECK5-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
2276 // CHECK5-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
2277 // CHECK5-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
2278 // CHECK5-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
2279 // CHECK5-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
2280 // CHECK5-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
2281 // CHECK5-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
2282 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2283 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2284 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2285 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2286 // CHECK5-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2287 // CHECK5-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2288 // CHECK5:       .worker:
2289 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]]
2290 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
2291 // CHECK5:       .mastercheck:
2292 // CHECK5-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2293 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2294 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2295 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
2296 // CHECK5-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
2297 // CHECK5-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
2298 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
2299 // CHECK5-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
2300 // CHECK5-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2301 // CHECK5:       .master:
2302 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2303 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2304 // CHECK5-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
2305 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
2306 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
2307 // CHECK5-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
2308 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4
2309 // CHECK5-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000
2310 // CHECK5-NEXT:    [[TMP8:%.*]] = zext i1 [[CMP]] to i32
2311 // CHECK5-NEXT:    [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
2312 // CHECK5-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0)
2313 // CHECK5-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4
2314 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], 1
2315 // CHECK5-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
2316 // CHECK5-NEXT:    [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4
2317 // CHECK5-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP11]] to i32
2318 // CHECK5-NEXT:    [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
2319 // CHECK5-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
2320 // CHECK5-NEXT:    store i16 [[CONV9]], i16* [[CONV]], align 4
2321 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
2322 // CHECK5-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
2323 // CHECK5-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1
2324 // CHECK5-NEXT:    store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4
2325 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2326 // CHECK5:       .termination.notifier:
2327 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2328 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2329 // CHECK5-NEXT:    br label [[DOTEXIT]]
2330 // CHECK5:       .exit:
2331 // CHECK5-NEXT:    ret void
2332 //
2333 //
2334 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3
2335 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
2336 // CHECK5-NEXT:  entry:
2337 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2338 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2339 // CHECK5-NEXT:    [[A:%.*]] = alloca i32, align 4
2340 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2341 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2342 // CHECK5-NEXT:    store i32 45, i32* [[A]], align 4
2343 // CHECK5-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
2344 // CHECK5-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
2345 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
2346 // CHECK5-NEXT:    ret void
2347 //
2348 //
2349 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
2350 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2351 // CHECK5-NEXT:  entry:
2352 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
2353 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2354 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2355 // CHECK5-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
2356 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2357 // CHECK5-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
2358 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2359 // CHECK5-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2360 // CHECK5-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
2361 // CHECK5-NEXT:    ret void
2362 //
2363 //
2364 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker
2365 // CHECK5-SAME: () #[[ATTR0]] {
2366 // CHECK5-NEXT:  entry:
2367 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
2368 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
2369 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
2370 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
2371 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
2372 // CHECK5:       .await.work:
2373 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2374 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
2375 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
2376 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
2377 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2378 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
2379 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
2380 // CHECK5:       .select.workers:
2381 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
2382 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
2383 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
2384 // CHECK5:       .execute.parallel:
2385 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
2386 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2387 // CHECK5-NEXT:    [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*)
2388 // CHECK5-NEXT:    br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
2389 // CHECK5:       .execute.fn:
2390 // CHECK5-NEXT:    call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
2391 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
2392 // CHECK5:       .check.next:
2393 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
2394 // CHECK5-NEXT:    call void [[TMP6]](i16 0, i32 [[TMP4]])
2395 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL]]
2396 // CHECK5:       .terminate.parallel:
2397 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
2398 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
2399 // CHECK5:       .barrier.parallel:
2400 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2401 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
2402 // CHECK5:       .exit:
2403 // CHECK5-NEXT:    ret void
2404 //
2405 //
2406 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58
2407 // CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1]] {
2408 // CHECK5-NEXT:  entry:
2409 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
2410 // CHECK5-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
2411 // CHECK5-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
2412 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2413 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2414 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2415 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2416 // CHECK5-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2417 // CHECK5-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2418 // CHECK5:       .worker:
2419 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]]
2420 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
2421 // CHECK5:       .mastercheck:
2422 // CHECK5-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2423 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2424 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2425 // CHECK5-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
2426 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
2427 // CHECK5-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
2428 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
2429 // CHECK5-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
2430 // CHECK5-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2431 // CHECK5:       .master:
2432 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2433 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2434 // CHECK5-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
2435 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
2436 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
2437 // CHECK5-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1)
2438 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
2439 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4
2440 // CHECK5-NEXT:    [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
2441 // CHECK5-NEXT:    store i32 [[TMP7]], i32* [[A7]], align 4
2442 // CHECK5-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
2443 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
2444 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A7]] to i8*
2445 // CHECK5-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
2446 // CHECK5-NEXT:    [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
2447 // CHECK5-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1)
2448 // CHECK5-NEXT:    [[TMP12:%.*]] = load i32, i32* [[A7]], align 4
2449 // CHECK5-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP12]], 1
2450 // CHECK5-NEXT:    store i32 [[INC]], i32* [[A7]], align 4
2451 // CHECK5-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
2452 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2453 // CHECK5:       .termination.notifier:
2454 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2455 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2456 // CHECK5-NEXT:    br label [[DOTEXIT]]
2457 // CHECK5:       .exit:
2458 // CHECK5-NEXT:    ret void
2459 //
2460 //
2461 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4
2462 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
2463 // CHECK5-NEXT:  entry:
2464 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2465 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2466 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
2467 // CHECK5-NEXT:    [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4
2468 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2469 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2470 // CHECK5-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
2471 // CHECK5-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
2472 // CHECK5-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
2473 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2474 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2475 // CHECK5-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
2476 // CHECK5-NEXT:    br label [[OMP_CRITICAL_LOOP:%.*]]
2477 // CHECK5:       omp.critical.loop:
2478 // CHECK5-NEXT:    [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
2479 // CHECK5-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]]
2480 // CHECK5-NEXT:    br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
2481 // CHECK5:       omp.critical.test:
2482 // CHECK5-NEXT:    [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
2483 // CHECK5-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]]
2484 // CHECK5-NEXT:    br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
2485 // CHECK5:       omp.critical.body:
2486 // CHECK5-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
2487 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
2488 // CHECK5-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
2489 // CHECK5-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4
2490 // CHECK5-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP8]], 1
2491 // CHECK5-NEXT:    store i32 [[INC]], i32* [[TMP0]], align 4
2492 // CHECK5-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
2493 // CHECK5-NEXT:    br label [[OMP_CRITICAL_SYNC]]
2494 // CHECK5:       omp.critical.sync:
2495 // CHECK5-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
2496 // CHECK5-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
2497 // CHECK5-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
2498 // CHECK5-NEXT:    br label [[OMP_CRITICAL_LOOP]]
2499 // CHECK5:       omp.critical.exit:
2500 // CHECK5-NEXT:    ret void
2501 //
2502 //
2503 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper
2504 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2505 // CHECK5-NEXT:  entry:
2506 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
2507 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2508 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2509 // CHECK5-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
2510 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2511 // CHECK5-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
2512 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2513 // CHECK5-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2514 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
2515 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
2516 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
2517 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4
2518 // CHECK5-NEXT:    call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
2519 // CHECK5-NEXT:    ret void
2520 //
2521