1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // Test target codegen - host bc file has to be created first.
3 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
4 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
5 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2
6 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
7 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3
8 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK4
9 // expected-no-diagnostics
10 #ifndef HEADER
11 #define HEADER
12 
13 #ifdef CK1
14 
15 template <typename T>
16 int tmain(T argc) {
17 #pragma omp target
18 #pragma omp teams
19   argc = 0;
20   return 0;
21 }
22 
23 
24 int main (int argc, char **argv) {
25 #pragma omp target
26 #pragma omp teams
27   {
28   argc = 0;
29   }
30   return tmain(argv);
31 }
32 
33 
34 // only nvptx side: do not outline teams region and do not call fork_teams
35 
36 
37 // target region in template
38 
39 
40 
41 #endif // CK1
42 
43 // Test target codegen - host bc file has to be created first.
44 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
45 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK5
46 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK6
47 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
48 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK7
49 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK8
50 // expected-no-diagnostics
51 #ifdef CK2
52 
53 template <typename T>
54 int tmain(T argc) {
55   int a = 10;
56   int b = 5;
57 #pragma omp target
58 #pragma omp teams num_teams(a) thread_limit(b)
59   {
60   argc = 0;
61   }
62   return 0;
63 }
64 
65 int main (int argc, char **argv) {
66   int a = 20;
67   int b = 5;
68 #pragma omp target
69 #pragma omp teams num_teams(a) thread_limit(b)
70   {
71   argc = 0;
72   }
73   return tmain(argv);
74 }
75 
76 
77 
78 
79 
80 
81 #endif // CK2
82 #endif
83 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker
84 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
85 // CHECK1-NEXT:  entry:
86 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
87 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
88 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
89 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
90 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
91 // CHECK1:       .await.work:
92 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
93 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
94 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
95 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
96 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
97 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
98 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
99 // CHECK1:       .select.workers:
100 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
101 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
102 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
103 // CHECK1:       .execute.parallel:
104 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
105 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
106 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
107 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
108 // CHECK1:       .terminate.parallel:
109 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
110 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
111 // CHECK1:       .barrier.parallel:
112 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
113 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
114 // CHECK1:       .exit:
115 // CHECK1-NEXT:    ret void
116 //
117 //
118 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25
119 // CHECK1-SAME: (i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
120 // CHECK1-NEXT:  entry:
121 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
122 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
123 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
124 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
125 // CHECK1-NEXT:    store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8
126 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
127 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
128 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
129 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
130 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
131 // CHECK1-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
132 // CHECK1-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
133 // CHECK1:       .worker:
134 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]]
135 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
136 // CHECK1:       .mastercheck:
137 // CHECK1-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
138 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
139 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
140 // CHECK1-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
141 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
142 // CHECK1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
143 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
144 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
145 // CHECK1-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
146 // CHECK1:       .master:
147 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
148 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
149 // CHECK1-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
150 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
151 // CHECK1-NEXT:    call void @__kmpc_data_sharing_init_stack()
152 // CHECK1-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
153 // CHECK1-NEXT:    [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8
154 // CHECK1-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
155 // CHECK1-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
156 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0
157 // CHECK1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty*
158 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV]], align 8
159 // CHECK1-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0
160 // CHECK1-NEXT:    store i32 [[TMP10]], i32* [[ARGC7]], align 4
161 // CHECK1-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
162 // CHECK1-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
163 // CHECK1-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]]
164 // CHECK1-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
165 // CHECK1-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
166 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
167 // CHECK1:       .termination.notifier:
168 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
169 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
170 // CHECK1-NEXT:    br label [[DOTEXIT]]
171 // CHECK1:       .exit:
172 // CHECK1-NEXT:    ret void
173 //
174 //
175 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
176 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
177 // CHECK1-NEXT:  entry:
178 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
179 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
180 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 8
181 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
182 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
183 // CHECK1-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8
184 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8
185 // CHECK1-NEXT:    store i32 0, i32* [[TMP0]], align 4
186 // CHECK1-NEXT:    ret void
187 //
188 //
189 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker
190 // CHECK1-SAME: () #[[ATTR0]] {
191 // CHECK1-NEXT:  entry:
192 // CHECK1-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
193 // CHECK1-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
194 // CHECK1-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
195 // CHECK1-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
196 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
197 // CHECK1:       .await.work:
198 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
199 // CHECK1-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
200 // CHECK1-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
201 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
202 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
203 // CHECK1-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
204 // CHECK1-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
205 // CHECK1:       .select.workers:
206 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
207 // CHECK1-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
208 // CHECK1-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
209 // CHECK1:       .execute.parallel:
210 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
211 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
212 // CHECK1-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
213 // CHECK1-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
214 // CHECK1:       .terminate.parallel:
215 // CHECK1-NEXT:    call void @__kmpc_kernel_end_parallel()
216 // CHECK1-NEXT:    br label [[DOTBARRIER_PARALLEL]]
217 // CHECK1:       .barrier.parallel:
218 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
219 // CHECK1-NEXT:    br label [[DOTAWAIT_WORK]]
220 // CHECK1:       .exit:
221 // CHECK1-NEXT:    ret void
222 //
223 //
224 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17
225 // CHECK1-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] {
226 // CHECK1-NEXT:  entry:
227 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 8
228 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
229 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
230 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
231 // CHECK1-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8
232 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
233 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
234 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
235 // CHECK1-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
236 // CHECK1-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
237 // CHECK1-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
238 // CHECK1:       .worker:
239 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]]
240 // CHECK1-NEXT:    br label [[DOTEXIT:%.*]]
241 // CHECK1:       .mastercheck:
242 // CHECK1-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
243 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
244 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
245 // CHECK1-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
246 // CHECK1-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
247 // CHECK1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
248 // CHECK1-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
249 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
250 // CHECK1-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
251 // CHECK1:       .master:
252 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
253 // CHECK1-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
254 // CHECK1-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
255 // CHECK1-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
256 // CHECK1-NEXT:    call void @__kmpc_data_sharing_init_stack()
257 // CHECK1-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
258 // CHECK1-NEXT:    [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8
259 // CHECK1-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
260 // CHECK1-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
261 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0
262 // CHECK1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0*
263 // CHECK1-NEXT:    [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
264 // CHECK1-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0
265 // CHECK1-NEXT:    store i8** [[TMP10]], i8*** [[ARGC7]], align 8
266 // CHECK1-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
267 // CHECK1-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
268 // CHECK1-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]]
269 // CHECK1-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
270 // CHECK1-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
271 // CHECK1-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
272 // CHECK1:       .termination.notifier:
273 // CHECK1-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
274 // CHECK1-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
275 // CHECK1-NEXT:    br label [[DOTEXIT]]
276 // CHECK1:       .exit:
277 // CHECK1-NEXT:    ret void
278 //
279 //
280 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3
281 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] {
282 // CHECK1-NEXT:  entry:
283 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
284 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
285 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 8
286 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
287 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
288 // CHECK1-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8
289 // CHECK1-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8
290 // CHECK1-NEXT:    store i8** null, i8*** [[TMP0]], align 8
291 // CHECK1-NEXT:    ret void
292 //
293 //
294 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker
295 // CHECK2-SAME: () #[[ATTR0:[0-9]+]] {
296 // CHECK2-NEXT:  entry:
297 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
298 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
299 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
300 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
301 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
302 // CHECK2:       .await.work:
303 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
304 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
305 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
306 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
307 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
308 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
309 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
310 // CHECK2:       .select.workers:
311 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
312 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
313 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
314 // CHECK2:       .execute.parallel:
315 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
316 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
317 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
318 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
319 // CHECK2:       .terminate.parallel:
320 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
321 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
322 // CHECK2:       .barrier.parallel:
323 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
324 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
325 // CHECK2:       .exit:
326 // CHECK2-NEXT:    ret void
327 //
328 //
329 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25
330 // CHECK2-SAME: (i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
331 // CHECK2-NEXT:  entry:
332 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
333 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
334 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
335 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
336 // CHECK2-NEXT:    store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8
337 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
338 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
339 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
340 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
341 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
342 // CHECK2-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
343 // CHECK2-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
344 // CHECK2:       .worker:
345 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]]
346 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
347 // CHECK2:       .mastercheck:
348 // CHECK2-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
349 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
350 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
351 // CHECK2-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
352 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
353 // CHECK2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
354 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
355 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
356 // CHECK2-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
357 // CHECK2:       .master:
358 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
359 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
360 // CHECK2-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
361 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
362 // CHECK2-NEXT:    call void @__kmpc_data_sharing_init_stack()
363 // CHECK2-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1)
364 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
365 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8
366 // CHECK2-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
367 // CHECK2-NEXT:    store i32 [[TMP7]], i32* [[ARGC7]], align 4
368 // CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
369 // CHECK2-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
370 // CHECK2-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]]
371 // CHECK2-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
372 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
373 // CHECK2:       .termination.notifier:
374 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
375 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
376 // CHECK2-NEXT:    br label [[DOTEXIT]]
377 // CHECK2:       .exit:
378 // CHECK2-NEXT:    ret void
379 //
380 //
381 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
382 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
383 // CHECK2-NEXT:  entry:
384 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
385 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
386 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 8
387 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
388 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
389 // CHECK2-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8
390 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8
391 // CHECK2-NEXT:    store i32 0, i32* [[TMP0]], align 4
392 // CHECK2-NEXT:    ret void
393 //
394 //
395 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker
396 // CHECK2-SAME: () #[[ATTR0]] {
397 // CHECK2-NEXT:  entry:
398 // CHECK2-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
399 // CHECK2-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
400 // CHECK2-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
401 // CHECK2-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
402 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
403 // CHECK2:       .await.work:
404 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
405 // CHECK2-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
406 // CHECK2-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
407 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
408 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
409 // CHECK2-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
410 // CHECK2-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
411 // CHECK2:       .select.workers:
412 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
413 // CHECK2-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
414 // CHECK2-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
415 // CHECK2:       .execute.parallel:
416 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
417 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
418 // CHECK2-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
419 // CHECK2-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
420 // CHECK2:       .terminate.parallel:
421 // CHECK2-NEXT:    call void @__kmpc_kernel_end_parallel()
422 // CHECK2-NEXT:    br label [[DOTBARRIER_PARALLEL]]
423 // CHECK2:       .barrier.parallel:
424 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
425 // CHECK2-NEXT:    br label [[DOTAWAIT_WORK]]
426 // CHECK2:       .exit:
427 // CHECK2-NEXT:    ret void
428 //
429 //
430 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17
431 // CHECK2-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] {
432 // CHECK2-NEXT:  entry:
433 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 8
434 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
435 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
436 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
437 // CHECK2-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8
438 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
439 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
440 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
441 // CHECK2-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
442 // CHECK2-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
443 // CHECK2-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
444 // CHECK2:       .worker:
445 // CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]]
446 // CHECK2-NEXT:    br label [[DOTEXIT:%.*]]
447 // CHECK2:       .mastercheck:
448 // CHECK2-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
449 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
450 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
451 // CHECK2-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
452 // CHECK2-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
453 // CHECK2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
454 // CHECK2-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
455 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
456 // CHECK2-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
457 // CHECK2:       .master:
458 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
459 // CHECK2-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
460 // CHECK2-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
461 // CHECK2-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
462 // CHECK2-NEXT:    call void @__kmpc_data_sharing_init_stack()
463 // CHECK2-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1)
464 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
465 // CHECK2-NEXT:    [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
466 // CHECK2-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
467 // CHECK2-NEXT:    store i8** [[TMP7]], i8*** [[ARGC7]], align 8
468 // CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
469 // CHECK2-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
470 // CHECK2-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]]
471 // CHECK2-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
472 // CHECK2-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
473 // CHECK2:       .termination.notifier:
474 // CHECK2-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
475 // CHECK2-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
476 // CHECK2-NEXT:    br label [[DOTEXIT]]
477 // CHECK2:       .exit:
478 // CHECK2-NEXT:    ret void
479 //
480 //
481 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
482 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] {
483 // CHECK2-NEXT:  entry:
484 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
485 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
486 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 8
487 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
488 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
489 // CHECK2-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8
490 // CHECK2-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8
491 // CHECK2-NEXT:    store i8** null, i8*** [[TMP0]], align 8
492 // CHECK2-NEXT:    ret void
493 //
494 //
495 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker
496 // CHECK3-SAME: () #[[ATTR0:[0-9]+]] {
497 // CHECK3-NEXT:  entry:
498 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
499 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
500 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
501 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
502 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
503 // CHECK3:       .await.work:
504 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
505 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
506 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
507 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
508 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
509 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
510 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
511 // CHECK3:       .select.workers:
512 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
513 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
514 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
515 // CHECK3:       .execute.parallel:
516 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
517 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
518 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
519 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
520 // CHECK3:       .terminate.parallel:
521 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
522 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
523 // CHECK3:       .barrier.parallel:
524 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
525 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
526 // CHECK3:       .exit:
527 // CHECK3-NEXT:    ret void
528 //
529 //
530 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25
531 // CHECK3-SAME: (i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
532 // CHECK3-NEXT:  entry:
533 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
534 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
535 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
536 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
537 // CHECK3-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
538 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
539 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
540 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
541 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
542 // CHECK3-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
543 // CHECK3-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
544 // CHECK3:       .worker:
545 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]]
546 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
547 // CHECK3:       .mastercheck:
548 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
549 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
550 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
551 // CHECK3-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
552 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
553 // CHECK3-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
554 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
555 // CHECK3-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
556 // CHECK3-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
557 // CHECK3:       .master:
558 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
559 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
560 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
561 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
562 // CHECK3-NEXT:    call void @__kmpc_data_sharing_init_stack()
563 // CHECK3-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
564 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4
565 // CHECK3-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
566 // CHECK3-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
567 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
568 // CHECK3-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty*
569 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
570 // CHECK3-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0
571 // CHECK3-NEXT:    store i32 [[TMP10]], i32* [[ARGC7]], align 4
572 // CHECK3-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
573 // CHECK3-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
574 // CHECK3-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]]
575 // CHECK3-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
576 // CHECK3-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
577 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
578 // CHECK3:       .termination.notifier:
579 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
580 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
581 // CHECK3-NEXT:    br label [[DOTEXIT]]
582 // CHECK3:       .exit:
583 // CHECK3-NEXT:    ret void
584 //
585 //
586 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__
587 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
588 // CHECK3-NEXT:  entry:
589 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
590 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
591 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 4
592 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
593 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
594 // CHECK3-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4
595 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4
596 // CHECK3-NEXT:    store i32 0, i32* [[TMP0]], align 4
597 // CHECK3-NEXT:    ret void
598 //
599 //
600 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker
601 // CHECK3-SAME: () #[[ATTR0]] {
602 // CHECK3-NEXT:  entry:
603 // CHECK3-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
604 // CHECK3-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
605 // CHECK3-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
606 // CHECK3-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
607 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
608 // CHECK3:       .await.work:
609 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
610 // CHECK3-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
611 // CHECK3-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
612 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
613 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
614 // CHECK3-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
615 // CHECK3-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
616 // CHECK3:       .select.workers:
617 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
618 // CHECK3-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
619 // CHECK3-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
620 // CHECK3:       .execute.parallel:
621 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
622 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
623 // CHECK3-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
624 // CHECK3-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
625 // CHECK3:       .terminate.parallel:
626 // CHECK3-NEXT:    call void @__kmpc_kernel_end_parallel()
627 // CHECK3-NEXT:    br label [[DOTBARRIER_PARALLEL]]
628 // CHECK3:       .barrier.parallel:
629 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
630 // CHECK3-NEXT:    br label [[DOTAWAIT_WORK]]
631 // CHECK3:       .exit:
632 // CHECK3-NEXT:    ret void
633 //
634 //
635 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17
636 // CHECK3-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] {
637 // CHECK3-NEXT:  entry:
638 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 4
639 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
640 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
641 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
642 // CHECK3-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4
643 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
644 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
645 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
646 // CHECK3-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
647 // CHECK3-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
648 // CHECK3-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
649 // CHECK3:       .worker:
650 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]]
651 // CHECK3-NEXT:    br label [[DOTEXIT:%.*]]
652 // CHECK3:       .mastercheck:
653 // CHECK3-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
654 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
655 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
656 // CHECK3-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
657 // CHECK3-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
658 // CHECK3-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
659 // CHECK3-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
660 // CHECK3-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
661 // CHECK3-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
662 // CHECK3:       .master:
663 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
664 // CHECK3-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
665 // CHECK3-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
666 // CHECK3-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
667 // CHECK3-NEXT:    call void @__kmpc_data_sharing_init_stack()
668 // CHECK3-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
669 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4
670 // CHECK3-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
671 // CHECK3-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
672 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
673 // CHECK3-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0*
674 // CHECK3-NEXT:    [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
675 // CHECK3-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0
676 // CHECK3-NEXT:    store i8** [[TMP10]], i8*** [[ARGC7]], align 4
677 // CHECK3-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
678 // CHECK3-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
679 // CHECK3-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]]
680 // CHECK3-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
681 // CHECK3-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
682 // CHECK3-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
683 // CHECK3:       .termination.notifier:
684 // CHECK3-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
685 // CHECK3-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
686 // CHECK3-NEXT:    br label [[DOTEXIT]]
687 // CHECK3:       .exit:
688 // CHECK3-NEXT:    ret void
689 //
690 //
691 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3
692 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
693 // CHECK3-NEXT:  entry:
694 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
695 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
696 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 4
697 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
698 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
699 // CHECK3-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4
700 // CHECK3-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4
701 // CHECK3-NEXT:    store i8** null, i8*** [[TMP0]], align 4
702 // CHECK3-NEXT:    ret void
703 //
704 //
705 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker
706 // CHECK4-SAME: () #[[ATTR0:[0-9]+]] {
707 // CHECK4-NEXT:  entry:
708 // CHECK4-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
709 // CHECK4-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
710 // CHECK4-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
711 // CHECK4-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
712 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
713 // CHECK4:       .await.work:
714 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
715 // CHECK4-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
716 // CHECK4-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
717 // CHECK4-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
718 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
719 // CHECK4-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
720 // CHECK4-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
721 // CHECK4:       .select.workers:
722 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
723 // CHECK4-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
724 // CHECK4-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
725 // CHECK4:       .execute.parallel:
726 // CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
727 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
728 // CHECK4-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
729 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
730 // CHECK4:       .terminate.parallel:
731 // CHECK4-NEXT:    call void @__kmpc_kernel_end_parallel()
732 // CHECK4-NEXT:    br label [[DOTBARRIER_PARALLEL]]
733 // CHECK4:       .barrier.parallel:
734 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
735 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK]]
736 // CHECK4:       .exit:
737 // CHECK4-NEXT:    ret void
738 //
739 //
740 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25
741 // CHECK4-SAME: (i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
742 // CHECK4-NEXT:  entry:
743 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
744 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
745 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
746 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
747 // CHECK4-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
748 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
749 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
750 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
751 // CHECK4-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
752 // CHECK4-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
753 // CHECK4-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
754 // CHECK4:       .worker:
755 // CHECK4-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]]
756 // CHECK4-NEXT:    br label [[DOTEXIT:%.*]]
757 // CHECK4:       .mastercheck:
758 // CHECK4-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
759 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
760 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
761 // CHECK4-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
762 // CHECK4-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
763 // CHECK4-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
764 // CHECK4-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
765 // CHECK4-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
766 // CHECK4-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
767 // CHECK4:       .master:
768 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
769 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
770 // CHECK4-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
771 // CHECK4-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
772 // CHECK4-NEXT:    call void @__kmpc_data_sharing_init_stack()
773 // CHECK4-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1)
774 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
775 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
776 // CHECK4-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
777 // CHECK4-NEXT:    store i32 [[TMP7]], i32* [[ARGC7]], align 4
778 // CHECK4-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
779 // CHECK4-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
780 // CHECK4-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]]
781 // CHECK4-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
782 // CHECK4-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
783 // CHECK4:       .termination.notifier:
784 // CHECK4-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
785 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
786 // CHECK4-NEXT:    br label [[DOTEXIT]]
787 // CHECK4:       .exit:
788 // CHECK4-NEXT:    ret void
789 //
790 //
791 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__
792 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
793 // CHECK4-NEXT:  entry:
794 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
795 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
796 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 4
797 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
798 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
799 // CHECK4-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4
800 // CHECK4-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4
801 // CHECK4-NEXT:    store i32 0, i32* [[TMP0]], align 4
802 // CHECK4-NEXT:    ret void
803 //
804 //
805 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker
806 // CHECK4-SAME: () #[[ATTR0]] {
807 // CHECK4-NEXT:  entry:
808 // CHECK4-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
809 // CHECK4-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
810 // CHECK4-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
811 // CHECK4-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
812 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
813 // CHECK4:       .await.work:
814 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
815 // CHECK4-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
816 // CHECK4-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
817 // CHECK4-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
818 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
819 // CHECK4-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
820 // CHECK4-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
821 // CHECK4:       .select.workers:
822 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
823 // CHECK4-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
824 // CHECK4-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
825 // CHECK4:       .execute.parallel:
826 // CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
827 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
828 // CHECK4-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
829 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
830 // CHECK4:       .terminate.parallel:
831 // CHECK4-NEXT:    call void @__kmpc_kernel_end_parallel()
832 // CHECK4-NEXT:    br label [[DOTBARRIER_PARALLEL]]
833 // CHECK4:       .barrier.parallel:
834 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
835 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK]]
836 // CHECK4:       .exit:
837 // CHECK4-NEXT:    ret void
838 //
839 //
840 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17
841 // CHECK4-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] {
842 // CHECK4-NEXT:  entry:
843 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 4
844 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
845 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
846 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
847 // CHECK4-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4
848 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
849 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
850 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
851 // CHECK4-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
852 // CHECK4-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
853 // CHECK4-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
854 // CHECK4:       .worker:
855 // CHECK4-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]]
856 // CHECK4-NEXT:    br label [[DOTEXIT:%.*]]
857 // CHECK4:       .mastercheck:
858 // CHECK4-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
859 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
860 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
861 // CHECK4-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
862 // CHECK4-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
863 // CHECK4-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
864 // CHECK4-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
865 // CHECK4-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
866 // CHECK4-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
867 // CHECK4:       .master:
868 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
869 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
870 // CHECK4-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
871 // CHECK4-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
872 // CHECK4-NEXT:    call void @__kmpc_data_sharing_init_stack()
873 // CHECK4-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1)
874 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
875 // CHECK4-NEXT:    [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
876 // CHECK4-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
877 // CHECK4-NEXT:    store i8** [[TMP7]], i8*** [[ARGC7]], align 4
878 // CHECK4-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
879 // CHECK4-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
880 // CHECK4-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]]
881 // CHECK4-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
882 // CHECK4-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
883 // CHECK4:       .termination.notifier:
884 // CHECK4-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
885 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
886 // CHECK4-NEXT:    br label [[DOTEXIT]]
887 // CHECK4:       .exit:
888 // CHECK4-NEXT:    ret void
889 //
890 //
891 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1
892 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
893 // CHECK4-NEXT:  entry:
894 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
895 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
896 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 4
897 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
898 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
899 // CHECK4-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4
900 // CHECK4-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4
901 // CHECK4-NEXT:    store i8** null, i8*** [[TMP0]], align 4
902 // CHECK4-NEXT:    ret void
903 //
904 //
905 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker
906 // CHECK5-SAME: () #[[ATTR0:[0-9]+]] {
907 // CHECK5-NEXT:  entry:
908 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
909 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
910 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
911 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
912 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
913 // CHECK5:       .await.work:
914 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
915 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
916 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
917 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
918 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
919 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
920 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
921 // CHECK5:       .select.workers:
922 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
923 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
924 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
925 // CHECK5:       .execute.parallel:
926 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
927 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
928 // CHECK5-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
929 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
930 // CHECK5:       .terminate.parallel:
931 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
932 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
933 // CHECK5:       .barrier.parallel:
934 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
935 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
936 // CHECK5:       .exit:
937 // CHECK5-NEXT:    ret void
938 //
939 //
940 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
941 // CHECK5-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
942 // CHECK5-NEXT:  entry:
943 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
944 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
945 // CHECK5-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
946 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
947 // CHECK5-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
948 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
949 // CHECK5-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
950 // CHECK5-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
951 // CHECK5-NEXT:    store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8
952 // CHECK5-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
953 // CHECK5-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
954 // CHECK5-NEXT:    [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
955 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
956 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
957 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
958 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
959 // CHECK5-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
960 // CHECK5-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
961 // CHECK5:       .worker:
962 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]]
963 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
964 // CHECK5:       .mastercheck:
965 // CHECK5-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
966 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
967 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
968 // CHECK5-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
969 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
970 // CHECK5-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
971 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
972 // CHECK5-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
973 // CHECK5-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
974 // CHECK5:       .master:
975 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
976 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
977 // CHECK5-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
978 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
979 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
980 // CHECK5-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
981 // CHECK5-NEXT:    [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8
982 // CHECK5-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
983 // CHECK5-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
984 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0
985 // CHECK5-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty*
986 // CHECK5-NEXT:    [[TMP10:%.*]] = load i32, i32* [[CONV2]], align 8
987 // CHECK5-NEXT:    [[ARGC9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0
988 // CHECK5-NEXT:    store i32 [[TMP10]], i32* [[ARGC9]], align 4
989 // CHECK5-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
990 // CHECK5-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
991 // CHECK5-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC9]]) #[[ATTR3]]
992 // CHECK5-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
993 // CHECK5-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
994 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
995 // CHECK5:       .termination.notifier:
996 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
997 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
998 // CHECK5-NEXT:    br label [[DOTEXIT]]
999 // CHECK5:       .exit:
1000 // CHECK5-NEXT:    ret void
1001 //
1002 //
1003 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__
1004 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
1005 // CHECK5-NEXT:  entry:
1006 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
1007 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
1008 // CHECK5-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 8
1009 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
1010 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
1011 // CHECK5-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8
1012 // CHECK5-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8
1013 // CHECK5-NEXT:    store i32 0, i32* [[TMP0]], align 4
1014 // CHECK5-NEXT:    ret void
1015 //
1016 //
1017 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker
1018 // CHECK5-SAME: () #[[ATTR0]] {
1019 // CHECK5-NEXT:  entry:
1020 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
1021 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1022 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
1023 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1024 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1025 // CHECK5:       .await.work:
1026 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1027 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1028 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1029 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1030 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
1031 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1032 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1033 // CHECK5:       .select.workers:
1034 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1035 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1036 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1037 // CHECK5:       .execute.parallel:
1038 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1039 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1040 // CHECK5-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1041 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1042 // CHECK5:       .terminate.parallel:
1043 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
1044 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1045 // CHECK5:       .barrier.parallel:
1046 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1047 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
1048 // CHECK5:       .exit:
1049 // CHECK5-NEXT:    ret void
1050 //
1051 //
1052 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57
1053 // CHECK5-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] {
1054 // CHECK5-NEXT:  entry:
1055 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
1056 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
1057 // CHECK5-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 8
1058 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1059 // CHECK5-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1060 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1061 // CHECK5-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
1062 // CHECK5-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
1063 // CHECK5-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8
1064 // CHECK5-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
1065 // CHECK5-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
1066 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1067 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1068 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1069 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1070 // CHECK5-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1071 // CHECK5-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1072 // CHECK5:       .worker:
1073 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]]
1074 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
1075 // CHECK5:       .mastercheck:
1076 // CHECK5-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1077 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1078 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1079 // CHECK5-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
1080 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
1081 // CHECK5-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1082 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1083 // CHECK5-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
1084 // CHECK5-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1085 // CHECK5:       .master:
1086 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1087 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1088 // CHECK5-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
1089 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
1090 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
1091 // CHECK5-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
1092 // CHECK5-NEXT:    [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8
1093 // CHECK5-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
1094 // CHECK5-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
1095 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0
1096 // CHECK5-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0*
1097 // CHECK5-NEXT:    [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
1098 // CHECK5-NEXT:    [[ARGC8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0
1099 // CHECK5-NEXT:    store i8** [[TMP10]], i8*** [[ARGC8]], align 8
1100 // CHECK5-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1101 // CHECK5-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
1102 // CHECK5-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC8]]) #[[ATTR3]]
1103 // CHECK5-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
1104 // CHECK5-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
1105 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1106 // CHECK5:       .termination.notifier:
1107 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1108 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1109 // CHECK5-NEXT:    br label [[DOTEXIT]]
1110 // CHECK5:       .exit:
1111 // CHECK5-NEXT:    ret void
1112 //
1113 //
1114 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3
1115 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] {
1116 // CHECK5-NEXT:  entry:
1117 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
1118 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
1119 // CHECK5-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 8
1120 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
1121 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
1122 // CHECK5-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8
1123 // CHECK5-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8
1124 // CHECK5-NEXT:    store i8** null, i8*** [[TMP0]], align 8
1125 // CHECK5-NEXT:    ret void
1126 //
1127 //
1128 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker
1129 // CHECK6-SAME: () #[[ATTR0:[0-9]+]] {
1130 // CHECK6-NEXT:  entry:
1131 // CHECK6-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
1132 // CHECK6-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1133 // CHECK6-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
1134 // CHECK6-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1135 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1136 // CHECK6:       .await.work:
1137 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1138 // CHECK6-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1139 // CHECK6-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1140 // CHECK6-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1141 // CHECK6-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
1142 // CHECK6-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1143 // CHECK6-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1144 // CHECK6:       .select.workers:
1145 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1146 // CHECK6-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1147 // CHECK6-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1148 // CHECK6:       .execute.parallel:
1149 // CHECK6-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
1150 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1151 // CHECK6-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1152 // CHECK6-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1153 // CHECK6:       .terminate.parallel:
1154 // CHECK6-NEXT:    call void @__kmpc_kernel_end_parallel()
1155 // CHECK6-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1156 // CHECK6:       .barrier.parallel:
1157 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1158 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK]]
1159 // CHECK6:       .exit:
1160 // CHECK6-NEXT:    ret void
1161 //
1162 //
1163 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
1164 // CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
1165 // CHECK6-NEXT:  entry:
1166 // CHECK6-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
1167 // CHECK6-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
1168 // CHECK6-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
1169 // CHECK6-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1170 // CHECK6-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1171 // CHECK6-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1172 // CHECK6-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
1173 // CHECK6-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
1174 // CHECK6-NEXT:    store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8
1175 // CHECK6-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
1176 // CHECK6-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
1177 // CHECK6-NEXT:    [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32*
1178 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1179 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1180 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1181 // CHECK6-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1182 // CHECK6-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1183 // CHECK6-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1184 // CHECK6:       .worker:
1185 // CHECK6-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]]
1186 // CHECK6-NEXT:    br label [[DOTEXIT:%.*]]
1187 // CHECK6:       .mastercheck:
1188 // CHECK6-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1189 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1190 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1191 // CHECK6-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
1192 // CHECK6-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
1193 // CHECK6-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1194 // CHECK6-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1195 // CHECK6-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
1196 // CHECK6-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1197 // CHECK6:       .master:
1198 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1199 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1200 // CHECK6-NEXT:    [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
1201 // CHECK6-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
1202 // CHECK6-NEXT:    call void @__kmpc_data_sharing_init_stack()
1203 // CHECK6-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1)
1204 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
1205 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[CONV2]], align 8
1206 // CHECK6-NEXT:    [[ARGC9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
1207 // CHECK6-NEXT:    store i32 [[TMP7]], i32* [[ARGC9]], align 4
1208 // CHECK6-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1209 // CHECK6-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
1210 // CHECK6-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC9]]) #[[ATTR3]]
1211 // CHECK6-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
1212 // CHECK6-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1213 // CHECK6:       .termination.notifier:
1214 // CHECK6-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1215 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1216 // CHECK6-NEXT:    br label [[DOTEXIT]]
1217 // CHECK6:       .exit:
1218 // CHECK6-NEXT:    ret void
1219 //
1220 //
1221 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__
1222 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
1223 // CHECK6-NEXT:  entry:
1224 // CHECK6-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
1225 // CHECK6-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
1226 // CHECK6-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 8
1227 // CHECK6-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
1228 // CHECK6-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
1229 // CHECK6-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8
1230 // CHECK6-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8
1231 // CHECK6-NEXT:    store i32 0, i32* [[TMP0]], align 4
1232 // CHECK6-NEXT:    ret void
1233 //
1234 //
1235 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker
1236 // CHECK6-SAME: () #[[ATTR0]] {
1237 // CHECK6-NEXT:  entry:
1238 // CHECK6-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 8
1239 // CHECK6-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1240 // CHECK6-NEXT:    store i8* null, i8** [[WORK_FN]], align 8
1241 // CHECK6-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1242 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1243 // CHECK6:       .await.work:
1244 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1245 // CHECK6-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1246 // CHECK6-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1247 // CHECK6-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1248 // CHECK6-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
1249 // CHECK6-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1250 // CHECK6-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1251 // CHECK6:       .select.workers:
1252 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1253 // CHECK6-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1254 // CHECK6-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1255 // CHECK6:       .execute.parallel:
1256 // CHECK6-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1257 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1258 // CHECK6-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1259 // CHECK6-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1260 // CHECK6:       .terminate.parallel:
1261 // CHECK6-NEXT:    call void @__kmpc_kernel_end_parallel()
1262 // CHECK6-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1263 // CHECK6:       .barrier.parallel:
1264 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1265 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK]]
1266 // CHECK6:       .exit:
1267 // CHECK6-NEXT:    ret void
1268 //
1269 //
1270 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57
1271 // CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] {
1272 // CHECK6-NEXT:  entry:
1273 // CHECK6-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
1274 // CHECK6-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
1275 // CHECK6-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 8
1276 // CHECK6-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1277 // CHECK6-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1278 // CHECK6-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1279 // CHECK6-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
1280 // CHECK6-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
1281 // CHECK6-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8
1282 // CHECK6-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
1283 // CHECK6-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32*
1284 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1285 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1286 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1287 // CHECK6-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1288 // CHECK6-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1289 // CHECK6-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1290 // CHECK6:       .worker:
1291 // CHECK6-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]]
1292 // CHECK6-NEXT:    br label [[DOTEXIT:%.*]]
1293 // CHECK6:       .mastercheck:
1294 // CHECK6-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1295 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1296 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1297 // CHECK6-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
1298 // CHECK6-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
1299 // CHECK6-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1300 // CHECK6-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1301 // CHECK6-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
1302 // CHECK6-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1303 // CHECK6:       .master:
1304 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1305 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1306 // CHECK6-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
1307 // CHECK6-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
1308 // CHECK6-NEXT:    call void @__kmpc_data_sharing_init_stack()
1309 // CHECK6-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1)
1310 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
1311 // CHECK6-NEXT:    [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8
1312 // CHECK6-NEXT:    [[ARGC8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
1313 // CHECK6-NEXT:    store i8** [[TMP7]], i8*** [[ARGC8]], align 8
1314 // CHECK6-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1315 // CHECK6-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
1316 // CHECK6-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC8]]) #[[ATTR3]]
1317 // CHECK6-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
1318 // CHECK6-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1319 // CHECK6:       .termination.notifier:
1320 // CHECK6-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1321 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1322 // CHECK6-NEXT:    br label [[DOTEXIT]]
1323 // CHECK6:       .exit:
1324 // CHECK6-NEXT:    ret void
1325 //
1326 //
1327 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1
1328 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] {
1329 // CHECK6-NEXT:  entry:
1330 // CHECK6-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
1331 // CHECK6-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
1332 // CHECK6-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 8
1333 // CHECK6-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
1334 // CHECK6-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
1335 // CHECK6-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8
1336 // CHECK6-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8
1337 // CHECK6-NEXT:    store i8** null, i8*** [[TMP0]], align 8
1338 // CHECK6-NEXT:    ret void
1339 //
1340 //
1341 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker
1342 // CHECK7-SAME: () #[[ATTR0:[0-9]+]] {
1343 // CHECK7-NEXT:  entry:
1344 // CHECK7-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1345 // CHECK7-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1346 // CHECK7-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1347 // CHECK7-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1348 // CHECK7-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1349 // CHECK7:       .await.work:
1350 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1351 // CHECK7-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1352 // CHECK7-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1353 // CHECK7-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1354 // CHECK7-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1355 // CHECK7-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1356 // CHECK7-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1357 // CHECK7:       .select.workers:
1358 // CHECK7-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1359 // CHECK7-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1360 // CHECK7-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1361 // CHECK7:       .execute.parallel:
1362 // CHECK7-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
1363 // CHECK7-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1364 // CHECK7-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1365 // CHECK7-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1366 // CHECK7:       .terminate.parallel:
1367 // CHECK7-NEXT:    call void @__kmpc_kernel_end_parallel()
1368 // CHECK7-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1369 // CHECK7:       .barrier.parallel:
1370 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1371 // CHECK7-NEXT:    br label [[DOTAWAIT_WORK]]
1372 // CHECK7:       .exit:
1373 // CHECK7-NEXT:    ret void
1374 //
1375 //
1376 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
1377 // CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
1378 // CHECK7-NEXT:  entry:
1379 // CHECK7-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1380 // CHECK7-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
1381 // CHECK7-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
1382 // CHECK7-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1383 // CHECK7-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1384 // CHECK7-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1385 // CHECK7-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1386 // CHECK7-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
1387 // CHECK7-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
1388 // CHECK7-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1389 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1390 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1391 // CHECK7-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1392 // CHECK7-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1393 // CHECK7-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1394 // CHECK7:       .worker:
1395 // CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]]
1396 // CHECK7-NEXT:    br label [[DOTEXIT:%.*]]
1397 // CHECK7:       .mastercheck:
1398 // CHECK7-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1399 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1400 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1401 // CHECK7-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1402 // CHECK7-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1403 // CHECK7-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1404 // CHECK7-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1405 // CHECK7-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1406 // CHECK7-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1407 // CHECK7:       .master:
1408 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1409 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1410 // CHECK7-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1411 // CHECK7-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1412 // CHECK7-NEXT:    call void @__kmpc_data_sharing_init_stack()
1413 // CHECK7-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
1414 // CHECK7-NEXT:    [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4
1415 // CHECK7-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
1416 // CHECK7-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
1417 // CHECK7-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
1418 // CHECK7-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty*
1419 // CHECK7-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
1420 // CHECK7-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0
1421 // CHECK7-NEXT:    store i32 [[TMP10]], i32* [[ARGC7]], align 4
1422 // CHECK7-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1423 // CHECK7-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
1424 // CHECK7-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]]
1425 // CHECK7-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
1426 // CHECK7-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
1427 // CHECK7-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1428 // CHECK7:       .termination.notifier:
1429 // CHECK7-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1430 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1431 // CHECK7-NEXT:    br label [[DOTEXIT]]
1432 // CHECK7:       .exit:
1433 // CHECK7-NEXT:    ret void
1434 //
1435 //
1436 // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__
1437 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
1438 // CHECK7-NEXT:  entry:
1439 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1440 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1441 // CHECK7-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 4
1442 // CHECK7-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1443 // CHECK7-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1444 // CHECK7-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4
1445 // CHECK7-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4
1446 // CHECK7-NEXT:    store i32 0, i32* [[TMP0]], align 4
1447 // CHECK7-NEXT:    ret void
1448 //
1449 //
1450 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker
1451 // CHECK7-SAME: () #[[ATTR0]] {
1452 // CHECK7-NEXT:  entry:
1453 // CHECK7-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1454 // CHECK7-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1455 // CHECK7-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1456 // CHECK7-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1457 // CHECK7-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1458 // CHECK7:       .await.work:
1459 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1460 // CHECK7-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1461 // CHECK7-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1462 // CHECK7-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1463 // CHECK7-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1464 // CHECK7-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1465 // CHECK7-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1466 // CHECK7:       .select.workers:
1467 // CHECK7-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1468 // CHECK7-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1469 // CHECK7-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1470 // CHECK7:       .execute.parallel:
1471 // CHECK7-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1472 // CHECK7-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1473 // CHECK7-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1474 // CHECK7-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1475 // CHECK7:       .terminate.parallel:
1476 // CHECK7-NEXT:    call void @__kmpc_kernel_end_parallel()
1477 // CHECK7-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1478 // CHECK7:       .barrier.parallel:
1479 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1480 // CHECK7-NEXT:    br label [[DOTAWAIT_WORK]]
1481 // CHECK7:       .exit:
1482 // CHECK7-NEXT:    ret void
1483 //
1484 //
1485 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57
1486 // CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] {
1487 // CHECK7-NEXT:  entry:
1488 // CHECK7-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1489 // CHECK7-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
1490 // CHECK7-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 4
1491 // CHECK7-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1492 // CHECK7-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1493 // CHECK7-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1494 // CHECK7-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1495 // CHECK7-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
1496 // CHECK7-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4
1497 // CHECK7-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1498 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1499 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1500 // CHECK7-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1501 // CHECK7-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1502 // CHECK7-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1503 // CHECK7:       .worker:
1504 // CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]]
1505 // CHECK7-NEXT:    br label [[DOTEXIT:%.*]]
1506 // CHECK7:       .mastercheck:
1507 // CHECK7-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1508 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1509 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1510 // CHECK7-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1511 // CHECK7-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1512 // CHECK7-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1513 // CHECK7-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1514 // CHECK7-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1515 // CHECK7-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1516 // CHECK7:       .master:
1517 // CHECK7-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1518 // CHECK7-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1519 // CHECK7-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1520 // CHECK7-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1521 // CHECK7-NEXT:    call void @__kmpc_data_sharing_init_stack()
1522 // CHECK7-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
1523 // CHECK7-NEXT:    [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4
1524 // CHECK7-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
1525 // CHECK7-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
1526 // CHECK7-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
1527 // CHECK7-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0*
1528 // CHECK7-NEXT:    [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
1529 // CHECK7-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0
1530 // CHECK7-NEXT:    store i8** [[TMP10]], i8*** [[ARGC7]], align 4
1531 // CHECK7-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1532 // CHECK7-NEXT:    store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4
1533 // CHECK7-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]]
1534 // CHECK7-NEXT:    [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
1535 // CHECK7-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]])
1536 // CHECK7-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1537 // CHECK7:       .termination.notifier:
1538 // CHECK7-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1539 // CHECK7-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1540 // CHECK7-NEXT:    br label [[DOTEXIT]]
1541 // CHECK7:       .exit:
1542 // CHECK7-NEXT:    ret void
1543 //
1544 //
1545 // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3
1546 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
1547 // CHECK7-NEXT:  entry:
1548 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1549 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1550 // CHECK7-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 4
1551 // CHECK7-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1552 // CHECK7-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1553 // CHECK7-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4
1554 // CHECK7-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4
1555 // CHECK7-NEXT:    store i8** null, i8*** [[TMP0]], align 4
1556 // CHECK7-NEXT:    ret void
1557 //
1558 //
1559 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker
1560 // CHECK8-SAME: () #[[ATTR0:[0-9]+]] {
1561 // CHECK8-NEXT:  entry:
1562 // CHECK8-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1563 // CHECK8-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1564 // CHECK8-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1565 // CHECK8-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1566 // CHECK8-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1567 // CHECK8:       .await.work:
1568 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1569 // CHECK8-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1570 // CHECK8-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1571 // CHECK8-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1572 // CHECK8-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1573 // CHECK8-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1574 // CHECK8-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1575 // CHECK8:       .select.workers:
1576 // CHECK8-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1577 // CHECK8-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1578 // CHECK8-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1579 // CHECK8:       .execute.parallel:
1580 // CHECK8-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
1581 // CHECK8-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1582 // CHECK8-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1583 // CHECK8-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1584 // CHECK8:       .terminate.parallel:
1585 // CHECK8-NEXT:    call void @__kmpc_kernel_end_parallel()
1586 // CHECK8-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1587 // CHECK8:       .barrier.parallel:
1588 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1589 // CHECK8-NEXT:    br label [[DOTAWAIT_WORK]]
1590 // CHECK8:       .exit:
1591 // CHECK8-NEXT:    ret void
1592 //
1593 //
1594 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
1595 // CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
1596 // CHECK8-NEXT:  entry:
1597 // CHECK8-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1598 // CHECK8-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
1599 // CHECK8-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
1600 // CHECK8-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1601 // CHECK8-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1602 // CHECK8-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1603 // CHECK8-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1604 // CHECK8-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
1605 // CHECK8-NEXT:    store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4
1606 // CHECK8-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1607 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1608 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1609 // CHECK8-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1610 // CHECK8-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1611 // CHECK8-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1612 // CHECK8:       .worker:
1613 // CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]]
1614 // CHECK8-NEXT:    br label [[DOTEXIT:%.*]]
1615 // CHECK8:       .mastercheck:
1616 // CHECK8-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1617 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1618 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1619 // CHECK8-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1620 // CHECK8-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1621 // CHECK8-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1622 // CHECK8-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1623 // CHECK8-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1624 // CHECK8-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1625 // CHECK8:       .master:
1626 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1627 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1628 // CHECK8-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1629 // CHECK8-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1630 // CHECK8-NEXT:    call void @__kmpc_data_sharing_init_stack()
1631 // CHECK8-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1)
1632 // CHECK8-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
1633 // CHECK8-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
1634 // CHECK8-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
1635 // CHECK8-NEXT:    store i32 [[TMP7]], i32* [[ARGC7]], align 4
1636 // CHECK8-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1637 // CHECK8-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
1638 // CHECK8-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]]
1639 // CHECK8-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
1640 // CHECK8-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1641 // CHECK8:       .termination.notifier:
1642 // CHECK8-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1643 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1644 // CHECK8-NEXT:    br label [[DOTEXIT]]
1645 // CHECK8:       .exit:
1646 // CHECK8-NEXT:    ret void
1647 //
1648 //
1649 // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__
1650 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
1651 // CHECK8-NEXT:  entry:
1652 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1653 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1654 // CHECK8-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32*, align 4
1655 // CHECK8-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1656 // CHECK8-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1657 // CHECK8-NEXT:    store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4
1658 // CHECK8-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4
1659 // CHECK8-NEXT:    store i32 0, i32* [[TMP0]], align 4
1660 // CHECK8-NEXT:    ret void
1661 //
1662 //
1663 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker
1664 // CHECK8-SAME: () #[[ATTR0]] {
1665 // CHECK8-NEXT:  entry:
1666 // CHECK8-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1667 // CHECK8-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1668 // CHECK8-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1669 // CHECK8-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1670 // CHECK8-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1671 // CHECK8:       .await.work:
1672 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1673 // CHECK8-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1674 // CHECK8-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1675 // CHECK8-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1676 // CHECK8-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1677 // CHECK8-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1678 // CHECK8-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1679 // CHECK8:       .select.workers:
1680 // CHECK8-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1681 // CHECK8-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1682 // CHECK8-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1683 // CHECK8:       .execute.parallel:
1684 // CHECK8-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1685 // CHECK8-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1686 // CHECK8-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1687 // CHECK8-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1688 // CHECK8:       .terminate.parallel:
1689 // CHECK8-NEXT:    call void @__kmpc_kernel_end_parallel()
1690 // CHECK8-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1691 // CHECK8:       .barrier.parallel:
1692 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1693 // CHECK8-NEXT:    br label [[DOTAWAIT_WORK]]
1694 // CHECK8:       .exit:
1695 // CHECK8-NEXT:    ret void
1696 //
1697 //
1698 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57
1699 // CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] {
1700 // CHECK8-NEXT:  entry:
1701 // CHECK8-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
1702 // CHECK8-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
1703 // CHECK8-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8**, align 4
1704 // CHECK8-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1705 // CHECK8-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1706 // CHECK8-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1707 // CHECK8-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
1708 // CHECK8-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
1709 // CHECK8-NEXT:    store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4
1710 // CHECK8-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1711 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1712 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1713 // CHECK8-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1714 // CHECK8-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1715 // CHECK8-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1716 // CHECK8:       .worker:
1717 // CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]]
1718 // CHECK8-NEXT:    br label [[DOTEXIT:%.*]]
1719 // CHECK8:       .mastercheck:
1720 // CHECK8-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1721 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1722 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1723 // CHECK8-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1724 // CHECK8-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1725 // CHECK8-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1726 // CHECK8-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1727 // CHECK8-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1728 // CHECK8-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1729 // CHECK8:       .master:
1730 // CHECK8-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1731 // CHECK8-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1732 // CHECK8-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1733 // CHECK8-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1734 // CHECK8-NEXT:    call void @__kmpc_data_sharing_init_stack()
1735 // CHECK8-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1)
1736 // CHECK8-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
1737 // CHECK8-NEXT:    [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4
1738 // CHECK8-NEXT:    [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
1739 // CHECK8-NEXT:    store i8** [[TMP7]], i8*** [[ARGC7]], align 4
1740 // CHECK8-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1741 // CHECK8-NEXT:    store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4
1742 // CHECK8-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]]
1743 // CHECK8-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
1744 // CHECK8-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1745 // CHECK8:       .termination.notifier:
1746 // CHECK8-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1747 // CHECK8-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1748 // CHECK8-NEXT:    br label [[DOTEXIT]]
1749 // CHECK8:       .exit:
1750 // CHECK8-NEXT:    ret void
1751 //
1752 //
1753 // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1
1754 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
1755 // CHECK8-NEXT:  entry:
1756 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1757 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1758 // CHECK8-NEXT:    [[ARGC_ADDR:%.*]] = alloca i8***, align 4
1759 // CHECK8-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1760 // CHECK8-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1761 // CHECK8-NEXT:    store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4
1762 // CHECK8-NEXT:    [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4
1763 // CHECK8-NEXT:    store i8** null, i8*** [[TMP0]], align 4
1764 // CHECK8-NEXT:    ret void
1765 //
1766