1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // Test target codegen - host bc file has to be created first.
3 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
4 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
5 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
6 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix=CHECK2
7 // expected-no-diagnostics
8 #ifndef HEADER
9 #define HEADER
10 
11 template<typename tx>
ftemplate(int n)12 tx ftemplate(int n) {
13   tx a = 0;
14   short aa = 0;
15   tx b[10];
16 
17   #pragma omp target if(0)
18   {
19     #pragma omp parallel
20     {
21       int a = 41;
22     }
23     a += 1;
24   }
25 
26   #pragma omp target
27   {
28     #pragma omp parallel
29     {
30       int a = 42;
31     }
32     #pragma omp parallel if(0)
33     {
34       int a = 43;
35     }
36     #pragma omp parallel if(1)
37     {
38       int a = 44;
39     }
40     a += 1;
41   }
42 
43   #pragma omp target if(n>40)
44   {
45     #pragma omp parallel if(n>1000)
46     {
47       int a = 45;
48 #pragma omp barrier
49     }
50     a += 1;
51     aa += 1;
52     b[2] += 1;
53   }
54 
55   #pragma omp target
56   {
57     #pragma omp parallel
58     {
59     #pragma omp critical
60     ++a;
61     }
62     ++a;
63   }
64   return a;
65 }
66 
bar(int n)67 int bar(int n){
68   int a = 0;
69 
70   a += ftemplate<int>(n);
71 
72   return a;
73 }
74 
75 #endif
76 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
77 // CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
78 // CHECK1-NEXT:  entry:
79 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
80 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
81 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8
82 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x i8*], align 8
83 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
84 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
85 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true)
86 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
87 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
88 // CHECK1:       user_code.entry:
89 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
90 // CHECK1-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
91 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP2]], i64 0)
92 // CHECK1-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
93 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i64 0)
94 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS2]] to i8**
95 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP4]], i64 0)
96 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[CONV]], align 4
97 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], 1
98 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV]], align 4
99 // CHECK1-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
100 // CHECK1-NEXT:    ret void
101 // CHECK1:       worker.exit:
102 // CHECK1-NEXT:    ret void
103 //
104 //
105 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
106 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
107 // CHECK1-NEXT:  entry:
108 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
109 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
110 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
111 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
112 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
113 // CHECK1-NEXT:    store i32 42, i32* [[A]], align 4
114 // CHECK1-NEXT:    ret void
115 //
116 //
117 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
118 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
119 // CHECK1-NEXT:  entry:
120 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
121 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
122 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
123 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
124 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
125 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
126 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
127 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
128 // CHECK1-NEXT:    call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3:[0-9]+]]
129 // CHECK1-NEXT:    ret void
130 //
131 //
132 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
133 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
134 // CHECK1-NEXT:  entry:
135 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
136 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
137 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
138 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
139 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
140 // CHECK1-NEXT:    store i32 43, i32* [[A]], align 4
141 // CHECK1-NEXT:    ret void
142 //
143 //
144 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
145 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
146 // CHECK1-NEXT:  entry:
147 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
148 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
149 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
150 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
151 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
152 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
153 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
154 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
155 // CHECK1-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
156 // CHECK1-NEXT:    ret void
157 //
158 //
159 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2
160 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
161 // CHECK1-NEXT:  entry:
162 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
163 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
164 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
165 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
166 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
167 // CHECK1-NEXT:    store i32 44, i32* [[A]], align 4
168 // CHECK1-NEXT:    ret void
169 //
170 //
171 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
172 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
173 // CHECK1-NEXT:  entry:
174 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
175 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
176 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
177 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
178 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
179 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
180 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
181 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
182 // CHECK1-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
183 // CHECK1-NEXT:    ret void
184 //
185 //
186 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
187 // CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
188 // CHECK1-NEXT:  entry:
189 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
190 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
191 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
192 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
193 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
194 // CHECK1-NEXT:    store i64 [[N]], i64* [[N_ADDR]], align 8
195 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
196 // CHECK1-NEXT:    store i64 [[AA]], i64* [[AA_ADDR]], align 8
197 // CHECK1-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
198 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
199 // CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32*
200 // CHECK1-NEXT:    [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
201 // CHECK1-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
202 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
203 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
204 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
205 // CHECK1:       user_code.entry:
206 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
207 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4
208 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1000
209 // CHECK1-NEXT:    [[TMP4:%.*]] = zext i1 [[CMP]] to i32
210 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
211 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 [[TMP4]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP5]], i64 0)
212 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[CONV1]], align 4
213 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
214 // CHECK1-NEXT:    store i32 [[ADD]], i32* [[CONV1]], align 4
215 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[CONV2]], align 2
216 // CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP7]] to i32
217 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
218 // CHECK1-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
219 // CHECK1-NEXT:    store i16 [[CONV5]], i16* [[CONV2]], align 2
220 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2
221 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
222 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP8]], 1
223 // CHECK1-NEXT:    store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4
224 // CHECK1-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
225 // CHECK1-NEXT:    ret void
226 // CHECK1:       worker.exit:
227 // CHECK1-NEXT:    ret void
228 //
229 //
230 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3
231 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
232 // CHECK1-NEXT:  entry:
233 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
234 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
235 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
236 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
237 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
238 // CHECK1-NEXT:    store i32 45, i32* [[A]], align 4
239 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
240 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
241 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
242 // CHECK1-NEXT:    ret void
243 //
244 //
245 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
246 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
247 // CHECK1-NEXT:  entry:
248 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
249 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
250 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
251 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
252 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
253 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
254 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
255 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
256 // CHECK1-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
257 // CHECK1-NEXT:    ret void
258 //
259 //
260 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
261 // CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] {
262 // CHECK1-NEXT:  entry:
263 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
264 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
265 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
266 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
267 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
268 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
269 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
270 // CHECK1:       user_code.entry:
271 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
272 // CHECK1-NEXT:    [[A1:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4)
273 // CHECK1-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
274 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4
275 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
276 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
277 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
278 // CHECK1-NEXT:    store i8* [[TMP4]], i8** [[TMP3]], align 8
279 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
280 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP5]], i64 1)
281 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ON_STACK]], align 4
282 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP6]], 1
283 // CHECK1-NEXT:    store i32 [[INC]], i32* [[A_ON_STACK]], align 4
284 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[A1]], i64 4)
285 // CHECK1-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
286 // CHECK1-NEXT:    ret void
287 // CHECK1:       worker.exit:
288 // CHECK1-NEXT:    ret void
289 //
290 //
291 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4
292 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
293 // CHECK1-NEXT:  entry:
294 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
295 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
296 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
297 // CHECK1-NEXT:    [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4
298 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
299 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
300 // CHECK1-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
301 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
302 // CHECK1-NEXT:    [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
303 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
304 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
305 // CHECK1-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
306 // CHECK1-NEXT:    br label [[OMP_CRITICAL_LOOP:%.*]]
307 // CHECK1:       omp.critical.loop:
308 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
309 // CHECK1-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP3]], [[NVPTX_NUM_THREADS]]
310 // CHECK1-NEXT:    br i1 [[TMP4]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
311 // CHECK1:       omp.critical.test:
312 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
313 // CHECK1-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP2]], [[TMP5]]
314 // CHECK1-NEXT:    br i1 [[TMP6]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
315 // CHECK1:       omp.critical.body:
316 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
317 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
318 // CHECK1-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var")
319 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP0]], align 4
320 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP9]], 1
321 // CHECK1-NEXT:    store i32 [[INC]], i32* [[TMP0]], align 4
322 // CHECK1-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var")
323 // CHECK1-NEXT:    br label [[OMP_CRITICAL_SYNC]]
324 // CHECK1:       omp.critical.sync:
325 // CHECK1-NEXT:    call void @__kmpc_syncwarp(i64 [[TMP1]])
326 // CHECK1-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP5]], 1
327 // CHECK1-NEXT:    store i32 [[TMP10]], i32* [[CRITICAL_COUNTER]], align 4
328 // CHECK1-NEXT:    br label [[OMP_CRITICAL_LOOP]]
329 // CHECK1:       omp.critical.exit:
330 // CHECK1-NEXT:    ret void
331 //
332 //
333 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper
334 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
335 // CHECK1-NEXT:  entry:
336 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
337 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
338 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
339 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
340 // CHECK1-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
341 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
342 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
343 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
344 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
345 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
346 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
347 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
348 // CHECK1-NEXT:    call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
349 // CHECK1-NEXT:    ret void
350 //
351 //
352 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
353 // CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
354 // CHECK2-NEXT:  entry:
355 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
356 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
357 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 4
358 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x i8*], align 4
359 // CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
360 // CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true)
361 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
362 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
363 // CHECK2:       user_code.entry:
364 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
365 // CHECK2-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
366 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP2]], i32 0)
367 // CHECK2-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
368 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 0)
369 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS2]] to i8**
370 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP4]], i32 0)
371 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4
372 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], 1
373 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
374 // CHECK2-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
375 // CHECK2-NEXT:    ret void
376 // CHECK2:       worker.exit:
377 // CHECK2-NEXT:    ret void
378 //
379 //
380 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
381 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
382 // CHECK2-NEXT:  entry:
383 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
384 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
385 // CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
386 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
387 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
388 // CHECK2-NEXT:    store i32 42, i32* [[A]], align 4
389 // CHECK2-NEXT:    ret void
390 //
391 //
392 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
393 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
394 // CHECK2-NEXT:  entry:
395 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
396 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
397 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
398 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
399 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
400 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
401 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
402 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
403 // CHECK2-NEXT:    call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2:[0-9]+]]
404 // CHECK2-NEXT:    ret void
405 //
406 //
407 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
408 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
409 // CHECK2-NEXT:  entry:
410 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
411 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
412 // CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
413 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
414 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
415 // CHECK2-NEXT:    store i32 43, i32* [[A]], align 4
416 // CHECK2-NEXT:    ret void
417 //
418 //
419 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
420 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
421 // CHECK2-NEXT:  entry:
422 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
423 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
424 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
425 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
426 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
427 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
428 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
429 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
430 // CHECK2-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
431 // CHECK2-NEXT:    ret void
432 //
433 //
434 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2
435 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
436 // CHECK2-NEXT:  entry:
437 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
438 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
439 // CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
440 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
441 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
442 // CHECK2-NEXT:    store i32 44, i32* [[A]], align 4
443 // CHECK2-NEXT:    ret void
444 //
445 //
446 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper
447 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
448 // CHECK2-NEXT:  entry:
449 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
450 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
451 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
452 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
453 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
454 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
455 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
456 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
457 // CHECK2-NEXT:    call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
458 // CHECK2-NEXT:    ret void
459 //
460 //
461 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
462 // CHECK2-SAME: (i32 noundef [[N:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
463 // CHECK2-NEXT:  entry:
464 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
465 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
466 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
467 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
468 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
469 // CHECK2-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
470 // CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
471 // CHECK2-NEXT:    store i32 [[AA]], i32* [[AA_ADDR]], align 4
472 // CHECK2-NEXT:    store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
473 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
474 // CHECK2-NEXT:    [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
475 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
476 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
477 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
478 // CHECK2:       user_code.entry:
479 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
480 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
481 // CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1000
482 // CHECK2-NEXT:    [[TMP4:%.*]] = zext i1 [[CMP]] to i32
483 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
484 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 [[TMP4]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP5]], i32 0)
485 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
486 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
487 // CHECK2-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
488 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[CONV]], align 2
489 // CHECK2-NEXT:    [[CONV1:%.*]] = sext i16 [[TMP7]] to i32
490 // CHECK2-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV1]], 1
491 // CHECK2-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
492 // CHECK2-NEXT:    store i16 [[CONV3]], i16* [[CONV]], align 2
493 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
494 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
495 // CHECK2-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP8]], 1
496 // CHECK2-NEXT:    store i32 [[ADD4]], i32* [[ARRAYIDX]], align 4
497 // CHECK2-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
498 // CHECK2-NEXT:    ret void
499 // CHECK2:       worker.exit:
500 // CHECK2-NEXT:    ret void
501 //
502 //
503 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3
504 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
505 // CHECK2-NEXT:  entry:
506 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
507 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
508 // CHECK2-NEXT:    [[A:%.*]] = alloca i32, align 4
509 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
510 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
511 // CHECK2-NEXT:    store i32 45, i32* [[A]], align 4
512 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
513 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
514 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
515 // CHECK2-NEXT:    ret void
516 //
517 //
518 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
519 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
520 // CHECK2-NEXT:  entry:
521 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
522 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
523 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
524 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
525 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
526 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
527 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
528 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
529 // CHECK2-NEXT:    call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
530 // CHECK2-NEXT:    ret void
531 //
532 //
533 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
534 // CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
535 // CHECK2-NEXT:  entry:
536 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
537 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
538 // CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
539 // CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
540 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
541 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
542 // CHECK2:       user_code.entry:
543 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4
544 // CHECK2-NEXT:    [[A1:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i32 4)
545 // CHECK2-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
546 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4
547 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
548 // CHECK2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
549 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
550 // CHECK2-NEXT:    store i8* [[TMP4]], i8** [[TMP3]], align 4
551 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
552 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP5]], i32 1)
553 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ON_STACK]], align 4
554 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP6]], 1
555 // CHECK2-NEXT:    store i32 [[INC]], i32* [[A_ON_STACK]], align 4
556 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[A1]], i32 4)
557 // CHECK2-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
558 // CHECK2-NEXT:    ret void
559 // CHECK2:       worker.exit:
560 // CHECK2-NEXT:    ret void
561 //
562 //
563 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4
564 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
565 // CHECK2-NEXT:  entry:
566 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
567 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
568 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
569 // CHECK2-NEXT:    [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4
570 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
571 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
572 // CHECK2-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
573 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
574 // CHECK2-NEXT:    [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
575 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
576 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
577 // CHECK2-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
578 // CHECK2-NEXT:    br label [[OMP_CRITICAL_LOOP:%.*]]
579 // CHECK2:       omp.critical.loop:
580 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
581 // CHECK2-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP3]], [[NVPTX_NUM_THREADS]]
582 // CHECK2-NEXT:    br i1 [[TMP4]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
583 // CHECK2:       omp.critical.test:
584 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
585 // CHECK2-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP2]], [[TMP5]]
586 // CHECK2-NEXT:    br i1 [[TMP6]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
587 // CHECK2:       omp.critical.body:
588 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
589 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
590 // CHECK2-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var")
591 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP0]], align 4
592 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP9]], 1
593 // CHECK2-NEXT:    store i32 [[INC]], i32* [[TMP0]], align 4
594 // CHECK2-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var")
595 // CHECK2-NEXT:    br label [[OMP_CRITICAL_SYNC]]
596 // CHECK2:       omp.critical.sync:
597 // CHECK2-NEXT:    call void @__kmpc_syncwarp(i64 [[TMP1]])
598 // CHECK2-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP5]], 1
599 // CHECK2-NEXT:    store i32 [[TMP10]], i32* [[CRITICAL_COUNTER]], align 4
600 // CHECK2-NEXT:    br label [[OMP_CRITICAL_LOOP]]
601 // CHECK2:       omp.critical.exit:
602 // CHECK2-NEXT:    ret void
603 //
604 //
605 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper
606 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] {
607 // CHECK2-NEXT:  entry:
608 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
609 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
610 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
611 // CHECK2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
612 // CHECK2-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
613 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
614 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
615 // CHECK2-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
616 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
617 // CHECK2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
618 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
619 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4
620 // CHECK2-NEXT:    call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]]
621 // CHECK2-NEXT:    ret void
622 //
623