1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ 2 // Test target codegen - host bc file has to be created first. 3 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 4 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1 5 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc 6 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix=CHECK2 7 // expected-no-diagnostics 8 #ifndef HEADER 9 #define HEADER 10 11 template<typename tx> 12 tx ftemplate(int n) { 13 tx a = 0; 14 short aa = 0; 15 tx b[10]; 16 17 #pragma omp target if(0) 18 { 19 #pragma omp parallel 20 { 21 int a = 41; 22 } 23 a += 1; 24 } 25 26 #pragma omp target 27 { 28 #pragma omp parallel 29 { 30 int a = 42; 31 } 32 #pragma omp parallel if(0) 33 { 34 int a = 43; 35 } 36 #pragma omp parallel if(1) 37 { 38 int a = 44; 39 } 40 a += 1; 41 } 42 43 #pragma omp target if(n>40) 44 { 45 #pragma omp parallel if(n>1000) 46 { 47 int a = 45; 48 #pragma omp barrier 49 } 50 a += 1; 51 aa += 1; 52 b[2] += 1; 53 } 54 55 #pragma omp target 56 { 57 #pragma omp parallel 58 { 59 #pragma omp critical 60 ++a; 61 } 62 ++a; 63 } 64 return a; 65 } 66 67 int bar(int n){ 68 int a = 0; 69 70 a += ftemplate<int>(n); 71 72 return a; 73 } 74 75 #endif 76 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 77 // CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { 78 // CHECK1-NEXT: entry: 79 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 80 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 81 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 82 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x i8*], align 8 83 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 84 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 85 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) 86 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 87 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 88 // CHECK1: user_code.entry: 89 // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 90 // CHECK1-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 91 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP2]], i64 0) 92 // CHECK1-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** 93 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i64 0) 94 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS2]] to i8** 95 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP4]], i64 0) 96 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 4 97 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], 1 98 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 4 99 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 100 // CHECK1-NEXT: ret void 101 // CHECK1: worker.exit: 102 // CHECK1-NEXT: ret void 103 // 104 // 105 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ 106 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] { 107 // CHECK1-NEXT: entry: 108 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 109 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 110 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 111 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 112 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 113 // CHECK1-NEXT: store i32 42, i32* [[A]], align 4 114 // CHECK1-NEXT: ret void 115 // 116 // 117 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 118 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { 119 // CHECK1-NEXT: entry: 120 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 121 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 122 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 123 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 124 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 125 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 126 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 127 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 128 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3:[0-9]+]] 129 // CHECK1-NEXT: ret void 130 // 131 // 132 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 133 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 134 // CHECK1-NEXT: entry: 135 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 136 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 137 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 138 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 139 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 140 // CHECK1-NEXT: store i32 43, i32* [[A]], align 4 141 // CHECK1-NEXT: ret void 142 // 143 // 144 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 145 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 146 // CHECK1-NEXT: entry: 147 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 148 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 149 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 150 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 151 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 152 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 153 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 154 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 155 // CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 156 // CHECK1-NEXT: ret void 157 // 158 // 159 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 160 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 161 // CHECK1-NEXT: entry: 162 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 163 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 164 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 165 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 166 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 167 // CHECK1-NEXT: store i32 44, i32* [[A]], align 4 168 // CHECK1-NEXT: ret void 169 // 170 // 171 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 172 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 173 // CHECK1-NEXT: entry: 174 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 175 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 176 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 177 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 178 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 179 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 180 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 181 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 182 // CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 183 // CHECK1-NEXT: ret void 184 // 185 // 186 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 187 // CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 188 // CHECK1-NEXT: entry: 189 // CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 190 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 191 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 192 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 193 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 194 // CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 195 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 196 // CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 197 // CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 198 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* 199 // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* 200 // CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* 201 // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 202 // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 203 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 204 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 205 // CHECK1: user_code.entry: 206 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 207 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 4 208 // CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1000 209 // CHECK1-NEXT: [[TMP4:%.*]] = zext i1 [[CMP]] to i32 210 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 211 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 [[TMP4]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP5]], i64 0) 212 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV1]], align 4 213 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 214 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 4 215 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV2]], align 2 216 // CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP7]] to i32 217 // CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1 218 // CHECK1-NEXT: [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16 219 // CHECK1-NEXT: store i16 [[CONV5]], i16* [[CONV2]], align 2 220 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 221 // CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 222 // CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP8]], 1 223 // CHECK1-NEXT: store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4 224 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 225 // CHECK1-NEXT: ret void 226 // CHECK1: worker.exit: 227 // CHECK1-NEXT: ret void 228 // 229 // 230 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 231 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 232 // CHECK1-NEXT: entry: 233 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 234 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 235 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 236 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 237 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 238 // CHECK1-NEXT: store i32 45, i32* [[A]], align 4 239 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 240 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 241 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 242 // CHECK1-NEXT: ret void 243 // 244 // 245 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 246 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 247 // CHECK1-NEXT: entry: 248 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 249 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 250 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 251 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 252 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 253 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 254 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 255 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 256 // CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 257 // CHECK1-NEXT: ret void 258 // 259 // 260 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 261 // CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] { 262 // CHECK1-NEXT: entry: 263 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 264 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 265 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 266 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 267 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 268 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 269 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 270 // CHECK1: user_code.entry: 271 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4 272 // CHECK1-NEXT: [[A1:%.*]] = call align 16 i8* @__kmpc_alloc_shared(i64 4) 273 // CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32* 274 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4 275 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 276 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 277 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* 278 // CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 279 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 280 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP5]], i64 1) 281 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 282 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 283 // CHECK1-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 284 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A1]], i64 4) 285 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 286 // CHECK1-NEXT: ret void 287 // CHECK1: worker.exit: 288 // CHECK1-NEXT: ret void 289 // 290 // 291 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 292 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { 293 // CHECK1-NEXT: entry: 294 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 295 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 296 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 297 // CHECK1-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 298 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 299 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 300 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 301 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 302 // CHECK1-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() 303 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 304 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() 305 // CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 306 // CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 307 // CHECK1: omp.critical.loop: 308 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 309 // CHECK1-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3]], [[NVPTX_NUM_THREADS]] 310 // CHECK1-NEXT: br i1 [[TMP4]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 311 // CHECK1: omp.critical.test: 312 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 313 // CHECK1-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP2]], [[TMP5]] 314 // CHECK1-NEXT: br i1 [[TMP6]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 315 // CHECK1: omp.critical.body: 316 // CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 317 // CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 318 // CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var") 319 // CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP0]], align 4 320 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP9]], 1 321 // CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 322 // CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var") 323 // CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] 324 // CHECK1: omp.critical.sync: 325 // CHECK1-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) 326 // CHECK1-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP5]], 1 327 // CHECK1-NEXT: store i32 [[TMP10]], i32* [[CRITICAL_COUNTER]], align 4 328 // CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] 329 // CHECK1: omp.critical.exit: 330 // CHECK1-NEXT: ret void 331 // 332 // 333 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 334 // CHECK1-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 335 // CHECK1-NEXT: entry: 336 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 337 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 338 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 339 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 340 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 341 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 342 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 343 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 344 // CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 345 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 346 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 347 // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 348 // CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] 349 // CHECK1-NEXT: ret void 350 // 351 // 352 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 353 // CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { 354 // CHECK2-NEXT: entry: 355 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 356 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 357 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 4 358 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x i8*], align 4 359 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 360 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) 361 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 362 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 363 // CHECK2: user_code.entry: 364 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 365 // CHECK2-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 366 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP2]], i32 0) 367 // CHECK2-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** 368 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 0) 369 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS2]] to i8** 370 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP4]], i32 0) 371 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4 372 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], 1 373 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 374 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 375 // CHECK2-NEXT: ret void 376 // CHECK2: worker.exit: 377 // CHECK2-NEXT: ret void 378 // 379 // 380 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ 381 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] { 382 // CHECK2-NEXT: entry: 383 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 384 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 385 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 386 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 387 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 388 // CHECK2-NEXT: store i32 42, i32* [[A]], align 4 389 // CHECK2-NEXT: ret void 390 // 391 // 392 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 393 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] { 394 // CHECK2-NEXT: entry: 395 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 396 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 397 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 398 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 399 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 400 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 401 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 402 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 403 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2:[0-9]+]] 404 // CHECK2-NEXT: ret void 405 // 406 // 407 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 408 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 409 // CHECK2-NEXT: entry: 410 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 411 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 412 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 413 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 414 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 415 // CHECK2-NEXT: store i32 43, i32* [[A]], align 4 416 // CHECK2-NEXT: ret void 417 // 418 // 419 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 420 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] { 421 // CHECK2-NEXT: entry: 422 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 423 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 424 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 425 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 426 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 427 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 428 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 429 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 430 // CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 431 // CHECK2-NEXT: ret void 432 // 433 // 434 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 435 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 436 // CHECK2-NEXT: entry: 437 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 438 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 439 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 440 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 441 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 442 // CHECK2-NEXT: store i32 44, i32* [[A]], align 4 443 // CHECK2-NEXT: ret void 444 // 445 // 446 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 447 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] { 448 // CHECK2-NEXT: entry: 449 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 450 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 451 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 452 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 453 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 454 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 455 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 456 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 457 // CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 458 // CHECK2-NEXT: ret void 459 // 460 // 461 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 462 // CHECK2-SAME: (i32 noundef [[N:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], [10 x i32]* noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 463 // CHECK2-NEXT: entry: 464 // CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 465 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 466 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 467 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 468 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 469 // CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 470 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 471 // CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 472 // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 473 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 474 // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 475 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 476 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 477 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 478 // CHECK2: user_code.entry: 479 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 480 // CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 481 // CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1000 482 // CHECK2-NEXT: [[TMP4:%.*]] = zext i1 [[CMP]] to i32 483 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 484 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 [[TMP4]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP5]], i32 0) 485 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 486 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 487 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 488 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 2 489 // CHECK2-NEXT: [[CONV1:%.*]] = sext i16 [[TMP7]] to i32 490 // CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[CONV1]], 1 491 // CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16 492 // CHECK2-NEXT: store i16 [[CONV3]], i16* [[CONV]], align 2 493 // CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 494 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 495 // CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP8]], 1 496 // CHECK2-NEXT: store i32 [[ADD4]], i32* [[ARRAYIDX]], align 4 497 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 498 // CHECK2-NEXT: ret void 499 // CHECK2: worker.exit: 500 // CHECK2-NEXT: ret void 501 // 502 // 503 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 504 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 505 // CHECK2-NEXT: entry: 506 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 507 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 508 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 509 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 510 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 511 // CHECK2-NEXT: store i32 45, i32* [[A]], align 4 512 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 513 // CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 514 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 515 // CHECK2-NEXT: ret void 516 // 517 // 518 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 519 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] { 520 // CHECK2-NEXT: entry: 521 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 522 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 523 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 524 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 525 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 526 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 527 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 528 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 529 // CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 530 // CHECK2-NEXT: ret void 531 // 532 // 533 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 534 // CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] { 535 // CHECK2-NEXT: entry: 536 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 537 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 538 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 539 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 540 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 541 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 542 // CHECK2: user_code.entry: 543 // CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 544 // CHECK2-NEXT: [[A1:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i32 4) 545 // CHECK2-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32* 546 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4 547 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 548 // CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 549 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* 550 // CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 551 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 552 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP5]], i32 1) 553 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 554 // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 555 // CHECK2-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 556 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[A1]], i32 4) 557 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 558 // CHECK2-NEXT: ret void 559 // CHECK2: worker.exit: 560 // CHECK2-NEXT: ret void 561 // 562 // 563 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 564 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { 565 // CHECK2-NEXT: entry: 566 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 567 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 568 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 569 // CHECK2-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 570 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 571 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 572 // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 573 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 574 // CHECK2-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() 575 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 576 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() 577 // CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 578 // CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 579 // CHECK2: omp.critical.loop: 580 // CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 581 // CHECK2-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3]], [[NVPTX_NUM_THREADS]] 582 // CHECK2-NEXT: br i1 [[TMP4]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 583 // CHECK2: omp.critical.test: 584 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 585 // CHECK2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP2]], [[TMP5]] 586 // CHECK2-NEXT: br i1 [[TMP6]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 587 // CHECK2: omp.critical.body: 588 // CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 589 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 590 // CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var") 591 // CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP0]], align 4 592 // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP9]], 1 593 // CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 594 // CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var") 595 // CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] 596 // CHECK2: omp.critical.sync: 597 // CHECK2-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) 598 // CHECK2-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP5]], 1 599 // CHECK2-NEXT: store i32 [[TMP10]], i32* [[CRITICAL_COUNTER]], align 4 600 // CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]] 601 // CHECK2: omp.critical.exit: 602 // CHECK2-NEXT: ret void 603 // 604 // 605 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 606 // CHECK2-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR1]] { 607 // CHECK2-NEXT: entry: 608 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 609 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 610 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 611 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 612 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 613 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 614 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 615 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 616 // CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 617 // CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 618 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 619 // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 620 // CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] 621 // CHECK2-NEXT: ret void 622 // 623