1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ 2 // Test target codegen - host bc file has to be created first. 3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1 5 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc 6 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix=CHECK2 7 // expected-no-diagnostics 8 #ifndef HEADER 9 #define HEADER 10 11 template<typename tx> 12 tx ftemplate(int n) { 13 tx a = 0; 14 short aa = 0; 15 tx b[10]; 16 17 #pragma omp target if(0) 18 { 19 #pragma omp parallel 20 { 21 int a = 41; 22 } 23 a += 1; 24 } 25 26 #pragma omp target 27 { 28 #pragma omp parallel 29 { 30 int a = 42; 31 } 32 #pragma omp parallel if(0) 33 { 34 int a = 43; 35 } 36 #pragma omp parallel if(1) 37 { 38 int a = 44; 39 } 40 a += 1; 41 } 42 43 #pragma omp target if(n>40) 44 { 45 #pragma omp parallel if(n>1000) 46 { 47 int a = 45; 48 #pragma omp barrier 49 } 50 a += 1; 51 aa += 1; 52 b[2] += 1; 53 } 54 55 #pragma omp target 56 { 57 #pragma omp parallel 58 { 59 #pragma omp critical 60 ++a; 61 } 62 ++a; 63 } 64 return a; 65 } 66 67 int bar(int n){ 68 int a = 0; 69 70 a += ftemplate<int>(n); 71 72 return a; 73 } 74 75 #endif 76 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker 77 // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { 78 // CHECK3-NEXT: entry: 79 // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 80 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 81 // CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 82 // CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 83 // CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] 84 // CHECK3: .await.work: 85 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 86 // CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 87 // CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 88 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 89 // CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 90 // CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 91 // CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 92 // CHECK3: .select.workers: 93 // CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 94 // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 95 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 96 // CHECK3: .execute.parallel: 97 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 98 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 99 // CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) 100 // CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 101 // CHECK3: .execute.fn: 102 // CHECK3-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2:[0-9]+]] 103 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 104 // CHECK3: .check.next: 105 // CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 106 // CHECK3-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) 107 // CHECK3-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] 108 // CHECK3: .execute.fn2: 109 // CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 110 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 111 // CHECK3: .check.next3: 112 // CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 113 // CHECK3-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) 114 // CHECK3-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] 115 // CHECK3: .execute.fn5: 116 // CHECK3-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 117 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 118 // CHECK3: .check.next6: 119 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 120 // CHECK3-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) 121 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 122 // CHECK3: .terminate.parallel: 123 // CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() 124 // CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] 125 // CHECK3: .barrier.parallel: 126 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 127 // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] 128 // CHECK3: .exit: 129 // CHECK3-NEXT: ret void 130 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 131 // CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { 132 // CHECK3-NEXT: entry: 133 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 134 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 135 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 136 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 137 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 138 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 139 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 140 // CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 141 // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 142 // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 143 // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 144 // CHECK3: .worker: 145 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR2]] 146 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 147 // CHECK3: .mastercheck: 148 // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 149 // CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 150 // CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 151 // CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 152 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 153 // CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 154 // CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 155 // CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 156 // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 157 // CHECK3: .master: 158 // CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 159 // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 160 // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 161 // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 162 // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() 163 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 164 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 165 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) 166 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** 167 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) 168 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** 169 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) 170 // CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 171 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 172 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 173 // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 174 // CHECK3: .termination.notifier: 175 // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) 176 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 177 // CHECK3-NEXT: br label [[DOTEXIT]] 178 // CHECK3: .exit: 179 // CHECK3-NEXT: ret void 180 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ 181 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 182 // CHECK3-NEXT: entry: 183 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 184 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 185 // CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 186 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 187 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 188 // CHECK3-NEXT: store i32 42, i32* [[A]], align 4 189 // CHECK3-NEXT: ret void 190 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 191 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 192 // CHECK3-NEXT: entry: 193 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 194 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 195 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 196 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 197 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 198 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 199 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 200 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 201 // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 202 // CHECK3-NEXT: ret void 203 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 204 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 205 // CHECK3-NEXT: entry: 206 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 207 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 208 // CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 209 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 210 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 211 // CHECK3-NEXT: store i32 43, i32* [[A]], align 4 212 // CHECK3-NEXT: ret void 213 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 214 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 215 // CHECK3-NEXT: entry: 216 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 217 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 218 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 219 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 220 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 221 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 222 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 223 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 224 // CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 225 // CHECK3-NEXT: ret void 226 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 227 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 228 // CHECK3-NEXT: entry: 229 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 230 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 231 // CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 232 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 233 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 234 // CHECK3-NEXT: store i32 44, i32* [[A]], align 4 235 // CHECK3-NEXT: ret void 236 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 237 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 238 // CHECK3-NEXT: entry: 239 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 240 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 241 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 242 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 243 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 244 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 245 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 246 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 247 // CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 248 // CHECK3-NEXT: ret void 249 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker 250 // CHECK3-SAME: () #[[ATTR0]] { 251 // CHECK3-NEXT: entry: 252 // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 253 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 254 // CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 255 // CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 256 // CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] 257 // CHECK3: .await.work: 258 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 259 // CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 260 // CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 261 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 262 // CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 263 // CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 264 // CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 265 // CHECK3: .select.workers: 266 // CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 267 // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 268 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 269 // CHECK3: .execute.parallel: 270 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 271 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 272 // CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) 273 // CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 274 // CHECK3: .execute.fn: 275 // CHECK3-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 276 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 277 // CHECK3: .check.next: 278 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 279 // CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 280 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 281 // CHECK3: .terminate.parallel: 282 // CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() 283 // CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] 284 // CHECK3: .barrier.parallel: 285 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 286 // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] 287 // CHECK3: .exit: 288 // CHECK3-NEXT: ret void 289 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 290 // CHECK3-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 291 // CHECK3-NEXT: entry: 292 // CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 293 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 294 // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 295 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 296 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 297 // CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 298 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 299 // CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 300 // CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 301 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 302 // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 303 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 304 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 305 // CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 306 // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 307 // CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 308 // CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 309 // CHECK3: .worker: 310 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR2]] 311 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 312 // CHECK3: .mastercheck: 313 // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 314 // CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 315 // CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 316 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 317 // CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 318 // CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 319 // CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 320 // CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 321 // CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 322 // CHECK3: .master: 323 // CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 324 // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 325 // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 326 // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 327 // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() 328 // CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 329 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 330 // CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 331 // CHECK3-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 332 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 333 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) 334 // CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 335 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 336 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 337 // CHECK3-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 338 // CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 339 // CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 340 // CHECK3-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 341 // CHECK3-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 342 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 343 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 344 // CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 345 // CHECK3-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 346 // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 347 // CHECK3: .termination.notifier: 348 // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) 349 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 350 // CHECK3-NEXT: br label [[DOTEXIT]] 351 // CHECK3: .exit: 352 // CHECK3-NEXT: ret void 353 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 354 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 355 // CHECK3-NEXT: entry: 356 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 357 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 358 // CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 359 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 360 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 361 // CHECK3-NEXT: store i32 45, i32* [[A]], align 4 362 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 363 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 364 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 365 // CHECK3-NEXT: ret void 366 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 367 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 368 // CHECK3-NEXT: entry: 369 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 370 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 371 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 372 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 373 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 374 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 375 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 376 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 377 // CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 378 // CHECK3-NEXT: ret void 379 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker 380 // CHECK3-SAME: () #[[ATTR0]] { 381 // CHECK3-NEXT: entry: 382 // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 383 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 384 // CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 385 // CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 386 // CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] 387 // CHECK3: .await.work: 388 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 389 // CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 390 // CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 391 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 392 // CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 393 // CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 394 // CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 395 // CHECK3: .select.workers: 396 // CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 397 // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 398 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 399 // CHECK3: .execute.parallel: 400 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 401 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 402 // CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) 403 // CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 404 // CHECK3: .execute.fn: 405 // CHECK3-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 406 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 407 // CHECK3: .check.next: 408 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 409 // CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 410 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 411 // CHECK3: .terminate.parallel: 412 // CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() 413 // CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] 414 // CHECK3: .barrier.parallel: 415 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 416 // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] 417 // CHECK3: .exit: 418 // CHECK3-NEXT: ret void 419 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 420 // CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { 421 // CHECK3-NEXT: entry: 422 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 423 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 424 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 425 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 426 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 427 // CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 428 // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 429 // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 430 // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 431 // CHECK3: .worker: 432 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR2]] 433 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 434 // CHECK3: .mastercheck: 435 // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 436 // CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 437 // CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 438 // CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 439 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 440 // CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 441 // CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 442 // CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 443 // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 444 // CHECK3: .master: 445 // CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 446 // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 447 // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 448 // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 449 // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() 450 // CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 451 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 452 // CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 453 // CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 454 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 455 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* 456 // CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 457 // CHECK3-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 458 // CHECK3-NEXT: store i32 [[TMP10]], i32* [[A7]], align 4 459 // CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 460 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 461 // CHECK3-NEXT: [[TMP13:%.*]] = bitcast i32* [[A7]] to i8* 462 // CHECK3-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 4 463 // CHECK3-NEXT: [[TMP14:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 464 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP14]], i32 1) 465 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A7]], align 4 466 // CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP15]], 1 467 // CHECK3-NEXT: store i32 [[INC]], i32* [[A7]], align 4 468 // CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 469 // CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP16]]) 470 // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 471 // CHECK3: .termination.notifier: 472 // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) 473 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 474 // CHECK3-NEXT: br label [[DOTEXIT]] 475 // CHECK3: .exit: 476 // CHECK3-NEXT: ret void 477 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 478 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { 479 // CHECK3-NEXT: entry: 480 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 481 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 482 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 483 // CHECK3-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 484 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 485 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 486 // CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 487 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 488 // CHECK3-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() 489 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 490 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 491 // CHECK3-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 492 // CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 493 // CHECK3: omp.critical.loop: 494 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 495 // CHECK3-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 496 // CHECK3-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 497 // CHECK3: omp.critical.test: 498 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 499 // CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 500 // CHECK3-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 501 // CHECK3: omp.critical.body: 502 // CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 503 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 504 // CHECK3-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 505 // CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 506 // CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 507 // CHECK3-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 508 // CHECK3-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 509 // CHECK3-NEXT: br label [[OMP_CRITICAL_SYNC]] 510 // CHECK3: omp.critical.sync: 511 // CHECK3-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) 512 // CHECK3-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 513 // CHECK3-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 514 // CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP]] 515 // CHECK3: omp.critical.exit: 516 // CHECK3-NEXT: ret void 517 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 518 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 519 // CHECK3-NEXT: entry: 520 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 521 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 522 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 523 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 524 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 525 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 526 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 527 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 528 // CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 529 // CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 530 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 531 // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 532 // CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] 533 // CHECK3-NEXT: ret void 534 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker 535 // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { 536 // CHECK4-NEXT: entry: 537 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 538 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 539 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 540 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 541 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 542 // CHECK4: .await.work: 543 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 544 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 545 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 546 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 547 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 548 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 549 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 550 // CHECK4: .select.workers: 551 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 552 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 553 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 554 // CHECK4: .execute.parallel: 555 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 556 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 557 // CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) 558 // CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 559 // CHECK4: .execute.fn: 560 // CHECK4-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] 561 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 562 // CHECK4: .check.next: 563 // CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 564 // CHECK4-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) 565 // CHECK4-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] 566 // CHECK4: .execute.fn2: 567 // CHECK4-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 568 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 569 // CHECK4: .check.next3: 570 // CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 571 // CHECK4-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) 572 // CHECK4-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] 573 // CHECK4: .execute.fn5: 574 // CHECK4-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 575 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 576 // CHECK4: .check.next6: 577 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 578 // CHECK4-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) 579 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 580 // CHECK4: .terminate.parallel: 581 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 582 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 583 // CHECK4: .barrier.parallel: 584 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 585 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 586 // CHECK4: .exit: 587 // CHECK4-NEXT: ret void 588 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 589 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { 590 // CHECK4-NEXT: entry: 591 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 592 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 593 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 594 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 595 // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 596 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 597 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 598 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 599 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 600 // CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 601 // CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 602 // CHECK4: .worker: 603 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] 604 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 605 // CHECK4: .mastercheck: 606 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 607 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 608 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 609 // CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 610 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 611 // CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 612 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 613 // CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 614 // CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 615 // CHECK4: .master: 616 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 617 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 618 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 619 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 620 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 621 // CHECK4-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 622 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 623 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) 624 // CHECK4-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** 625 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) 626 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** 627 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) 628 // CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 629 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 630 // CHECK4-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 631 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 632 // CHECK4: .termination.notifier: 633 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 634 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 635 // CHECK4-NEXT: br label [[DOTEXIT]] 636 // CHECK4: .exit: 637 // CHECK4-NEXT: ret void 638 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ 639 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 640 // CHECK4-NEXT: entry: 641 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 642 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 643 // CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 644 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 645 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 646 // CHECK4-NEXT: store i32 42, i32* [[A]], align 4 647 // CHECK4-NEXT: ret void 648 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 649 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 650 // CHECK4-NEXT: entry: 651 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 652 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 653 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 654 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 655 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 656 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 657 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 658 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 659 // CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 660 // CHECK4-NEXT: ret void 661 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 662 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 663 // CHECK4-NEXT: entry: 664 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 665 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 666 // CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 667 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 668 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 669 // CHECK4-NEXT: store i32 43, i32* [[A]], align 4 670 // CHECK4-NEXT: ret void 671 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 672 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 673 // CHECK4-NEXT: entry: 674 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 675 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 676 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 677 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 678 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 679 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 680 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 681 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 682 // CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 683 // CHECK4-NEXT: ret void 684 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 685 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 686 // CHECK4-NEXT: entry: 687 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 688 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 689 // CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 690 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 691 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 692 // CHECK4-NEXT: store i32 44, i32* [[A]], align 4 693 // CHECK4-NEXT: ret void 694 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 695 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 696 // CHECK4-NEXT: entry: 697 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 698 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 699 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 700 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 701 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 702 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 703 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 704 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 705 // CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 706 // CHECK4-NEXT: ret void 707 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker 708 // CHECK4-SAME: () #[[ATTR0]] { 709 // CHECK4-NEXT: entry: 710 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 711 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 712 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 713 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 714 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 715 // CHECK4: .await.work: 716 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 717 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 718 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 719 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 720 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 721 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 722 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 723 // CHECK4: .select.workers: 724 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 725 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 726 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 727 // CHECK4: .execute.parallel: 728 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 729 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 730 // CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) 731 // CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 732 // CHECK4: .execute.fn: 733 // CHECK4-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 734 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 735 // CHECK4: .check.next: 736 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 737 // CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 738 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 739 // CHECK4: .terminate.parallel: 740 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 741 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 742 // CHECK4: .barrier.parallel: 743 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 744 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 745 // CHECK4: .exit: 746 // CHECK4-NEXT: ret void 747 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 748 // CHECK4-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { 749 // CHECK4-NEXT: entry: 750 // CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 751 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 752 // CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 753 // CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 754 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 755 // CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 756 // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 757 // CHECK4-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 758 // CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 759 // CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 760 // CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 761 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 762 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 763 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 764 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 765 // CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 766 // CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 767 // CHECK4: .worker: 768 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] 769 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 770 // CHECK4: .mastercheck: 771 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 772 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 773 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 774 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 775 // CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 776 // CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 777 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 778 // CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 779 // CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 780 // CHECK4: .master: 781 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 782 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 783 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 784 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 785 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 786 // CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 787 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 788 // CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 789 // CHECK4-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 790 // CHECK4-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 791 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) 792 // CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 793 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 794 // CHECK4-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 795 // CHECK4-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 796 // CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 797 // CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 798 // CHECK4-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 799 // CHECK4-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 800 // CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 801 // CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 802 // CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 803 // CHECK4-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 804 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 805 // CHECK4: .termination.notifier: 806 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 807 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 808 // CHECK4-NEXT: br label [[DOTEXIT]] 809 // CHECK4: .exit: 810 // CHECK4-NEXT: ret void 811 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 812 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 813 // CHECK4-NEXT: entry: 814 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 815 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 816 // CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 817 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 818 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 819 // CHECK4-NEXT: store i32 45, i32* [[A]], align 4 820 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 821 // CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 822 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 823 // CHECK4-NEXT: ret void 824 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 825 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 826 // CHECK4-NEXT: entry: 827 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 828 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 829 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 830 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 831 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 832 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 833 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 834 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 835 // CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 836 // CHECK4-NEXT: ret void 837 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker 838 // CHECK4-SAME: () #[[ATTR0]] { 839 // CHECK4-NEXT: entry: 840 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 841 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 842 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 843 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 844 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 845 // CHECK4: .await.work: 846 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 847 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 848 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 849 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 850 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 851 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 852 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 853 // CHECK4: .select.workers: 854 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 855 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 856 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 857 // CHECK4: .execute.parallel: 858 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 859 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 860 // CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) 861 // CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 862 // CHECK4: .execute.fn: 863 // CHECK4-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 864 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 865 // CHECK4: .check.next: 866 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 867 // CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 868 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 869 // CHECK4: .terminate.parallel: 870 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 871 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 872 // CHECK4: .barrier.parallel: 873 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 874 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 875 // CHECK4: .exit: 876 // CHECK4-NEXT: ret void 877 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 878 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { 879 // CHECK4-NEXT: entry: 880 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 881 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 882 // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 883 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 884 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 885 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 886 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 887 // CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 888 // CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 889 // CHECK4: .worker: 890 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] 891 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 892 // CHECK4: .mastercheck: 893 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 894 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 895 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 896 // CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 897 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 898 // CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 899 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 900 // CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 901 // CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 902 // CHECK4: .master: 903 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 904 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 905 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 906 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 907 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 908 // CHECK4-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) 909 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 910 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 911 // CHECK4-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 912 // CHECK4-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 913 // CHECK4-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 914 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 915 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* 916 // CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 917 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 918 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) 919 // CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 920 // CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 921 // CHECK4-NEXT: store i32 [[INC]], i32* [[A7]], align 4 922 // CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 923 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 924 // CHECK4: .termination.notifier: 925 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 926 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 927 // CHECK4-NEXT: br label [[DOTEXIT]] 928 // CHECK4: .exit: 929 // CHECK4-NEXT: ret void 930 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 931 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { 932 // CHECK4-NEXT: entry: 933 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 934 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 935 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 936 // CHECK4-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 937 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 938 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 939 // CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 940 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 941 // CHECK4-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() 942 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 943 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 944 // CHECK4-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 945 // CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 946 // CHECK4: omp.critical.loop: 947 // CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 948 // CHECK4-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 949 // CHECK4-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 950 // CHECK4: omp.critical.test: 951 // CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 952 // CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 953 // CHECK4-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 954 // CHECK4: omp.critical.body: 955 // CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 956 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 957 // CHECK4-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 958 // CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 959 // CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 960 // CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 961 // CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 962 // CHECK4-NEXT: br label [[OMP_CRITICAL_SYNC]] 963 // CHECK4: omp.critical.sync: 964 // CHECK4-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) 965 // CHECK4-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 966 // CHECK4-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 967 // CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]] 968 // CHECK4: omp.critical.exit: 969 // CHECK4-NEXT: ret void 970 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 971 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 972 // CHECK4-NEXT: entry: 973 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 974 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 975 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 976 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 977 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 978 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 979 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 980 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 981 // CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 982 // CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 983 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 984 // CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 985 // CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] 986 // CHECK4-NEXT: ret void 987 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker 988 // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { 989 // CHECK5-NEXT: entry: 990 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 991 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 992 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 993 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 994 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 995 // CHECK5: .await.work: 996 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 997 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 998 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 999 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1000 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1001 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1002 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1003 // CHECK5: .select.workers: 1004 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1005 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1006 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1007 // CHECK5: .execute.parallel: 1008 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 1009 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1010 // CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) 1011 // CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1012 // CHECK5: .execute.fn: 1013 // CHECK5-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] 1014 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1015 // CHECK5: .check.next: 1016 // CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1017 // CHECK5-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) 1018 // CHECK5-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] 1019 // CHECK5: .execute.fn2: 1020 // CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1021 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1022 // CHECK5: .check.next3: 1023 // CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1024 // CHECK5-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) 1025 // CHECK5-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] 1026 // CHECK5: .execute.fn5: 1027 // CHECK5-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1028 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1029 // CHECK5: .check.next6: 1030 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1031 // CHECK5-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) 1032 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1033 // CHECK5: .terminate.parallel: 1034 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1035 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1036 // CHECK5: .barrier.parallel: 1037 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1038 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1039 // CHECK5: .exit: 1040 // CHECK5-NEXT: ret void 1041 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 1042 // CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { 1043 // CHECK5-NEXT: entry: 1044 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1045 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 1046 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 1047 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 1048 // CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1049 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1050 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1051 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1052 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1053 // CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1054 // CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1055 // CHECK5: .worker: 1056 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] 1057 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1058 // CHECK5: .mastercheck: 1059 // CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1060 // CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1061 // CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1062 // CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1063 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1064 // CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1065 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1066 // CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1067 // CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1068 // CHECK5: .master: 1069 // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1070 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1071 // CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1072 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1073 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1074 // CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1075 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1076 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) 1077 // CHECK5-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** 1078 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) 1079 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** 1080 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) 1081 // CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 1082 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 1083 // CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 1084 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1085 // CHECK5: .termination.notifier: 1086 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1087 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1088 // CHECK5-NEXT: br label [[DOTEXIT]] 1089 // CHECK5: .exit: 1090 // CHECK5-NEXT: ret void 1091 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ 1092 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1093 // CHECK5-NEXT: entry: 1094 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1095 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1096 // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 1097 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1098 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1099 // CHECK5-NEXT: store i32 42, i32* [[A]], align 4 1100 // CHECK5-NEXT: ret void 1101 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 1102 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1103 // CHECK5-NEXT: entry: 1104 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1105 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1106 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1107 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1108 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1109 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1110 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1111 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1112 // CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1113 // CHECK5-NEXT: ret void 1114 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 1115 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1116 // CHECK5-NEXT: entry: 1117 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1118 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1119 // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 1120 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1121 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1122 // CHECK5-NEXT: store i32 43, i32* [[A]], align 4 1123 // CHECK5-NEXT: ret void 1124 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 1125 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1126 // CHECK5-NEXT: entry: 1127 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1128 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1129 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1130 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1131 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1132 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1133 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1134 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1135 // CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1136 // CHECK5-NEXT: ret void 1137 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 1138 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1139 // CHECK5-NEXT: entry: 1140 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1141 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1142 // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 1143 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1144 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1145 // CHECK5-NEXT: store i32 44, i32* [[A]], align 4 1146 // CHECK5-NEXT: ret void 1147 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 1148 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1149 // CHECK5-NEXT: entry: 1150 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1151 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1152 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1153 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1154 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1155 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1156 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1157 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1158 // CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1159 // CHECK5-NEXT: ret void 1160 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker 1161 // CHECK5-SAME: () #[[ATTR0]] { 1162 // CHECK5-NEXT: entry: 1163 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1164 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1165 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1166 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1167 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1168 // CHECK5: .await.work: 1169 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1170 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1171 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1172 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1173 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1174 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1175 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1176 // CHECK5: .select.workers: 1177 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1178 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1179 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1180 // CHECK5: .execute.parallel: 1181 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1182 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1183 // CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) 1184 // CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1185 // CHECK5: .execute.fn: 1186 // CHECK5-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1187 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1188 // CHECK5: .check.next: 1189 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1190 // CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 1191 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1192 // CHECK5: .terminate.parallel: 1193 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1194 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1195 // CHECK5: .barrier.parallel: 1196 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1197 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1198 // CHECK5: .exit: 1199 // CHECK5-NEXT: ret void 1200 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 1201 // CHECK5-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { 1202 // CHECK5-NEXT: entry: 1203 // CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 1204 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1205 // CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 1206 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 1207 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 1208 // CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 1209 // CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1210 // CHECK5-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 1211 // CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 1212 // CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 1213 // CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 1214 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1215 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1216 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1217 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1218 // CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1219 // CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1220 // CHECK5: .worker: 1221 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] 1222 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1223 // CHECK5: .mastercheck: 1224 // CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1225 // CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1226 // CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1227 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1228 // CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1229 // CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 1230 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 1231 // CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1232 // CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1233 // CHECK5: .master: 1234 // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1235 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1236 // CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1237 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1238 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1239 // CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1240 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 1241 // CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 1242 // CHECK5-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 1243 // CHECK5-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1244 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) 1245 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 1246 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 1247 // CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 1248 // CHECK5-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 1249 // CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 1250 // CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 1251 // CHECK5-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 1252 // CHECK5-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 1253 // CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 1254 // CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 1255 // CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 1256 // CHECK5-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 1257 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1258 // CHECK5: .termination.notifier: 1259 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1260 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1261 // CHECK5-NEXT: br label [[DOTEXIT]] 1262 // CHECK5: .exit: 1263 // CHECK5-NEXT: ret void 1264 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 1265 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1266 // CHECK5-NEXT: entry: 1267 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1268 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1269 // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 1270 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1271 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1272 // CHECK5-NEXT: store i32 45, i32* [[A]], align 4 1273 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 1274 // CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 1275 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 1276 // CHECK5-NEXT: ret void 1277 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 1278 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1279 // CHECK5-NEXT: entry: 1280 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1281 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1282 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1283 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1284 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1285 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1286 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1287 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1288 // CHECK5-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1289 // CHECK5-NEXT: ret void 1290 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker 1291 // CHECK5-SAME: () #[[ATTR0]] { 1292 // CHECK5-NEXT: entry: 1293 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1294 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1295 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1296 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1297 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1298 // CHECK5: .await.work: 1299 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1300 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1301 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1302 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1303 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1304 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1305 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1306 // CHECK5: .select.workers: 1307 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1308 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1309 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1310 // CHECK5: .execute.parallel: 1311 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1312 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1313 // CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) 1314 // CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1315 // CHECK5: .execute.fn: 1316 // CHECK5-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1317 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1318 // CHECK5: .check.next: 1319 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1320 // CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 1321 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1322 // CHECK5: .terminate.parallel: 1323 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1324 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1325 // CHECK5: .barrier.parallel: 1326 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1327 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1328 // CHECK5: .exit: 1329 // CHECK5-NEXT: ret void 1330 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 1331 // CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { 1332 // CHECK5-NEXT: entry: 1333 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1334 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 1335 // CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1336 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1337 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1338 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1339 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1340 // CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1341 // CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1342 // CHECK5: .worker: 1343 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] 1344 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1345 // CHECK5: .mastercheck: 1346 // CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1347 // CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1348 // CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1349 // CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1350 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1351 // CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1352 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1353 // CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1354 // CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1355 // CHECK5: .master: 1356 // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1357 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1358 // CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1359 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1360 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1361 // CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) 1362 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 1363 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 1364 // CHECK5-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 1365 // CHECK5-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 1366 // CHECK5-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1367 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 1368 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* 1369 // CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 1370 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1371 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) 1372 // CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 1373 // CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 1374 // CHECK5-NEXT: store i32 [[INC]], i32* [[A7]], align 4 1375 // CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 1376 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1377 // CHECK5: .termination.notifier: 1378 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1379 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1380 // CHECK5-NEXT: br label [[DOTEXIT]] 1381 // CHECK5: .exit: 1382 // CHECK5-NEXT: ret void 1383 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 1384 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { 1385 // CHECK5-NEXT: entry: 1386 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1387 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1388 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 1389 // CHECK5-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 1390 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1391 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1392 // CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 1393 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 1394 // CHECK5-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() 1395 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1396 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1397 // CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 1398 // CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 1399 // CHECK5: omp.critical.loop: 1400 // CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1401 // CHECK5-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 1402 // CHECK5-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 1403 // CHECK5: omp.critical.test: 1404 // CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1405 // CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 1406 // CHECK5-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 1407 // CHECK5: omp.critical.body: 1408 // CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 1409 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 1410 // CHECK5-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1411 // CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 1412 // CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 1413 // CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 1414 // CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1415 // CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]] 1416 // CHECK5: omp.critical.sync: 1417 // CHECK5-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) 1418 // CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 1419 // CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 1420 // CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]] 1421 // CHECK5: omp.critical.exit: 1422 // CHECK5-NEXT: ret void 1423 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 1424 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1425 // CHECK5-NEXT: entry: 1426 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1427 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1428 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1429 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1430 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1431 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1432 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1433 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1434 // CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 1435 // CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 1436 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 1437 // CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 1438 // CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] 1439 // CHECK5-NEXT: ret void 1440 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 1441 // CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR0:[0-9]+]] { 1442 // CHECK1-NEXT: entry: 1443 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1444 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 1445 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 1446 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x i8*], align 8 1447 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1448 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1449 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) 1450 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 1451 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 1452 // CHECK1: user_code.entry: 1453 // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1454 // CHECK1-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1455 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP2]], i64 0) 1456 // CHECK1-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** 1457 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i64 0) 1458 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS2]] to i8** 1459 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP4]], i64 0) 1460 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 1461 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], 1 1462 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 1463 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 1464 // CHECK1-NEXT: ret void 1465 // CHECK1: worker.exit: 1466 // CHECK1-NEXT: ret void 1467 // 1468 // 1469 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ 1470 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 1471 // CHECK1-NEXT: entry: 1472 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1473 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1474 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 1475 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1476 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1477 // CHECK1-NEXT: store i32 42, i32* [[A]], align 4 1478 // CHECK1-NEXT: ret void 1479 // 1480 // 1481 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 1482 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { 1483 // CHECK1-NEXT: entry: 1484 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1485 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1486 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1487 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1488 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1489 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1490 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1491 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1492 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2:[0-9]+]] 1493 // CHECK1-NEXT: ret void 1494 // 1495 // 1496 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 1497 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 1498 // CHECK1-NEXT: entry: 1499 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1500 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1501 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 1502 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1503 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1504 // CHECK1-NEXT: store i32 43, i32* [[A]], align 4 1505 // CHECK1-NEXT: ret void 1506 // 1507 // 1508 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 1509 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { 1510 // CHECK1-NEXT: entry: 1511 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1512 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1513 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1514 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1515 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1516 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1517 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1518 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1519 // CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 1520 // CHECK1-NEXT: ret void 1521 // 1522 // 1523 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 1524 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 1525 // CHECK1-NEXT: entry: 1526 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1527 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1528 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 1529 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1530 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1531 // CHECK1-NEXT: store i32 44, i32* [[A]], align 4 1532 // CHECK1-NEXT: ret void 1533 // 1534 // 1535 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 1536 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { 1537 // CHECK1-NEXT: entry: 1538 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1539 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1540 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1541 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1542 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1543 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1544 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1545 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1546 // CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 1547 // CHECK1-NEXT: ret void 1548 // 1549 // 1550 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 1551 // CHECK1-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 1552 // CHECK1-NEXT: entry: 1553 // CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 1554 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1555 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 1556 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 1557 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 1558 // CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 1559 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1560 // CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 1561 // CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 1562 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* 1563 // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1564 // CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* 1565 // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 1566 // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 1567 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 1568 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 1569 // CHECK1: user_code.entry: 1570 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1571 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 1572 // CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1000 1573 // CHECK1-NEXT: [[TMP4:%.*]] = zext i1 [[CMP]] to i32 1574 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1575 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 [[TMP4]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP5]], i64 0) 1576 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV1]], align 8 1577 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 1578 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 1579 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV2]], align 8 1580 // CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP7]] to i32 1581 // CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1 1582 // CHECK1-NEXT: [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16 1583 // CHECK1-NEXT: store i16 [[CONV5]], i16* [[CONV2]], align 8 1584 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 1585 // CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 1586 // CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP8]], 1 1587 // CHECK1-NEXT: store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4 1588 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 1589 // CHECK1-NEXT: ret void 1590 // CHECK1: worker.exit: 1591 // CHECK1-NEXT: ret void 1592 // 1593 // 1594 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 1595 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 1596 // CHECK1-NEXT: entry: 1597 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1598 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1599 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 1600 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1601 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1602 // CHECK1-NEXT: store i32 45, i32* [[A]], align 4 1603 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 1604 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 1605 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 1606 // CHECK1-NEXT: ret void 1607 // 1608 // 1609 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 1610 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { 1611 // CHECK1-NEXT: entry: 1612 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1613 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1614 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1615 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1616 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1617 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1618 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1619 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1620 // CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 1621 // CHECK1-NEXT: ret void 1622 // 1623 // 1624 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 1625 // CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR0]] { 1626 // CHECK1-NEXT: entry: 1627 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1628 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 1629 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1630 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1631 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 1632 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 1633 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 1634 // CHECK1: user_code.entry: 1635 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 1636 // CHECK1-NEXT: [[A1:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) 1637 // CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32* 1638 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4 1639 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1640 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 1641 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* 1642 // CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 1643 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1644 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP5]], i64 1) 1645 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 1646 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 1647 // CHECK1-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 1648 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A1]], i64 4) 1649 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 1650 // CHECK1-NEXT: ret void 1651 // CHECK1: worker.exit: 1652 // CHECK1-NEXT: ret void 1653 // 1654 // 1655 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 1656 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { 1657 // CHECK1-NEXT: entry: 1658 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1659 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1660 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 1661 // CHECK1-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 1662 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1663 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1664 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 1665 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 1666 // CHECK1-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() 1667 // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1668 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() 1669 // CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 1670 // CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 1671 // CHECK1: omp.critical.loop: 1672 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1673 // CHECK1-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 1674 // CHECK1-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 1675 // CHECK1: omp.critical.test: 1676 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1677 // CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 1678 // CHECK1-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 1679 // CHECK1: omp.critical.body: 1680 // CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 1681 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 1682 // CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1683 // CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 1684 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 1685 // CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 1686 // CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1687 // CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] 1688 // CHECK1: omp.critical.sync: 1689 // CHECK1-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) 1690 // CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 1691 // CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 1692 // CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] 1693 // CHECK1: omp.critical.exit: 1694 // CHECK1-NEXT: ret void 1695 // 1696 // 1697 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 1698 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { 1699 // CHECK1-NEXT: entry: 1700 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1701 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1702 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1703 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1704 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1705 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1706 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1707 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1708 // CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 1709 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 1710 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 1711 // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 1712 // CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] 1713 // CHECK1-NEXT: ret void 1714 // 1715 // 1716 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 1717 // CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { 1718 // CHECK2-NEXT: entry: 1719 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1720 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 1721 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 4 1722 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x i8*], align 4 1723 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1724 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) 1725 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 1726 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 1727 // CHECK2: user_code.entry: 1728 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1729 // CHECK2-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1730 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP2]], i32 0) 1731 // CHECK2-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** 1732 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 0) 1733 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS2]] to i8** 1734 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP4]], i32 0) 1735 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4 1736 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], 1 1737 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 1738 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 1739 // CHECK2-NEXT: ret void 1740 // CHECK2: worker.exit: 1741 // CHECK2-NEXT: ret void 1742 // 1743 // 1744 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ 1745 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 1746 // CHECK2-NEXT: entry: 1747 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1748 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1749 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 1750 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1751 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1752 // CHECK2-NEXT: store i32 42, i32* [[A]], align 4 1753 // CHECK2-NEXT: ret void 1754 // 1755 // 1756 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 1757 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1758 // CHECK2-NEXT: entry: 1759 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1760 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1761 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1762 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1763 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1764 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1765 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1766 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1767 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR1:[0-9]+]] 1768 // CHECK2-NEXT: ret void 1769 // 1770 // 1771 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 1772 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 1773 // CHECK2-NEXT: entry: 1774 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1775 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1776 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 1777 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1778 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1779 // CHECK2-NEXT: store i32 43, i32* [[A]], align 4 1780 // CHECK2-NEXT: ret void 1781 // 1782 // 1783 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 1784 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1785 // CHECK2-NEXT: entry: 1786 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1787 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1788 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1789 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1790 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1791 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1792 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1793 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1794 // CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR1]] 1795 // CHECK2-NEXT: ret void 1796 // 1797 // 1798 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 1799 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 1800 // CHECK2-NEXT: entry: 1801 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1802 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1803 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 1804 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1805 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1806 // CHECK2-NEXT: store i32 44, i32* [[A]], align 4 1807 // CHECK2-NEXT: ret void 1808 // 1809 // 1810 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 1811 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1812 // CHECK2-NEXT: entry: 1813 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1814 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1815 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1816 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1817 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1818 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1819 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1820 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1821 // CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR1]] 1822 // CHECK2-NEXT: ret void 1823 // 1824 // 1825 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 1826 // CHECK2-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 1827 // CHECK2-NEXT: entry: 1828 // CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 1829 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1830 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 1831 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 1832 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 1833 // CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 1834 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1835 // CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 1836 // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 1837 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 1838 // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 1839 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 1840 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 1841 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 1842 // CHECK2: user_code.entry: 1843 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1844 // CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 1845 // CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1000 1846 // CHECK2-NEXT: [[TMP4:%.*]] = zext i1 [[CMP]] to i32 1847 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1848 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 [[TMP4]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP5]], i32 0) 1849 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 1850 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 1851 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 1852 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4 1853 // CHECK2-NEXT: [[CONV1:%.*]] = sext i16 [[TMP7]] to i32 1854 // CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[CONV1]], 1 1855 // CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16 1856 // CHECK2-NEXT: store i16 [[CONV3]], i16* [[CONV]], align 4 1857 // CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 1858 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 1859 // CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP8]], 1 1860 // CHECK2-NEXT: store i32 [[ADD4]], i32* [[ARRAYIDX]], align 4 1861 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 1862 // CHECK2-NEXT: ret void 1863 // CHECK2: worker.exit: 1864 // CHECK2-NEXT: ret void 1865 // 1866 // 1867 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 1868 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 1869 // CHECK2-NEXT: entry: 1870 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1871 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1872 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 1873 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1874 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1875 // CHECK2-NEXT: store i32 45, i32* [[A]], align 4 1876 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 1877 // CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 1878 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 1879 // CHECK2-NEXT: ret void 1880 // 1881 // 1882 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 1883 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1884 // CHECK2-NEXT: entry: 1885 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1886 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1887 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1888 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1889 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1890 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1891 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1892 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1893 // CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR1]] 1894 // CHECK2-NEXT: ret void 1895 // 1896 // 1897 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 1898 // CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { 1899 // CHECK2-NEXT: entry: 1900 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1901 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 1902 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1903 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 1904 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 1905 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 1906 // CHECK2: user_code.entry: 1907 // CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 1908 // CHECK2-NEXT: [[A1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) 1909 // CHECK2-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32* 1910 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[A_ON_STACK]], align 4 1911 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1912 // CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 1913 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* 1914 // CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 1915 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1916 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP5]], i32 1) 1917 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 1918 // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 1919 // CHECK2-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 1920 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[A1]], i32 4) 1921 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 1922 // CHECK2-NEXT: ret void 1923 // CHECK2: worker.exit: 1924 // CHECK2-NEXT: ret void 1925 // 1926 // 1927 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 1928 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { 1929 // CHECK2-NEXT: entry: 1930 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1931 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1932 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 1933 // CHECK2-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 1934 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1935 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1936 // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 1937 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 1938 // CHECK2-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() 1939 // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1940 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() 1941 // CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 1942 // CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 1943 // CHECK2: omp.critical.loop: 1944 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1945 // CHECK2-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 1946 // CHECK2-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 1947 // CHECK2: omp.critical.test: 1948 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1949 // CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 1950 // CHECK2-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 1951 // CHECK2: omp.critical.body: 1952 // CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 1953 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 1954 // CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1955 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 1956 // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 1957 // CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 1958 // CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1959 // CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] 1960 // CHECK2: omp.critical.sync: 1961 // CHECK2-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) 1962 // CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 1963 // CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 1964 // CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]] 1965 // CHECK2: omp.critical.exit: 1966 // CHECK2-NEXT: ret void 1967 // 1968 // 1969 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 1970 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1971 // CHECK2-NEXT: entry: 1972 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1973 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1974 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1975 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1976 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1977 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1978 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1979 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1980 // CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 1981 // CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 1982 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 1983 // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 1984 // CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR1]] 1985 // CHECK2-NEXT: ret void 1986 // 1987