1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ 2 // Test target codegen - host bc file has to be created first. 3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1 5 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc 6 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix=CHECK2 7 // expected-no-diagnostics 8 #ifndef HEADER 9 #define HEADER 10 11 template<typename tx> 12 tx ftemplate(int n) { 13 tx a = 0; 14 short aa = 0; 15 tx b[10]; 16 17 #pragma omp target if(0) 18 { 19 #pragma omp parallel 20 { 21 int a = 41; 22 } 23 a += 1; 24 } 25 26 #pragma omp target 27 { 28 #pragma omp parallel 29 { 30 int a = 42; 31 } 32 #pragma omp parallel if(0) 33 { 34 int a = 43; 35 } 36 #pragma omp parallel if(1) 37 { 38 int a = 44; 39 } 40 a += 1; 41 } 42 43 #pragma omp target if(n>40) 44 { 45 #pragma omp parallel if(n>1000) 46 { 47 int a = 45; 48 #pragma omp barrier 49 } 50 a += 1; 51 aa += 1; 52 b[2] += 1; 53 } 54 55 #pragma omp target 56 { 57 #pragma omp parallel 58 { 59 #pragma omp critical 60 ++a; 61 } 62 ++a; 63 } 64 return a; 65 } 66 67 int bar(int n){ 68 int a = 0; 69 70 a += ftemplate<int>(n); 71 72 return a; 73 } 74 75 #endif 76 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker 77 // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { 78 // CHECK3-NEXT: entry: 79 // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 80 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 81 // CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 82 // CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 83 // CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] 84 // CHECK3: .await.work: 85 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 86 // CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 87 // CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 88 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 89 // CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 90 // CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 91 // CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 92 // CHECK3: .select.workers: 93 // CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 94 // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 95 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 96 // CHECK3: .execute.parallel: 97 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 98 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 99 // CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) 100 // CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 101 // CHECK3: .execute.fn: 102 // CHECK3-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2:[0-9]+]] 103 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 104 // CHECK3: .check.next: 105 // CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 106 // CHECK3-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) 107 // CHECK3-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] 108 // CHECK3: .execute.fn2: 109 // CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 110 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 111 // CHECK3: .check.next3: 112 // CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 113 // CHECK3-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) 114 // CHECK3-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] 115 // CHECK3: .execute.fn5: 116 // CHECK3-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 117 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 118 // CHECK3: .check.next6: 119 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 120 // CHECK3-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) 121 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 122 // CHECK3: .terminate.parallel: 123 // CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() 124 // CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] 125 // CHECK3: .barrier.parallel: 126 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 127 // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] 128 // CHECK3: .exit: 129 // CHECK3-NEXT: ret void 130 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 131 // CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { 132 // CHECK3-NEXT: entry: 133 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 134 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 135 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 136 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 137 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 138 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 139 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 140 // CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 141 // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 142 // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 143 // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 144 // CHECK3: .worker: 145 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR2]] 146 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 147 // CHECK3: .mastercheck: 148 // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 149 // CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 150 // CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 151 // CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 152 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 153 // CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 154 // CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 155 // CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 156 // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 157 // CHECK3: .master: 158 // CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 159 // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 160 // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 161 // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 162 // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() 163 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 164 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 165 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) 166 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** 167 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) 168 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** 169 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) 170 // CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 171 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 172 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 173 // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 174 // CHECK3: .termination.notifier: 175 // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) 176 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 177 // CHECK3-NEXT: br label [[DOTEXIT]] 178 // CHECK3: .exit: 179 // CHECK3-NEXT: ret void 180 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ 181 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 182 // CHECK3-NEXT: entry: 183 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 184 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 185 // CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 186 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 187 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 188 // CHECK3-NEXT: store i32 42, i32* [[A]], align 4 189 // CHECK3-NEXT: ret void 190 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 191 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 192 // CHECK3-NEXT: entry: 193 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 194 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 195 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 196 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 197 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 198 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 199 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 200 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 201 // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 202 // CHECK3-NEXT: ret void 203 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 204 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 205 // CHECK3-NEXT: entry: 206 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 207 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 208 // CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 209 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 210 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 211 // CHECK3-NEXT: store i32 43, i32* [[A]], align 4 212 // CHECK3-NEXT: ret void 213 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 214 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 215 // CHECK3-NEXT: entry: 216 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 217 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 218 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 219 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 220 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 221 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 222 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 223 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 224 // CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 225 // CHECK3-NEXT: ret void 226 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 227 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 228 // CHECK3-NEXT: entry: 229 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 230 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 231 // CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 232 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 233 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 234 // CHECK3-NEXT: store i32 44, i32* [[A]], align 4 235 // CHECK3-NEXT: ret void 236 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 237 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 238 // CHECK3-NEXT: entry: 239 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 240 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 241 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 242 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 243 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 244 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 245 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 246 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 247 // CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 248 // CHECK3-NEXT: ret void 249 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker 250 // CHECK3-SAME: () #[[ATTR0]] { 251 // CHECK3-NEXT: entry: 252 // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 253 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 254 // CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 255 // CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 256 // CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] 257 // CHECK3: .await.work: 258 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 259 // CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 260 // CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 261 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 262 // CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 263 // CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 264 // CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 265 // CHECK3: .select.workers: 266 // CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 267 // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 268 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 269 // CHECK3: .execute.parallel: 270 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 271 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 272 // CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) 273 // CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 274 // CHECK3: .execute.fn: 275 // CHECK3-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 276 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 277 // CHECK3: .check.next: 278 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 279 // CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 280 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 281 // CHECK3: .terminate.parallel: 282 // CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() 283 // CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] 284 // CHECK3: .barrier.parallel: 285 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 286 // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] 287 // CHECK3: .exit: 288 // CHECK3-NEXT: ret void 289 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 290 // CHECK3-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 291 // CHECK3-NEXT: entry: 292 // CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 293 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 294 // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 295 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 296 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 297 // CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 298 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 299 // CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 300 // CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 301 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 302 // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 303 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 304 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 305 // CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 306 // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 307 // CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 308 // CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 309 // CHECK3: .worker: 310 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR2]] 311 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 312 // CHECK3: .mastercheck: 313 // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 314 // CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 315 // CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 316 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 317 // CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 318 // CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 319 // CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 320 // CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 321 // CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 322 // CHECK3: .master: 323 // CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 324 // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 325 // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 326 // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 327 // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() 328 // CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 329 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 330 // CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 331 // CHECK3-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 332 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 333 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) 334 // CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 335 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 336 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 337 // CHECK3-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 338 // CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 339 // CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 340 // CHECK3-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 341 // CHECK3-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 342 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 343 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 344 // CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 345 // CHECK3-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 346 // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 347 // CHECK3: .termination.notifier: 348 // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) 349 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 350 // CHECK3-NEXT: br label [[DOTEXIT]] 351 // CHECK3: .exit: 352 // CHECK3-NEXT: ret void 353 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 354 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 355 // CHECK3-NEXT: entry: 356 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 357 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 358 // CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 359 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 360 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 361 // CHECK3-NEXT: store i32 45, i32* [[A]], align 4 362 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 363 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 364 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 365 // CHECK3-NEXT: ret void 366 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 367 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 368 // CHECK3-NEXT: entry: 369 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 370 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 371 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 372 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 373 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 374 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 375 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 376 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 377 // CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 378 // CHECK3-NEXT: ret void 379 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker 380 // CHECK3-SAME: () #[[ATTR0]] { 381 // CHECK3-NEXT: entry: 382 // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 383 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 384 // CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 385 // CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 386 // CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] 387 // CHECK3: .await.work: 388 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 389 // CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 390 // CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 391 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 392 // CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 393 // CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 394 // CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 395 // CHECK3: .select.workers: 396 // CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 397 // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 398 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 399 // CHECK3: .execute.parallel: 400 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 401 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 402 // CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) 403 // CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 404 // CHECK3: .execute.fn: 405 // CHECK3-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 406 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 407 // CHECK3: .check.next: 408 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 409 // CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 410 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] 411 // CHECK3: .terminate.parallel: 412 // CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() 413 // CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] 414 // CHECK3: .barrier.parallel: 415 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 416 // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] 417 // CHECK3: .exit: 418 // CHECK3-NEXT: ret void 419 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 420 // CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { 421 // CHECK3-NEXT: entry: 422 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 423 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 424 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 425 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 426 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 427 // CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 428 // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 429 // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 430 // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 431 // CHECK3: .worker: 432 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR2]] 433 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 434 // CHECK3: .mastercheck: 435 // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 436 // CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 437 // CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 438 // CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 439 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 440 // CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 441 // CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 442 // CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 443 // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 444 // CHECK3: .master: 445 // CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 446 // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 447 // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 448 // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 449 // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() 450 // CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 451 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 452 // CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 453 // CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 454 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 455 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* 456 // CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 457 // CHECK3-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 458 // CHECK3-NEXT: store i32 [[TMP10]], i32* [[A7]], align 4 459 // CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 460 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 461 // CHECK3-NEXT: [[TMP13:%.*]] = bitcast i32* [[A7]] to i8* 462 // CHECK3-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 4 463 // CHECK3-NEXT: [[TMP14:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 464 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP14]], i32 1) 465 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A7]], align 4 466 // CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP15]], 1 467 // CHECK3-NEXT: store i32 [[INC]], i32* [[A7]], align 4 468 // CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 469 // CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP16]]) 470 // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 471 // CHECK3: .termination.notifier: 472 // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) 473 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 474 // CHECK3-NEXT: br label [[DOTEXIT]] 475 // CHECK3: .exit: 476 // CHECK3-NEXT: ret void 477 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 478 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { 479 // CHECK3-NEXT: entry: 480 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 481 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 482 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 483 // CHECK3-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 484 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 485 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 486 // CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 487 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 488 // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() 489 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 490 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 491 // CHECK3-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 492 // CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 493 // CHECK3: omp.critical.loop: 494 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 495 // CHECK3-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 496 // CHECK3-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 497 // CHECK3: omp.critical.test: 498 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 499 // CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 500 // CHECK3-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 501 // CHECK3: omp.critical.body: 502 // CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 503 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 504 // CHECK3-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 505 // CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 506 // CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 507 // CHECK3-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 508 // CHECK3-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 509 // CHECK3-NEXT: br label [[OMP_CRITICAL_SYNC]] 510 // CHECK3: omp.critical.sync: 511 // CHECK3-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) 512 // CHECK3-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 513 // CHECK3-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 514 // CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP]] 515 // CHECK3: omp.critical.exit: 516 // CHECK3-NEXT: ret void 517 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 518 // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 519 // CHECK3-NEXT: entry: 520 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 521 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 522 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 523 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 524 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 525 // CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 526 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 527 // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 528 // CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 529 // CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 530 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 531 // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 532 // CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] 533 // CHECK3-NEXT: ret void 534 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker 535 // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { 536 // CHECK4-NEXT: entry: 537 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 538 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 539 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 540 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 541 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 542 // CHECK4: .await.work: 543 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 544 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 545 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 546 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 547 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 548 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 549 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 550 // CHECK4: .select.workers: 551 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 552 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 553 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 554 // CHECK4: .execute.parallel: 555 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 556 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 557 // CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) 558 // CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 559 // CHECK4: .execute.fn: 560 // CHECK4-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] 561 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 562 // CHECK4: .check.next: 563 // CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 564 // CHECK4-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) 565 // CHECK4-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] 566 // CHECK4: .execute.fn2: 567 // CHECK4-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 568 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 569 // CHECK4: .check.next3: 570 // CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 571 // CHECK4-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) 572 // CHECK4-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] 573 // CHECK4: .execute.fn5: 574 // CHECK4-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 575 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 576 // CHECK4: .check.next6: 577 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 578 // CHECK4-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) 579 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 580 // CHECK4: .terminate.parallel: 581 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 582 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 583 // CHECK4: .barrier.parallel: 584 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 585 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 586 // CHECK4: .exit: 587 // CHECK4-NEXT: ret void 588 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 589 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { 590 // CHECK4-NEXT: entry: 591 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 592 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 593 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 594 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 595 // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 596 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 597 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 598 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 599 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 600 // CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 601 // CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 602 // CHECK4: .worker: 603 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] 604 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 605 // CHECK4: .mastercheck: 606 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 607 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 608 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 609 // CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 610 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 611 // CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 612 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 613 // CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 614 // CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 615 // CHECK4: .master: 616 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 617 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 618 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 619 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 620 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 621 // CHECK4-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 622 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 623 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) 624 // CHECK4-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** 625 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) 626 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** 627 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) 628 // CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 629 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 630 // CHECK4-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 631 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 632 // CHECK4: .termination.notifier: 633 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 634 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 635 // CHECK4-NEXT: br label [[DOTEXIT]] 636 // CHECK4: .exit: 637 // CHECK4-NEXT: ret void 638 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ 639 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 640 // CHECK4-NEXT: entry: 641 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 642 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 643 // CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 644 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 645 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 646 // CHECK4-NEXT: store i32 42, i32* [[A]], align 4 647 // CHECK4-NEXT: ret void 648 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 649 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 650 // CHECK4-NEXT: entry: 651 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 652 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 653 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 654 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 655 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 656 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 657 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 658 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 659 // CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 660 // CHECK4-NEXT: ret void 661 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 662 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 663 // CHECK4-NEXT: entry: 664 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 665 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 666 // CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 667 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 668 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 669 // CHECK4-NEXT: store i32 43, i32* [[A]], align 4 670 // CHECK4-NEXT: ret void 671 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 672 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 673 // CHECK4-NEXT: entry: 674 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 675 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 676 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 677 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 678 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 679 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 680 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 681 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 682 // CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 683 // CHECK4-NEXT: ret void 684 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 685 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 686 // CHECK4-NEXT: entry: 687 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 688 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 689 // CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 690 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 691 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 692 // CHECK4-NEXT: store i32 44, i32* [[A]], align 4 693 // CHECK4-NEXT: ret void 694 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 695 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 696 // CHECK4-NEXT: entry: 697 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 698 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 699 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 700 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 701 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 702 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 703 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 704 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 705 // CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 706 // CHECK4-NEXT: ret void 707 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker 708 // CHECK4-SAME: () #[[ATTR0]] { 709 // CHECK4-NEXT: entry: 710 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 711 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 712 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 713 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 714 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 715 // CHECK4: .await.work: 716 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 717 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 718 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 719 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 720 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 721 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 722 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 723 // CHECK4: .select.workers: 724 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 725 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 726 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 727 // CHECK4: .execute.parallel: 728 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 729 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 730 // CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) 731 // CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 732 // CHECK4: .execute.fn: 733 // CHECK4-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 734 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 735 // CHECK4: .check.next: 736 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 737 // CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 738 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 739 // CHECK4: .terminate.parallel: 740 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 741 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 742 // CHECK4: .barrier.parallel: 743 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 744 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 745 // CHECK4: .exit: 746 // CHECK4-NEXT: ret void 747 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 748 // CHECK4-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { 749 // CHECK4-NEXT: entry: 750 // CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 751 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 752 // CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 753 // CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 754 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 755 // CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 756 // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 757 // CHECK4-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 758 // CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 759 // CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 760 // CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 761 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 762 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 763 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 764 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 765 // CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 766 // CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 767 // CHECK4: .worker: 768 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] 769 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 770 // CHECK4: .mastercheck: 771 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 772 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 773 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 774 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 775 // CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 776 // CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 777 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 778 // CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 779 // CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 780 // CHECK4: .master: 781 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 782 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 783 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 784 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 785 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 786 // CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 787 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 788 // CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 789 // CHECK4-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 790 // CHECK4-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 791 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) 792 // CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 793 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 794 // CHECK4-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 795 // CHECK4-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 796 // CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 797 // CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 798 // CHECK4-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 799 // CHECK4-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 800 // CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 801 // CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 802 // CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 803 // CHECK4-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 804 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 805 // CHECK4: .termination.notifier: 806 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 807 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 808 // CHECK4-NEXT: br label [[DOTEXIT]] 809 // CHECK4: .exit: 810 // CHECK4-NEXT: ret void 811 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 812 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 813 // CHECK4-NEXT: entry: 814 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 815 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 816 // CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 817 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 818 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 819 // CHECK4-NEXT: store i32 45, i32* [[A]], align 4 820 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 821 // CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 822 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 823 // CHECK4-NEXT: ret void 824 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 825 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 826 // CHECK4-NEXT: entry: 827 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 828 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 829 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 830 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 831 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 832 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 833 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 834 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 835 // CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 836 // CHECK4-NEXT: ret void 837 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker 838 // CHECK4-SAME: () #[[ATTR0]] { 839 // CHECK4-NEXT: entry: 840 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 841 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 842 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 843 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 844 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 845 // CHECK4: .await.work: 846 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 847 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 848 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 849 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 850 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 851 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 852 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 853 // CHECK4: .select.workers: 854 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 855 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 856 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 857 // CHECK4: .execute.parallel: 858 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 859 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 860 // CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) 861 // CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 862 // CHECK4: .execute.fn: 863 // CHECK4-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 864 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 865 // CHECK4: .check.next: 866 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 867 // CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 868 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] 869 // CHECK4: .terminate.parallel: 870 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 871 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 872 // CHECK4: .barrier.parallel: 873 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 874 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 875 // CHECK4: .exit: 876 // CHECK4-NEXT: ret void 877 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 878 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { 879 // CHECK4-NEXT: entry: 880 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 881 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 882 // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 883 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 884 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 885 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 886 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 887 // CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 888 // CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 889 // CHECK4: .worker: 890 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] 891 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 892 // CHECK4: .mastercheck: 893 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 894 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 895 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 896 // CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 897 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 898 // CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 899 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 900 // CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 901 // CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 902 // CHECK4: .master: 903 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 904 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 905 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 906 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 907 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 908 // CHECK4-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) 909 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 910 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 911 // CHECK4-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 912 // CHECK4-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 913 // CHECK4-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 914 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 915 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* 916 // CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 917 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 918 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) 919 // CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 920 // CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 921 // CHECK4-NEXT: store i32 [[INC]], i32* [[A7]], align 4 922 // CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 923 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 924 // CHECK4: .termination.notifier: 925 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 926 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 927 // CHECK4-NEXT: br label [[DOTEXIT]] 928 // CHECK4: .exit: 929 // CHECK4-NEXT: ret void 930 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 931 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { 932 // CHECK4-NEXT: entry: 933 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 934 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 935 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 936 // CHECK4-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 937 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 938 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 939 // CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 940 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 941 // CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() 942 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 943 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 944 // CHECK4-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 945 // CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 946 // CHECK4: omp.critical.loop: 947 // CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 948 // CHECK4-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 949 // CHECK4-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 950 // CHECK4: omp.critical.test: 951 // CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 952 // CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 953 // CHECK4-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 954 // CHECK4: omp.critical.body: 955 // CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 956 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 957 // CHECK4-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 958 // CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 959 // CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 960 // CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 961 // CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 962 // CHECK4-NEXT: br label [[OMP_CRITICAL_SYNC]] 963 // CHECK4: omp.critical.sync: 964 // CHECK4-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) 965 // CHECK4-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 966 // CHECK4-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 967 // CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]] 968 // CHECK4: omp.critical.exit: 969 // CHECK4-NEXT: ret void 970 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 971 // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 972 // CHECK4-NEXT: entry: 973 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 974 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 975 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 976 // CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 977 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 978 // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 979 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 980 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 981 // CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 982 // CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 983 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 984 // CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 985 // CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] 986 // CHECK4-NEXT: ret void 987 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker 988 // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { 989 // CHECK5-NEXT: entry: 990 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 991 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 992 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 993 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 994 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 995 // CHECK5: .await.work: 996 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 997 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 998 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 999 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1000 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1001 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1002 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1003 // CHECK5: .select.workers: 1004 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1005 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1006 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1007 // CHECK5: .execute.parallel: 1008 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 1009 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1010 // CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) 1011 // CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1012 // CHECK5: .execute.fn: 1013 // CHECK5-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] 1014 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1015 // CHECK5: .check.next: 1016 // CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1017 // CHECK5-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) 1018 // CHECK5-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] 1019 // CHECK5: .execute.fn2: 1020 // CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1021 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1022 // CHECK5: .check.next3: 1023 // CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1024 // CHECK5-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) 1025 // CHECK5-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] 1026 // CHECK5: .execute.fn5: 1027 // CHECK5-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1028 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1029 // CHECK5: .check.next6: 1030 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1031 // CHECK5-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) 1032 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1033 // CHECK5: .terminate.parallel: 1034 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1035 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1036 // CHECK5: .barrier.parallel: 1037 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1038 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1039 // CHECK5: .exit: 1040 // CHECK5-NEXT: ret void 1041 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 1042 // CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { 1043 // CHECK5-NEXT: entry: 1044 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1045 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 1046 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 1047 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 1048 // CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1049 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1050 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1051 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1052 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1053 // CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1054 // CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1055 // CHECK5: .worker: 1056 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] 1057 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1058 // CHECK5: .mastercheck: 1059 // CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1060 // CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1061 // CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1062 // CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1063 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1064 // CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1065 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1066 // CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1067 // CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1068 // CHECK5: .master: 1069 // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1070 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1071 // CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1072 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1073 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1074 // CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1075 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1076 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) 1077 // CHECK5-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** 1078 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) 1079 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** 1080 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) 1081 // CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 1082 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 1083 // CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 1084 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1085 // CHECK5: .termination.notifier: 1086 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1087 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1088 // CHECK5-NEXT: br label [[DOTEXIT]] 1089 // CHECK5: .exit: 1090 // CHECK5-NEXT: ret void 1091 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ 1092 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1093 // CHECK5-NEXT: entry: 1094 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1095 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1096 // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 1097 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1098 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1099 // CHECK5-NEXT: store i32 42, i32* [[A]], align 4 1100 // CHECK5-NEXT: ret void 1101 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 1102 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1103 // CHECK5-NEXT: entry: 1104 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1105 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1106 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1107 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1108 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1109 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1110 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1111 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1112 // CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1113 // CHECK5-NEXT: ret void 1114 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 1115 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1116 // CHECK5-NEXT: entry: 1117 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1118 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1119 // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 1120 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1121 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1122 // CHECK5-NEXT: store i32 43, i32* [[A]], align 4 1123 // CHECK5-NEXT: ret void 1124 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 1125 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1126 // CHECK5-NEXT: entry: 1127 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1128 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1129 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1130 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1131 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1132 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1133 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1134 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1135 // CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1136 // CHECK5-NEXT: ret void 1137 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 1138 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1139 // CHECK5-NEXT: entry: 1140 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1141 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1142 // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 1143 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1144 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1145 // CHECK5-NEXT: store i32 44, i32* [[A]], align 4 1146 // CHECK5-NEXT: ret void 1147 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 1148 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1149 // CHECK5-NEXT: entry: 1150 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1151 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1152 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1153 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1154 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1155 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1156 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1157 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1158 // CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1159 // CHECK5-NEXT: ret void 1160 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker 1161 // CHECK5-SAME: () #[[ATTR0]] { 1162 // CHECK5-NEXT: entry: 1163 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1164 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1165 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1166 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1167 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1168 // CHECK5: .await.work: 1169 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1170 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1171 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1172 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1173 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1174 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1175 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1176 // CHECK5: .select.workers: 1177 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1178 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1179 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1180 // CHECK5: .execute.parallel: 1181 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1182 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1183 // CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) 1184 // CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1185 // CHECK5: .execute.fn: 1186 // CHECK5-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1187 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1188 // CHECK5: .check.next: 1189 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1190 // CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 1191 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1192 // CHECK5: .terminate.parallel: 1193 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1194 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1195 // CHECK5: .barrier.parallel: 1196 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1197 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1198 // CHECK5: .exit: 1199 // CHECK5-NEXT: ret void 1200 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 1201 // CHECK5-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { 1202 // CHECK5-NEXT: entry: 1203 // CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 1204 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1205 // CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 1206 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 1207 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 1208 // CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 1209 // CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1210 // CHECK5-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 1211 // CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 1212 // CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 1213 // CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 1214 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1215 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1216 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1217 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1218 // CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1219 // CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1220 // CHECK5: .worker: 1221 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] 1222 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1223 // CHECK5: .mastercheck: 1224 // CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1225 // CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1226 // CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1227 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1228 // CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1229 // CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 1230 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 1231 // CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1232 // CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1233 // CHECK5: .master: 1234 // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1235 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1236 // CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1237 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1238 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1239 // CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1240 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 1241 // CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 1242 // CHECK5-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 1243 // CHECK5-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1244 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) 1245 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 1246 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 1247 // CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 1248 // CHECK5-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 1249 // CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 1250 // CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 1251 // CHECK5-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 1252 // CHECK5-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 1253 // CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 1254 // CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 1255 // CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 1256 // CHECK5-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 1257 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1258 // CHECK5: .termination.notifier: 1259 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1260 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1261 // CHECK5-NEXT: br label [[DOTEXIT]] 1262 // CHECK5: .exit: 1263 // CHECK5-NEXT: ret void 1264 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 1265 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1266 // CHECK5-NEXT: entry: 1267 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1268 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1269 // CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 1270 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1271 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1272 // CHECK5-NEXT: store i32 45, i32* [[A]], align 4 1273 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 1274 // CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 1275 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 1276 // CHECK5-NEXT: ret void 1277 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 1278 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1279 // CHECK5-NEXT: entry: 1280 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1281 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1282 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1283 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1284 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1285 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1286 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1287 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1288 // CHECK5-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1289 // CHECK5-NEXT: ret void 1290 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker 1291 // CHECK5-SAME: () #[[ATTR0]] { 1292 // CHECK5-NEXT: entry: 1293 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1294 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1295 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1296 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1297 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1298 // CHECK5: .await.work: 1299 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1300 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1301 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1302 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1303 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1304 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1305 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1306 // CHECK5: .select.workers: 1307 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1308 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1309 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1310 // CHECK5: .execute.parallel: 1311 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1312 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1313 // CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) 1314 // CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1315 // CHECK5: .execute.fn: 1316 // CHECK5-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1317 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1318 // CHECK5: .check.next: 1319 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1320 // CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 1321 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1322 // CHECK5: .terminate.parallel: 1323 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1324 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1325 // CHECK5: .barrier.parallel: 1326 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1327 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1328 // CHECK5: .exit: 1329 // CHECK5-NEXT: ret void 1330 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 1331 // CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { 1332 // CHECK5-NEXT: entry: 1333 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1334 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 1335 // CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1336 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1337 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1338 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1339 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1340 // CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1341 // CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1342 // CHECK5: .worker: 1343 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] 1344 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1345 // CHECK5: .mastercheck: 1346 // CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1347 // CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1348 // CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1349 // CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1350 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1351 // CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1352 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1353 // CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1354 // CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1355 // CHECK5: .master: 1356 // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1357 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1358 // CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1359 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1360 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1361 // CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) 1362 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 1363 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 1364 // CHECK5-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 1365 // CHECK5-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 1366 // CHECK5-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1367 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 1368 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* 1369 // CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 1370 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1371 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) 1372 // CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 1373 // CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 1374 // CHECK5-NEXT: store i32 [[INC]], i32* [[A7]], align 4 1375 // CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 1376 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1377 // CHECK5: .termination.notifier: 1378 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1379 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1380 // CHECK5-NEXT: br label [[DOTEXIT]] 1381 // CHECK5: .exit: 1382 // CHECK5-NEXT: ret void 1383 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 1384 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { 1385 // CHECK5-NEXT: entry: 1386 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1387 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1388 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 1389 // CHECK5-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 1390 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1391 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1392 // CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 1393 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 1394 // CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() 1395 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1396 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1397 // CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 1398 // CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 1399 // CHECK5: omp.critical.loop: 1400 // CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1401 // CHECK5-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 1402 // CHECK5-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 1403 // CHECK5: omp.critical.test: 1404 // CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1405 // CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 1406 // CHECK5-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 1407 // CHECK5: omp.critical.body: 1408 // CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 1409 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 1410 // CHECK5-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1411 // CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 1412 // CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 1413 // CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 1414 // CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1415 // CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]] 1416 // CHECK5: omp.critical.sync: 1417 // CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) 1418 // CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 1419 // CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 1420 // CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]] 1421 // CHECK5: omp.critical.exit: 1422 // CHECK5-NEXT: ret void 1423 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 1424 // CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1425 // CHECK5-NEXT: entry: 1426 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1427 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1428 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1429 // CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 1430 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1431 // CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1432 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1433 // CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1434 // CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 1435 // CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 1436 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 1437 // CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 1438 // CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] 1439 // CHECK5-NEXT: ret void 1440 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_worker 1441 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] { 1442 // CHECK1-NEXT: entry: 1443 // CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 1444 // CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1445 // CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 1446 // CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1447 // CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1448 // CHECK1: .await.work: 1449 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1450 // CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1451 // CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1452 // CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1453 // CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1454 // CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1455 // CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1456 // CHECK1: .select.workers: 1457 // CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1458 // CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1459 // CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1460 // CHECK1: .execute.parallel: 1461 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 1462 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1463 // CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) 1464 // CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1465 // CHECK1: .execute.fn: 1466 // CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] 1467 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1468 // CHECK1: .check.next: 1469 // CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1470 // CHECK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) 1471 // CHECK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] 1472 // CHECK1: .execute.fn2: 1473 // CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1474 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1475 // CHECK1: .check.next3: 1476 // CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1477 // CHECK1-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) 1478 // CHECK1-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] 1479 // CHECK1: .execute.fn5: 1480 // CHECK1-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1481 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1482 // CHECK1: .check.next6: 1483 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1484 // CHECK1-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) 1485 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1486 // CHECK1: .terminate.parallel: 1487 // CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() 1488 // CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] 1489 // CHECK1: .barrier.parallel: 1490 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1491 // CHECK1-NEXT: br label [[DOTAWAIT_WORK]] 1492 // CHECK1: .exit: 1493 // CHECK1-NEXT: ret void 1494 // 1495 // 1496 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 1497 // CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { 1498 // CHECK1-NEXT: entry: 1499 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1500 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 1501 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 1502 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 1503 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1504 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1505 // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1506 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1507 // CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1508 // CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1509 // CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1510 // CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1511 // CHECK1: .worker: 1512 // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_worker() #[[ATTR3]] 1513 // CHECK1-NEXT: br label [[DOTEXIT:%.*]] 1514 // CHECK1: .mastercheck: 1515 // CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1516 // CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1517 // CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1518 // CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1519 // CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1520 // CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1521 // CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1522 // CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1523 // CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1524 // CHECK1: .master: 1525 // CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1526 // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1527 // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1528 // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1529 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1530 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1531 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) 1532 // CHECK1-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** 1533 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) 1534 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** 1535 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) 1536 // CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 1537 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 1538 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 1539 // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1540 // CHECK1: .termination.notifier: 1541 // CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1542 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1543 // CHECK1-NEXT: br label [[DOTEXIT]] 1544 // CHECK1: .exit: 1545 // CHECK1-NEXT: ret void 1546 // 1547 // 1548 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ 1549 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1550 // CHECK1-NEXT: entry: 1551 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1552 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1553 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 1554 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1555 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1556 // CHECK1-NEXT: store i32 42, i32* [[A]], align 4 1557 // CHECK1-NEXT: ret void 1558 // 1559 // 1560 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 1561 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1562 // CHECK1-NEXT: entry: 1563 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1564 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1565 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1566 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1567 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1568 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1569 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1570 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1571 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1572 // CHECK1-NEXT: ret void 1573 // 1574 // 1575 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 1576 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1577 // CHECK1-NEXT: entry: 1578 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1579 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1580 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 1581 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1582 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1583 // CHECK1-NEXT: store i32 43, i32* [[A]], align 4 1584 // CHECK1-NEXT: ret void 1585 // 1586 // 1587 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 1588 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1589 // CHECK1-NEXT: entry: 1590 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1591 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1592 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1593 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1594 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1595 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1596 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1597 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1598 // CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1599 // CHECK1-NEXT: ret void 1600 // 1601 // 1602 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 1603 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1604 // CHECK1-NEXT: entry: 1605 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1606 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1607 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 1608 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1609 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1610 // CHECK1-NEXT: store i32 44, i32* [[A]], align 4 1611 // CHECK1-NEXT: ret void 1612 // 1613 // 1614 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 1615 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1616 // CHECK1-NEXT: entry: 1617 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1618 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1619 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1620 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1621 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1622 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1623 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1624 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1625 // CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1626 // CHECK1-NEXT: ret void 1627 // 1628 // 1629 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_worker 1630 // CHECK1-SAME: () #[[ATTR0]] { 1631 // CHECK1-NEXT: entry: 1632 // CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 1633 // CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1634 // CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 1635 // CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1636 // CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1637 // CHECK1: .await.work: 1638 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1639 // CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1640 // CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1641 // CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1642 // CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1643 // CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1644 // CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1645 // CHECK1: .select.workers: 1646 // CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1647 // CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1648 // CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1649 // CHECK1: .execute.parallel: 1650 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1651 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1652 // CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) 1653 // CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1654 // CHECK1: .execute.fn: 1655 // CHECK1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1656 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1657 // CHECK1: .check.next: 1658 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1659 // CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 1660 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1661 // CHECK1: .terminate.parallel: 1662 // CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() 1663 // CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] 1664 // CHECK1: .barrier.parallel: 1665 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1666 // CHECK1-NEXT: br label [[DOTAWAIT_WORK]] 1667 // CHECK1: .exit: 1668 // CHECK1-NEXT: ret void 1669 // 1670 // 1671 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 1672 // CHECK1-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { 1673 // CHECK1-NEXT: entry: 1674 // CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 1675 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1676 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 1677 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 1678 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 1679 // CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 1680 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1681 // CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 1682 // CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 1683 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* 1684 // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1685 // CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* 1686 // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 1687 // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1688 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1689 // CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1690 // CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1691 // CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1692 // CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1693 // CHECK1: .worker: 1694 // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_worker() #[[ATTR3]] 1695 // CHECK1-NEXT: br label [[DOTEXIT:%.*]] 1696 // CHECK1: .mastercheck: 1697 // CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1698 // CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1699 // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1700 // CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 1701 // CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 1702 // CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 1703 // CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 1704 // CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] 1705 // CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1706 // CHECK1: .master: 1707 // CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1708 // CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1709 // CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] 1710 // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) 1711 // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1712 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 1713 // CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 1714 // CHECK1-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 1715 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1716 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) 1717 // CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 1718 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 1719 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 1720 // CHECK1-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 1721 // CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 1722 // CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 1723 // CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 1724 // CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 1725 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 1726 // CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 1727 // CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 1728 // CHECK1-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 1729 // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1730 // CHECK1: .termination.notifier: 1731 // CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1732 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1733 // CHECK1-NEXT: br label [[DOTEXIT]] 1734 // CHECK1: .exit: 1735 // CHECK1-NEXT: ret void 1736 // 1737 // 1738 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 1739 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { 1740 // CHECK1-NEXT: entry: 1741 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1742 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1743 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 1744 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1745 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1746 // CHECK1-NEXT: store i32 45, i32* [[A]], align 4 1747 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 1748 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 1749 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 1750 // CHECK1-NEXT: ret void 1751 // 1752 // 1753 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 1754 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1755 // CHECK1-NEXT: entry: 1756 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1757 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1758 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1759 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1760 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1761 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1762 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1763 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1764 // CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] 1765 // CHECK1-NEXT: ret void 1766 // 1767 // 1768 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_worker 1769 // CHECK1-SAME: () #[[ATTR0]] { 1770 // CHECK1-NEXT: entry: 1771 // CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 1772 // CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1773 // CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 1774 // CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1775 // CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1776 // CHECK1: .await.work: 1777 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1778 // CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1779 // CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1780 // CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1781 // CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1782 // CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1783 // CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1784 // CHECK1: .select.workers: 1785 // CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1786 // CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1787 // CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1788 // CHECK1: .execute.parallel: 1789 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1790 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1791 // CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) 1792 // CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1793 // CHECK1: .execute.fn: 1794 // CHECK1-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] 1795 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1796 // CHECK1: .check.next: 1797 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1798 // CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 1799 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1800 // CHECK1: .terminate.parallel: 1801 // CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() 1802 // CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] 1803 // CHECK1: .barrier.parallel: 1804 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1805 // CHECK1-NEXT: br label [[DOTAWAIT_WORK]] 1806 // CHECK1: .exit: 1807 // CHECK1-NEXT: ret void 1808 // 1809 // 1810 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 1811 // CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { 1812 // CHECK1-NEXT: entry: 1813 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1814 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 1815 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1816 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1817 // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1818 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1819 // CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1820 // CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1821 // CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1822 // CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1823 // CHECK1: .worker: 1824 // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_worker() #[[ATTR3]] 1825 // CHECK1-NEXT: br label [[DOTEXIT:%.*]] 1826 // CHECK1: .mastercheck: 1827 // CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1828 // CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1829 // CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1830 // CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1831 // CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1832 // CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1833 // CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1834 // CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1835 // CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1836 // CHECK1: .master: 1837 // CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1838 // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1839 // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1840 // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1841 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 1842 // CHECK1-NEXT: [[A7:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) 1843 // CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A7]] to i32* 1844 // CHECK1-NEXT: store i32 [[TMP5]], i32* [[A_ON_STACK]], align 4 1845 // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1846 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 1847 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* 1848 // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 1849 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 1850 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP9]], i64 1) 1851 // CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 1852 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 1853 // CHECK1-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 1854 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A7]]) 1855 // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1856 // CHECK1: .termination.notifier: 1857 // CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1858 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1859 // CHECK1-NEXT: br label [[DOTEXIT]] 1860 // CHECK1: .exit: 1861 // CHECK1-NEXT: ret void 1862 // 1863 // 1864 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 1865 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { 1866 // CHECK1-NEXT: entry: 1867 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1868 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1869 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 1870 // CHECK1-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 1871 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1872 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1873 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 1874 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 1875 // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() 1876 // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1877 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1878 // CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 1879 // CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 1880 // CHECK1: omp.critical.loop: 1881 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1882 // CHECK1-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 1883 // CHECK1-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 1884 // CHECK1: omp.critical.test: 1885 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 1886 // CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 1887 // CHECK1-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 1888 // CHECK1: omp.critical.body: 1889 // CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 1890 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 1891 // CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1892 // CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 1893 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 1894 // CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 1895 // CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 1896 // CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] 1897 // CHECK1: omp.critical.sync: 1898 // CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) 1899 // CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 1900 // CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 1901 // CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] 1902 // CHECK1: omp.critical.exit: 1903 // CHECK1-NEXT: ret void 1904 // 1905 // 1906 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 1907 // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1908 // CHECK1-NEXT: entry: 1909 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 1910 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1911 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1912 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 1913 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1914 // CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 1915 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1916 // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 1917 // CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 1918 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 1919 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 1920 // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 1921 // CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] 1922 // CHECK1-NEXT: ret void 1923 // 1924 // 1925 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_worker 1926 // CHECK2-SAME: () #[[ATTR0:[0-9]+]] { 1927 // CHECK2-NEXT: entry: 1928 // CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1929 // CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1930 // CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1931 // CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1932 // CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1933 // CHECK2: .await.work: 1934 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1935 // CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1936 // CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1937 // CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1938 // CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1939 // CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1940 // CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1941 // CHECK2: .select.workers: 1942 // CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1943 // CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1944 // CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1945 // CHECK2: .execute.parallel: 1946 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 1947 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1948 // CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) 1949 // CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 1950 // CHECK2: .execute.fn: 1951 // CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2:[0-9]+]] 1952 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1953 // CHECK2: .check.next: 1954 // CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1955 // CHECK2-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) 1956 // CHECK2-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] 1957 // CHECK2: .execute.fn2: 1958 // CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 1959 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1960 // CHECK2: .check.next3: 1961 // CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1962 // CHECK2-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) 1963 // CHECK2-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] 1964 // CHECK2: .execute.fn5: 1965 // CHECK2-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 1966 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1967 // CHECK2: .check.next6: 1968 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1969 // CHECK2-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) 1970 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] 1971 // CHECK2: .terminate.parallel: 1972 // CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() 1973 // CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] 1974 // CHECK2: .barrier.parallel: 1975 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1976 // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] 1977 // CHECK2: .exit: 1978 // CHECK2-NEXT: ret void 1979 // 1980 // 1981 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 1982 // CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { 1983 // CHECK2-NEXT: entry: 1984 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1985 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 1986 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 1987 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 1988 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1989 // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1990 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1991 // CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1992 // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1993 // CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1994 // CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1995 // CHECK2: .worker: 1996 // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_worker() #[[ATTR2]] 1997 // CHECK2-NEXT: br label [[DOTEXIT:%.*]] 1998 // CHECK2: .mastercheck: 1999 // CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2000 // CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2001 // CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2002 // CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 2003 // CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 2004 // CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 2005 // CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 2006 // CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 2007 // CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 2008 // CHECK2: .master: 2009 // CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2010 // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2011 // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 2012 // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 2013 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 2014 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 2015 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) 2016 // CHECK2-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** 2017 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) 2018 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** 2019 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) 2020 // CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 2021 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 2022 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 2023 // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 2024 // CHECK2: .termination.notifier: 2025 // CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) 2026 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2027 // CHECK2-NEXT: br label [[DOTEXIT]] 2028 // CHECK2: .exit: 2029 // CHECK2-NEXT: ret void 2030 // 2031 // 2032 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ 2033 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 2034 // CHECK2-NEXT: entry: 2035 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 2036 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 2037 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 2038 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 2039 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 2040 // CHECK2-NEXT: store i32 42, i32* [[A]], align 4 2041 // CHECK2-NEXT: ret void 2042 // 2043 // 2044 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper 2045 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 2046 // CHECK2-NEXT: entry: 2047 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 2048 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2049 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 2050 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 2051 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 2052 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 2053 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2054 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 2055 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 2056 // CHECK2-NEXT: ret void 2057 // 2058 // 2059 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 2060 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 2061 // CHECK2-NEXT: entry: 2062 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 2063 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 2064 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 2065 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 2066 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 2067 // CHECK2-NEXT: store i32 43, i32* [[A]], align 4 2068 // CHECK2-NEXT: ret void 2069 // 2070 // 2071 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper 2072 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 2073 // CHECK2-NEXT: entry: 2074 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 2075 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2076 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 2077 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 2078 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 2079 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 2080 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2081 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 2082 // CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 2083 // CHECK2-NEXT: ret void 2084 // 2085 // 2086 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 2087 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 2088 // CHECK2-NEXT: entry: 2089 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 2090 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 2091 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 2092 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 2093 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 2094 // CHECK2-NEXT: store i32 44, i32* [[A]], align 4 2095 // CHECK2-NEXT: ret void 2096 // 2097 // 2098 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper 2099 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 2100 // CHECK2-NEXT: entry: 2101 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 2102 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2103 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 2104 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 2105 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 2106 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 2107 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2108 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 2109 // CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 2110 // CHECK2-NEXT: ret void 2111 // 2112 // 2113 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_worker 2114 // CHECK2-SAME: () #[[ATTR0]] { 2115 // CHECK2-NEXT: entry: 2116 // CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 2117 // CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 2118 // CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 2119 // CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 2120 // CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] 2121 // CHECK2: .await.work: 2122 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2123 // CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 2124 // CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 2125 // CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 2126 // CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 2127 // CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 2128 // CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 2129 // CHECK2: .select.workers: 2130 // CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 2131 // CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 2132 // CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 2133 // CHECK2: .execute.parallel: 2134 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 2135 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 2136 // CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) 2137 // CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 2138 // CHECK2: .execute.fn: 2139 // CHECK2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 2140 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 2141 // CHECK2: .check.next: 2142 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 2143 // CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 2144 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] 2145 // CHECK2: .terminate.parallel: 2146 // CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() 2147 // CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] 2148 // CHECK2: .barrier.parallel: 2149 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2150 // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] 2151 // CHECK2: .exit: 2152 // CHECK2-NEXT: ret void 2153 // 2154 // 2155 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 2156 // CHECK2-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 2157 // CHECK2-NEXT: entry: 2158 // CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 2159 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 2160 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 2161 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 2162 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 2163 // CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 2164 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 2165 // CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 2166 // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 2167 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* 2168 // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 2169 // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2170 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2171 // CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2172 // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 2173 // CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 2174 // CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 2175 // CHECK2: .worker: 2176 // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_worker() #[[ATTR2]] 2177 // CHECK2-NEXT: br label [[DOTEXIT:%.*]] 2178 // CHECK2: .mastercheck: 2179 // CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2180 // CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2181 // CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2182 // CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 2183 // CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 2184 // CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 2185 // CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 2186 // CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 2187 // CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 2188 // CHECK2: .master: 2189 // CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2190 // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2191 // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 2192 // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 2193 // CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 2194 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 2195 // CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 2196 // CHECK2-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 2197 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 2198 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) 2199 // CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 2200 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 2201 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 2202 // CHECK2-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 2203 // CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 2204 // CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 2205 // CHECK2-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 2206 // CHECK2-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 2207 // CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 2208 // CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 2209 // CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 2210 // CHECK2-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 2211 // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 2212 // CHECK2: .termination.notifier: 2213 // CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) 2214 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2215 // CHECK2-NEXT: br label [[DOTEXIT]] 2216 // CHECK2: .exit: 2217 // CHECK2-NEXT: ret void 2218 // 2219 // 2220 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 2221 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { 2222 // CHECK2-NEXT: entry: 2223 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 2224 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 2225 // CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 2226 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 2227 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 2228 // CHECK2-NEXT: store i32 45, i32* [[A]], align 4 2229 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 2230 // CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 2231 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) 2232 // CHECK2-NEXT: ret void 2233 // 2234 // 2235 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper 2236 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 2237 // CHECK2-NEXT: entry: 2238 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 2239 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2240 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 2241 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 2242 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 2243 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 2244 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2245 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 2246 // CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] 2247 // CHECK2-NEXT: ret void 2248 // 2249 // 2250 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_worker 2251 // CHECK2-SAME: () #[[ATTR0]] { 2252 // CHECK2-NEXT: entry: 2253 // CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 2254 // CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 2255 // CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 2256 // CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 2257 // CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] 2258 // CHECK2: .await.work: 2259 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2260 // CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 2261 // CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 2262 // CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 2263 // CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 2264 // CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 2265 // CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 2266 // CHECK2: .select.workers: 2267 // CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 2268 // CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 2269 // CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 2270 // CHECK2: .execute.parallel: 2271 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 2272 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 2273 // CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) 2274 // CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] 2275 // CHECK2: .execute.fn: 2276 // CHECK2-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] 2277 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 2278 // CHECK2: .check.next: 2279 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 2280 // CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) 2281 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] 2282 // CHECK2: .terminate.parallel: 2283 // CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() 2284 // CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] 2285 // CHECK2: .barrier.parallel: 2286 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2287 // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] 2288 // CHECK2: .exit: 2289 // CHECK2-NEXT: ret void 2290 // 2291 // 2292 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 2293 // CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { 2294 // CHECK2-NEXT: entry: 2295 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 2296 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 2297 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 2298 // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2299 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2300 // CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2301 // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 2302 // CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 2303 // CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 2304 // CHECK2: .worker: 2305 // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_worker() #[[ATTR2]] 2306 // CHECK2-NEXT: br label [[DOTEXIT:%.*]] 2307 // CHECK2: .mastercheck: 2308 // CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2309 // CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2310 // CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2311 // CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 2312 // CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 2313 // CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 2314 // CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 2315 // CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 2316 // CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 2317 // CHECK2: .master: 2318 // CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2319 // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2320 // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 2321 // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 2322 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4 2323 // CHECK2-NEXT: [[A7:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) 2324 // CHECK2-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A7]] to i32* 2325 // CHECK2-NEXT: store i32 [[TMP5]], i32* [[A_ON_STACK]], align 4 2326 // CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 2327 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 2328 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* 2329 // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 2330 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 2331 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP9]], i32 1) 2332 // CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 2333 // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 2334 // CHECK2-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 2335 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[A7]]) 2336 // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 2337 // CHECK2: .termination.notifier: 2338 // CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) 2339 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2340 // CHECK2-NEXT: br label [[DOTEXIT]] 2341 // CHECK2: .exit: 2342 // CHECK2-NEXT: ret void 2343 // 2344 // 2345 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 2346 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { 2347 // CHECK2-NEXT: entry: 2348 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 2349 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 2350 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 2351 // CHECK2-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 2352 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 2353 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 2354 // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 2355 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 2356 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() 2357 // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2358 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2359 // CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 2360 // CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] 2361 // CHECK2: omp.critical.loop: 2362 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 2363 // CHECK2-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] 2364 // CHECK2-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] 2365 // CHECK2: omp.critical.test: 2366 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 2367 // CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] 2368 // CHECK2-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] 2369 // CHECK2: omp.critical.body: 2370 // CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 2371 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 2372 // CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 2373 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 2374 // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 2375 // CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 2376 // CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") 2377 // CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] 2378 // CHECK2: omp.critical.sync: 2379 // CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) 2380 // CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 2381 // CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 2382 // CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]] 2383 // CHECK2: omp.critical.exit: 2384 // CHECK2-NEXT: ret void 2385 // 2386 // 2387 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper 2388 // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 2389 // CHECK2-NEXT: entry: 2390 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 2391 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2392 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 2393 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 2394 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 2395 // CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 2396 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2397 // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) 2398 // CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 2399 // CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 2400 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** 2401 // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 2402 // CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] 2403 // CHECK2-NEXT: ret void 2404 // 2405