1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ 2 // Test target codegen - host bc file has to be created first. 3 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 4 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 5 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 6 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc 7 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 8 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK4 9 // expected-no-diagnostics 10 #ifndef HEADER 11 #define HEADER 12 13 #ifdef CK1 14 15 template <typename T> 16 int tmain(T argc) { 17 #pragma omp target 18 #pragma omp teams 19 argc = 0; 20 return 0; 21 } 22 23 24 int main (int argc, char **argv) { 25 #pragma omp target 26 #pragma omp teams 27 { 28 argc = 0; 29 } 30 return tmain(argv); 31 } 32 33 34 // only nvptx side: do not outline teams region and do not call fork_teams 35 36 37 // target region in template 38 39 40 41 #endif // CK1 42 43 // Test target codegen - host bc file has to be created first. 44 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 45 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK5 46 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK6 47 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc 48 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK7 49 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK8 50 // expected-no-diagnostics 51 #ifdef CK2 52 53 template <typename T> 54 int tmain(T argc) { 55 int a = 10; 56 int b = 5; 57 #pragma omp target 58 #pragma omp teams num_teams(a) thread_limit(b) 59 { 60 argc = 0; 61 } 62 return 0; 63 } 64 65 int main (int argc, char **argv) { 66 int a = 20; 67 int b = 5; 68 #pragma omp target 69 #pragma omp teams num_teams(a) thread_limit(b) 70 { 71 argc = 0; 72 } 73 return tmain(argv); 74 } 75 76 77 78 79 80 81 #endif // CK2 82 #endif 83 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker 84 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] { 85 // CHECK1-NEXT: entry: 86 // CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 87 // CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 88 // CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 89 // CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 90 // CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] 91 // CHECK1: .await.work: 92 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 93 // CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 94 // CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 95 // CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 96 // CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 97 // CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 98 // CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 99 // CHECK1: .select.workers: 100 // CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 101 // CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 102 // CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 103 // CHECK1: .execute.parallel: 104 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 105 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 106 // CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 107 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 108 // CHECK1: .terminate.parallel: 109 // CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() 110 // CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] 111 // CHECK1: .barrier.parallel: 112 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 113 // CHECK1-NEXT: br label [[DOTAWAIT_WORK]] 114 // CHECK1: .exit: 115 // CHECK1-NEXT: ret void 116 // 117 // 118 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 119 // CHECK1-SAME: (i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { 120 // CHECK1-NEXT: entry: 121 // CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 122 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 123 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 124 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 125 // CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 126 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* 127 // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 128 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 129 // CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 130 // CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 131 // CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 132 // CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 133 // CHECK1: .worker: 134 // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]] 135 // CHECK1-NEXT: br label [[DOTEXIT:%.*]] 136 // CHECK1: .mastercheck: 137 // CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 138 // CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 139 // CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 140 // CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 141 // CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 142 // CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 143 // CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 144 // CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 145 // CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 146 // CHECK1: .master: 147 // CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 148 // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 149 // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 150 // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 151 // CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() 152 // CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 153 // CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 154 // CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 155 // CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 156 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 157 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* 158 // CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV]], align 8 159 // CHECK1-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 160 // CHECK1-NEXT: store i32 [[TMP10]], i32* [[ARGC7]], align 4 161 // CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 162 // CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 163 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] 164 // CHECK1-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 165 // CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) 166 // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 167 // CHECK1: .termination.notifier: 168 // CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) 169 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 170 // CHECK1-NEXT: br label [[DOTEXIT]] 171 // CHECK1: .exit: 172 // CHECK1-NEXT: ret void 173 // 174 // 175 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ 176 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 177 // CHECK1-NEXT: entry: 178 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 179 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 180 // CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 181 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 182 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 183 // CHECK1-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 184 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 185 // CHECK1-NEXT: store i32 0, i32* [[TMP0]], align 4 186 // CHECK1-NEXT: ret void 187 // 188 // 189 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker 190 // CHECK1-SAME: () #[[ATTR0]] { 191 // CHECK1-NEXT: entry: 192 // CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 193 // CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 194 // CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 195 // CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 196 // CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] 197 // CHECK1: .await.work: 198 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 199 // CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 200 // CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 201 // CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 202 // CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 203 // CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 204 // CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 205 // CHECK1: .select.workers: 206 // CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 207 // CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 208 // CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 209 // CHECK1: .execute.parallel: 210 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 211 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 212 // CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 213 // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 214 // CHECK1: .terminate.parallel: 215 // CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() 216 // CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] 217 // CHECK1: .barrier.parallel: 218 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 219 // CHECK1-NEXT: br label [[DOTAWAIT_WORK]] 220 // CHECK1: .exit: 221 // CHECK1-NEXT: ret void 222 // 223 // 224 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17 225 // CHECK1-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { 226 // CHECK1-NEXT: entry: 227 // CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 228 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 229 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 230 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 231 // CHECK1-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 232 // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 233 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 234 // CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 235 // CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 236 // CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 237 // CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 238 // CHECK1: .worker: 239 // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]] 240 // CHECK1-NEXT: br label [[DOTEXIT:%.*]] 241 // CHECK1: .mastercheck: 242 // CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 243 // CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 244 // CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 245 // CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 246 // CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 247 // CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 248 // CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 249 // CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 250 // CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 251 // CHECK1: .master: 252 // CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 253 // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 254 // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 255 // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 256 // CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() 257 // CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 258 // CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8 259 // CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 260 // CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 261 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 262 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0* 263 // CHECK1-NEXT: [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 264 // CHECK1-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0 265 // CHECK1-NEXT: store i8** [[TMP10]], i8*** [[ARGC7]], align 8 266 // CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 267 // CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 268 // CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] 269 // CHECK1-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 270 // CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) 271 // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 272 // CHECK1: .termination.notifier: 273 // CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) 274 // CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 275 // CHECK1-NEXT: br label [[DOTEXIT]] 276 // CHECK1: .exit: 277 // CHECK1-NEXT: ret void 278 // 279 // 280 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 281 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { 282 // CHECK1-NEXT: entry: 283 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 284 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 285 // CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 286 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 287 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 288 // CHECK1-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 289 // CHECK1-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 290 // CHECK1-NEXT: store i8** null, i8*** [[TMP0]], align 8 291 // CHECK1-NEXT: ret void 292 // 293 // 294 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker 295 // CHECK2-SAME: () #[[ATTR0:[0-9]+]] { 296 // CHECK2-NEXT: entry: 297 // CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 298 // CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 299 // CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 300 // CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 301 // CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] 302 // CHECK2: .await.work: 303 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 304 // CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 305 // CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 306 // CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 307 // CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 308 // CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 309 // CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 310 // CHECK2: .select.workers: 311 // CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 312 // CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 313 // CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 314 // CHECK2: .execute.parallel: 315 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 316 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 317 // CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 318 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 319 // CHECK2: .terminate.parallel: 320 // CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() 321 // CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] 322 // CHECK2: .barrier.parallel: 323 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 324 // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] 325 // CHECK2: .exit: 326 // CHECK2-NEXT: ret void 327 // 328 // 329 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 330 // CHECK2-SAME: (i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { 331 // CHECK2-NEXT: entry: 332 // CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 333 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 334 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 335 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 336 // CHECK2-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 337 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* 338 // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 339 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 340 // CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 341 // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 342 // CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 343 // CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 344 // CHECK2: .worker: 345 // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]] 346 // CHECK2-NEXT: br label [[DOTEXIT:%.*]] 347 // CHECK2: .mastercheck: 348 // CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 349 // CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 350 // CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 351 // CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 352 // CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 353 // CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 354 // CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 355 // CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 356 // CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 357 // CHECK2: .master: 358 // CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 359 // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 360 // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 361 // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 362 // CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() 363 // CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) 364 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 365 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 366 // CHECK2-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 367 // CHECK2-NEXT: store i32 [[TMP7]], i32* [[ARGC7]], align 4 368 // CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 369 // CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 370 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] 371 // CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 372 // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 373 // CHECK2: .termination.notifier: 374 // CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) 375 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 376 // CHECK2-NEXT: br label [[DOTEXIT]] 377 // CHECK2: .exit: 378 // CHECK2-NEXT: ret void 379 // 380 // 381 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ 382 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 383 // CHECK2-NEXT: entry: 384 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 385 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 386 // CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 387 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 388 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 389 // CHECK2-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 390 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 391 // CHECK2-NEXT: store i32 0, i32* [[TMP0]], align 4 392 // CHECK2-NEXT: ret void 393 // 394 // 395 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker 396 // CHECK2-SAME: () #[[ATTR0]] { 397 // CHECK2-NEXT: entry: 398 // CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 399 // CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 400 // CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 401 // CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 402 // CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] 403 // CHECK2: .await.work: 404 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 405 // CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 406 // CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 407 // CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 408 // CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 409 // CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 410 // CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 411 // CHECK2: .select.workers: 412 // CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 413 // CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 414 // CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 415 // CHECK2: .execute.parallel: 416 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 417 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 418 // CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 419 // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 420 // CHECK2: .terminate.parallel: 421 // CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() 422 // CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] 423 // CHECK2: .barrier.parallel: 424 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 425 // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] 426 // CHECK2: .exit: 427 // CHECK2-NEXT: ret void 428 // 429 // 430 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17 431 // CHECK2-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { 432 // CHECK2-NEXT: entry: 433 // CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 434 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 435 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 436 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 437 // CHECK2-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 438 // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 439 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 440 // CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 441 // CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 442 // CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 443 // CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 444 // CHECK2: .worker: 445 // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]] 446 // CHECK2-NEXT: br label [[DOTEXIT:%.*]] 447 // CHECK2: .mastercheck: 448 // CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 449 // CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 450 // CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 451 // CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 452 // CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 453 // CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 454 // CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 455 // CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 456 // CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 457 // CHECK2: .master: 458 // CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 459 // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 460 // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 461 // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 462 // CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() 463 // CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) 464 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 465 // CHECK2-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 466 // CHECK2-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 467 // CHECK2-NEXT: store i8** [[TMP7]], i8*** [[ARGC7]], align 8 468 // CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 469 // CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 470 // CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] 471 // CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 472 // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 473 // CHECK2: .termination.notifier: 474 // CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) 475 // CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 476 // CHECK2-NEXT: br label [[DOTEXIT]] 477 // CHECK2: .exit: 478 // CHECK2-NEXT: ret void 479 // 480 // 481 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 482 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { 483 // CHECK2-NEXT: entry: 484 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 485 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 486 // CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 487 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 488 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 489 // CHECK2-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 490 // CHECK2-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 491 // CHECK2-NEXT: store i8** null, i8*** [[TMP0]], align 8 492 // CHECK2-NEXT: ret void 493 // 494 // 495 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker 496 // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { 497 // CHECK3-NEXT: entry: 498 // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 499 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 500 // CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 501 // CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 502 // CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] 503 // CHECK3: .await.work: 504 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 505 // CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 506 // CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 507 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 508 // CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 509 // CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 510 // CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 511 // CHECK3: .select.workers: 512 // CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 513 // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 514 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 515 // CHECK3: .execute.parallel: 516 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 517 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 518 // CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 519 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 520 // CHECK3: .terminate.parallel: 521 // CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() 522 // CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] 523 // CHECK3: .barrier.parallel: 524 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 525 // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] 526 // CHECK3: .exit: 527 // CHECK3-NEXT: ret void 528 // 529 // 530 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 531 // CHECK3-SAME: (i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { 532 // CHECK3-NEXT: entry: 533 // CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 534 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 535 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 536 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 537 // CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 538 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 539 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 540 // CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 541 // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 542 // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 543 // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 544 // CHECK3: .worker: 545 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]] 546 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 547 // CHECK3: .mastercheck: 548 // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 549 // CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 550 // CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 551 // CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 552 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 553 // CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 554 // CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 555 // CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 556 // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 557 // CHECK3: .master: 558 // CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 559 // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 560 // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 561 // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 562 // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() 563 // CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 564 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 565 // CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 566 // CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 567 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 568 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* 569 // CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 570 // CHECK3-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 571 // CHECK3-NEXT: store i32 [[TMP10]], i32* [[ARGC7]], align 4 572 // CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 573 // CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 574 // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] 575 // CHECK3-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 576 // CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) 577 // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 578 // CHECK3: .termination.notifier: 579 // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) 580 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 581 // CHECK3-NEXT: br label [[DOTEXIT]] 582 // CHECK3: .exit: 583 // CHECK3-NEXT: ret void 584 // 585 // 586 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ 587 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 588 // CHECK3-NEXT: entry: 589 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 590 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 591 // CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 592 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 593 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 594 // CHECK3-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 595 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 596 // CHECK3-NEXT: store i32 0, i32* [[TMP0]], align 4 597 // CHECK3-NEXT: ret void 598 // 599 // 600 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker 601 // CHECK3-SAME: () #[[ATTR0]] { 602 // CHECK3-NEXT: entry: 603 // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 604 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 605 // CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 606 // CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 607 // CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] 608 // CHECK3: .await.work: 609 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 610 // CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 611 // CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 612 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 613 // CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 614 // CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 615 // CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 616 // CHECK3: .select.workers: 617 // CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 618 // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 619 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 620 // CHECK3: .execute.parallel: 621 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 622 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 623 // CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 624 // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 625 // CHECK3: .terminate.parallel: 626 // CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() 627 // CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] 628 // CHECK3: .barrier.parallel: 629 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 630 // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] 631 // CHECK3: .exit: 632 // CHECK3-NEXT: ret void 633 // 634 // 635 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17 636 // CHECK3-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { 637 // CHECK3-NEXT: entry: 638 // CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 639 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 640 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 641 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 642 // CHECK3-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 643 // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 644 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 645 // CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 646 // CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 647 // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 648 // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 649 // CHECK3: .worker: 650 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]] 651 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 652 // CHECK3: .mastercheck: 653 // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 654 // CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 655 // CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 656 // CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 657 // CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 658 // CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 659 // CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 660 // CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 661 // CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 662 // CHECK3: .master: 663 // CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 664 // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 665 // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 666 // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 667 // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() 668 // CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 669 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 670 // CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 671 // CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 672 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 673 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0* 674 // CHECK3-NEXT: [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 675 // CHECK3-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0 676 // CHECK3-NEXT: store i8** [[TMP10]], i8*** [[ARGC7]], align 4 677 // CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 678 // CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 679 // CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] 680 // CHECK3-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 681 // CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) 682 // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 683 // CHECK3: .termination.notifier: 684 // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) 685 // CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 686 // CHECK3-NEXT: br label [[DOTEXIT]] 687 // CHECK3: .exit: 688 // CHECK3-NEXT: ret void 689 // 690 // 691 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 692 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 693 // CHECK3-NEXT: entry: 694 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 695 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 696 // CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 697 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 698 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 699 // CHECK3-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 700 // CHECK3-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 701 // CHECK3-NEXT: store i8** null, i8*** [[TMP0]], align 4 702 // CHECK3-NEXT: ret void 703 // 704 // 705 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker 706 // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { 707 // CHECK4-NEXT: entry: 708 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 709 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 710 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 711 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 712 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 713 // CHECK4: .await.work: 714 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 715 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 716 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 717 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 718 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 719 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 720 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 721 // CHECK4: .select.workers: 722 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 723 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 724 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 725 // CHECK4: .execute.parallel: 726 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 727 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 728 // CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 729 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 730 // CHECK4: .terminate.parallel: 731 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 732 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 733 // CHECK4: .barrier.parallel: 734 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 735 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 736 // CHECK4: .exit: 737 // CHECK4-NEXT: ret void 738 // 739 // 740 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 741 // CHECK4-SAME: (i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { 742 // CHECK4-NEXT: entry: 743 // CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 744 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 745 // CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 746 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 747 // CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 748 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 749 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 750 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 751 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 752 // CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 753 // CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 754 // CHECK4: .worker: 755 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]] 756 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 757 // CHECK4: .mastercheck: 758 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 759 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 760 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 761 // CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 762 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 763 // CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 764 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 765 // CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 766 // CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 767 // CHECK4: .master: 768 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 769 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 770 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 771 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 772 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 773 // CHECK4-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) 774 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 775 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 776 // CHECK4-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 777 // CHECK4-NEXT: store i32 [[TMP7]], i32* [[ARGC7]], align 4 778 // CHECK4-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 779 // CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 780 // CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] 781 // CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 782 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 783 // CHECK4: .termination.notifier: 784 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 785 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 786 // CHECK4-NEXT: br label [[DOTEXIT]] 787 // CHECK4: .exit: 788 // CHECK4-NEXT: ret void 789 // 790 // 791 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ 792 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 793 // CHECK4-NEXT: entry: 794 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 795 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 796 // CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 797 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 798 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 799 // CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 800 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 801 // CHECK4-NEXT: store i32 0, i32* [[TMP0]], align 4 802 // CHECK4-NEXT: ret void 803 // 804 // 805 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker 806 // CHECK4-SAME: () #[[ATTR0]] { 807 // CHECK4-NEXT: entry: 808 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 809 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 810 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 811 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 812 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 813 // CHECK4: .await.work: 814 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 815 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 816 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 817 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 818 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 819 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 820 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 821 // CHECK4: .select.workers: 822 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 823 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 824 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 825 // CHECK4: .execute.parallel: 826 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 827 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 828 // CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 829 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 830 // CHECK4: .terminate.parallel: 831 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 832 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 833 // CHECK4: .barrier.parallel: 834 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 835 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 836 // CHECK4: .exit: 837 // CHECK4-NEXT: ret void 838 // 839 // 840 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17 841 // CHECK4-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { 842 // CHECK4-NEXT: entry: 843 // CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 844 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 845 // CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 846 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 847 // CHECK4-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 848 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 849 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 850 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 851 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 852 // CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 853 // CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 854 // CHECK4: .worker: 855 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]] 856 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 857 // CHECK4: .mastercheck: 858 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 859 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 860 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 861 // CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 862 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 863 // CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 864 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 865 // CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 866 // CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 867 // CHECK4: .master: 868 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 869 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 870 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 871 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 872 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 873 // CHECK4-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) 874 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 875 // CHECK4-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 876 // CHECK4-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 877 // CHECK4-NEXT: store i8** [[TMP7]], i8*** [[ARGC7]], align 4 878 // CHECK4-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 879 // CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 880 // CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] 881 // CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 882 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 883 // CHECK4: .termination.notifier: 884 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 885 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 886 // CHECK4-NEXT: br label [[DOTEXIT]] 887 // CHECK4: .exit: 888 // CHECK4-NEXT: ret void 889 // 890 // 891 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 892 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 893 // CHECK4-NEXT: entry: 894 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 895 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 896 // CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 897 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 898 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 899 // CHECK4-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 900 // CHECK4-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 901 // CHECK4-NEXT: store i8** null, i8*** [[TMP0]], align 4 902 // CHECK4-NEXT: ret void 903 // 904 // 905 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker 906 // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { 907 // CHECK5-NEXT: entry: 908 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 909 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 910 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 8 911 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 912 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 913 // CHECK5: .await.work: 914 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 915 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 916 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 917 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 918 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 919 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 920 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 921 // CHECK5: .select.workers: 922 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 923 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 924 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 925 // CHECK5: .execute.parallel: 926 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 927 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 928 // CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 929 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 930 // CHECK5: .terminate.parallel: 931 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 932 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 933 // CHECK5: .barrier.parallel: 934 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 935 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 936 // CHECK5: .exit: 937 // CHECK5-NEXT: ret void 938 // 939 // 940 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 941 // CHECK5-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { 942 // CHECK5-NEXT: entry: 943 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 944 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 945 // CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 946 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 947 // CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 948 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 949 // CHECK5-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 950 // CHECK5-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 951 // CHECK5-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 952 // CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 953 // CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* 954 // CHECK5-NEXT: [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* 955 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 956 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 957 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 958 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 959 // CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 960 // CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 961 // CHECK5: .worker: 962 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] 963 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 964 // CHECK5: .mastercheck: 965 // CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 966 // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 967 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 968 // CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 969 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 970 // CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 971 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 972 // CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] 973 // CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 974 // CHECK5: .master: 975 // CHECK5-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 976 // CHECK5-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 977 // CHECK5-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] 978 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) 979 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 980 // CHECK5-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 981 // CHECK5-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 982 // CHECK5-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 983 // CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 984 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 985 // CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* 986 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV2]], align 8 987 // CHECK5-NEXT: [[ARGC9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 988 // CHECK5-NEXT: store i32 [[TMP10]], i32* [[ARGC9]], align 4 989 // CHECK5-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 990 // CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 991 // CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC9]]) #[[ATTR3]] 992 // CHECK5-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 993 // CHECK5-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) 994 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 995 // CHECK5: .termination.notifier: 996 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 997 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 998 // CHECK5-NEXT: br label [[DOTEXIT]] 999 // CHECK5: .exit: 1000 // CHECK5-NEXT: ret void 1001 // 1002 // 1003 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ 1004 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 1005 // CHECK5-NEXT: entry: 1006 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1007 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1008 // CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 1009 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1010 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1011 // CHECK5-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 1012 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 1013 // CHECK5-NEXT: store i32 0, i32* [[TMP0]], align 4 1014 // CHECK5-NEXT: ret void 1015 // 1016 // 1017 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker 1018 // CHECK5-SAME: () #[[ATTR0]] { 1019 // CHECK5-NEXT: entry: 1020 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 1021 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1022 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 8 1023 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1024 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1025 // CHECK5: .await.work: 1026 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1027 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1028 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1029 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1030 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1031 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1032 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1033 // CHECK5: .select.workers: 1034 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1035 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1036 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1037 // CHECK5: .execute.parallel: 1038 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1039 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1040 // CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1041 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1042 // CHECK5: .terminate.parallel: 1043 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1044 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1045 // CHECK5: .barrier.parallel: 1046 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1047 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1048 // CHECK5: .exit: 1049 // CHECK5-NEXT: ret void 1050 // 1051 // 1052 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 1053 // CHECK5-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { 1054 // CHECK5-NEXT: entry: 1055 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1056 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 1057 // CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 1058 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1059 // CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1060 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1061 // CHECK5-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1062 // CHECK5-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 1063 // CHECK5-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 1064 // CHECK5-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1065 // CHECK5-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* 1066 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1067 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1068 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1069 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1070 // CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1071 // CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1072 // CHECK5: .worker: 1073 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] 1074 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1075 // CHECK5: .mastercheck: 1076 // CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1077 // CHECK5-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1078 // CHECK5-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1079 // CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 1080 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 1081 // CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1082 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1083 // CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] 1084 // CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1085 // CHECK5: .master: 1086 // CHECK5-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1087 // CHECK5-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1088 // CHECK5-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] 1089 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) 1090 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1091 // CHECK5-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 1092 // CHECK5-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8 1093 // CHECK5-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 1094 // CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 1095 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 1096 // CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0* 1097 // CHECK5-NEXT: [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 1098 // CHECK5-NEXT: [[ARGC8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0 1099 // CHECK5-NEXT: store i8** [[TMP10]], i8*** [[ARGC8]], align 8 1100 // CHECK5-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1101 // CHECK5-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 1102 // CHECK5-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC8]]) #[[ATTR3]] 1103 // CHECK5-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 1104 // CHECK5-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) 1105 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1106 // CHECK5: .termination.notifier: 1107 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1108 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1109 // CHECK5-NEXT: br label [[DOTEXIT]] 1110 // CHECK5: .exit: 1111 // CHECK5-NEXT: ret void 1112 // 1113 // 1114 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 1115 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { 1116 // CHECK5-NEXT: entry: 1117 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1118 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1119 // CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 1120 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1121 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1122 // CHECK5-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 1123 // CHECK5-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 1124 // CHECK5-NEXT: store i8** null, i8*** [[TMP0]], align 8 1125 // CHECK5-NEXT: ret void 1126 // 1127 // 1128 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker 1129 // CHECK6-SAME: () #[[ATTR0:[0-9]+]] { 1130 // CHECK6-NEXT: entry: 1131 // CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 1132 // CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1133 // CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 8 1134 // CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1135 // CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1136 // CHECK6: .await.work: 1137 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1138 // CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1139 // CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1140 // CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1141 // CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1142 // CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1143 // CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1144 // CHECK6: .select.workers: 1145 // CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1146 // CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1147 // CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1148 // CHECK6: .execute.parallel: 1149 // CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 1150 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1151 // CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1152 // CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1153 // CHECK6: .terminate.parallel: 1154 // CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() 1155 // CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] 1156 // CHECK6: .barrier.parallel: 1157 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1158 // CHECK6-NEXT: br label [[DOTAWAIT_WORK]] 1159 // CHECK6: .exit: 1160 // CHECK6-NEXT: ret void 1161 // 1162 // 1163 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 1164 // CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { 1165 // CHECK6-NEXT: entry: 1166 // CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1167 // CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 1168 // CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 1169 // CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1170 // CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1171 // CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1172 // CHECK6-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1173 // CHECK6-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 1174 // CHECK6-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 1175 // CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1176 // CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* 1177 // CHECK6-NEXT: [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* 1178 // CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1179 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1180 // CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1181 // CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1182 // CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1183 // CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1184 // CHECK6: .worker: 1185 // CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] 1186 // CHECK6-NEXT: br label [[DOTEXIT:%.*]] 1187 // CHECK6: .mastercheck: 1188 // CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1189 // CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1190 // CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1191 // CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 1192 // CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 1193 // CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1194 // CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1195 // CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] 1196 // CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1197 // CHECK6: .master: 1198 // CHECK6-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1199 // CHECK6-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1200 // CHECK6-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] 1201 // CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) 1202 // CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() 1203 // CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) 1204 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 1205 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV2]], align 8 1206 // CHECK6-NEXT: [[ARGC9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 1207 // CHECK6-NEXT: store i32 [[TMP7]], i32* [[ARGC9]], align 4 1208 // CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1209 // CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 1210 // CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC9]]) #[[ATTR3]] 1211 // CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 1212 // CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1213 // CHECK6: .termination.notifier: 1214 // CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1215 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1216 // CHECK6-NEXT: br label [[DOTEXIT]] 1217 // CHECK6: .exit: 1218 // CHECK6-NEXT: ret void 1219 // 1220 // 1221 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ 1222 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 1223 // CHECK6-NEXT: entry: 1224 // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1225 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1226 // CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 1227 // CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1228 // CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1229 // CHECK6-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 1230 // CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 1231 // CHECK6-NEXT: store i32 0, i32* [[TMP0]], align 4 1232 // CHECK6-NEXT: ret void 1233 // 1234 // 1235 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker 1236 // CHECK6-SAME: () #[[ATTR0]] { 1237 // CHECK6-NEXT: entry: 1238 // CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 1239 // CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1240 // CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 8 1241 // CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1242 // CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1243 // CHECK6: .await.work: 1244 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1245 // CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1246 // CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1247 // CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1248 // CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 1249 // CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1250 // CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1251 // CHECK6: .select.workers: 1252 // CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1253 // CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1254 // CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1255 // CHECK6: .execute.parallel: 1256 // CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1257 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1258 // CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1259 // CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1260 // CHECK6: .terminate.parallel: 1261 // CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() 1262 // CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] 1263 // CHECK6: .barrier.parallel: 1264 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1265 // CHECK6-NEXT: br label [[DOTAWAIT_WORK]] 1266 // CHECK6: .exit: 1267 // CHECK6-NEXT: ret void 1268 // 1269 // 1270 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 1271 // CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { 1272 // CHECK6-NEXT: entry: 1273 // CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 1274 // CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 1275 // CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 1276 // CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1277 // CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1278 // CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1279 // CHECK6-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 1280 // CHECK6-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 1281 // CHECK6-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 1282 // CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 1283 // CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* 1284 // CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1285 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1286 // CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1287 // CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1288 // CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1289 // CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1290 // CHECK6: .worker: 1291 // CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] 1292 // CHECK6-NEXT: br label [[DOTEXIT:%.*]] 1293 // CHECK6: .mastercheck: 1294 // CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1295 // CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1296 // CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1297 // CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 1298 // CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 1299 // CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1300 // CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1301 // CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] 1302 // CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1303 // CHECK6: .master: 1304 // CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1305 // CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1306 // CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] 1307 // CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) 1308 // CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() 1309 // CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) 1310 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 1311 // CHECK6-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 1312 // CHECK6-NEXT: [[ARGC8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 1313 // CHECK6-NEXT: store i8** [[TMP7]], i8*** [[ARGC8]], align 8 1314 // CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1315 // CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 1316 // CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC8]]) #[[ATTR3]] 1317 // CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 1318 // CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1319 // CHECK6: .termination.notifier: 1320 // CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1321 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1322 // CHECK6-NEXT: br label [[DOTEXIT]] 1323 // CHECK6: .exit: 1324 // CHECK6-NEXT: ret void 1325 // 1326 // 1327 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 1328 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { 1329 // CHECK6-NEXT: entry: 1330 // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 1331 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 1332 // CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 1333 // CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 1334 // CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 1335 // CHECK6-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 1336 // CHECK6-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 1337 // CHECK6-NEXT: store i8** null, i8*** [[TMP0]], align 8 1338 // CHECK6-NEXT: ret void 1339 // 1340 // 1341 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker 1342 // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { 1343 // CHECK7-NEXT: entry: 1344 // CHECK7-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1345 // CHECK7-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1346 // CHECK7-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1347 // CHECK7-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1348 // CHECK7-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1349 // CHECK7: .await.work: 1350 // CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1351 // CHECK7-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1352 // CHECK7-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1353 // CHECK7-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1354 // CHECK7-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1355 // CHECK7-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1356 // CHECK7-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1357 // CHECK7: .select.workers: 1358 // CHECK7-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1359 // CHECK7-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1360 // CHECK7-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1361 // CHECK7: .execute.parallel: 1362 // CHECK7-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 1363 // CHECK7-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1364 // CHECK7-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1365 // CHECK7-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1366 // CHECK7: .terminate.parallel: 1367 // CHECK7-NEXT: call void @__kmpc_kernel_end_parallel() 1368 // CHECK7-NEXT: br label [[DOTBARRIER_PARALLEL]] 1369 // CHECK7: .barrier.parallel: 1370 // CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1371 // CHECK7-NEXT: br label [[DOTAWAIT_WORK]] 1372 // CHECK7: .exit: 1373 // CHECK7-NEXT: ret void 1374 // 1375 // 1376 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 1377 // CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { 1378 // CHECK7-NEXT: entry: 1379 // CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1380 // CHECK7-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 1381 // CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 1382 // CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1383 // CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1384 // CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1385 // CHECK7-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1386 // CHECK7-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 1387 // CHECK7-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 1388 // CHECK7-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1389 // CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1390 // CHECK7-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1391 // CHECK7-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1392 // CHECK7-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1393 // CHECK7-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1394 // CHECK7: .worker: 1395 // CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] 1396 // CHECK7-NEXT: br label [[DOTEXIT:%.*]] 1397 // CHECK7: .mastercheck: 1398 // CHECK7-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1399 // CHECK7-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1400 // CHECK7-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1401 // CHECK7-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1402 // CHECK7-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1403 // CHECK7-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1404 // CHECK7-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1405 // CHECK7-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1406 // CHECK7-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1407 // CHECK7: .master: 1408 // CHECK7-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1409 // CHECK7-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1410 // CHECK7-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1411 // CHECK7-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1412 // CHECK7-NEXT: call void @__kmpc_data_sharing_init_stack() 1413 // CHECK7-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 1414 // CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 1415 // CHECK7-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 1416 // CHECK7-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 1417 // CHECK7-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 1418 // CHECK7-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* 1419 // CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 1420 // CHECK7-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 1421 // CHECK7-NEXT: store i32 [[TMP10]], i32* [[ARGC7]], align 4 1422 // CHECK7-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1423 // CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 1424 // CHECK7-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] 1425 // CHECK7-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 1426 // CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) 1427 // CHECK7-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1428 // CHECK7: .termination.notifier: 1429 // CHECK7-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1430 // CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1431 // CHECK7-NEXT: br label [[DOTEXIT]] 1432 // CHECK7: .exit: 1433 // CHECK7-NEXT: ret void 1434 // 1435 // 1436 // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ 1437 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 1438 // CHECK7-NEXT: entry: 1439 // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1440 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1441 // CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 1442 // CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1443 // CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1444 // CHECK7-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 1445 // CHECK7-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 1446 // CHECK7-NEXT: store i32 0, i32* [[TMP0]], align 4 1447 // CHECK7-NEXT: ret void 1448 // 1449 // 1450 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker 1451 // CHECK7-SAME: () #[[ATTR0]] { 1452 // CHECK7-NEXT: entry: 1453 // CHECK7-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1454 // CHECK7-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1455 // CHECK7-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1456 // CHECK7-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1457 // CHECK7-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1458 // CHECK7: .await.work: 1459 // CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1460 // CHECK7-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1461 // CHECK7-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1462 // CHECK7-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1463 // CHECK7-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1464 // CHECK7-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1465 // CHECK7-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1466 // CHECK7: .select.workers: 1467 // CHECK7-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1468 // CHECK7-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1469 // CHECK7-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1470 // CHECK7: .execute.parallel: 1471 // CHECK7-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1472 // CHECK7-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1473 // CHECK7-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1474 // CHECK7-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1475 // CHECK7: .terminate.parallel: 1476 // CHECK7-NEXT: call void @__kmpc_kernel_end_parallel() 1477 // CHECK7-NEXT: br label [[DOTBARRIER_PARALLEL]] 1478 // CHECK7: .barrier.parallel: 1479 // CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1480 // CHECK7-NEXT: br label [[DOTAWAIT_WORK]] 1481 // CHECK7: .exit: 1482 // CHECK7-NEXT: ret void 1483 // 1484 // 1485 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 1486 // CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { 1487 // CHECK7-NEXT: entry: 1488 // CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1489 // CHECK7-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 1490 // CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 1491 // CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1492 // CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1493 // CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1494 // CHECK7-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1495 // CHECK7-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 1496 // CHECK7-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 1497 // CHECK7-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1498 // CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1499 // CHECK7-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1500 // CHECK7-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1501 // CHECK7-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1502 // CHECK7-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1503 // CHECK7: .worker: 1504 // CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] 1505 // CHECK7-NEXT: br label [[DOTEXIT:%.*]] 1506 // CHECK7: .mastercheck: 1507 // CHECK7-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1508 // CHECK7-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1509 // CHECK7-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1510 // CHECK7-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1511 // CHECK7-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1512 // CHECK7-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1513 // CHECK7-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1514 // CHECK7-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1515 // CHECK7-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1516 // CHECK7: .master: 1517 // CHECK7-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1518 // CHECK7-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1519 // CHECK7-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1520 // CHECK7-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1521 // CHECK7-NEXT: call void @__kmpc_data_sharing_init_stack() 1522 // CHECK7-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 1523 // CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 1524 // CHECK7-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 1525 // CHECK7-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 1526 // CHECK7-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 1527 // CHECK7-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0* 1528 // CHECK7-NEXT: [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 1529 // CHECK7-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0 1530 // CHECK7-NEXT: store i8** [[TMP10]], i8*** [[ARGC7]], align 4 1531 // CHECK7-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1532 // CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 1533 // CHECK7-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] 1534 // CHECK7-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 1535 // CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) 1536 // CHECK7-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1537 // CHECK7: .termination.notifier: 1538 // CHECK7-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1539 // CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1540 // CHECK7-NEXT: br label [[DOTEXIT]] 1541 // CHECK7: .exit: 1542 // CHECK7-NEXT: ret void 1543 // 1544 // 1545 // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 1546 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 1547 // CHECK7-NEXT: entry: 1548 // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1549 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1550 // CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 1551 // CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1552 // CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1553 // CHECK7-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 1554 // CHECK7-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 1555 // CHECK7-NEXT: store i8** null, i8*** [[TMP0]], align 4 1556 // CHECK7-NEXT: ret void 1557 // 1558 // 1559 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker 1560 // CHECK8-SAME: () #[[ATTR0:[0-9]+]] { 1561 // CHECK8-NEXT: entry: 1562 // CHECK8-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1563 // CHECK8-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1564 // CHECK8-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1565 // CHECK8-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1566 // CHECK8-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1567 // CHECK8: .await.work: 1568 // CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1569 // CHECK8-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1570 // CHECK8-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1571 // CHECK8-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1572 // CHECK8-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1573 // CHECK8-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1574 // CHECK8-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1575 // CHECK8: .select.workers: 1576 // CHECK8-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1577 // CHECK8-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1578 // CHECK8-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1579 // CHECK8: .execute.parallel: 1580 // CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 1581 // CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1582 // CHECK8-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1583 // CHECK8-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1584 // CHECK8: .terminate.parallel: 1585 // CHECK8-NEXT: call void @__kmpc_kernel_end_parallel() 1586 // CHECK8-NEXT: br label [[DOTBARRIER_PARALLEL]] 1587 // CHECK8: .barrier.parallel: 1588 // CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1589 // CHECK8-NEXT: br label [[DOTAWAIT_WORK]] 1590 // CHECK8: .exit: 1591 // CHECK8-NEXT: ret void 1592 // 1593 // 1594 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 1595 // CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { 1596 // CHECK8-NEXT: entry: 1597 // CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1598 // CHECK8-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 1599 // CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 1600 // CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1601 // CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1602 // CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1603 // CHECK8-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1604 // CHECK8-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 1605 // CHECK8-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 1606 // CHECK8-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1607 // CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1608 // CHECK8-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1609 // CHECK8-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1610 // CHECK8-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1611 // CHECK8-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1612 // CHECK8: .worker: 1613 // CHECK8-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] 1614 // CHECK8-NEXT: br label [[DOTEXIT:%.*]] 1615 // CHECK8: .mastercheck: 1616 // CHECK8-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1617 // CHECK8-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1618 // CHECK8-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1619 // CHECK8-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1620 // CHECK8-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1621 // CHECK8-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1622 // CHECK8-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1623 // CHECK8-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1624 // CHECK8-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1625 // CHECK8: .master: 1626 // CHECK8-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1627 // CHECK8-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1628 // CHECK8-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1629 // CHECK8-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1630 // CHECK8-NEXT: call void @__kmpc_data_sharing_init_stack() 1631 // CHECK8-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) 1632 // CHECK8-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 1633 // CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 1634 // CHECK8-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 1635 // CHECK8-NEXT: store i32 [[TMP7]], i32* [[ARGC7]], align 4 1636 // CHECK8-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1637 // CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 1638 // CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] 1639 // CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 1640 // CHECK8-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1641 // CHECK8: .termination.notifier: 1642 // CHECK8-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1643 // CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1644 // CHECK8-NEXT: br label [[DOTEXIT]] 1645 // CHECK8: .exit: 1646 // CHECK8-NEXT: ret void 1647 // 1648 // 1649 // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ 1650 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 1651 // CHECK8-NEXT: entry: 1652 // CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1653 // CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1654 // CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 1655 // CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1656 // CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1657 // CHECK8-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 1658 // CHECK8-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 1659 // CHECK8-NEXT: store i32 0, i32* [[TMP0]], align 4 1660 // CHECK8-NEXT: ret void 1661 // 1662 // 1663 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker 1664 // CHECK8-SAME: () #[[ATTR0]] { 1665 // CHECK8-NEXT: entry: 1666 // CHECK8-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1667 // CHECK8-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1668 // CHECK8-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1669 // CHECK8-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1670 // CHECK8-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1671 // CHECK8: .await.work: 1672 // CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1673 // CHECK8-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1674 // CHECK8-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1675 // CHECK8-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1676 // CHECK8-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1677 // CHECK8-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1678 // CHECK8-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1679 // CHECK8: .select.workers: 1680 // CHECK8-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1681 // CHECK8-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1682 // CHECK8-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1683 // CHECK8: .execute.parallel: 1684 // CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1685 // CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1686 // CHECK8-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1687 // CHECK8-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1688 // CHECK8: .terminate.parallel: 1689 // CHECK8-NEXT: call void @__kmpc_kernel_end_parallel() 1690 // CHECK8-NEXT: br label [[DOTBARRIER_PARALLEL]] 1691 // CHECK8: .barrier.parallel: 1692 // CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1693 // CHECK8-NEXT: br label [[DOTAWAIT_WORK]] 1694 // CHECK8: .exit: 1695 // CHECK8-NEXT: ret void 1696 // 1697 // 1698 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 1699 // CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { 1700 // CHECK8-NEXT: entry: 1701 // CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 1702 // CHECK8-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 1703 // CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 1704 // CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1705 // CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1706 // CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1707 // CHECK8-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 1708 // CHECK8-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 1709 // CHECK8-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 1710 // CHECK8-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1711 // CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1712 // CHECK8-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1713 // CHECK8-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1714 // CHECK8-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1715 // CHECK8-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1716 // CHECK8: .worker: 1717 // CHECK8-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] 1718 // CHECK8-NEXT: br label [[DOTEXIT:%.*]] 1719 // CHECK8: .mastercheck: 1720 // CHECK8-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1721 // CHECK8-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1722 // CHECK8-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1723 // CHECK8-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1724 // CHECK8-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1725 // CHECK8-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1726 // CHECK8-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1727 // CHECK8-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1728 // CHECK8-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1729 // CHECK8: .master: 1730 // CHECK8-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1731 // CHECK8-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1732 // CHECK8-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1733 // CHECK8-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1734 // CHECK8-NEXT: call void @__kmpc_data_sharing_init_stack() 1735 // CHECK8-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) 1736 // CHECK8-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 1737 // CHECK8-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 1738 // CHECK8-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 1739 // CHECK8-NEXT: store i8** [[TMP7]], i8*** [[ARGC7]], align 4 1740 // CHECK8-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1741 // CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 1742 // CHECK8-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] 1743 // CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 1744 // CHECK8-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1745 // CHECK8: .termination.notifier: 1746 // CHECK8-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1747 // CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1748 // CHECK8-NEXT: br label [[DOTEXIT]] 1749 // CHECK8: .exit: 1750 // CHECK8-NEXT: ret void 1751 // 1752 // 1753 // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 1754 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { 1755 // CHECK8-NEXT: entry: 1756 // CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1757 // CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1758 // CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 1759 // CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1760 // CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1761 // CHECK8-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 1762 // CHECK8-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 1763 // CHECK8-NEXT: store i8** null, i8*** [[TMP0]], align 4 1764 // CHECK8-NEXT: ret void 1765 // 1766