1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ 2 // Test target codegen - host bc file has to be created first. 3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 5 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc 6 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2 7 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 8 // expected-no-diagnostics 9 #ifndef HEADER 10 #define HEADER 11 12 template<typename tx> 13 tx ftemplate(int n) { 14 int a; 15 short b; 16 tx c; 17 float d; 18 double e; 19 20 #pragma omp target 21 #pragma omp teams reduction(+: e) 22 { 23 e += 5; 24 } 25 26 #pragma omp target 27 #pragma omp teams reduction(^: c) reduction(*: d) 28 { 29 c ^= 2; 30 d *= 33; 31 } 32 33 #pragma omp target 34 #pragma omp teams reduction(|: a) reduction(max: b) 35 #pragma omp parallel reduction(|: a) reduction(max: b) 36 { 37 a |= 1; 38 b = 99 > b ? 99 : b; 39 } 40 41 return a+b+c+d+e; 42 } 43 44 int bar(int n){ 45 int a = 0; 46 47 a += ftemplate<char>(n); 48 49 return a; 50 } 51 52 #endif 53 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker 54 // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { 55 // CHECK4-NEXT: entry: 56 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 57 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 58 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 59 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 60 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 61 // CHECK4: .await.work: 62 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 63 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 64 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 65 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 66 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 67 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 68 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 69 // CHECK4: .select.workers: 70 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 71 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 72 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 73 // CHECK4: .execute.parallel: 74 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 75 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 76 // CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 77 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 78 // CHECK4: .terminate.parallel: 79 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 80 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 81 // CHECK4: .barrier.parallel: 82 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 83 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 84 // CHECK4: .exit: 85 // CHECK4-NEXT: ret void 86 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 87 // CHECK4-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { 88 // CHECK4-NEXT: entry: 89 // CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 90 // CHECK4-NEXT: [[E7:%.*]] = alloca double, align 8 91 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 92 // CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 93 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 94 // CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 95 // CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 96 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 97 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 98 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 99 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 100 // CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 101 // CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 102 // CHECK4: .worker: 103 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] 104 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 105 // CHECK4: .mastercheck: 106 // CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 107 // CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 108 // CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 109 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 110 // CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 111 // CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 112 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 113 // CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 114 // CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 115 // CHECK4: .master: 116 // CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 117 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 118 // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 119 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 120 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 121 // CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 122 // CHECK4-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 123 // CHECK4-NEXT: store double [[TMP7]], double* [[E7]], align 8 124 // CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 125 // CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] 126 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 127 // CHECK4: .termination.notifier: 128 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 129 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 130 // CHECK4-NEXT: br label [[DOTEXIT]] 131 // CHECK4: .exit: 132 // CHECK4-NEXT: ret void 133 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ 134 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { 135 // CHECK4-NEXT: entry: 136 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 137 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 138 // CHECK4-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 139 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 140 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 141 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 142 // CHECK4-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 143 // CHECK4-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 144 // CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 145 // CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 146 // CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 147 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 148 // CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 149 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* 150 // CHECK4-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 151 // CHECK4-NEXT: store double 0.000000e+00, double* [[E1]], align 8 152 // CHECK4-NEXT: [[TMP6:%.*]] = load double, double* [[E1]], align 8 153 // CHECK4-NEXT: [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00 154 // CHECK4-NEXT: store double [[ADD]], double* [[E1]], align 8 155 // CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 156 // CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 157 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 158 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast double* [[E1]] to i8* 159 // CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 160 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 161 // CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 162 // CHECK4-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) 163 // CHECK4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 164 // CHECK4-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 165 // CHECK4: .omp.reduction.then: 166 // CHECK4-NEXT: [[TMP15:%.*]] = load double, double* [[TMP0]], align 8 167 // CHECK4-NEXT: [[TMP16:%.*]] = load double, double* [[E1]], align 8 168 // CHECK4-NEXT: [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]] 169 // CHECK4-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 170 // CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) 171 // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 172 // CHECK4: .omp.reduction.done: 173 // CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 174 // CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) 175 // CHECK4-NEXT: ret void 176 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func 177 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 178 // CHECK4-NEXT: entry: 179 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 180 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 181 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 182 // CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 183 // CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 184 // CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 185 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 186 // CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 187 // CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 188 // CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 189 // CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 190 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* 191 // CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 192 // CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 193 // CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 194 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 195 // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 196 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 197 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* 198 // CHECK4-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 199 // CHECK4-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* 200 // CHECK4-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* 201 // CHECK4-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* 202 // CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 203 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 204 // CHECK4-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 205 // CHECK4-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) 206 // CHECK4-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 207 // CHECK4-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 208 // CHECK4-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 209 // CHECK4-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 210 // CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 211 // CHECK4-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 212 // CHECK4-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 213 // CHECK4-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 214 // CHECK4-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] 215 // CHECK4-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 216 // CHECK4-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 217 // CHECK4-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 218 // CHECK4-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] 219 // CHECK4-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 220 // CHECK4-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] 221 // CHECK4-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] 222 // CHECK4-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] 223 // CHECK4-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] 224 // CHECK4: then: 225 // CHECK4-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* 226 // CHECK4-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 227 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] 228 // CHECK4-NEXT: br label [[IFCONT:%.*]] 229 // CHECK4: else: 230 // CHECK4-NEXT: br label [[IFCONT]] 231 // CHECK4: ifcont: 232 // CHECK4-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 233 // CHECK4-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 234 // CHECK4-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] 235 // CHECK4-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 236 // CHECK4: then4: 237 // CHECK4-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 238 // CHECK4-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 239 // CHECK4-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 240 // CHECK4-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 241 // CHECK4-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* 242 // CHECK4-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* 243 // CHECK4-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 244 // CHECK4-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 245 // CHECK4-NEXT: br label [[IFCONT6:%.*]] 246 // CHECK4: else5: 247 // CHECK4-NEXT: br label [[IFCONT6]] 248 // CHECK4: ifcont6: 249 // CHECK4-NEXT: ret void 250 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func 251 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 252 // CHECK4-NEXT: entry: 253 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 254 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 255 // CHECK4-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 256 // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 257 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 258 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 259 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 260 // CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 261 // CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 262 // CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 263 // CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 264 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 265 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 266 // CHECK4-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 267 // CHECK4-NEXT: br label [[PRECOND:%.*]] 268 // CHECK4: precond: 269 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 270 // CHECK4-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 271 // CHECK4-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] 272 // CHECK4: body: 273 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) 274 // CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 275 // CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 276 // CHECK4: then: 277 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 278 // CHECK4-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 279 // CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* 280 // CHECK4-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] 281 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 282 // CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 283 // CHECK4-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 284 // CHECK4-NEXT: br label [[IFCONT:%.*]] 285 // CHECK4: else: 286 // CHECK4-NEXT: br label [[IFCONT]] 287 // CHECK4: ifcont: 288 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 289 // CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 290 // CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] 291 // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 292 // CHECK4: then4: 293 // CHECK4-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 294 // CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 295 // CHECK4-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 296 // CHECK4-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 297 // CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] 298 // CHECK4-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 299 // CHECK4-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 300 // CHECK4-NEXT: br label [[IFCONT6:%.*]] 301 // CHECK4: else5: 302 // CHECK4-NEXT: br label [[IFCONT6]] 303 // CHECK4: ifcont6: 304 // CHECK4-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 305 // CHECK4-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 306 // CHECK4-NEXT: br label [[PRECOND]] 307 // CHECK4: exit: 308 // CHECK4-NEXT: ret void 309 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func 310 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 311 // CHECK4-NEXT: entry: 312 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 313 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 314 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 315 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 316 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 317 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 318 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 319 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 320 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 321 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 322 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 323 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 324 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 325 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 326 // CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 327 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] 328 // CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 329 // CHECK4-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 330 // CHECK4-NEXT: ret void 331 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func 332 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 333 // CHECK4-NEXT: entry: 334 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 335 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 336 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 337 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 338 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 339 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 340 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 341 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 342 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 343 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 344 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 345 // CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 346 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] 347 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 348 // CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 349 // CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 350 // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 351 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] 352 // CHECK4-NEXT: ret void 353 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func 354 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 355 // CHECK4-NEXT: entry: 356 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 357 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 358 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 359 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 360 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 361 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 362 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 363 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 364 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 365 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 366 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 367 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 368 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 369 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 370 // CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 371 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] 372 // CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 373 // CHECK4-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 374 // CHECK4-NEXT: ret void 375 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func 376 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 377 // CHECK4-NEXT: entry: 378 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 379 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 380 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 381 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 382 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 383 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 384 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 385 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 386 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 387 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 388 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 389 // CHECK4-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 390 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] 391 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 392 // CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 393 // CHECK4-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 394 // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 395 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] 396 // CHECK4-NEXT: ret void 397 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker 398 // CHECK4-SAME: () #[[ATTR0]] { 399 // CHECK4-NEXT: entry: 400 // CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 401 // CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 402 // CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 403 // CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 404 // CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] 405 // CHECK4: .await.work: 406 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 407 // CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 408 // CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 409 // CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 410 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 411 // CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 412 // CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 413 // CHECK4: .select.workers: 414 // CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 415 // CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 416 // CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 417 // CHECK4: .execute.parallel: 418 // CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 419 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 420 // CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 421 // CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 422 // CHECK4: .terminate.parallel: 423 // CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() 424 // CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] 425 // CHECK4: .barrier.parallel: 426 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 427 // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] 428 // CHECK4: .exit: 429 // CHECK4-NEXT: ret void 430 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 431 // CHECK4-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { 432 // CHECK4-NEXT: entry: 433 // CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 434 // CHECK4-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 435 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 436 // CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 437 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 438 // CHECK4-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 439 // CHECK4-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 440 // CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* 441 // CHECK4-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* 442 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 443 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 444 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 445 // CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 446 // CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 447 // CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 448 // CHECK4: .worker: 449 // CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] 450 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 451 // CHECK4: .mastercheck: 452 // CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 453 // CHECK4-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 454 // CHECK4-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 455 // CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 456 // CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 457 // CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 458 // CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 459 // CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] 460 // CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 461 // CHECK4: .master: 462 // CHECK4-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 463 // CHECK4-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 464 // CHECK4-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] 465 // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) 466 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() 467 // CHECK4-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 468 // CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 469 // CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) 470 // CHECK4-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 471 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 472 // CHECK4-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1* 473 // CHECK4-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4 474 // CHECK4-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1 475 // CHECK4-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 476 // CHECK4-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 4 477 // CHECK4-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0 478 // CHECK4-NEXT: store float [[TMP11]], float* [[D9]], align 4 479 // CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 480 // CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 481 // CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] 482 // CHECK4-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 483 // CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) 484 // CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 485 // CHECK4: .termination.notifier: 486 // CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) 487 // CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 488 // CHECK4-NEXT: br label [[DOTEXIT]] 489 // CHECK4: .exit: 490 // CHECK4-NEXT: ret void 491 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 492 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { 493 // CHECK4-NEXT: entry: 494 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 495 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 496 // CHECK4-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 497 // CHECK4-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 498 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 499 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 500 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 501 // CHECK4-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 502 // CHECK4-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 503 // CHECK4-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 504 // CHECK4-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 505 // CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 506 // CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8 507 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2* 508 // CHECK4-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1 509 // CHECK4-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0 510 // CHECK4-NEXT: store i8 0, i8* [[C1]], align 4 511 // CHECK4-NEXT: store float 1.000000e+00, float* [[D2]], align 4 512 // CHECK4-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 513 // CHECK4-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 514 // CHECK4-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 515 // CHECK4-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 516 // CHECK4-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 517 // CHECK4-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 518 // CHECK4-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 519 // CHECK4-NEXT: store float [[MUL]], float* [[D2]], align 4 520 // CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 521 // CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 522 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 523 // CHECK4-NEXT: store i8* [[C1]], i8** [[TMP9]], align 4 524 // CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 525 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* 526 // CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 527 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 528 // CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 529 // CHECK4-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 2048, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) 530 // CHECK4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 531 // CHECK4-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 532 // CHECK4: .omp.reduction.then: 533 // CHECK4-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 534 // CHECK4-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 535 // CHECK4-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 536 // CHECK4-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 537 // CHECK4-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] 538 // CHECK4-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 539 // CHECK4-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 540 // CHECK4-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 541 // CHECK4-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 542 // CHECK4-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] 543 // CHECK4-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 544 // CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) 545 // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 546 // CHECK4: .omp.reduction.done: 547 // CHECK4-NEXT: ret void 548 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 549 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 550 // CHECK4-NEXT: entry: 551 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 552 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 553 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 554 // CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 555 // CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 556 // CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 557 // CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 558 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 559 // CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 560 // CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 561 // CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 562 // CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 563 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 564 // CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 565 // CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 566 // CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 567 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 568 // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 569 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 570 // CHECK4-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 571 // CHECK4-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 572 // CHECK4-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 573 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 574 // CHECK4-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 575 // CHECK4-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) 576 // CHECK4-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 577 // CHECK4-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 578 // CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 579 // CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 580 // CHECK4-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 581 // CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 582 // CHECK4-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 583 // CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 584 // CHECK4-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* 585 // CHECK4-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 586 // CHECK4-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* 587 // CHECK4-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* 588 // CHECK4-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* 589 // CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 590 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 591 // CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 592 // CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 593 // CHECK4-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 594 // CHECK4-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 595 // CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 596 // CHECK4-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 597 // CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 598 // CHECK4-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 599 // CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 600 // CHECK4-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 601 // CHECK4-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] 602 // CHECK4-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 603 // CHECK4-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 604 // CHECK4-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 605 // CHECK4-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] 606 // CHECK4-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 607 // CHECK4-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] 608 // CHECK4-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] 609 // CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] 610 // CHECK4-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] 611 // CHECK4: then: 612 // CHECK4-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 613 // CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 614 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] 615 // CHECK4-NEXT: br label [[IFCONT:%.*]] 616 // CHECK4: else: 617 // CHECK4-NEXT: br label [[IFCONT]] 618 // CHECK4: ifcont: 619 // CHECK4-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 620 // CHECK4-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 621 // CHECK4-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] 622 // CHECK4-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 623 // CHECK4: then6: 624 // CHECK4-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 625 // CHECK4-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 626 // CHECK4-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 627 // CHECK4-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 628 // CHECK4-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 629 // CHECK4-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 630 // CHECK4-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 631 // CHECK4-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 632 // CHECK4-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 633 // CHECK4-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 634 // CHECK4-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* 635 // CHECK4-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* 636 // CHECK4-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 637 // CHECK4-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 638 // CHECK4-NEXT: br label [[IFCONT8:%.*]] 639 // CHECK4: else7: 640 // CHECK4-NEXT: br label [[IFCONT8]] 641 // CHECK4: ifcont8: 642 // CHECK4-NEXT: ret void 643 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 644 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 645 // CHECK4-NEXT: entry: 646 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 647 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 648 // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 649 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 650 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 651 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 652 // CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 653 // CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 654 // CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 655 // CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 656 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 657 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 658 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 659 // CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 660 // CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 661 // CHECK4: then: 662 // CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 663 // CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 664 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 665 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* 666 // CHECK4-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 667 // CHECK4-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 668 // CHECK4-NEXT: br label [[IFCONT:%.*]] 669 // CHECK4: else: 670 // CHECK4-NEXT: br label [[IFCONT]] 671 // CHECK4: ifcont: 672 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 673 // CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 674 // CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 675 // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 676 // CHECK4: then4: 677 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 678 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* 679 // CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 680 // CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 681 // CHECK4-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 682 // CHECK4-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 683 // CHECK4-NEXT: br label [[IFCONT6:%.*]] 684 // CHECK4: else5: 685 // CHECK4-NEXT: br label [[IFCONT6]] 686 // CHECK4: ifcont6: 687 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 688 // CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 689 // CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 690 // CHECK4: then8: 691 // CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 692 // CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 693 // CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* 694 // CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 695 // CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 696 // CHECK4-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 697 // CHECK4-NEXT: br label [[IFCONT10:%.*]] 698 // CHECK4: else9: 699 // CHECK4-NEXT: br label [[IFCONT10]] 700 // CHECK4: ifcont10: 701 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 702 // CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 703 // CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] 704 // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 705 // CHECK4: then12: 706 // CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 707 // CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 708 // CHECK4-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 709 // CHECK4-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* 710 // CHECK4-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 711 // CHECK4-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 712 // CHECK4-NEXT: br label [[IFCONT14:%.*]] 713 // CHECK4: else13: 714 // CHECK4-NEXT: br label [[IFCONT14]] 715 // CHECK4: ifcont14: 716 // CHECK4-NEXT: ret void 717 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 718 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 719 // CHECK4-NEXT: entry: 720 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 721 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 722 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 723 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 724 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 725 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 726 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 727 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 728 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 729 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* 730 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 731 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 732 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 733 // CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 734 // CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] 735 // CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 736 // CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 737 // CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 738 // CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 739 // CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 740 // CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 741 // CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] 742 // CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 743 // CHECK4-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 744 // CHECK4-NEXT: ret void 745 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 746 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 747 // CHECK4-NEXT: entry: 748 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 749 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 750 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 751 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 752 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 753 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 754 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 755 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 756 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* 757 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 758 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 759 // CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 760 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] 761 // CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 762 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 763 // CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 764 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] 765 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 766 // CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 767 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 768 // CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 769 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] 770 // CHECK4-NEXT: ret void 771 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 772 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 773 // CHECK4-NEXT: entry: 774 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 775 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 776 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 777 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 778 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 779 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 780 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 781 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 782 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 783 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* 784 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 785 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 786 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 787 // CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 788 // CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] 789 // CHECK4-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 790 // CHECK4-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 791 // CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 792 // CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 793 // CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 794 // CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 795 // CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] 796 // CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 797 // CHECK4-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 798 // CHECK4-NEXT: ret void 799 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 800 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 801 // CHECK4-NEXT: entry: 802 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 803 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 804 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 805 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 806 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 807 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 808 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 809 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 810 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* 811 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 812 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 813 // CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 814 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] 815 // CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 816 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 817 // CHECK4-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 818 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] 819 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 820 // CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 821 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 822 // CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 823 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] 824 // CHECK4-NEXT: ret void 825 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 826 // CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { 827 // CHECK4-NEXT: entry: 828 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 829 // CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 830 // CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 831 // CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 832 // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 833 // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 834 // CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 835 // CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* 836 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 837 // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 838 // CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() 839 // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] 840 // CHECK4: .execute: 841 // CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) 842 // CHECK4-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 843 // CHECK4-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] 844 // CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] 845 // CHECK4: .omp.deinit: 846 // CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 847 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 848 // CHECK4: .exit: 849 // CHECK4-NEXT: ret void 850 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 851 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { 852 // CHECK4-NEXT: entry: 853 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 854 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 855 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 856 // CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 857 // CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4 858 // CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2 859 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 860 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 861 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 862 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 863 // CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 864 // CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 865 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 866 // CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 867 // CHECK4-NEXT: store i32 0, i32* [[A1]], align 4 868 // CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2 869 // CHECK4-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 870 // CHECK4-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* 871 // CHECK4-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 872 // CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 873 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* 874 // CHECK4-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 875 // CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 876 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 877 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 878 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2) 879 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 880 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* 881 // CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 882 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 883 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* 884 // CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 885 // CHECK4-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 886 // CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 887 // CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) 888 // CHECK4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 889 // CHECK4-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 890 // CHECK4: .omp.reduction.then: 891 // CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 892 // CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 893 // CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] 894 // CHECK4-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 895 // CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 896 // CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 897 // CHECK4-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 898 // CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 899 // CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] 900 // CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 901 // CHECK4: cond.true: 902 // CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 903 // CHECK4-NEXT: br label [[COND_END:%.*]] 904 // CHECK4: cond.false: 905 // CHECK4-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 906 // CHECK4-NEXT: br label [[COND_END]] 907 // CHECK4: cond.end: 908 // CHECK4-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] 909 // CHECK4-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 910 // CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) 911 // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 912 // CHECK4: .omp.reduction.done: 913 // CHECK4-NEXT: ret void 914 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__12 915 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { 916 // CHECK4-NEXT: entry: 917 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 918 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 919 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 920 // CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 921 // CHECK4-NEXT: [[A1:%.*]] = alloca i32, align 4 922 // CHECK4-NEXT: [[B2:%.*]] = alloca i16, align 2 923 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 924 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 925 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 926 // CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 927 // CHECK4-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 928 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 929 // CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 930 // CHECK4-NEXT: store i32 0, i32* [[A1]], align 4 931 // CHECK4-NEXT: store i16 -32768, i16* [[B2]], align 2 932 // CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 933 // CHECK4-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 934 // CHECK4-NEXT: store i32 [[OR]], i32* [[A1]], align 4 935 // CHECK4-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 936 // CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 937 // CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] 938 // CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 939 // CHECK4: cond.true: 940 // CHECK4-NEXT: br label [[COND_END:%.*]] 941 // CHECK4: cond.false: 942 // CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 943 // CHECK4-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 944 // CHECK4-NEXT: br label [[COND_END]] 945 // CHECK4: cond.end: 946 // CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] 947 // CHECK4-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 948 // CHECK4-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 949 // CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 950 // CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 951 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 952 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* 953 // CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 954 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 955 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* 956 // CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 957 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 958 // CHECK4-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) 959 // CHECK4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 960 // CHECK4-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 961 // CHECK4: .omp.reduction.then: 962 // CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 963 // CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 964 // CHECK4-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] 965 // CHECK4-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 966 // CHECK4-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 967 // CHECK4-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 968 // CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 969 // CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 970 // CHECK4-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] 971 // CHECK4-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] 972 // CHECK4: cond.true9: 973 // CHECK4-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 974 // CHECK4-NEXT: br label [[COND_END11:%.*]] 975 // CHECK4: cond.false10: 976 // CHECK4-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 977 // CHECK4-NEXT: br label [[COND_END11]] 978 // CHECK4: cond.end11: 979 // CHECK4-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] 980 // CHECK4-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 981 // CHECK4-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) 982 // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 983 // CHECK4: .omp.reduction.done: 984 // CHECK4-NEXT: ret void 985 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 986 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 987 // CHECK4-NEXT: entry: 988 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 989 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 990 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 991 // CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 992 // CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 993 // CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 994 // CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 995 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 996 // CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 997 // CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 998 // CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 999 // CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1000 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 1001 // CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 1002 // CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 1003 // CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 1004 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 1005 // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 1006 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 1007 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 1008 // CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 1009 // CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 1010 // CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 1011 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1012 // CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 1013 // CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) 1014 // CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 1015 // CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 1016 // CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 1017 // CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 1018 // CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 1019 // CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 1020 // CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 1021 // CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 1022 // CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* 1023 // CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 1024 // CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* 1025 // CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 1026 // CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 1027 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1028 // CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 1029 // CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 1030 // CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 1031 // CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 1032 // CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 1033 // CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 1034 // CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 1035 // CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 1036 // CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 1037 // CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 1038 // CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 1039 // CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] 1040 // CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 1041 // CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 1042 // CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 1043 // CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] 1044 // CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 1045 // CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] 1046 // CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] 1047 // CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] 1048 // CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] 1049 // CHECK4: then: 1050 // CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 1051 // CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 1052 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] 1053 // CHECK4-NEXT: br label [[IFCONT:%.*]] 1054 // CHECK4: else: 1055 // CHECK4-NEXT: br label [[IFCONT]] 1056 // CHECK4: ifcont: 1057 // CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 1058 // CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 1059 // CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] 1060 // CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 1061 // CHECK4: then6: 1062 // CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 1063 // CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 1064 // CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 1065 // CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 1066 // CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* 1067 // CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* 1068 // CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 1069 // CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 1070 // CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 1071 // CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 1072 // CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 1073 // CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 1074 // CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* 1075 // CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* 1076 // CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 1077 // CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 1078 // CHECK4-NEXT: br label [[IFCONT8:%.*]] 1079 // CHECK4: else7: 1080 // CHECK4-NEXT: br label [[IFCONT8]] 1081 // CHECK4: ifcont8: 1082 // CHECK4-NEXT: ret void 1083 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 1084 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1085 // CHECK4-NEXT: entry: 1086 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1087 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1088 // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 1089 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1090 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1091 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1092 // CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1093 // CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 1094 // CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1095 // CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 1096 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1097 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 1098 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) 1099 // CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 1100 // CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 1101 // CHECK4: then: 1102 // CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 1103 // CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 1104 // CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* 1105 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 1106 // CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 1107 // CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 1108 // CHECK4-NEXT: br label [[IFCONT:%.*]] 1109 // CHECK4: else: 1110 // CHECK4-NEXT: br label [[IFCONT]] 1111 // CHECK4: ifcont: 1112 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 1113 // CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1114 // CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 1115 // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 1116 // CHECK4: then4: 1117 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 1118 // CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 1119 // CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 1120 // CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* 1121 // CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 1122 // CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 1123 // CHECK4-NEXT: br label [[IFCONT6:%.*]] 1124 // CHECK4: else5: 1125 // CHECK4-NEXT: br label [[IFCONT6]] 1126 // CHECK4: ifcont6: 1127 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 1128 // CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 1129 // CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 1130 // CHECK4: then8: 1131 // CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 1132 // CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 1133 // CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* 1134 // CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 1135 // CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* 1136 // CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 1137 // CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 1138 // CHECK4-NEXT: br label [[IFCONT10:%.*]] 1139 // CHECK4: else9: 1140 // CHECK4-NEXT: br label [[IFCONT10]] 1141 // CHECK4: ifcont10: 1142 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 1143 // CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1144 // CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] 1145 // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 1146 // CHECK4: then12: 1147 // CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 1148 // CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* 1149 // CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 1150 // CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 1151 // CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* 1152 // CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 1153 // CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 1154 // CHECK4-NEXT: br label [[IFCONT14:%.*]] 1155 // CHECK4: else13: 1156 // CHECK4-NEXT: br label [[IFCONT14]] 1157 // CHECK4: ifcont14: 1158 // CHECK4-NEXT: ret void 1159 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 1160 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 1161 // CHECK4-NEXT: entry: 1162 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1163 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 1164 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 1165 // CHECK4-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 1166 // CHECK4-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 1167 // CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 1168 // CHECK4-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 1169 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1170 // CHECK4-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 1171 // CHECK4-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 1172 // CHECK4-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 1173 // CHECK4-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1174 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 1175 // CHECK4-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 1176 // CHECK4-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 1177 // CHECK4-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 1178 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 1179 // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 1180 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 1181 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 1182 // CHECK4-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 1183 // CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 1184 // CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 1185 // CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1186 // CHECK4-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 1187 // CHECK4-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) 1188 // CHECK4-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 1189 // CHECK4-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 1190 // CHECK4-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 1191 // CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 1192 // CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 1193 // CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 1194 // CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 1195 // CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 1196 // CHECK4-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* 1197 // CHECK4-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 1198 // CHECK4-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* 1199 // CHECK4-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 1200 // CHECK4-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 1201 // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1202 // CHECK4-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 1203 // CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 1204 // CHECK4-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 1205 // CHECK4-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 1206 // CHECK4-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 1207 // CHECK4-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 1208 // CHECK4-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 1209 // CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 1210 // CHECK4-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 1211 // CHECK4-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 1212 // CHECK4-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 1213 // CHECK4-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] 1214 // CHECK4-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 1215 // CHECK4-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 1216 // CHECK4-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 1217 // CHECK4-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] 1218 // CHECK4-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 1219 // CHECK4-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] 1220 // CHECK4-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] 1221 // CHECK4-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] 1222 // CHECK4-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] 1223 // CHECK4: then: 1224 // CHECK4-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 1225 // CHECK4-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 1226 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] 1227 // CHECK4-NEXT: br label [[IFCONT:%.*]] 1228 // CHECK4: else: 1229 // CHECK4-NEXT: br label [[IFCONT]] 1230 // CHECK4: ifcont: 1231 // CHECK4-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 1232 // CHECK4-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 1233 // CHECK4-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] 1234 // CHECK4-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 1235 // CHECK4: then6: 1236 // CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 1237 // CHECK4-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 1238 // CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 1239 // CHECK4-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 1240 // CHECK4-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* 1241 // CHECK4-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* 1242 // CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 1243 // CHECK4-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 1244 // CHECK4-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 1245 // CHECK4-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 1246 // CHECK4-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 1247 // CHECK4-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 1248 // CHECK4-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* 1249 // CHECK4-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* 1250 // CHECK4-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 1251 // CHECK4-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 1252 // CHECK4-NEXT: br label [[IFCONT8:%.*]] 1253 // CHECK4: else7: 1254 // CHECK4-NEXT: br label [[IFCONT8]] 1255 // CHECK4: ifcont8: 1256 // CHECK4-NEXT: ret void 1257 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 1258 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1259 // CHECK4-NEXT: entry: 1260 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1261 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1262 // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 1263 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1264 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1265 // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1266 // CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1267 // CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 1268 // CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1269 // CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 1270 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1271 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 1272 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 1273 // CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 1274 // CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 1275 // CHECK4: then: 1276 // CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 1277 // CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 1278 // CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* 1279 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 1280 // CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 1281 // CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 1282 // CHECK4-NEXT: br label [[IFCONT:%.*]] 1283 // CHECK4: else: 1284 // CHECK4-NEXT: br label [[IFCONT]] 1285 // CHECK4: ifcont: 1286 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 1287 // CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1288 // CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 1289 // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 1290 // CHECK4: then4: 1291 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 1292 // CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 1293 // CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 1294 // CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* 1295 // CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 1296 // CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 1297 // CHECK4-NEXT: br label [[IFCONT6:%.*]] 1298 // CHECK4: else5: 1299 // CHECK4-NEXT: br label [[IFCONT6]] 1300 // CHECK4: ifcont6: 1301 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 1302 // CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 1303 // CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 1304 // CHECK4: then8: 1305 // CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 1306 // CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 1307 // CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* 1308 // CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 1309 // CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* 1310 // CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 1311 // CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 1312 // CHECK4-NEXT: br label [[IFCONT10:%.*]] 1313 // CHECK4: else9: 1314 // CHECK4-NEXT: br label [[IFCONT10]] 1315 // CHECK4: ifcont10: 1316 // CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 1317 // CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1318 // CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] 1319 // CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 1320 // CHECK4: then12: 1321 // CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 1322 // CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* 1323 // CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 1324 // CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 1325 // CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* 1326 // CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 1327 // CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 1328 // CHECK4-NEXT: br label [[IFCONT14:%.*]] 1329 // CHECK4: else13: 1330 // CHECK4-NEXT: br label [[IFCONT14]] 1331 // CHECK4: ifcont14: 1332 // CHECK4-NEXT: ret void 1333 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 1334 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 1335 // CHECK4-NEXT: entry: 1336 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1337 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1338 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 1339 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1340 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1341 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 1342 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 1343 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 1344 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1345 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* 1346 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1347 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 1348 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 1349 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 1350 // CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 1351 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] 1352 // CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 1353 // CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 1354 // CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 1355 // CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 1356 // CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 1357 // CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 1358 // CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] 1359 // CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 1360 // CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 1361 // CHECK4-NEXT: ret void 1362 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 1363 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 1364 // CHECK4-NEXT: entry: 1365 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1366 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1367 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 1368 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 1369 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1370 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1371 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 1372 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1373 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* 1374 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1375 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 1376 // CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 1377 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] 1378 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 1379 // CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 1380 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 1381 // CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 1382 // CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] 1383 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 1384 // CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 1385 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 1386 // CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 1387 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] 1388 // CHECK4-NEXT: ret void 1389 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 1390 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 1391 // CHECK4-NEXT: entry: 1392 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1393 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1394 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 1395 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1396 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1397 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 1398 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 1399 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 1400 // CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1401 // CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* 1402 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1403 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 1404 // CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 1405 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 1406 // CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 1407 // CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] 1408 // CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 1409 // CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 1410 // CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 1411 // CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 1412 // CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 1413 // CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 1414 // CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] 1415 // CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 1416 // CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 1417 // CHECK4-NEXT: ret void 1418 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 1419 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 1420 // CHECK4-NEXT: entry: 1421 // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1422 // CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1423 // CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 1424 // CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 1425 // CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1426 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1427 // CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 1428 // CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1429 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* 1430 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1431 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 1432 // CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 1433 // CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] 1434 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 1435 // CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 1436 // CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 1437 // CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 1438 // CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] 1439 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 1440 // CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 1441 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 1442 // CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 1443 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] 1444 // CHECK4-NEXT: ret void 1445 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker 1446 // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { 1447 // CHECK5-NEXT: entry: 1448 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1449 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1450 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1451 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1452 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1453 // CHECK5: .await.work: 1454 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1455 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1456 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1457 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1458 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1459 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1460 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1461 // CHECK5: .select.workers: 1462 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1463 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1464 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1465 // CHECK5: .execute.parallel: 1466 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 1467 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1468 // CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1469 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1470 // CHECK5: .terminate.parallel: 1471 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1472 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1473 // CHECK5: .barrier.parallel: 1474 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1475 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1476 // CHECK5: .exit: 1477 // CHECK5-NEXT: ret void 1478 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 1479 // CHECK5-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { 1480 // CHECK5-NEXT: entry: 1481 // CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 1482 // CHECK5-NEXT: [[E7:%.*]] = alloca double, align 8 1483 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1484 // CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1485 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1486 // CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 1487 // CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 1488 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1489 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1490 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1491 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1492 // CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1493 // CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1494 // CHECK5: .worker: 1495 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] 1496 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1497 // CHECK5: .mastercheck: 1498 // CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1499 // CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1500 // CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1501 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 1502 // CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 1503 // CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 1504 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 1505 // CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 1506 // CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1507 // CHECK5: .master: 1508 // CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1509 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1510 // CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 1511 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 1512 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1513 // CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1514 // CHECK5-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 1515 // CHECK5-NEXT: store double [[TMP7]], double* [[E7]], align 8 1516 // CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 1517 // CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] 1518 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1519 // CHECK5: .termination.notifier: 1520 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1521 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1522 // CHECK5-NEXT: br label [[DOTEXIT]] 1523 // CHECK5: .exit: 1524 // CHECK5-NEXT: ret void 1525 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ 1526 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { 1527 // CHECK5-NEXT: entry: 1528 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1529 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1530 // CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 1531 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 1532 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1533 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1534 // CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 1535 // CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 1536 // CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) 1537 // CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* 1538 // CHECK5-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 1539 // CHECK5-NEXT: store double 0.000000e+00, double* [[E1]], align 8 1540 // CHECK5-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 1541 // CHECK5-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 1542 // CHECK5-NEXT: store double [[ADD]], double* [[E1]], align 8 1543 // CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 1544 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 1545 // CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 1546 // CHECK5-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* 1547 // CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 1548 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 1549 // CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 1550 // CHECK5-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) 1551 // CHECK5-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 1552 // CHECK5-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 1553 // CHECK5: .omp.reduction.then: 1554 // CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 1555 // CHECK5-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 1556 // CHECK5-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] 1557 // CHECK5-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 1558 // CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) 1559 // CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 1560 // CHECK5: .omp.reduction.done: 1561 // CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) 1562 // CHECK5-NEXT: ret void 1563 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func 1564 // CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 1565 // CHECK5-NEXT: entry: 1566 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1567 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 1568 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 1569 // CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 1570 // CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 1571 // CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 1572 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1573 // CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 1574 // CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 1575 // CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 1576 // CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1577 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* 1578 // CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 1579 // CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 1580 // CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 1581 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 1582 // CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 1583 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 1584 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* 1585 // CHECK5-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 1586 // CHECK5-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* 1587 // CHECK5-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* 1588 // CHECK5-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* 1589 // CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 1590 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1591 // CHECK5-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 1592 // CHECK5-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) 1593 // CHECK5-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 1594 // CHECK5-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 1595 // CHECK5-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 1596 // CHECK5-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 1597 // CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 1598 // CHECK5-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 1599 // CHECK5-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 1600 // CHECK5-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 1601 // CHECK5-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] 1602 // CHECK5-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 1603 // CHECK5-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 1604 // CHECK5-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 1605 // CHECK5-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] 1606 // CHECK5-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 1607 // CHECK5-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] 1608 // CHECK5-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] 1609 // CHECK5-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] 1610 // CHECK5-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] 1611 // CHECK5: then: 1612 // CHECK5-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* 1613 // CHECK5-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 1614 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] 1615 // CHECK5-NEXT: br label [[IFCONT:%.*]] 1616 // CHECK5: else: 1617 // CHECK5-NEXT: br label [[IFCONT]] 1618 // CHECK5: ifcont: 1619 // CHECK5-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 1620 // CHECK5-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 1621 // CHECK5-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] 1622 // CHECK5-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 1623 // CHECK5: then4: 1624 // CHECK5-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 1625 // CHECK5-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 1626 // CHECK5-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 1627 // CHECK5-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 1628 // CHECK5-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* 1629 // CHECK5-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* 1630 // CHECK5-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 1631 // CHECK5-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 1632 // CHECK5-NEXT: br label [[IFCONT6:%.*]] 1633 // CHECK5: else5: 1634 // CHECK5-NEXT: br label [[IFCONT6]] 1635 // CHECK5: ifcont6: 1636 // CHECK5-NEXT: ret void 1637 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func 1638 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 1639 // CHECK5-NEXT: entry: 1640 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1641 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1642 // CHECK5-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 1643 // CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1644 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1645 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1646 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1647 // CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1648 // CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 1649 // CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1650 // CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 1651 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1652 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 1653 // CHECK5-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 1654 // CHECK5-NEXT: br label [[PRECOND:%.*]] 1655 // CHECK5: precond: 1656 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 1657 // CHECK5-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 1658 // CHECK5-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] 1659 // CHECK5: body: 1660 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) 1661 // CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 1662 // CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 1663 // CHECK5: then: 1664 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 1665 // CHECK5-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 1666 // CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* 1667 // CHECK5-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] 1668 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 1669 // CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 1670 // CHECK5-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 1671 // CHECK5-NEXT: br label [[IFCONT:%.*]] 1672 // CHECK5: else: 1673 // CHECK5-NEXT: br label [[IFCONT]] 1674 // CHECK5: ifcont: 1675 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 1676 // CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1677 // CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] 1678 // CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 1679 // CHECK5: then4: 1680 // CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 1681 // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 1682 // CHECK5-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 1683 // CHECK5-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 1684 // CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] 1685 // CHECK5-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 1686 // CHECK5-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 1687 // CHECK5-NEXT: br label [[IFCONT6:%.*]] 1688 // CHECK5: else5: 1689 // CHECK5-NEXT: br label [[IFCONT6]] 1690 // CHECK5: ifcont6: 1691 // CHECK5-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 1692 // CHECK5-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 1693 // CHECK5-NEXT: br label [[PRECOND]] 1694 // CHECK5: exit: 1695 // CHECK5-NEXT: ret void 1696 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func 1697 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 1698 // CHECK5-NEXT: entry: 1699 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1700 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1701 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 1702 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1703 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1704 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 1705 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 1706 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 1707 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1708 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 1709 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1710 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 1711 // CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 1712 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 1713 // CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 1714 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] 1715 // CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 1716 // CHECK5-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 1717 // CHECK5-NEXT: ret void 1718 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func 1719 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 1720 // CHECK5-NEXT: entry: 1721 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1722 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1723 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 1724 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 1725 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1726 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1727 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 1728 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1729 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 1730 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1731 // CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 1732 // CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 1733 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] 1734 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 1735 // CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 1736 // CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 1737 // CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 1738 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] 1739 // CHECK5-NEXT: ret void 1740 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func 1741 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 1742 // CHECK5-NEXT: entry: 1743 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1744 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1745 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 1746 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1747 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1748 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 1749 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 1750 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 1751 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1752 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 1753 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1754 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 1755 // CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 1756 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 1757 // CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 1758 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] 1759 // CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 1760 // CHECK5-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 1761 // CHECK5-NEXT: ret void 1762 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func 1763 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 1764 // CHECK5-NEXT: entry: 1765 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1766 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 1767 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 1768 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 1769 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1770 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 1771 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 1772 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1773 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 1774 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 1775 // CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 1776 // CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 1777 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] 1778 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 1779 // CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 1780 // CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 1781 // CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 1782 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] 1783 // CHECK5-NEXT: ret void 1784 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker 1785 // CHECK5-SAME: () #[[ATTR0]] { 1786 // CHECK5-NEXT: entry: 1787 // CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 1788 // CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 1789 // CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 1790 // CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 1791 // CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] 1792 // CHECK5: .await.work: 1793 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1794 // CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 1795 // CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 1796 // CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 1797 // CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 1798 // CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 1799 // CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 1800 // CHECK5: .select.workers: 1801 // CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 1802 // CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 1803 // CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 1804 // CHECK5: .execute.parallel: 1805 // CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1806 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 1807 // CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 1808 // CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 1809 // CHECK5: .terminate.parallel: 1810 // CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() 1811 // CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] 1812 // CHECK5: .barrier.parallel: 1813 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1814 // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] 1815 // CHECK5: .exit: 1816 // CHECK5-NEXT: ret void 1817 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 1818 // CHECK5-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { 1819 // CHECK5-NEXT: entry: 1820 // CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 1821 // CHECK5-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 1822 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 1823 // CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 1824 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 1825 // CHECK5-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 1826 // CHECK5-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 1827 // CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* 1828 // CHECK5-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* 1829 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1830 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1831 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1832 // CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 1833 // CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 1834 // CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 1835 // CHECK5: .worker: 1836 // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] 1837 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 1838 // CHECK5: .mastercheck: 1839 // CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 1840 // CHECK5-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1841 // CHECK5-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1842 // CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 1843 // CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 1844 // CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 1845 // CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 1846 // CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] 1847 // CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 1848 // CHECK5: .master: 1849 // CHECK5-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 1850 // CHECK5-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1851 // CHECK5-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] 1852 // CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) 1853 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() 1854 // CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) 1855 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* 1856 // CHECK5-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 1857 // CHECK5-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 1858 // CHECK5-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 1859 // CHECK5-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 1860 // CHECK5-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 1861 // CHECK5-NEXT: store float [[TMP8]], float* [[D9]], align 4 1862 // CHECK5-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 1863 // CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 1864 // CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] 1865 // CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 1866 // CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 1867 // CHECK5: .termination.notifier: 1868 // CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) 1869 // CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 1870 // CHECK5-NEXT: br label [[DOTEXIT]] 1871 // CHECK5: .exit: 1872 // CHECK5-NEXT: ret void 1873 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 1874 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { 1875 // CHECK5-NEXT: entry: 1876 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 1877 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 1878 // CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 1879 // CHECK5-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 1880 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 1881 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 1882 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 1883 // CHECK5-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 1884 // CHECK5-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 1885 // CHECK5-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 1886 // CHECK5-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 1887 // CHECK5-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) 1888 // CHECK5-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* 1889 // CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 1890 // CHECK5-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 1891 // CHECK5-NEXT: store i8 0, i8* [[C1]], align 4 1892 // CHECK5-NEXT: store float 1.000000e+00, float* [[D2]], align 4 1893 // CHECK5-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 1894 // CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 1895 // CHECK5-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 1896 // CHECK5-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 1897 // CHECK5-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 1898 // CHECK5-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 1899 // CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 1900 // CHECK5-NEXT: store float [[MUL]], float* [[D2]], align 4 1901 // CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 1902 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 1903 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 1904 // CHECK5-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 1905 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 1906 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* 1907 // CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 1908 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 1909 // CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 1910 // CHECK5-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) 1911 // CHECK5-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 1912 // CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 1913 // CHECK5: .omp.reduction.then: 1914 // CHECK5-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 1915 // CHECK5-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 1916 // CHECK5-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 1917 // CHECK5-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 1918 // CHECK5-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] 1919 // CHECK5-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 1920 // CHECK5-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 1921 // CHECK5-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 1922 // CHECK5-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 1923 // CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] 1924 // CHECK5-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 1925 // CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) 1926 // CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 1927 // CHECK5: .omp.reduction.done: 1928 // CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) 1929 // CHECK5-NEXT: ret void 1930 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 1931 // CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 1932 // CHECK5-NEXT: entry: 1933 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 1934 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 1935 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 1936 // CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 1937 // CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 1938 // CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 1939 // CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 1940 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 1941 // CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 1942 // CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 1943 // CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 1944 // CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 1945 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 1946 // CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 1947 // CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 1948 // CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 1949 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 1950 // CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 1951 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 1952 // CHECK5-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 1953 // CHECK5-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 1954 // CHECK5-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 1955 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1956 // CHECK5-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 1957 // CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) 1958 // CHECK5-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 1959 // CHECK5-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 1960 // CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 1961 // CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 1962 // CHECK5-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 1963 // CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 1964 // CHECK5-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 1965 // CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 1966 // CHECK5-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* 1967 // CHECK5-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 1968 // CHECK5-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* 1969 // CHECK5-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* 1970 // CHECK5-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* 1971 // CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 1972 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 1973 // CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 1974 // CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 1975 // CHECK5-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 1976 // CHECK5-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 1977 // CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 1978 // CHECK5-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 1979 // CHECK5-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 1980 // CHECK5-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 1981 // CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 1982 // CHECK5-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 1983 // CHECK5-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] 1984 // CHECK5-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 1985 // CHECK5-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 1986 // CHECK5-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 1987 // CHECK5-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] 1988 // CHECK5-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 1989 // CHECK5-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] 1990 // CHECK5-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] 1991 // CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] 1992 // CHECK5-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] 1993 // CHECK5: then: 1994 // CHECK5-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 1995 // CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 1996 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] 1997 // CHECK5-NEXT: br label [[IFCONT:%.*]] 1998 // CHECK5: else: 1999 // CHECK5-NEXT: br label [[IFCONT]] 2000 // CHECK5: ifcont: 2001 // CHECK5-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 2002 // CHECK5-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 2003 // CHECK5-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] 2004 // CHECK5-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 2005 // CHECK5: then6: 2006 // CHECK5-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 2007 // CHECK5-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 2008 // CHECK5-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 2009 // CHECK5-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 2010 // CHECK5-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 2011 // CHECK5-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 2012 // CHECK5-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 2013 // CHECK5-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 2014 // CHECK5-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 2015 // CHECK5-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 2016 // CHECK5-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* 2017 // CHECK5-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* 2018 // CHECK5-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 2019 // CHECK5-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 2020 // CHECK5-NEXT: br label [[IFCONT8:%.*]] 2021 // CHECK5: else7: 2022 // CHECK5-NEXT: br label [[IFCONT8]] 2023 // CHECK5: ifcont8: 2024 // CHECK5-NEXT: ret void 2025 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 2026 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 2027 // CHECK5-NEXT: entry: 2028 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2029 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2030 // CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 2031 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2032 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2033 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2034 // CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2035 // CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 2036 // CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2037 // CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 2038 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2039 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 2040 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 2041 // CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 2042 // CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 2043 // CHECK5: then: 2044 // CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2045 // CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 2046 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 2047 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* 2048 // CHECK5-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 2049 // CHECK5-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 2050 // CHECK5-NEXT: br label [[IFCONT:%.*]] 2051 // CHECK5: else: 2052 // CHECK5-NEXT: br label [[IFCONT]] 2053 // CHECK5: ifcont: 2054 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 2055 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2056 // CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 2057 // CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 2058 // CHECK5: then4: 2059 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 2060 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* 2061 // CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2062 // CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 2063 // CHECK5-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 2064 // CHECK5-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 2065 // CHECK5-NEXT: br label [[IFCONT6:%.*]] 2066 // CHECK5: else5: 2067 // CHECK5-NEXT: br label [[IFCONT6]] 2068 // CHECK5: ifcont6: 2069 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 2070 // CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 2071 // CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 2072 // CHECK5: then8: 2073 // CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2074 // CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 2075 // CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* 2076 // CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 2077 // CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 2078 // CHECK5-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 2079 // CHECK5-NEXT: br label [[IFCONT10:%.*]] 2080 // CHECK5: else9: 2081 // CHECK5-NEXT: br label [[IFCONT10]] 2082 // CHECK5: ifcont10: 2083 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 2084 // CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2085 // CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] 2086 // CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 2087 // CHECK5: then12: 2088 // CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 2089 // CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2090 // CHECK5-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 2091 // CHECK5-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* 2092 // CHECK5-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 2093 // CHECK5-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 2094 // CHECK5-NEXT: br label [[IFCONT14:%.*]] 2095 // CHECK5: else13: 2096 // CHECK5-NEXT: br label [[IFCONT14]] 2097 // CHECK5: ifcont14: 2098 // CHECK5-NEXT: ret void 2099 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 2100 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 2101 // CHECK5-NEXT: entry: 2102 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2103 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2104 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 2105 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2106 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2107 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 2108 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 2109 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 2110 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2111 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* 2112 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2113 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2114 // CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 2115 // CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 2116 // CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] 2117 // CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 2118 // CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 2119 // CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2120 // CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 2121 // CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 2122 // CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 2123 // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] 2124 // CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 2125 // CHECK5-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 2126 // CHECK5-NEXT: ret void 2127 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 2128 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 2129 // CHECK5-NEXT: entry: 2130 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2131 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2132 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 2133 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 2134 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2135 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2136 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 2137 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2138 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* 2139 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2140 // CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 2141 // CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 2142 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] 2143 // CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 2144 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 2145 // CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 2146 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] 2147 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 2148 // CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 2149 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 2150 // CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 2151 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] 2152 // CHECK5-NEXT: ret void 2153 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 2154 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 2155 // CHECK5-NEXT: entry: 2156 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2157 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2158 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 2159 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2160 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2161 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 2162 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 2163 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 2164 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2165 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* 2166 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2167 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2168 // CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 2169 // CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 2170 // CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] 2171 // CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 2172 // CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 2173 // CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2174 // CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 2175 // CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 2176 // CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 2177 // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] 2178 // CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 2179 // CHECK5-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 2180 // CHECK5-NEXT: ret void 2181 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 2182 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 2183 // CHECK5-NEXT: entry: 2184 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2185 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2186 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 2187 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 2188 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2189 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2190 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 2191 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2192 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* 2193 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2194 // CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 2195 // CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 2196 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] 2197 // CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 2198 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 2199 // CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 2200 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] 2201 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 2202 // CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 2203 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 2204 // CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 2205 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] 2206 // CHECK5-NEXT: ret void 2207 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 2208 // CHECK5-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { 2209 // CHECK5-NEXT: entry: 2210 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 2211 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 2212 // CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 2213 // CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 2214 // CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 2215 // CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 2216 // CHECK5-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 2217 // CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* 2218 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2219 // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 2220 // CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() 2221 // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] 2222 // CHECK5: .execute: 2223 // CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) 2224 // CHECK5-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 2225 // CHECK5-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] 2226 // CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] 2227 // CHECK5: .omp.deinit: 2228 // CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 2229 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 2230 // CHECK5: .exit: 2231 // CHECK5-NEXT: ret void 2232 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 2233 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { 2234 // CHECK5-NEXT: entry: 2235 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 2236 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 2237 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 2238 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 2239 // CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 2240 // CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 2241 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 2242 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 2243 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 2244 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 2245 // CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 2246 // CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 2247 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 2248 // CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 2249 // CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 2250 // CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 2251 // CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 2252 // CHECK5-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* 2253 // CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 2254 // CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 2255 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* 2256 // CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 2257 // CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 2258 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 2259 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 2260 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) 2261 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 2262 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* 2263 // CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 2264 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 2265 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* 2266 // CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 2267 // CHECK5-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 2268 // CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 2269 // CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) 2270 // CHECK5-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 2271 // CHECK5-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 2272 // CHECK5: .omp.reduction.then: 2273 // CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 2274 // CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 2275 // CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] 2276 // CHECK5-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 2277 // CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 2278 // CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 2279 // CHECK5-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 2280 // CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 2281 // CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] 2282 // CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 2283 // CHECK5: cond.true: 2284 // CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 2285 // CHECK5-NEXT: br label [[COND_END:%.*]] 2286 // CHECK5: cond.false: 2287 // CHECK5-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 2288 // CHECK5-NEXT: br label [[COND_END]] 2289 // CHECK5: cond.end: 2290 // CHECK5-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] 2291 // CHECK5-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 2292 // CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) 2293 // CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 2294 // CHECK5: .omp.reduction.done: 2295 // CHECK5-NEXT: ret void 2296 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 2297 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { 2298 // CHECK5-NEXT: entry: 2299 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 2300 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 2301 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 2302 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 2303 // CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 2304 // CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 2305 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 2306 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 2307 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 2308 // CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 2309 // CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 2310 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 2311 // CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 2312 // CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 2313 // CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 2314 // CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 2315 // CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 2316 // CHECK5-NEXT: store i32 [[OR]], i32* [[A1]], align 4 2317 // CHECK5-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 2318 // CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 2319 // CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] 2320 // CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 2321 // CHECK5: cond.true: 2322 // CHECK5-NEXT: br label [[COND_END:%.*]] 2323 // CHECK5: cond.false: 2324 // CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 2325 // CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 2326 // CHECK5-NEXT: br label [[COND_END]] 2327 // CHECK5: cond.end: 2328 // CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] 2329 // CHECK5-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 2330 // CHECK5-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 2331 // CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 2332 // CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 2333 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 2334 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* 2335 // CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 2336 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 2337 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* 2338 // CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 2339 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 2340 // CHECK5-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) 2341 // CHECK5-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 2342 // CHECK5-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 2343 // CHECK5: .omp.reduction.then: 2344 // CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 2345 // CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 2346 // CHECK5-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] 2347 // CHECK5-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 2348 // CHECK5-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 2349 // CHECK5-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 2350 // CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 2351 // CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 2352 // CHECK5-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] 2353 // CHECK5-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] 2354 // CHECK5: cond.true9: 2355 // CHECK5-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 2356 // CHECK5-NEXT: br label [[COND_END11:%.*]] 2357 // CHECK5: cond.false10: 2358 // CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 2359 // CHECK5-NEXT: br label [[COND_END11]] 2360 // CHECK5: cond.end11: 2361 // CHECK5-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] 2362 // CHECK5-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 2363 // CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) 2364 // CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 2365 // CHECK5: .omp.reduction.done: 2366 // CHECK5-NEXT: ret void 2367 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 2368 // CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 2369 // CHECK5-NEXT: entry: 2370 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2371 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 2372 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 2373 // CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 2374 // CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 2375 // CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 2376 // CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 2377 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2378 // CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 2379 // CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 2380 // CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 2381 // CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2382 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 2383 // CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 2384 // CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 2385 // CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 2386 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 2387 // CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 2388 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 2389 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 2390 // CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 2391 // CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 2392 // CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 2393 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2394 // CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 2395 // CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) 2396 // CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 2397 // CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 2398 // CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 2399 // CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 2400 // CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 2401 // CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 2402 // CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 2403 // CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 2404 // CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* 2405 // CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 2406 // CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* 2407 // CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 2408 // CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 2409 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2410 // CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 2411 // CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 2412 // CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 2413 // CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 2414 // CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 2415 // CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 2416 // CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 2417 // CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 2418 // CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 2419 // CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 2420 // CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 2421 // CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] 2422 // CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 2423 // CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 2424 // CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 2425 // CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] 2426 // CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 2427 // CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] 2428 // CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] 2429 // CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] 2430 // CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] 2431 // CHECK5: then: 2432 // CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 2433 // CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 2434 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] 2435 // CHECK5-NEXT: br label [[IFCONT:%.*]] 2436 // CHECK5: else: 2437 // CHECK5-NEXT: br label [[IFCONT]] 2438 // CHECK5: ifcont: 2439 // CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 2440 // CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 2441 // CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] 2442 // CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 2443 // CHECK5: then6: 2444 // CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 2445 // CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 2446 // CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 2447 // CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 2448 // CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* 2449 // CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* 2450 // CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 2451 // CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 2452 // CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 2453 // CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 2454 // CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 2455 // CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 2456 // CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* 2457 // CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* 2458 // CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 2459 // CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 2460 // CHECK5-NEXT: br label [[IFCONT8:%.*]] 2461 // CHECK5: else7: 2462 // CHECK5-NEXT: br label [[IFCONT8]] 2463 // CHECK5: ifcont8: 2464 // CHECK5-NEXT: ret void 2465 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 2466 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 2467 // CHECK5-NEXT: entry: 2468 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2469 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2470 // CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 2471 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2472 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2473 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2474 // CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2475 // CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 2476 // CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2477 // CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 2478 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2479 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 2480 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) 2481 // CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 2482 // CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 2483 // CHECK5: then: 2484 // CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2485 // CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 2486 // CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* 2487 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 2488 // CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 2489 // CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 2490 // CHECK5-NEXT: br label [[IFCONT:%.*]] 2491 // CHECK5: else: 2492 // CHECK5-NEXT: br label [[IFCONT]] 2493 // CHECK5: ifcont: 2494 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 2495 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2496 // CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 2497 // CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 2498 // CHECK5: then4: 2499 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 2500 // CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2501 // CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 2502 // CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* 2503 // CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 2504 // CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 2505 // CHECK5-NEXT: br label [[IFCONT6:%.*]] 2506 // CHECK5: else5: 2507 // CHECK5-NEXT: br label [[IFCONT6]] 2508 // CHECK5: ifcont6: 2509 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 2510 // CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 2511 // CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 2512 // CHECK5: then8: 2513 // CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2514 // CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 2515 // CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* 2516 // CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 2517 // CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* 2518 // CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 2519 // CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 2520 // CHECK5-NEXT: br label [[IFCONT10:%.*]] 2521 // CHECK5: else9: 2522 // CHECK5-NEXT: br label [[IFCONT10]] 2523 // CHECK5: ifcont10: 2524 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 2525 // CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2526 // CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] 2527 // CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 2528 // CHECK5: then12: 2529 // CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 2530 // CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* 2531 // CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2532 // CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 2533 // CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* 2534 // CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 2535 // CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 2536 // CHECK5-NEXT: br label [[IFCONT14:%.*]] 2537 // CHECK5: else13: 2538 // CHECK5-NEXT: br label [[IFCONT14]] 2539 // CHECK5: ifcont14: 2540 // CHECK5-NEXT: ret void 2541 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 2542 // CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 2543 // CHECK5-NEXT: entry: 2544 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2545 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 2546 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 2547 // CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 2548 // CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 2549 // CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 2550 // CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 2551 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2552 // CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 2553 // CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 2554 // CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 2555 // CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2556 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 2557 // CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 2558 // CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 2559 // CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 2560 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 2561 // CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 2562 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 2563 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 2564 // CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 2565 // CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 2566 // CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 2567 // CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2568 // CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 2569 // CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) 2570 // CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 2571 // CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 2572 // CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 2573 // CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 2574 // CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 2575 // CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 2576 // CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 2577 // CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 2578 // CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* 2579 // CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 2580 // CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* 2581 // CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 2582 // CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 2583 // CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2584 // CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 2585 // CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 2586 // CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 2587 // CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 2588 // CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 2589 // CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 2590 // CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 2591 // CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 2592 // CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 2593 // CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 2594 // CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 2595 // CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] 2596 // CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 2597 // CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 2598 // CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 2599 // CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] 2600 // CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 2601 // CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] 2602 // CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] 2603 // CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] 2604 // CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] 2605 // CHECK5: then: 2606 // CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 2607 // CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 2608 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] 2609 // CHECK5-NEXT: br label [[IFCONT:%.*]] 2610 // CHECK5: else: 2611 // CHECK5-NEXT: br label [[IFCONT]] 2612 // CHECK5: ifcont: 2613 // CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 2614 // CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 2615 // CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] 2616 // CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 2617 // CHECK5: then6: 2618 // CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 2619 // CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 2620 // CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 2621 // CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 2622 // CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* 2623 // CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* 2624 // CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 2625 // CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 2626 // CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 2627 // CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 2628 // CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 2629 // CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 2630 // CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* 2631 // CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* 2632 // CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 2633 // CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 2634 // CHECK5-NEXT: br label [[IFCONT8:%.*]] 2635 // CHECK5: else7: 2636 // CHECK5-NEXT: br label [[IFCONT8]] 2637 // CHECK5: ifcont8: 2638 // CHECK5-NEXT: ret void 2639 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 2640 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 2641 // CHECK5-NEXT: entry: 2642 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2643 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2644 // CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 2645 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2646 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2647 // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2648 // CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2649 // CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 2650 // CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2651 // CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 2652 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2653 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 2654 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 2655 // CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 2656 // CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 2657 // CHECK5: then: 2658 // CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2659 // CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 2660 // CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* 2661 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 2662 // CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 2663 // CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 2664 // CHECK5-NEXT: br label [[IFCONT:%.*]] 2665 // CHECK5: else: 2666 // CHECK5-NEXT: br label [[IFCONT]] 2667 // CHECK5: ifcont: 2668 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 2669 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2670 // CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 2671 // CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 2672 // CHECK5: then4: 2673 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 2674 // CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2675 // CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 2676 // CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* 2677 // CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 2678 // CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 2679 // CHECK5-NEXT: br label [[IFCONT6:%.*]] 2680 // CHECK5: else5: 2681 // CHECK5-NEXT: br label [[IFCONT6]] 2682 // CHECK5: ifcont6: 2683 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 2684 // CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 2685 // CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 2686 // CHECK5: then8: 2687 // CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2688 // CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 2689 // CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* 2690 // CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 2691 // CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* 2692 // CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 2693 // CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 2694 // CHECK5-NEXT: br label [[IFCONT10:%.*]] 2695 // CHECK5: else9: 2696 // CHECK5-NEXT: br label [[IFCONT10]] 2697 // CHECK5: ifcont10: 2698 // CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 2699 // CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2700 // CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] 2701 // CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 2702 // CHECK5: then12: 2703 // CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 2704 // CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* 2705 // CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2706 // CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 2707 // CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* 2708 // CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 2709 // CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 2710 // CHECK5-NEXT: br label [[IFCONT14:%.*]] 2711 // CHECK5: else13: 2712 // CHECK5-NEXT: br label [[IFCONT14]] 2713 // CHECK5: ifcont14: 2714 // CHECK5-NEXT: ret void 2715 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 2716 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 2717 // CHECK5-NEXT: entry: 2718 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2719 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2720 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 2721 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2722 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2723 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 2724 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 2725 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 2726 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2727 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* 2728 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2729 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2730 // CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 2731 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 2732 // CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 2733 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] 2734 // CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 2735 // CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 2736 // CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2737 // CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 2738 // CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 2739 // CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 2740 // CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] 2741 // CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 2742 // CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 2743 // CHECK5-NEXT: ret void 2744 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 2745 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 2746 // CHECK5-NEXT: entry: 2747 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2748 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2749 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 2750 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 2751 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2752 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2753 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 2754 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2755 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* 2756 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2757 // CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 2758 // CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 2759 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] 2760 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 2761 // CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 2762 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 2763 // CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 2764 // CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] 2765 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 2766 // CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 2767 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 2768 // CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 2769 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] 2770 // CHECK5-NEXT: ret void 2771 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 2772 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 2773 // CHECK5-NEXT: entry: 2774 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2775 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2776 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 2777 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2778 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2779 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 2780 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 2781 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 2782 // CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2783 // CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* 2784 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2785 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 2786 // CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 2787 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 2788 // CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 2789 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] 2790 // CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 2791 // CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 2792 // CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 2793 // CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 2794 // CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 2795 // CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 2796 // CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] 2797 // CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 2798 // CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 2799 // CHECK5-NEXT: ret void 2800 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 2801 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 2802 // CHECK5-NEXT: entry: 2803 // CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2804 // CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 2805 // CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 2806 // CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 2807 // CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2808 // CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 2809 // CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 2810 // CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2811 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* 2812 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 2813 // CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 2814 // CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 2815 // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] 2816 // CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 2817 // CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 2818 // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 2819 // CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 2820 // CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] 2821 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 2822 // CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 2823 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 2824 // CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 2825 // CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] 2826 // CHECK5-NEXT: ret void 2827 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker 2828 // CHECK6-SAME: () #[[ATTR0:[0-9]+]] { 2829 // CHECK6-NEXT: entry: 2830 // CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 2831 // CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 2832 // CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 2833 // CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 2834 // CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] 2835 // CHECK6: .await.work: 2836 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2837 // CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 2838 // CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 2839 // CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 2840 // CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 2841 // CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 2842 // CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 2843 // CHECK6: .select.workers: 2844 // CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 2845 // CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 2846 // CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 2847 // CHECK6: .execute.parallel: 2848 // CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 2849 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 2850 // CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 2851 // CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 2852 // CHECK6: .terminate.parallel: 2853 // CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() 2854 // CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] 2855 // CHECK6: .barrier.parallel: 2856 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2857 // CHECK6-NEXT: br label [[DOTAWAIT_WORK]] 2858 // CHECK6: .exit: 2859 // CHECK6-NEXT: ret void 2860 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 2861 // CHECK6-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { 2862 // CHECK6-NEXT: entry: 2863 // CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 2864 // CHECK6-NEXT: [[E7:%.*]] = alloca double, align 8 2865 // CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 2866 // CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 2867 // CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 2868 // CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 2869 // CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 2870 // CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2871 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2872 // CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2873 // CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 2874 // CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 2875 // CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 2876 // CHECK6: .worker: 2877 // CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] 2878 // CHECK6-NEXT: br label [[DOTEXIT:%.*]] 2879 // CHECK6: .mastercheck: 2880 // CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 2881 // CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2882 // CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2883 // CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 2884 // CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 2885 // CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 2886 // CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] 2887 // CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] 2888 // CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 2889 // CHECK6: .master: 2890 // CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 2891 // CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2892 // CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] 2893 // CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) 2894 // CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() 2895 // CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 2896 // CHECK6-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 2897 // CHECK6-NEXT: store double [[TMP7]], double* [[E7]], align 8 2898 // CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 2899 // CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] 2900 // CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 2901 // CHECK6: .termination.notifier: 2902 // CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) 2903 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 2904 // CHECK6-NEXT: br label [[DOTEXIT]] 2905 // CHECK6: .exit: 2906 // CHECK6-NEXT: ret void 2907 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ 2908 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { 2909 // CHECK6-NEXT: entry: 2910 // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 2911 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 2912 // CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 2913 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 2914 // CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 2915 // CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 2916 // CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 2917 // CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 2918 // CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) 2919 // CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* 2920 // CHECK6-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 2921 // CHECK6-NEXT: store double 0.000000e+00, double* [[E1]], align 8 2922 // CHECK6-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 2923 // CHECK6-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 2924 // CHECK6-NEXT: store double [[ADD]], double* [[E1]], align 8 2925 // CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 2926 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 2927 // CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 2928 // CHECK6-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* 2929 // CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 2930 // CHECK6-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 2931 // CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 2932 // CHECK6-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 2048, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) 2933 // CHECK6-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 2934 // CHECK6-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 2935 // CHECK6: .omp.reduction.then: 2936 // CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 2937 // CHECK6-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 2938 // CHECK6-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] 2939 // CHECK6-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 2940 // CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) 2941 // CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 2942 // CHECK6: .omp.reduction.done: 2943 // CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) 2944 // CHECK6-NEXT: ret void 2945 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func 2946 // CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 2947 // CHECK6-NEXT: entry: 2948 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 2949 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 2950 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 2951 // CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 2952 // CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 2953 // CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 2954 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 2955 // CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 2956 // CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 2957 // CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 2958 // CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 2959 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* 2960 // CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 2961 // CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 2962 // CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 2963 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 2964 // CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 2965 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 2966 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* 2967 // CHECK6-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 2968 // CHECK6-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* 2969 // CHECK6-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* 2970 // CHECK6-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* 2971 // CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 2972 // CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 2973 // CHECK6-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 2974 // CHECK6-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) 2975 // CHECK6-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 2976 // CHECK6-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 2977 // CHECK6-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 2978 // CHECK6-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 2979 // CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 2980 // CHECK6-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 2981 // CHECK6-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 2982 // CHECK6-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 2983 // CHECK6-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] 2984 // CHECK6-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 2985 // CHECK6-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 2986 // CHECK6-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 2987 // CHECK6-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] 2988 // CHECK6-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 2989 // CHECK6-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] 2990 // CHECK6-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] 2991 // CHECK6-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] 2992 // CHECK6-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] 2993 // CHECK6: then: 2994 // CHECK6-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* 2995 // CHECK6-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 2996 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] 2997 // CHECK6-NEXT: br label [[IFCONT:%.*]] 2998 // CHECK6: else: 2999 // CHECK6-NEXT: br label [[IFCONT]] 3000 // CHECK6: ifcont: 3001 // CHECK6-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 3002 // CHECK6-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 3003 // CHECK6-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] 3004 // CHECK6-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 3005 // CHECK6: then4: 3006 // CHECK6-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 3007 // CHECK6-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 3008 // CHECK6-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 3009 // CHECK6-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 3010 // CHECK6-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* 3011 // CHECK6-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* 3012 // CHECK6-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 3013 // CHECK6-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 3014 // CHECK6-NEXT: br label [[IFCONT6:%.*]] 3015 // CHECK6: else5: 3016 // CHECK6-NEXT: br label [[IFCONT6]] 3017 // CHECK6: ifcont6: 3018 // CHECK6-NEXT: ret void 3019 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func 3020 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 3021 // CHECK6-NEXT: entry: 3022 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3023 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3024 // CHECK6-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 3025 // CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 3026 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3027 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3028 // CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3029 // CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3030 // CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 3031 // CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3032 // CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 3033 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3034 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 3035 // CHECK6-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 3036 // CHECK6-NEXT: br label [[PRECOND:%.*]] 3037 // CHECK6: precond: 3038 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 3039 // CHECK6-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 3040 // CHECK6-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] 3041 // CHECK6: body: 3042 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) 3043 // CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 3044 // CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 3045 // CHECK6: then: 3046 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 3047 // CHECK6-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 3048 // CHECK6-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* 3049 // CHECK6-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] 3050 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 3051 // CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 3052 // CHECK6-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 3053 // CHECK6-NEXT: br label [[IFCONT:%.*]] 3054 // CHECK6: else: 3055 // CHECK6-NEXT: br label [[IFCONT]] 3056 // CHECK6: ifcont: 3057 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 3058 // CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3059 // CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] 3060 // CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 3061 // CHECK6: then4: 3062 // CHECK6-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 3063 // CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 3064 // CHECK6-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 3065 // CHECK6-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 3066 // CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] 3067 // CHECK6-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 3068 // CHECK6-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 3069 // CHECK6-NEXT: br label [[IFCONT6:%.*]] 3070 // CHECK6: else5: 3071 // CHECK6-NEXT: br label [[IFCONT6]] 3072 // CHECK6: ifcont6: 3073 // CHECK6-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 3074 // CHECK6-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 3075 // CHECK6-NEXT: br label [[PRECOND]] 3076 // CHECK6: exit: 3077 // CHECK6-NEXT: ret void 3078 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func 3079 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 3080 // CHECK6-NEXT: entry: 3081 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3082 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3083 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 3084 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3085 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3086 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 3087 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 3088 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 3089 // CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3090 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 3091 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3092 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 3093 // CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 3094 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 3095 // CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 3096 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] 3097 // CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 3098 // CHECK6-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 3099 // CHECK6-NEXT: ret void 3100 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func 3101 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 3102 // CHECK6-NEXT: entry: 3103 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3104 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3105 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 3106 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 3107 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3108 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3109 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 3110 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3111 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 3112 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3113 // CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 3114 // CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 3115 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] 3116 // CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 3117 // CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 3118 // CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 3119 // CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 3120 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] 3121 // CHECK6-NEXT: ret void 3122 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func 3123 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 3124 // CHECK6-NEXT: entry: 3125 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3126 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3127 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 3128 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3129 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3130 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 3131 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 3132 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 3133 // CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3134 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 3135 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3136 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 3137 // CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 3138 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 3139 // CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 3140 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] 3141 // CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 3142 // CHECK6-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 3143 // CHECK6-NEXT: ret void 3144 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func 3145 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 3146 // CHECK6-NEXT: entry: 3147 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3148 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3149 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 3150 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 3151 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3152 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3153 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 3154 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3155 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 3156 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3157 // CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 3158 // CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 3159 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] 3160 // CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 3161 // CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 3162 // CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 3163 // CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 3164 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] 3165 // CHECK6-NEXT: ret void 3166 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker 3167 // CHECK6-SAME: () #[[ATTR0]] { 3168 // CHECK6-NEXT: entry: 3169 // CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 3170 // CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 3171 // CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 3172 // CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 3173 // CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] 3174 // CHECK6: .await.work: 3175 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 3176 // CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) 3177 // CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 3178 // CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 3179 // CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 3180 // CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null 3181 // CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] 3182 // CHECK6: .select.workers: 3183 // CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 3184 // CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 3185 // CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] 3186 // CHECK6: .execute.parallel: 3187 // CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 3188 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* 3189 // CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) 3190 // CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] 3191 // CHECK6: .terminate.parallel: 3192 // CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() 3193 // CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] 3194 // CHECK6: .barrier.parallel: 3195 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 3196 // CHECK6-NEXT: br label [[DOTAWAIT_WORK]] 3197 // CHECK6: .exit: 3198 // CHECK6-NEXT: ret void 3199 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 3200 // CHECK6-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { 3201 // CHECK6-NEXT: entry: 3202 // CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 3203 // CHECK6-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 3204 // CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 3205 // CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 3206 // CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 3207 // CHECK6-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 3208 // CHECK6-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 3209 // CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* 3210 // CHECK6-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* 3211 // CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3212 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 3213 // CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3214 // CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] 3215 // CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] 3216 // CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] 3217 // CHECK6: .worker: 3218 // CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] 3219 // CHECK6-NEXT: br label [[DOTEXIT:%.*]] 3220 // CHECK6: .mastercheck: 3221 // CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3222 // CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 3223 // CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3224 // CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 3225 // CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 3226 // CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 3227 // CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] 3228 // CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] 3229 // CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] 3230 // CHECK6: .master: 3231 // CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 3232 // CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3233 // CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] 3234 // CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) 3235 // CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() 3236 // CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) 3237 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* 3238 // CHECK6-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 3239 // CHECK6-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 3240 // CHECK6-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 3241 // CHECK6-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 3242 // CHECK6-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 3243 // CHECK6-NEXT: store float [[TMP8]], float* [[D9]], align 4 3244 // CHECK6-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 3245 // CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 3246 // CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] 3247 // CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) 3248 // CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] 3249 // CHECK6: .termination.notifier: 3250 // CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) 3251 // CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) 3252 // CHECK6-NEXT: br label [[DOTEXIT]] 3253 // CHECK6: .exit: 3254 // CHECK6-NEXT: ret void 3255 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 3256 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { 3257 // CHECK6-NEXT: entry: 3258 // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 3259 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 3260 // CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 3261 // CHECK6-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 3262 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 3263 // CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 3264 // CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 3265 // CHECK6-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 3266 // CHECK6-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 3267 // CHECK6-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 3268 // CHECK6-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 3269 // CHECK6-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) 3270 // CHECK6-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* 3271 // CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 3272 // CHECK6-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 3273 // CHECK6-NEXT: store i8 0, i8* [[C1]], align 4 3274 // CHECK6-NEXT: store float 1.000000e+00, float* [[D2]], align 4 3275 // CHECK6-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 3276 // CHECK6-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 3277 // CHECK6-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 3278 // CHECK6-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 3279 // CHECK6-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 3280 // CHECK6-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 3281 // CHECK6-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 3282 // CHECK6-NEXT: store float [[MUL]], float* [[D2]], align 4 3283 // CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 3284 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 3285 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 3286 // CHECK6-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 3287 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 3288 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* 3289 // CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 3290 // CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 3291 // CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 3292 // CHECK6-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) 3293 // CHECK6-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 3294 // CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 3295 // CHECK6: .omp.reduction.then: 3296 // CHECK6-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 3297 // CHECK6-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 3298 // CHECK6-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 3299 // CHECK6-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 3300 // CHECK6-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] 3301 // CHECK6-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 3302 // CHECK6-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 3303 // CHECK6-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 3304 // CHECK6-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 3305 // CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] 3306 // CHECK6-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 3307 // CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) 3308 // CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 3309 // CHECK6: .omp.reduction.done: 3310 // CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) 3311 // CHECK6-NEXT: ret void 3312 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 3313 // CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 3314 // CHECK6-NEXT: entry: 3315 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3316 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 3317 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 3318 // CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 3319 // CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 3320 // CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 3321 // CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 3322 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3323 // CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 3324 // CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 3325 // CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 3326 // CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3327 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 3328 // CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 3329 // CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 3330 // CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 3331 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 3332 // CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 3333 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 3334 // CHECK6-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 3335 // CHECK6-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 3336 // CHECK6-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 3337 // CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3338 // CHECK6-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 3339 // CHECK6-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) 3340 // CHECK6-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 3341 // CHECK6-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 3342 // CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 3343 // CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 3344 // CHECK6-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 3345 // CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 3346 // CHECK6-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 3347 // CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 3348 // CHECK6-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* 3349 // CHECK6-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 3350 // CHECK6-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* 3351 // CHECK6-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* 3352 // CHECK6-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* 3353 // CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 3354 // CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3355 // CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 3356 // CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 3357 // CHECK6-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 3358 // CHECK6-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 3359 // CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 3360 // CHECK6-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 3361 // CHECK6-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 3362 // CHECK6-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 3363 // CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 3364 // CHECK6-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 3365 // CHECK6-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] 3366 // CHECK6-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 3367 // CHECK6-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 3368 // CHECK6-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 3369 // CHECK6-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] 3370 // CHECK6-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 3371 // CHECK6-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] 3372 // CHECK6-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] 3373 // CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] 3374 // CHECK6-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] 3375 // CHECK6: then: 3376 // CHECK6-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 3377 // CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 3378 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] 3379 // CHECK6-NEXT: br label [[IFCONT:%.*]] 3380 // CHECK6: else: 3381 // CHECK6-NEXT: br label [[IFCONT]] 3382 // CHECK6: ifcont: 3383 // CHECK6-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 3384 // CHECK6-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 3385 // CHECK6-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] 3386 // CHECK6-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 3387 // CHECK6: then6: 3388 // CHECK6-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 3389 // CHECK6-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 3390 // CHECK6-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 3391 // CHECK6-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 3392 // CHECK6-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 3393 // CHECK6-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 3394 // CHECK6-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 3395 // CHECK6-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 3396 // CHECK6-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 3397 // CHECK6-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 3398 // CHECK6-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* 3399 // CHECK6-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* 3400 // CHECK6-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 3401 // CHECK6-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 3402 // CHECK6-NEXT: br label [[IFCONT8:%.*]] 3403 // CHECK6: else7: 3404 // CHECK6-NEXT: br label [[IFCONT8]] 3405 // CHECK6: ifcont8: 3406 // CHECK6-NEXT: ret void 3407 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 3408 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 3409 // CHECK6-NEXT: entry: 3410 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3411 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3412 // CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 3413 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3414 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3415 // CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3416 // CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3417 // CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 3418 // CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3419 // CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 3420 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3421 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 3422 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 3423 // CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 3424 // CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 3425 // CHECK6: then: 3426 // CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 3427 // CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 3428 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 3429 // CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* 3430 // CHECK6-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 3431 // CHECK6-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 3432 // CHECK6-NEXT: br label [[IFCONT:%.*]] 3433 // CHECK6: else: 3434 // CHECK6-NEXT: br label [[IFCONT]] 3435 // CHECK6: ifcont: 3436 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 3437 // CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3438 // CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 3439 // CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 3440 // CHECK6: then4: 3441 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 3442 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* 3443 // CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 3444 // CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 3445 // CHECK6-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 3446 // CHECK6-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 3447 // CHECK6-NEXT: br label [[IFCONT6:%.*]] 3448 // CHECK6: else5: 3449 // CHECK6-NEXT: br label [[IFCONT6]] 3450 // CHECK6: ifcont6: 3451 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 3452 // CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 3453 // CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 3454 // CHECK6: then8: 3455 // CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 3456 // CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 3457 // CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* 3458 // CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 3459 // CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 3460 // CHECK6-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 3461 // CHECK6-NEXT: br label [[IFCONT10:%.*]] 3462 // CHECK6: else9: 3463 // CHECK6-NEXT: br label [[IFCONT10]] 3464 // CHECK6: ifcont10: 3465 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 3466 // CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3467 // CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] 3468 // CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 3469 // CHECK6: then12: 3470 // CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 3471 // CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 3472 // CHECK6-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 3473 // CHECK6-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* 3474 // CHECK6-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 3475 // CHECK6-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 3476 // CHECK6-NEXT: br label [[IFCONT14:%.*]] 3477 // CHECK6: else13: 3478 // CHECK6-NEXT: br label [[IFCONT14]] 3479 // CHECK6: ifcont14: 3480 // CHECK6-NEXT: ret void 3481 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 3482 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 3483 // CHECK6-NEXT: entry: 3484 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3485 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3486 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 3487 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3488 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3489 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 3490 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 3491 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 3492 // CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3493 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* 3494 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3495 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 3496 // CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 3497 // CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 3498 // CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] 3499 // CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 3500 // CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 3501 // CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 3502 // CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 3503 // CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 3504 // CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 3505 // CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] 3506 // CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 3507 // CHECK6-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 3508 // CHECK6-NEXT: ret void 3509 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 3510 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 3511 // CHECK6-NEXT: entry: 3512 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3513 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3514 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 3515 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 3516 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3517 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3518 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 3519 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3520 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* 3521 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3522 // CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 3523 // CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 3524 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] 3525 // CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 3526 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 3527 // CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 3528 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] 3529 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 3530 // CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 3531 // CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 3532 // CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 3533 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] 3534 // CHECK6-NEXT: ret void 3535 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 3536 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 3537 // CHECK6-NEXT: entry: 3538 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3539 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3540 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 3541 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3542 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3543 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 3544 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 3545 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 3546 // CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3547 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* 3548 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3549 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 3550 // CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 3551 // CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 3552 // CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] 3553 // CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 3554 // CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 3555 // CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 3556 // CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 3557 // CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 3558 // CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 3559 // CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] 3560 // CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 3561 // CHECK6-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 3562 // CHECK6-NEXT: ret void 3563 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 3564 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 3565 // CHECK6-NEXT: entry: 3566 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3567 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3568 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 3569 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 3570 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3571 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3572 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 3573 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3574 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* 3575 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3576 // CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 3577 // CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 3578 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] 3579 // CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 3580 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 3581 // CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 3582 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] 3583 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 3584 // CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 3585 // CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 3586 // CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 3587 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] 3588 // CHECK6-NEXT: ret void 3589 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 3590 // CHECK6-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { 3591 // CHECK6-NEXT: entry: 3592 // CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 3593 // CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 3594 // CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 3595 // CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 3596 // CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 3597 // CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 3598 // CHECK6-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 3599 // CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* 3600 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 3601 // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 3602 // CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() 3603 // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] 3604 // CHECK6: .execute: 3605 // CHECK6-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) 3606 // CHECK6-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 3607 // CHECK6-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] 3608 // CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] 3609 // CHECK6: .omp.deinit: 3610 // CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 3611 // CHECK6-NEXT: br label [[DOTEXIT:%.*]] 3612 // CHECK6: .exit: 3613 // CHECK6-NEXT: ret void 3614 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 3615 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { 3616 // CHECK6-NEXT: entry: 3617 // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 3618 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 3619 // CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 3620 // CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 3621 // CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 3622 // CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 3623 // CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 3624 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 3625 // CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 3626 // CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 3627 // CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 3628 // CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 3629 // CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 3630 // CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 3631 // CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 3632 // CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 3633 // CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 3634 // CHECK6-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* 3635 // CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 3636 // CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 3637 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* 3638 // CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 3639 // CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 3640 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 3641 // CHECK6-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 3642 // CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) 3643 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 3644 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* 3645 // CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 3646 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 3647 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* 3648 // CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 3649 // CHECK6-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 3650 // CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 3651 // CHECK6-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) 3652 // CHECK6-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 3653 // CHECK6-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 3654 // CHECK6: .omp.reduction.then: 3655 // CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 3656 // CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 3657 // CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] 3658 // CHECK6-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 3659 // CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 3660 // CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 3661 // CHECK6-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 3662 // CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 3663 // CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] 3664 // CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 3665 // CHECK6: cond.true: 3666 // CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 3667 // CHECK6-NEXT: br label [[COND_END:%.*]] 3668 // CHECK6: cond.false: 3669 // CHECK6-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 3670 // CHECK6-NEXT: br label [[COND_END]] 3671 // CHECK6: cond.end: 3672 // CHECK6-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] 3673 // CHECK6-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 3674 // CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) 3675 // CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 3676 // CHECK6: .omp.reduction.done: 3677 // CHECK6-NEXT: ret void 3678 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 3679 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { 3680 // CHECK6-NEXT: entry: 3681 // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 3682 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 3683 // CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 3684 // CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 3685 // CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 3686 // CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 3687 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 3688 // CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 3689 // CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 3690 // CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 3691 // CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 3692 // CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 3693 // CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 3694 // CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 3695 // CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 3696 // CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 3697 // CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 3698 // CHECK6-NEXT: store i32 [[OR]], i32* [[A1]], align 4 3699 // CHECK6-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 3700 // CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 3701 // CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] 3702 // CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 3703 // CHECK6: cond.true: 3704 // CHECK6-NEXT: br label [[COND_END:%.*]] 3705 // CHECK6: cond.false: 3706 // CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 3707 // CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 3708 // CHECK6-NEXT: br label [[COND_END]] 3709 // CHECK6: cond.end: 3710 // CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] 3711 // CHECK6-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 3712 // CHECK6-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 3713 // CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 3714 // CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 3715 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 3716 // CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* 3717 // CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 3718 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 3719 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* 3720 // CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 3721 // CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 3722 // CHECK6-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) 3723 // CHECK6-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 3724 // CHECK6-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 3725 // CHECK6: .omp.reduction.then: 3726 // CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 3727 // CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 3728 // CHECK6-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] 3729 // CHECK6-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 3730 // CHECK6-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 3731 // CHECK6-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 3732 // CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 3733 // CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 3734 // CHECK6-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] 3735 // CHECK6-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] 3736 // CHECK6: cond.true9: 3737 // CHECK6-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 3738 // CHECK6-NEXT: br label [[COND_END11:%.*]] 3739 // CHECK6: cond.false10: 3740 // CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 3741 // CHECK6-NEXT: br label [[COND_END11]] 3742 // CHECK6: cond.end11: 3743 // CHECK6-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] 3744 // CHECK6-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 3745 // CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) 3746 // CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 3747 // CHECK6: .omp.reduction.done: 3748 // CHECK6-NEXT: ret void 3749 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 3750 // CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 3751 // CHECK6-NEXT: entry: 3752 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3753 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 3754 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 3755 // CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 3756 // CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 3757 // CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 3758 // CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 3759 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3760 // CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 3761 // CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 3762 // CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 3763 // CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3764 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 3765 // CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 3766 // CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 3767 // CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 3768 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 3769 // CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 3770 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 3771 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 3772 // CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 3773 // CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 3774 // CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 3775 // CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3776 // CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 3777 // CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) 3778 // CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 3779 // CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 3780 // CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 3781 // CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 3782 // CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 3783 // CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 3784 // CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 3785 // CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 3786 // CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* 3787 // CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 3788 // CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* 3789 // CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 3790 // CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 3791 // CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3792 // CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 3793 // CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 3794 // CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 3795 // CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 3796 // CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 3797 // CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 3798 // CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 3799 // CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 3800 // CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 3801 // CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 3802 // CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 3803 // CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] 3804 // CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 3805 // CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 3806 // CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 3807 // CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] 3808 // CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 3809 // CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] 3810 // CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] 3811 // CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] 3812 // CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] 3813 // CHECK6: then: 3814 // CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 3815 // CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 3816 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] 3817 // CHECK6-NEXT: br label [[IFCONT:%.*]] 3818 // CHECK6: else: 3819 // CHECK6-NEXT: br label [[IFCONT]] 3820 // CHECK6: ifcont: 3821 // CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 3822 // CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 3823 // CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] 3824 // CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 3825 // CHECK6: then6: 3826 // CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 3827 // CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 3828 // CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 3829 // CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 3830 // CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* 3831 // CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* 3832 // CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 3833 // CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 3834 // CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 3835 // CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 3836 // CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 3837 // CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 3838 // CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* 3839 // CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* 3840 // CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 3841 // CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 3842 // CHECK6-NEXT: br label [[IFCONT8:%.*]] 3843 // CHECK6: else7: 3844 // CHECK6-NEXT: br label [[IFCONT8]] 3845 // CHECK6: ifcont8: 3846 // CHECK6-NEXT: ret void 3847 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 3848 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 3849 // CHECK6-NEXT: entry: 3850 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3851 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 3852 // CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 3853 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3854 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 3855 // CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3856 // CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3857 // CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 3858 // CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 3859 // CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 3860 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3861 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 3862 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) 3863 // CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 3864 // CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 3865 // CHECK6: then: 3866 // CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 3867 // CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 3868 // CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* 3869 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 3870 // CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 3871 // CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 3872 // CHECK6-NEXT: br label [[IFCONT:%.*]] 3873 // CHECK6: else: 3874 // CHECK6-NEXT: br label [[IFCONT]] 3875 // CHECK6: ifcont: 3876 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 3877 // CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3878 // CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 3879 // CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 3880 // CHECK6: then4: 3881 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 3882 // CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 3883 // CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 3884 // CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* 3885 // CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 3886 // CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 3887 // CHECK6-NEXT: br label [[IFCONT6:%.*]] 3888 // CHECK6: else5: 3889 // CHECK6-NEXT: br label [[IFCONT6]] 3890 // CHECK6: ifcont6: 3891 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 3892 // CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 3893 // CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 3894 // CHECK6: then8: 3895 // CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 3896 // CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 3897 // CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* 3898 // CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 3899 // CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* 3900 // CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 3901 // CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 3902 // CHECK6-NEXT: br label [[IFCONT10:%.*]] 3903 // CHECK6: else9: 3904 // CHECK6-NEXT: br label [[IFCONT10]] 3905 // CHECK6: ifcont10: 3906 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 3907 // CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 3908 // CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] 3909 // CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 3910 // CHECK6: then12: 3911 // CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 3912 // CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* 3913 // CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 3914 // CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 3915 // CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* 3916 // CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 3917 // CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 3918 // CHECK6-NEXT: br label [[IFCONT14:%.*]] 3919 // CHECK6: else13: 3920 // CHECK6-NEXT: br label [[IFCONT14]] 3921 // CHECK6: ifcont14: 3922 // CHECK6-NEXT: ret void 3923 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 3924 // CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { 3925 // CHECK6-NEXT: entry: 3926 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 3927 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 3928 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 3929 // CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 3930 // CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 3931 // CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 3932 // CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 3933 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 3934 // CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 3935 // CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 3936 // CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 3937 // CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 3938 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 3939 // CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 3940 // CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 3941 // CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 3942 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 3943 // CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 3944 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 3945 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 3946 // CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 3947 // CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 3948 // CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 3949 // CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3950 // CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 3951 // CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) 3952 // CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 3953 // CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 3954 // CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 3955 // CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 3956 // CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 3957 // CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 3958 // CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 3959 // CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 3960 // CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* 3961 // CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 3962 // CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* 3963 // CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 3964 // CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 3965 // CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 3966 // CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 3967 // CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) 3968 // CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 3969 // CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 3970 // CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 3971 // CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 3972 // CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 3973 // CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 3974 // CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 3975 // CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 3976 // CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 3977 // CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] 3978 // CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 3979 // CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 3980 // CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 3981 // CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] 3982 // CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 3983 // CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] 3984 // CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] 3985 // CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] 3986 // CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] 3987 // CHECK6: then: 3988 // CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 3989 // CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 3990 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] 3991 // CHECK6-NEXT: br label [[IFCONT:%.*]] 3992 // CHECK6: else: 3993 // CHECK6-NEXT: br label [[IFCONT]] 3994 // CHECK6: ifcont: 3995 // CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 3996 // CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 3997 // CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] 3998 // CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 3999 // CHECK6: then6: 4000 // CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 4001 // CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 4002 // CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 4003 // CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 4004 // CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* 4005 // CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* 4006 // CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 4007 // CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 4008 // CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 4009 // CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 4010 // CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 4011 // CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 4012 // CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* 4013 // CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* 4014 // CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 4015 // CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 4016 // CHECK6-NEXT: br label [[IFCONT8:%.*]] 4017 // CHECK6: else7: 4018 // CHECK6-NEXT: br label [[IFCONT8]] 4019 // CHECK6: ifcont8: 4020 // CHECK6-NEXT: ret void 4021 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 4022 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { 4023 // CHECK6-NEXT: entry: 4024 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 4025 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4026 // CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 4027 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 4028 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4029 // CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 4030 // CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 4031 // CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 4032 // CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 4033 // CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 4034 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 4035 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 4036 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 4037 // CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 4038 // CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 4039 // CHECK6: then: 4040 // CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 4041 // CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 4042 // CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* 4043 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 4044 // CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 4045 // CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 4046 // CHECK6-NEXT: br label [[IFCONT:%.*]] 4047 // CHECK6: else: 4048 // CHECK6-NEXT: br label [[IFCONT]] 4049 // CHECK6: ifcont: 4050 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 4051 // CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4052 // CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] 4053 // CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 4054 // CHECK6: then4: 4055 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 4056 // CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 4057 // CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 4058 // CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* 4059 // CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 4060 // CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 4061 // CHECK6-NEXT: br label [[IFCONT6:%.*]] 4062 // CHECK6: else5: 4063 // CHECK6-NEXT: br label [[IFCONT6]] 4064 // CHECK6: ifcont6: 4065 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 4066 // CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 4067 // CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] 4068 // CHECK6: then8: 4069 // CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 4070 // CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 4071 // CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* 4072 // CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 4073 // CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* 4074 // CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 4075 // CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 4076 // CHECK6-NEXT: br label [[IFCONT10:%.*]] 4077 // CHECK6: else9: 4078 // CHECK6-NEXT: br label [[IFCONT10]] 4079 // CHECK6: ifcont10: 4080 // CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 4081 // CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4082 // CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] 4083 // CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] 4084 // CHECK6: then12: 4085 // CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] 4086 // CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* 4087 // CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 4088 // CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 4089 // CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* 4090 // CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 4091 // CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 4092 // CHECK6-NEXT: br label [[IFCONT14:%.*]] 4093 // CHECK6: else13: 4094 // CHECK6-NEXT: br label [[IFCONT14]] 4095 // CHECK6: ifcont14: 4096 // CHECK6-NEXT: ret void 4097 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 4098 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 4099 // CHECK6-NEXT: entry: 4100 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 4101 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4102 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 4103 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 4104 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4105 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 4106 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 4107 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 4108 // CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 4109 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* 4110 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4111 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 4112 // CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 4113 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 4114 // CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 4115 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] 4116 // CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 4117 // CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 4118 // CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 4119 // CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 4120 // CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 4121 // CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 4122 // CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] 4123 // CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 4124 // CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 4125 // CHECK6-NEXT: ret void 4126 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 4127 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 4128 // CHECK6-NEXT: entry: 4129 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 4130 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4131 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 4132 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 4133 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 4134 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4135 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 4136 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 4137 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* 4138 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4139 // CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 4140 // CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 4141 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] 4142 // CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 4143 // CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 4144 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 4145 // CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 4146 // CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] 4147 // CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 4148 // CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 4149 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4150 // CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 4151 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] 4152 // CHECK6-NEXT: ret void 4153 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 4154 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 4155 // CHECK6-NEXT: entry: 4156 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 4157 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4158 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 4159 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 4160 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4161 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 4162 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 4163 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 4164 // CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 4165 // CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* 4166 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4167 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 4168 // CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 4169 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 4170 // CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 4171 // CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] 4172 // CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 4173 // CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 4174 // CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 4175 // CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 4176 // CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 4177 // CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 4178 // CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] 4179 // CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 4180 // CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 4181 // CHECK6-NEXT: ret void 4182 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 4183 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { 4184 // CHECK6-NEXT: entry: 4185 // CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 4186 // CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4187 // CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 4188 // CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 4189 // CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 4190 // CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4191 // CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 4192 // CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 4193 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* 4194 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4195 // CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 4196 // CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 4197 // CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] 4198 // CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 4199 // CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 4200 // CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 4201 // CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 4202 // CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] 4203 // CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 4204 // CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 4205 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4206 // CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 4207 // CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] 4208 // CHECK6-NEXT: ret void 4209 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20 4210 // CHECK1-SAME: (i64 noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] { 4211 // CHECK1-NEXT: entry: 4212 // CHECK1-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 4213 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 4214 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 4215 // CHECK1-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 4216 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* 4217 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) 4218 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 4219 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 4220 // CHECK1: user_code.entry: 4221 // CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[CONV]], align 8 4222 // CHECK1-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8) 4223 // CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* 4224 // CHECK1-NEXT: store double [[TMP1]], double* [[E_ON_STACK]], align 8 4225 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 4226 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 4227 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 4228 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E_ON_STACK]]) #[[ATTR3:[0-9]+]] 4229 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i64 8) 4230 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 4231 // CHECK1-NEXT: ret void 4232 // CHECK1: worker.exit: 4233 // CHECK1-NEXT: ret void 4234 // 4235 // 4236 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ 4237 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] { 4238 // CHECK1-NEXT: entry: 4239 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 4240 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 4241 // CHECK1-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 4242 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 4243 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 4244 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 4245 // CHECK1-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 4246 // CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 4247 // CHECK1-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8) 4248 // CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* 4249 // CHECK1-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 4250 // CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 4251 // CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 4252 // CHECK1-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 4253 // CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 4254 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 4255 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 4256 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* 4257 // CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 4258 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4259 // CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 4260 // CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) 4261 // CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 4262 // CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 4263 // CHECK1: .omp.reduction.then: 4264 // CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 4265 // CHECK1-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 4266 // CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] 4267 // CHECK1-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 4268 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) 4269 // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 4270 // CHECK1: .omp.reduction.done: 4271 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i64 8) 4272 // CHECK1-NEXT: ret void 4273 // 4274 // 4275 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func 4276 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] { 4277 // CHECK1-NEXT: entry: 4278 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4279 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 4280 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 4281 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 4282 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 4283 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 4284 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4285 // CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 4286 // CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 4287 // CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 4288 // CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4289 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* 4290 // CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 4291 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 4292 // CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 4293 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 4294 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 4295 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 4296 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* 4297 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 4298 // CHECK1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* 4299 // CHECK1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* 4300 // CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* 4301 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 4302 // CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size() 4303 // CHECK1-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 4304 // CHECK1-NEXT: [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]]) 4305 // CHECK1-NEXT: store i64 [[TMP20]], i64* [[TMP16]], align 8 4306 // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 4307 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 4308 // CHECK1-NEXT: [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 4309 // CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP11]], align 8 4310 // CHECK1-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0 4311 // CHECK1-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1 4312 // CHECK1-NEXT: [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 4313 // CHECK1-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]] 4314 // CHECK1-NEXT: [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2 4315 // CHECK1-NEXT: [[TMP29:%.*]] = and i16 [[TMP6]], 1 4316 // CHECK1-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0 4317 // CHECK1-NEXT: [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]] 4318 // CHECK1-NEXT: [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0 4319 // CHECK1-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]] 4320 // CHECK1-NEXT: [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]] 4321 // CHECK1-NEXT: [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]] 4322 // CHECK1-NEXT: br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]] 4323 // CHECK1: then: 4324 // CHECK1-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* 4325 // CHECK1-NEXT: [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 4326 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]] 4327 // CHECK1-NEXT: br label [[IFCONT:%.*]] 4328 // CHECK1: else: 4329 // CHECK1-NEXT: br label [[IFCONT]] 4330 // CHECK1: ifcont: 4331 // CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 4332 // CHECK1-NEXT: [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 4333 // CHECK1-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 4334 // CHECK1-NEXT: br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 4335 // CHECK1: then4: 4336 // CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 4337 // CHECK1-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8 4338 // CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 4339 // CHECK1-NEXT: [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8 4340 // CHECK1-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double* 4341 // CHECK1-NEXT: [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double* 4342 // CHECK1-NEXT: [[TMP47:%.*]] = load double, double* [[TMP45]], align 8 4343 // CHECK1-NEXT: store double [[TMP47]], double* [[TMP46]], align 8 4344 // CHECK1-NEXT: br label [[IFCONT6:%.*]] 4345 // CHECK1: else5: 4346 // CHECK1-NEXT: br label [[IFCONT6]] 4347 // CHECK1: ifcont6: 4348 // CHECK1-NEXT: ret void 4349 // 4350 // 4351 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func 4352 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 4353 // CHECK1-NEXT: entry: 4354 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4355 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4356 // CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 4357 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 4358 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4359 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4360 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 4361 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 4362 // CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 4363 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 4364 // CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 4365 // CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4366 // CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]* 4367 // CHECK1-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 4368 // CHECK1-NEXT: br label [[PRECOND:%.*]] 4369 // CHECK1: precond: 4370 // CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 4371 // CHECK1-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2 4372 // CHECK1-NEXT: br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]] 4373 // CHECK1: body: 4374 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) 4375 // CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 4376 // CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 4377 // CHECK1: then: 4378 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0 4379 // CHECK1-NEXT: [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8 4380 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32* 4381 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]] 4382 // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 4383 // CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4 4384 // CHECK1-NEXT: store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4 4385 // CHECK1-NEXT: br label [[IFCONT:%.*]] 4386 // CHECK1: else: 4387 // CHECK1-NEXT: br label [[IFCONT]] 4388 // CHECK1: ifcont: 4389 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 4390 // CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4391 // CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]] 4392 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 4393 // CHECK1: then2: 4394 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 4395 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0 4396 // CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8 4397 // CHECK1-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32* 4398 // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]] 4399 // CHECK1-NEXT: [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4 4400 // CHECK1-NEXT: store i32 [[TMP22]], i32* [[TMP21]], align 4 4401 // CHECK1-NEXT: br label [[IFCONT4:%.*]] 4402 // CHECK1: else3: 4403 // CHECK1-NEXT: br label [[IFCONT4]] 4404 // CHECK1: ifcont4: 4405 // CHECK1-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1 4406 // CHECK1-NEXT: store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4 4407 // CHECK1-NEXT: br label [[PRECOND]] 4408 // CHECK1: exit: 4409 // CHECK1-NEXT: ret void 4410 // 4411 // 4412 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func 4413 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 4414 // CHECK1-NEXT: entry: 4415 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4416 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4417 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 4418 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4419 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4420 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 4421 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 4422 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 4423 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4424 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 4425 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4426 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 4427 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 4428 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 4429 // CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 4430 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] 4431 // CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 4432 // CHECK1-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 4433 // CHECK1-NEXT: ret void 4434 // 4435 // 4436 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func 4437 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 4438 // CHECK1-NEXT: entry: 4439 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4440 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4441 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 4442 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 4443 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4444 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4445 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 4446 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4447 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* 4448 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4449 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 4450 // CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 4451 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] 4452 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 4453 // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 4454 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4455 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 4456 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] 4457 // CHECK1-NEXT: ret void 4458 // 4459 // 4460 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func 4461 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 4462 // CHECK1-NEXT: entry: 4463 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4464 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4465 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 4466 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4467 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4468 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 4469 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 4470 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 4471 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4472 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 4473 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4474 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 4475 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 4476 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 4477 // CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 4478 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] 4479 // CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 4480 // CHECK1-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 4481 // CHECK1-NEXT: ret void 4482 // 4483 // 4484 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func 4485 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 4486 // CHECK1-NEXT: entry: 4487 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4488 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4489 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 4490 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 4491 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4492 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4493 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 4494 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4495 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* 4496 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4497 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 4498 // CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 4499 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] 4500 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 4501 // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 4502 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4503 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 4504 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] 4505 // CHECK1-NEXT: ret void 4506 // 4507 // 4508 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26 4509 // CHECK1-SAME: (i64 noundef [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR0]] { 4510 // CHECK1-NEXT: entry: 4511 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 4512 // CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 4513 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 4514 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 4515 // CHECK1-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 4516 // CHECK1-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 4517 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* 4518 // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* 4519 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 4520 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 4521 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 4522 // CHECK1: user_code.entry: 4523 // CHECK1-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1 4524 // CHECK1-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1) 4525 // CHECK1-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1 4526 // CHECK1-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4 4527 // CHECK1-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4) 4528 // CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float* 4529 // CHECK1-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4 4530 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 4531 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 4532 // CHECK1-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4 4533 // CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]] 4534 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i64 4) 4535 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i64 1) 4536 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 4537 // CHECK1-NEXT: ret void 4538 // CHECK1: worker.exit: 4539 // CHECK1-NEXT: ret void 4540 // 4541 // 4542 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 4543 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] { 4544 // CHECK1-NEXT: entry: 4545 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 4546 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 4547 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 4548 // CHECK1-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 4549 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 4550 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 4551 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 4552 // CHECK1-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 4553 // CHECK1-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 4554 // CHECK1-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 4555 // CHECK1-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 4556 // CHECK1-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1) 4557 // CHECK1-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4) 4558 // CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* 4559 // CHECK1-NEXT: store i8 0, i8* [[C1]], align 1 4560 // CHECK1-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 4561 // CHECK1-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 4562 // CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 4563 // CHECK1-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 4564 // CHECK1-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 4565 // CHECK1-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 4566 // CHECK1-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 4567 // CHECK1-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 4568 // CHECK1-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 4569 // CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 4570 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 4571 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 4572 // CHECK1-NEXT: store i8* [[C1]], i8** [[TMP6]], align 8 4573 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 4574 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* 4575 // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 4576 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4577 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 4578 // CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) 4579 // CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 4580 // CHECK1-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 4581 // CHECK1: .omp.reduction.then: 4582 // CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 4583 // CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 4584 // CHECK1-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 4585 // CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 4586 // CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] 4587 // CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 4588 // CHECK1-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 4589 // CHECK1-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 4590 // CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 4591 // CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] 4592 // CHECK1-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 4593 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) 4594 // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 4595 // CHECK1: .omp.reduction.done: 4596 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D2]], i64 4) 4597 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C1]], i64 1) 4598 // CHECK1-NEXT: ret void 4599 // 4600 // 4601 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 4602 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 4603 // CHECK1-NEXT: entry: 4604 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4605 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 4606 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 4607 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 4608 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 4609 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 4610 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 4611 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4612 // CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 4613 // CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 4614 // CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 4615 // CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4616 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 4617 // CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 4618 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 4619 // CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 4620 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 4621 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 4622 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 4623 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 4624 // CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 4625 // CHECK1-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 4626 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size() 4627 // CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16 4628 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]]) 4629 // CHECK1-NEXT: [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8 4630 // CHECK1-NEXT: store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 4631 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 4632 // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 4633 // CHECK1-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 4634 // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 4635 // CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 4636 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 4637 // CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float* 4638 // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i64 1 4639 // CHECK1-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8* 4640 // CHECK1-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32* 4641 // CHECK1-NEXT: [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* 4642 // CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4 4643 // CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 4644 // CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 4645 // CHECK1-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 4646 // CHECK1-NEXT: store i32 [[TMP32]], i32* [[TMP28]], align 4 4647 // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 4648 // CHECK1-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i64 1 4649 // CHECK1-NEXT: [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 4650 // CHECK1-NEXT: store i8* [[TMP35]], i8** [[TMP23]], align 8 4651 // CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0 4652 // CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 4653 // CHECK1-NEXT: [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 4654 // CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] 4655 // CHECK1-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2 4656 // CHECK1-NEXT: [[TMP41:%.*]] = and i16 [[TMP6]], 1 4657 // CHECK1-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0 4658 // CHECK1-NEXT: [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]] 4659 // CHECK1-NEXT: [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0 4660 // CHECK1-NEXT: [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]] 4661 // CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]] 4662 // CHECK1-NEXT: [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]] 4663 // CHECK1-NEXT: br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]] 4664 // CHECK1: then: 4665 // CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 4666 // CHECK1-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 4667 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]] 4668 // CHECK1-NEXT: br label [[IFCONT:%.*]] 4669 // CHECK1: else: 4670 // CHECK1-NEXT: br label [[IFCONT]] 4671 // CHECK1: ifcont: 4672 // CHECK1-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1 4673 // CHECK1-NEXT: [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 4674 // CHECK1-NEXT: [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]] 4675 // CHECK1-NEXT: br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 4676 // CHECK1: then5: 4677 // CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 4678 // CHECK1-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 4679 // CHECK1-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 4680 // CHECK1-NEXT: [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 8 4681 // CHECK1-NEXT: [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1 4682 // CHECK1-NEXT: store i8 [[TMP57]], i8* [[TMP56]], align 1 4683 // CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 4684 // CHECK1-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 4685 // CHECK1-NEXT: [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 4686 // CHECK1-NEXT: [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 8 4687 // CHECK1-NEXT: [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float* 4688 // CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float* 4689 // CHECK1-NEXT: [[TMP64:%.*]] = load float, float* [[TMP62]], align 4 4690 // CHECK1-NEXT: store float [[TMP64]], float* [[TMP63]], align 4 4691 // CHECK1-NEXT: br label [[IFCONT7:%.*]] 4692 // CHECK1: else6: 4693 // CHECK1-NEXT: br label [[IFCONT7]] 4694 // CHECK1: ifcont7: 4695 // CHECK1-NEXT: ret void 4696 // 4697 // 4698 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 4699 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 4700 // CHECK1-NEXT: entry: 4701 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4702 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4703 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 4704 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4705 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4706 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 4707 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 4708 // CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 4709 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 4710 // CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 4711 // CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4712 // CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 4713 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 4714 // CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 4715 // CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 4716 // CHECK1: then: 4717 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0 4718 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 4719 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 4720 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)* 4721 // CHECK1-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1 4722 // CHECK1-NEXT: store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1 4723 // CHECK1-NEXT: br label [[IFCONT:%.*]] 4724 // CHECK1: else: 4725 // CHECK1-NEXT: br label [[IFCONT]] 4726 // CHECK1: ifcont: 4727 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 4728 // CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4729 // CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 4730 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 4731 // CHECK1: then2: 4732 // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 4733 // CHECK1-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)* 4734 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0 4735 // CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 4736 // CHECK1-NEXT: [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1 4737 // CHECK1-NEXT: store i8 [[TMP18]], i8* [[TMP17]], align 1 4738 // CHECK1-NEXT: br label [[IFCONT4:%.*]] 4739 // CHECK1: else3: 4740 // CHECK1-NEXT: br label [[IFCONT4]] 4741 // CHECK1: ifcont4: 4742 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 4743 // CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 4744 // CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 4745 // CHECK1: then6: 4746 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1 4747 // CHECK1-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8 4748 // CHECK1-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32* 4749 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 4750 // CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4 4751 // CHECK1-NEXT: store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4 4752 // CHECK1-NEXT: br label [[IFCONT8:%.*]] 4753 // CHECK1: else7: 4754 // CHECK1-NEXT: br label [[IFCONT8]] 4755 // CHECK1: ifcont8: 4756 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 4757 // CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4758 // CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]] 4759 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 4760 // CHECK1: then10: 4761 // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 4762 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1 4763 // CHECK1-NEXT: [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 8 4764 // CHECK1-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32* 4765 // CHECK1-NEXT: [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4 4766 // CHECK1-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 4 4767 // CHECK1-NEXT: br label [[IFCONT12:%.*]] 4768 // CHECK1: else11: 4769 // CHECK1-NEXT: br label [[IFCONT12]] 4770 // CHECK1: ifcont12: 4771 // CHECK1-NEXT: ret void 4772 // 4773 // 4774 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 4775 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 4776 // CHECK1-NEXT: entry: 4777 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4778 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4779 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 4780 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4781 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4782 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 4783 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 4784 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 4785 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4786 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 4787 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4788 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 4789 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 4790 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 4791 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] 4792 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 4793 // CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 4794 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 4795 // CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 4796 // CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 4797 // CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 4798 // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] 4799 // CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 4800 // CHECK1-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 4801 // CHECK1-NEXT: ret void 4802 // 4803 // 4804 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 4805 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 4806 // CHECK1-NEXT: entry: 4807 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4808 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4809 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 4810 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 4811 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4812 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4813 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 4814 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4815 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 4816 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4817 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 4818 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 4819 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] 4820 // CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 4821 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 4822 // CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 4823 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] 4824 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 4825 // CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 4826 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4827 // CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 4828 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] 4829 // CHECK1-NEXT: ret void 4830 // 4831 // 4832 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 4833 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 4834 // CHECK1-NEXT: entry: 4835 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4836 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4837 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 4838 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4839 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4840 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 4841 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 4842 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 4843 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4844 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 4845 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4846 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 4847 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 4848 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 4849 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] 4850 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 4851 // CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 4852 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 4853 // CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 4854 // CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 4855 // CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 4856 // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] 4857 // CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 4858 // CHECK1-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 4859 // CHECK1-NEXT: ret void 4860 // 4861 // 4862 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 4863 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 4864 // CHECK1-NEXT: entry: 4865 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 4866 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 4867 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 4868 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 4869 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 4870 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 4871 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 4872 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 4873 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 4874 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 4875 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 4876 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 4877 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] 4878 // CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 4879 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 4880 // CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 4881 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] 4882 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 4883 // CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 4884 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4885 // CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 4886 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] 4887 // CHECK1-NEXT: ret void 4888 // 4889 // 4890 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 4891 // CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { 4892 // CHECK1-NEXT: entry: 4893 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 4894 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 4895 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 4896 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 4897 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 4898 // CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 4899 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* 4900 // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* 4901 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) 4902 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 4903 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 4904 // CHECK1: user_code.entry: 4905 // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) 4906 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 4907 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 4908 // CHECK1-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] 4909 // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) 4910 // CHECK1-NEXT: ret void 4911 // CHECK1: worker.exit: 4912 // CHECK1-NEXT: ret void 4913 // 4914 // 4915 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9 4916 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { 4917 // CHECK1-NEXT: entry: 4918 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 4919 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 4920 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 4921 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 4922 // CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 4923 // CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 4924 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 4925 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 4926 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 4927 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 4928 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 4929 // CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 4930 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 4931 // CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 4932 // CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 4933 // CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 4934 // CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 4935 // CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* 4936 // CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 4937 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 4938 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* 4939 // CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 4940 // CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 4941 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 4942 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 4943 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i64 2) 4944 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 4945 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* 4946 // CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 4947 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 4948 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* 4949 // CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 4950 // CHECK1-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 4951 // CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 4952 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) 4953 // CHECK1-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 4954 // CHECK1-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 4955 // CHECK1: .omp.reduction.then: 4956 // CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 4957 // CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 4958 // CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] 4959 // CHECK1-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 4960 // CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 4961 // CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 4962 // CHECK1-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 4963 // CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 4964 // CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] 4965 // CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 4966 // CHECK1: cond.true: 4967 // CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 4968 // CHECK1-NEXT: br label [[COND_END:%.*]] 4969 // CHECK1: cond.false: 4970 // CHECK1-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 4971 // CHECK1-NEXT: br label [[COND_END]] 4972 // CHECK1: cond.end: 4973 // CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] 4974 // CHECK1-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 4975 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) 4976 // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 4977 // CHECK1: .omp.reduction.done: 4978 // CHECK1-NEXT: ret void 4979 // 4980 // 4981 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10 4982 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { 4983 // CHECK1-NEXT: entry: 4984 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 4985 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 4986 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 4987 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 4988 // CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 4989 // CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 4990 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 4991 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 4992 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 4993 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 4994 // CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 4995 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 4996 // CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 4997 // CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 4998 // CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 4999 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 5000 // CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 5001 // CHECK1-NEXT: store i32 [[OR]], i32* [[A1]], align 4 5002 // CHECK1-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 5003 // CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 5004 // CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] 5005 // CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 5006 // CHECK1: cond.true: 5007 // CHECK1-NEXT: br label [[COND_END:%.*]] 5008 // CHECK1: cond.false: 5009 // CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 5010 // CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 5011 // CHECK1-NEXT: br label [[COND_END]] 5012 // CHECK1: cond.end: 5013 // CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] 5014 // CHECK1-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 5015 // CHECK1-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 5016 // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 5017 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 5018 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 5019 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* 5020 // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 5021 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 5022 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* 5023 // CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 5024 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 5025 // CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) 5026 // CHECK1-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 5027 // CHECK1-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 5028 // CHECK1: .omp.reduction.then: 5029 // CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 5030 // CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 5031 // CHECK1-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] 5032 // CHECK1-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 5033 // CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 5034 // CHECK1-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 5035 // CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 5036 // CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 5037 // CHECK1-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] 5038 // CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] 5039 // CHECK1: cond.true9: 5040 // CHECK1-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 5041 // CHECK1-NEXT: br label [[COND_END11:%.*]] 5042 // CHECK1: cond.false10: 5043 // CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 5044 // CHECK1-NEXT: br label [[COND_END11]] 5045 // CHECK1: cond.end11: 5046 // CHECK1-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] 5047 // CHECK1-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 5048 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) 5049 // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 5050 // CHECK1: .omp.reduction.done: 5051 // CHECK1-NEXT: ret void 5052 // 5053 // 5054 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 5055 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 5056 // CHECK1-NEXT: entry: 5057 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 5058 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 5059 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 5060 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 5061 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 5062 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 5063 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 5064 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 5065 // CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 5066 // CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 5067 // CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 5068 // CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 5069 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 5070 // CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 5071 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 5072 // CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 5073 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 5074 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 5075 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 5076 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 5077 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 5078 // CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 5079 // CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 5080 // CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size() 5081 // CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 5082 // CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]]) 5083 // CHECK1-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 5084 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 5085 // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 5086 // CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 5087 // CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 8 5088 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 5089 // CHECK1-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 8 5090 // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 5091 // CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16* 5092 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1 5093 // CHECK1-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8* 5094 // CHECK1-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2 5095 // CHECK1-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 5096 // CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 5097 // CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 5098 // CHECK1-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 5099 // CHECK1-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 5100 // CHECK1-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 5101 // CHECK1-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1 5102 // CHECK1-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 5103 // CHECK1-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 5104 // CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 8 5105 // CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0 5106 // CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 5107 // CHECK1-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 5108 // CHECK1-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 5109 // CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2 5110 // CHECK1-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1 5111 // CHECK1-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0 5112 // CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]] 5113 // CHECK1-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0 5114 // CHECK1-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]] 5115 // CHECK1-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]] 5116 // CHECK1-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]] 5117 // CHECK1-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]] 5118 // CHECK1: then: 5119 // CHECK1-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 5120 // CHECK1-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 5121 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]] 5122 // CHECK1-NEXT: br label [[IFCONT:%.*]] 5123 // CHECK1: else: 5124 // CHECK1-NEXT: br label [[IFCONT]] 5125 // CHECK1: ifcont: 5126 // CHECK1-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1 5127 // CHECK1-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 5128 // CHECK1-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]] 5129 // CHECK1-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 5130 // CHECK1: then5: 5131 // CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 5132 // CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 5133 // CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 5134 // CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 5135 // CHECK1-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32* 5136 // CHECK1-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32* 5137 // CHECK1-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4 5138 // CHECK1-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4 5139 // CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 5140 // CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 5141 // CHECK1-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 5142 // CHECK1-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 8 5143 // CHECK1-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16* 5144 // CHECK1-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16* 5145 // CHECK1-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2 5146 // CHECK1-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2 5147 // CHECK1-NEXT: br label [[IFCONT7:%.*]] 5148 // CHECK1: else6: 5149 // CHECK1-NEXT: br label [[IFCONT7]] 5150 // CHECK1: ifcont7: 5151 // CHECK1-NEXT: ret void 5152 // 5153 // 5154 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 5155 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 5156 // CHECK1-NEXT: entry: 5157 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 5158 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5159 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 5160 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 5161 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5162 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5163 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5164 // CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 5165 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5166 // CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 5167 // CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8 5168 // CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 5169 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) 5170 // CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 5171 // CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 5172 // CHECK1: then: 5173 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0 5174 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 5175 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 5176 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 5177 // CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 5178 // CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 5179 // CHECK1-NEXT: br label [[IFCONT:%.*]] 5180 // CHECK1: else: 5181 // CHECK1-NEXT: br label [[IFCONT]] 5182 // CHECK1: ifcont: 5183 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 5184 // CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5185 // CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 5186 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 5187 // CHECK1: then2: 5188 // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 5189 // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0 5190 // CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 5191 // CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 5192 // CHECK1-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 5193 // CHECK1-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4 5194 // CHECK1-NEXT: br label [[IFCONT4:%.*]] 5195 // CHECK1: else3: 5196 // CHECK1-NEXT: br label [[IFCONT4]] 5197 // CHECK1: ifcont4: 5198 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 5199 // CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 5200 // CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 5201 // CHECK1: then6: 5202 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1 5203 // CHECK1-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8 5204 // CHECK1-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16* 5205 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 5206 // CHECK1-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)* 5207 // CHECK1-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2 5208 // CHECK1-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2 5209 // CHECK1-NEXT: br label [[IFCONT8:%.*]] 5210 // CHECK1: else7: 5211 // CHECK1-NEXT: br label [[IFCONT8]] 5212 // CHECK1: ifcont8: 5213 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 5214 // CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5215 // CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]] 5216 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 5217 // CHECK1: then10: 5218 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 5219 // CHECK1-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)* 5220 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1 5221 // CHECK1-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 8 5222 // CHECK1-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16* 5223 // CHECK1-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2 5224 // CHECK1-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2 5225 // CHECK1-NEXT: br label [[IFCONT12:%.*]] 5226 // CHECK1: else11: 5227 // CHECK1-NEXT: br label [[IFCONT12]] 5228 // CHECK1: ifcont12: 5229 // CHECK1-NEXT: ret void 5230 // 5231 // 5232 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 5233 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 5234 // CHECK1-NEXT: entry: 5235 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 5236 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 5237 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 5238 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 5239 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 5240 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 5241 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 5242 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 5243 // CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 5244 // CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 5245 // CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 5246 // CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 5247 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 5248 // CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 5249 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 5250 // CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 5251 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 5252 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 5253 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 5254 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 5255 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 5256 // CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 5257 // CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 5258 // CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size() 5259 // CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 5260 // CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]]) 5261 // CHECK1-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 5262 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 5263 // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 5264 // CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 5265 // CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 8 5266 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 5267 // CHECK1-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 8 5268 // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 5269 // CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16* 5270 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1 5271 // CHECK1-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8* 5272 // CHECK1-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2 5273 // CHECK1-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 5274 // CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 5275 // CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 5276 // CHECK1-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 5277 // CHECK1-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 5278 // CHECK1-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 5279 // CHECK1-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1 5280 // CHECK1-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 5281 // CHECK1-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 5282 // CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 8 5283 // CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0 5284 // CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 5285 // CHECK1-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 5286 // CHECK1-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 5287 // CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2 5288 // CHECK1-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1 5289 // CHECK1-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0 5290 // CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]] 5291 // CHECK1-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0 5292 // CHECK1-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]] 5293 // CHECK1-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]] 5294 // CHECK1-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]] 5295 // CHECK1-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]] 5296 // CHECK1: then: 5297 // CHECK1-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 5298 // CHECK1-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 5299 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]] 5300 // CHECK1-NEXT: br label [[IFCONT:%.*]] 5301 // CHECK1: else: 5302 // CHECK1-NEXT: br label [[IFCONT]] 5303 // CHECK1: ifcont: 5304 // CHECK1-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1 5305 // CHECK1-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 5306 // CHECK1-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]] 5307 // CHECK1-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 5308 // CHECK1: then5: 5309 // CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 5310 // CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 5311 // CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 5312 // CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 5313 // CHECK1-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32* 5314 // CHECK1-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32* 5315 // CHECK1-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4 5316 // CHECK1-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4 5317 // CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 5318 // CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 5319 // CHECK1-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 5320 // CHECK1-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 8 5321 // CHECK1-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16* 5322 // CHECK1-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16* 5323 // CHECK1-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2 5324 // CHECK1-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2 5325 // CHECK1-NEXT: br label [[IFCONT7:%.*]] 5326 // CHECK1: else6: 5327 // CHECK1-NEXT: br label [[IFCONT7]] 5328 // CHECK1: ifcont7: 5329 // CHECK1-NEXT: ret void 5330 // 5331 // 5332 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 5333 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 5334 // CHECK1-NEXT: entry: 5335 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 5336 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5337 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 5338 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 5339 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5340 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5341 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5342 // CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 5343 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5344 // CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 5345 // CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8 5346 // CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 5347 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 5348 // CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 5349 // CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 5350 // CHECK1: then: 5351 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0 5352 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 5353 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 5354 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 5355 // CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 5356 // CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 5357 // CHECK1-NEXT: br label [[IFCONT:%.*]] 5358 // CHECK1: else: 5359 // CHECK1-NEXT: br label [[IFCONT]] 5360 // CHECK1: ifcont: 5361 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 5362 // CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5363 // CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 5364 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 5365 // CHECK1: then2: 5366 // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 5367 // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0 5368 // CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 5369 // CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 5370 // CHECK1-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 5371 // CHECK1-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4 5372 // CHECK1-NEXT: br label [[IFCONT4:%.*]] 5373 // CHECK1: else3: 5374 // CHECK1-NEXT: br label [[IFCONT4]] 5375 // CHECK1: ifcont4: 5376 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 5377 // CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 5378 // CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 5379 // CHECK1: then6: 5380 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1 5381 // CHECK1-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8 5382 // CHECK1-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16* 5383 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 5384 // CHECK1-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)* 5385 // CHECK1-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2 5386 // CHECK1-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2 5387 // CHECK1-NEXT: br label [[IFCONT8:%.*]] 5388 // CHECK1: else7: 5389 // CHECK1-NEXT: br label [[IFCONT8]] 5390 // CHECK1: ifcont8: 5391 // CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 5392 // CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5393 // CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]] 5394 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 5395 // CHECK1: then10: 5396 // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 5397 // CHECK1-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)* 5398 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1 5399 // CHECK1-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 8 5400 // CHECK1-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16* 5401 // CHECK1-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2 5402 // CHECK1-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2 5403 // CHECK1-NEXT: br label [[IFCONT12:%.*]] 5404 // CHECK1: else11: 5405 // CHECK1-NEXT: br label [[IFCONT12]] 5406 // CHECK1: ifcont12: 5407 // CHECK1-NEXT: ret void 5408 // 5409 // 5410 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 5411 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 5412 // CHECK1-NEXT: entry: 5413 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 5414 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5415 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 5416 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 5417 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5418 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 5419 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 5420 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 5421 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 5422 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* 5423 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5424 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 5425 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 5426 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 5427 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 5428 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] 5429 // CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 5430 // CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 5431 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 5432 // CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 5433 // CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 5434 // CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 5435 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] 5436 // CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 5437 // CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 5438 // CHECK1-NEXT: ret void 5439 // 5440 // 5441 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 5442 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 5443 // CHECK1-NEXT: entry: 5444 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 5445 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5446 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 5447 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 5448 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 5449 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5450 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 5451 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 5452 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* 5453 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5454 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 5455 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 5456 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] 5457 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 5458 // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 5459 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 5460 // CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 5461 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] 5462 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 5463 // CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 5464 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 5465 // CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 5466 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] 5467 // CHECK1-NEXT: ret void 5468 // 5469 // 5470 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 5471 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 5472 // CHECK1-NEXT: entry: 5473 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 5474 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5475 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 5476 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 5477 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5478 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 5479 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 5480 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 5481 // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 5482 // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* 5483 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5484 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 5485 // CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 5486 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 5487 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 5488 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] 5489 // CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 5490 // CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 5491 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 5492 // CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 5493 // CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 5494 // CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 5495 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] 5496 // CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 5497 // CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 5498 // CHECK1-NEXT: ret void 5499 // 5500 // 5501 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 5502 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 5503 // CHECK1-NEXT: entry: 5504 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 5505 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5506 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 5507 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 5508 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 5509 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5510 // CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 5511 // CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 5512 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* 5513 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5514 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 5515 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 5516 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] 5517 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 5518 // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 5519 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 5520 // CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 5521 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] 5522 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 5523 // CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 5524 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 5525 // CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 5526 // CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] 5527 // CHECK1-NEXT: ret void 5528 // 5529 // 5530 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20 5531 // CHECK2-SAME: (double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] { 5532 // CHECK2-NEXT: entry: 5533 // CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 5534 // CHECK2-NEXT: [[E1:%.*]] = alloca double, align 8 5535 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 5536 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 5537 // CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 5538 // CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 5539 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) 5540 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 5541 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 5542 // CHECK2: user_code.entry: 5543 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 5544 // CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[TMP0]], align 8 5545 // CHECK2-NEXT: store double [[TMP3]], double* [[E1]], align 8 5546 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 5547 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 5548 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR3:[0-9]+]] 5549 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 5550 // CHECK2-NEXT: ret void 5551 // CHECK2: worker.exit: 5552 // CHECK2-NEXT: ret void 5553 // 5554 // 5555 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ 5556 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] { 5557 // CHECK2-NEXT: entry: 5558 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 5559 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 5560 // CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 5561 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 5562 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 5563 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 5564 // CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 5565 // CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 5566 // CHECK2-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8) 5567 // CHECK2-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* 5568 // CHECK2-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 5569 // CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 5570 // CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 5571 // CHECK2-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 5572 // CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 5573 // CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 5574 // CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 5575 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* 5576 // CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 5577 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 5578 // CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 5579 // CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) 5580 // CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 5581 // CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 5582 // CHECK2: .omp.reduction.then: 5583 // CHECK2-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 5584 // CHECK2-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 5585 // CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] 5586 // CHECK2-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 5587 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) 5588 // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 5589 // CHECK2: .omp.reduction.done: 5590 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i32 8) 5591 // CHECK2-NEXT: ret void 5592 // 5593 // 5594 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func 5595 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] { 5596 // CHECK2-NEXT: entry: 5597 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 5598 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 5599 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 5600 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 5601 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 5602 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 5603 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 5604 // CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 5605 // CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 5606 // CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 5607 // CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 5608 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* 5609 // CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 5610 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 5611 // CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 5612 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 5613 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 5614 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 5615 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* 5616 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 5617 // CHECK2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* 5618 // CHECK2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* 5619 // CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* 5620 // CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 5621 // CHECK2-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size() 5622 // CHECK2-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 5623 // CHECK2-NEXT: [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]]) 5624 // CHECK2-NEXT: store i64 [[TMP20]], i64* [[TMP16]], align 8 5625 // CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 5626 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 5627 // CHECK2-NEXT: [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 5628 // CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP11]], align 4 5629 // CHECK2-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0 5630 // CHECK2-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1 5631 // CHECK2-NEXT: [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 5632 // CHECK2-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]] 5633 // CHECK2-NEXT: [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2 5634 // CHECK2-NEXT: [[TMP29:%.*]] = and i16 [[TMP6]], 1 5635 // CHECK2-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0 5636 // CHECK2-NEXT: [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]] 5637 // CHECK2-NEXT: [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0 5638 // CHECK2-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]] 5639 // CHECK2-NEXT: [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]] 5640 // CHECK2-NEXT: [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]] 5641 // CHECK2-NEXT: br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]] 5642 // CHECK2: then: 5643 // CHECK2-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* 5644 // CHECK2-NEXT: [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 5645 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]] 5646 // CHECK2-NEXT: br label [[IFCONT:%.*]] 5647 // CHECK2: else: 5648 // CHECK2-NEXT: br label [[IFCONT]] 5649 // CHECK2: ifcont: 5650 // CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 5651 // CHECK2-NEXT: [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 5652 // CHECK2-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 5653 // CHECK2-NEXT: br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 5654 // CHECK2: then4: 5655 // CHECK2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 5656 // CHECK2-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 4 5657 // CHECK2-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 5658 // CHECK2-NEXT: [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 4 5659 // CHECK2-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double* 5660 // CHECK2-NEXT: [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double* 5661 // CHECK2-NEXT: [[TMP47:%.*]] = load double, double* [[TMP45]], align 8 5662 // CHECK2-NEXT: store double [[TMP47]], double* [[TMP46]], align 8 5663 // CHECK2-NEXT: br label [[IFCONT6:%.*]] 5664 // CHECK2: else5: 5665 // CHECK2-NEXT: br label [[IFCONT6]] 5666 // CHECK2: ifcont6: 5667 // CHECK2-NEXT: ret void 5668 // 5669 // 5670 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func 5671 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 5672 // CHECK2-NEXT: entry: 5673 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 5674 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5675 // CHECK2-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 5676 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 5677 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 5678 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5679 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5680 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5681 // CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 5682 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 5683 // CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 5684 // CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4 5685 // CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]* 5686 // CHECK2-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 5687 // CHECK2-NEXT: br label [[PRECOND:%.*]] 5688 // CHECK2: precond: 5689 // CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 5690 // CHECK2-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2 5691 // CHECK2-NEXT: br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]] 5692 // CHECK2: body: 5693 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) 5694 // CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 5695 // CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 5696 // CHECK2: then: 5697 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0 5698 // CHECK2-NEXT: [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 4 5699 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32* 5700 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]] 5701 // CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 5702 // CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4 5703 // CHECK2-NEXT: store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4 5704 // CHECK2-NEXT: br label [[IFCONT:%.*]] 5705 // CHECK2: else: 5706 // CHECK2-NEXT: br label [[IFCONT]] 5707 // CHECK2: ifcont: 5708 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 5709 // CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5710 // CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]] 5711 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 5712 // CHECK2: then2: 5713 // CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 5714 // CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0 5715 // CHECK2-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 4 5716 // CHECK2-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32* 5717 // CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]] 5718 // CHECK2-NEXT: [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4 5719 // CHECK2-NEXT: store i32 [[TMP22]], i32* [[TMP21]], align 4 5720 // CHECK2-NEXT: br label [[IFCONT4:%.*]] 5721 // CHECK2: else3: 5722 // CHECK2-NEXT: br label [[IFCONT4]] 5723 // CHECK2: ifcont4: 5724 // CHECK2-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1 5725 // CHECK2-NEXT: store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4 5726 // CHECK2-NEXT: br label [[PRECOND]] 5727 // CHECK2: exit: 5728 // CHECK2-NEXT: ret void 5729 // 5730 // 5731 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func 5732 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 5733 // CHECK2-NEXT: entry: 5734 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 5735 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5736 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 5737 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 5738 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5739 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 5740 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 5741 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 5742 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 5743 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 5744 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5745 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 5746 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 5747 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 5748 // CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 5749 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] 5750 // CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 5751 // CHECK2-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 5752 // CHECK2-NEXT: ret void 5753 // 5754 // 5755 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func 5756 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 5757 // CHECK2-NEXT: entry: 5758 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 5759 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5760 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 5761 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 5762 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 5763 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5764 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 5765 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 5766 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* 5767 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5768 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 5769 // CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 5770 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] 5771 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 5772 // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 5773 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 5774 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 5775 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] 5776 // CHECK2-NEXT: ret void 5777 // 5778 // 5779 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func 5780 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 5781 // CHECK2-NEXT: entry: 5782 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 5783 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5784 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 5785 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 5786 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5787 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 5788 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 5789 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 5790 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 5791 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 5792 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5793 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 5794 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 5795 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 5796 // CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 5797 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] 5798 // CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 5799 // CHECK2-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 5800 // CHECK2-NEXT: ret void 5801 // 5802 // 5803 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func 5804 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 5805 // CHECK2-NEXT: entry: 5806 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 5807 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 5808 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 5809 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 5810 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 5811 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 5812 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 5813 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 5814 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* 5815 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 5816 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 5817 // CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 5818 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] 5819 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 5820 // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 5821 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 5822 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 5823 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] 5824 // CHECK2-NEXT: ret void 5825 // 5826 // 5827 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26 5828 // CHECK2-SAME: (i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] { 5829 // CHECK2-NEXT: entry: 5830 // CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 5831 // CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 5832 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 5833 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 5834 // CHECK2-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 5835 // CHECK2-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 5836 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* 5837 // CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* 5838 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 5839 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 5840 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 5841 // CHECK2: user_code.entry: 5842 // CHECK2-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1 5843 // CHECK2-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1) 5844 // CHECK2-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1 5845 // CHECK2-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4 5846 // CHECK2-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4) 5847 // CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float* 5848 // CHECK2-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4 5849 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 5850 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 5851 // CHECK2-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4 5852 // CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]] 5853 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i32 4) 5854 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i32 1) 5855 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 5856 // CHECK2-NEXT: ret void 5857 // CHECK2: worker.exit: 5858 // CHECK2-NEXT: ret void 5859 // 5860 // 5861 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 5862 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] { 5863 // CHECK2-NEXT: entry: 5864 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 5865 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 5866 // CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 5867 // CHECK2-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 5868 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 5869 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 5870 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 5871 // CHECK2-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 5872 // CHECK2-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 5873 // CHECK2-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 5874 // CHECK2-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 5875 // CHECK2-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1) 5876 // CHECK2-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4) 5877 // CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* 5878 // CHECK2-NEXT: store i8 0, i8* [[C1]], align 1 5879 // CHECK2-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 5880 // CHECK2-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 5881 // CHECK2-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 5882 // CHECK2-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 5883 // CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 5884 // CHECK2-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 5885 // CHECK2-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 5886 // CHECK2-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 5887 // CHECK2-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 5888 // CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 5889 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 5890 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 5891 // CHECK2-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 5892 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 5893 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* 5894 // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 5895 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 5896 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 5897 // CHECK2-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) 5898 // CHECK2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 5899 // CHECK2-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 5900 // CHECK2: .omp.reduction.then: 5901 // CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 5902 // CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 5903 // CHECK2-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 5904 // CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 5905 // CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] 5906 // CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 5907 // CHECK2-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 5908 // CHECK2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 5909 // CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 5910 // CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] 5911 // CHECK2-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 5912 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) 5913 // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 5914 // CHECK2: .omp.reduction.done: 5915 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D2]], i32 4) 5916 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C1]], i32 1) 5917 // CHECK2-NEXT: ret void 5918 // 5919 // 5920 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 5921 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 5922 // CHECK2-NEXT: entry: 5923 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 5924 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 5925 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 5926 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 5927 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 5928 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 5929 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 5930 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 5931 // CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 5932 // CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 5933 // CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 5934 // CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 5935 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 5936 // CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 5937 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 5938 // CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 5939 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 5940 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 5941 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 5942 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 5943 // CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 5944 // CHECK2-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 5945 // CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size() 5946 // CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16 5947 // CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]]) 5948 // CHECK2-NEXT: [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8 5949 // CHECK2-NEXT: store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 5950 // CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 5951 // CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 5952 // CHECK2-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 5953 // CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 5954 // CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 5955 // CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 5956 // CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float* 5957 // CHECK2-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i32 1 5958 // CHECK2-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8* 5959 // CHECK2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32* 5960 // CHECK2-NEXT: [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* 5961 // CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4 5962 // CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 5963 // CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 5964 // CHECK2-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 5965 // CHECK2-NEXT: store i32 [[TMP32]], i32* [[TMP28]], align 4 5966 // CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 5967 // CHECK2-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1 5968 // CHECK2-NEXT: [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 5969 // CHECK2-NEXT: store i8* [[TMP35]], i8** [[TMP23]], align 4 5970 // CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0 5971 // CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 5972 // CHECK2-NEXT: [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 5973 // CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] 5974 // CHECK2-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2 5975 // CHECK2-NEXT: [[TMP41:%.*]] = and i16 [[TMP6]], 1 5976 // CHECK2-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0 5977 // CHECK2-NEXT: [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]] 5978 // CHECK2-NEXT: [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0 5979 // CHECK2-NEXT: [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]] 5980 // CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]] 5981 // CHECK2-NEXT: [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]] 5982 // CHECK2-NEXT: br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]] 5983 // CHECK2: then: 5984 // CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 5985 // CHECK2-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 5986 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]] 5987 // CHECK2-NEXT: br label [[IFCONT:%.*]] 5988 // CHECK2: else: 5989 // CHECK2-NEXT: br label [[IFCONT]] 5990 // CHECK2: ifcont: 5991 // CHECK2-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1 5992 // CHECK2-NEXT: [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 5993 // CHECK2-NEXT: [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]] 5994 // CHECK2-NEXT: br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 5995 // CHECK2: then5: 5996 // CHECK2-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 5997 // CHECK2-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 5998 // CHECK2-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 5999 // CHECK2-NEXT: [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 4 6000 // CHECK2-NEXT: [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1 6001 // CHECK2-NEXT: store i8 [[TMP57]], i8* [[TMP56]], align 1 6002 // CHECK2-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 6003 // CHECK2-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 6004 // CHECK2-NEXT: [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 6005 // CHECK2-NEXT: [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 4 6006 // CHECK2-NEXT: [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float* 6007 // CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float* 6008 // CHECK2-NEXT: [[TMP64:%.*]] = load float, float* [[TMP62]], align 4 6009 // CHECK2-NEXT: store float [[TMP64]], float* [[TMP63]], align 4 6010 // CHECK2-NEXT: br label [[IFCONT7:%.*]] 6011 // CHECK2: else6: 6012 // CHECK2-NEXT: br label [[IFCONT7]] 6013 // CHECK2: ifcont7: 6014 // CHECK2-NEXT: ret void 6015 // 6016 // 6017 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 6018 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 6019 // CHECK2-NEXT: entry: 6020 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6021 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6022 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 6023 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6024 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6025 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6026 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6027 // CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 6028 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6029 // CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 6030 // CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6031 // CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 6032 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 6033 // CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 6034 // CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 6035 // CHECK2: then: 6036 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 6037 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 6038 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 6039 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)* 6040 // CHECK2-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1 6041 // CHECK2-NEXT: store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1 6042 // CHECK2-NEXT: br label [[IFCONT:%.*]] 6043 // CHECK2: else: 6044 // CHECK2-NEXT: br label [[IFCONT]] 6045 // CHECK2: ifcont: 6046 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 6047 // CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6048 // CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 6049 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 6050 // CHECK2: then2: 6051 // CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 6052 // CHECK2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)* 6053 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 6054 // CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 6055 // CHECK2-NEXT: [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1 6056 // CHECK2-NEXT: store i8 [[TMP18]], i8* [[TMP17]], align 1 6057 // CHECK2-NEXT: br label [[IFCONT4:%.*]] 6058 // CHECK2: else3: 6059 // CHECK2-NEXT: br label [[IFCONT4]] 6060 // CHECK2: ifcont4: 6061 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 6062 // CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 6063 // CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 6064 // CHECK2: then6: 6065 // CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 6066 // CHECK2-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4 6067 // CHECK2-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32* 6068 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 6069 // CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4 6070 // CHECK2-NEXT: store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4 6071 // CHECK2-NEXT: br label [[IFCONT8:%.*]] 6072 // CHECK2: else7: 6073 // CHECK2-NEXT: br label [[IFCONT8]] 6074 // CHECK2: ifcont8: 6075 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 6076 // CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6077 // CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]] 6078 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 6079 // CHECK2: then10: 6080 // CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 6081 // CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 6082 // CHECK2-NEXT: [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 4 6083 // CHECK2-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32* 6084 // CHECK2-NEXT: [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4 6085 // CHECK2-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 4 6086 // CHECK2-NEXT: br label [[IFCONT12:%.*]] 6087 // CHECK2: else11: 6088 // CHECK2-NEXT: br label [[IFCONT12]] 6089 // CHECK2: ifcont12: 6090 // CHECK2-NEXT: ret void 6091 // 6092 // 6093 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 6094 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 6095 // CHECK2-NEXT: entry: 6096 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6097 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6098 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 6099 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6100 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6101 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 6102 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 6103 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 6104 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6105 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 6106 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6107 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 6108 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 6109 // CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 6110 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] 6111 // CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 6112 // CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 6113 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 6114 // CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 6115 // CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 6116 // CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 6117 // CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] 6118 // CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 6119 // CHECK2-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 6120 // CHECK2-NEXT: ret void 6121 // 6122 // 6123 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 6124 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 6125 // CHECK2-NEXT: entry: 6126 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6127 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6128 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 6129 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 6130 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6131 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6132 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 6133 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6134 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 6135 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6136 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 6137 // CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 6138 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] 6139 // CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 6140 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 6141 // CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 6142 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] 6143 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 6144 // CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 6145 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 6146 // CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 6147 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] 6148 // CHECK2-NEXT: ret void 6149 // 6150 // 6151 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 6152 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 6153 // CHECK2-NEXT: entry: 6154 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6155 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6156 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 6157 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6158 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6159 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 6160 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 6161 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 6162 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6163 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 6164 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6165 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 6166 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 6167 // CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 6168 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] 6169 // CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 6170 // CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 6171 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 6172 // CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 6173 // CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 6174 // CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 6175 // CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] 6176 // CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 6177 // CHECK2-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 6178 // CHECK2-NEXT: ret void 6179 // 6180 // 6181 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 6182 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 6183 // CHECK2-NEXT: entry: 6184 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6185 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6186 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 6187 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 6188 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6189 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6190 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 6191 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6192 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 6193 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6194 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 6195 // CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 6196 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] 6197 // CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 6198 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 6199 // CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 6200 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] 6201 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 6202 // CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 6203 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 6204 // CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 6205 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] 6206 // CHECK2-NEXT: ret void 6207 // 6208 // 6209 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 6210 // CHECK2-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { 6211 // CHECK2-NEXT: entry: 6212 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 6213 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 6214 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 6215 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 6216 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 6217 // CHECK2-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 6218 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* 6219 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) 6220 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 6221 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 6222 // CHECK2: user_code.entry: 6223 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) 6224 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 6225 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 6226 // CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] 6227 // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) 6228 // CHECK2-NEXT: ret void 6229 // CHECK2: worker.exit: 6230 // CHECK2-NEXT: ret void 6231 // 6232 // 6233 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 6234 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { 6235 // CHECK2-NEXT: entry: 6236 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 6237 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 6238 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 6239 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 6240 // CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 6241 // CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 6242 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 6243 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 6244 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 6245 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 6246 // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 6247 // CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 6248 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 6249 // CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 6250 // CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 6251 // CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 6252 // CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 6253 // CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* 6254 // CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 6255 // CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 6256 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* 6257 // CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 6258 // CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 6259 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 6260 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 6261 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) 6262 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 6263 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* 6264 // CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 6265 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 6266 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* 6267 // CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 6268 // CHECK2-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 6269 // CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 6270 // CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) 6271 // CHECK2-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 6272 // CHECK2-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 6273 // CHECK2: .omp.reduction.then: 6274 // CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 6275 // CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 6276 // CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] 6277 // CHECK2-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 6278 // CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 6279 // CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 6280 // CHECK2-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 6281 // CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 6282 // CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] 6283 // CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 6284 // CHECK2: cond.true: 6285 // CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 6286 // CHECK2-NEXT: br label [[COND_END:%.*]] 6287 // CHECK2: cond.false: 6288 // CHECK2-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 6289 // CHECK2-NEXT: br label [[COND_END]] 6290 // CHECK2: cond.end: 6291 // CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] 6292 // CHECK2-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 6293 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) 6294 // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 6295 // CHECK2: .omp.reduction.done: 6296 // CHECK2-NEXT: ret void 6297 // 6298 // 6299 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 6300 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { 6301 // CHECK2-NEXT: entry: 6302 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 6303 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 6304 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 6305 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 6306 // CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 6307 // CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 6308 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 6309 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 6310 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 6311 // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 6312 // CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 6313 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 6314 // CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 6315 // CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 6316 // CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 6317 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 6318 // CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 6319 // CHECK2-NEXT: store i32 [[OR]], i32* [[A1]], align 4 6320 // CHECK2-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 6321 // CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 6322 // CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] 6323 // CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 6324 // CHECK2: cond.true: 6325 // CHECK2-NEXT: br label [[COND_END:%.*]] 6326 // CHECK2: cond.false: 6327 // CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 6328 // CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 6329 // CHECK2-NEXT: br label [[COND_END]] 6330 // CHECK2: cond.end: 6331 // CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] 6332 // CHECK2-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 6333 // CHECK2-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 6334 // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 6335 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 6336 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 6337 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* 6338 // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 6339 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 6340 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* 6341 // CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 6342 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 6343 // CHECK2-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) 6344 // CHECK2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 6345 // CHECK2-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 6346 // CHECK2: .omp.reduction.then: 6347 // CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 6348 // CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 6349 // CHECK2-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] 6350 // CHECK2-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 6351 // CHECK2-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 6352 // CHECK2-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 6353 // CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 6354 // CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 6355 // CHECK2-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] 6356 // CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] 6357 // CHECK2: cond.true9: 6358 // CHECK2-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 6359 // CHECK2-NEXT: br label [[COND_END11:%.*]] 6360 // CHECK2: cond.false10: 6361 // CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 6362 // CHECK2-NEXT: br label [[COND_END11]] 6363 // CHECK2: cond.end11: 6364 // CHECK2-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] 6365 // CHECK2-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 6366 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) 6367 // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 6368 // CHECK2: .omp.reduction.done: 6369 // CHECK2-NEXT: ret void 6370 // 6371 // 6372 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 6373 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 6374 // CHECK2-NEXT: entry: 6375 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6376 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 6377 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 6378 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 6379 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 6380 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 6381 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 6382 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6383 // CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 6384 // CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 6385 // CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 6386 // CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6387 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 6388 // CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 6389 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 6390 // CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 6391 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 6392 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 6393 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 6394 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 6395 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 6396 // CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 6397 // CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 6398 // CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size() 6399 // CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 6400 // CHECK2-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]]) 6401 // CHECK2-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 6402 // CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 6403 // CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 6404 // CHECK2-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 6405 // CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 4 6406 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 6407 // CHECK2-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4 6408 // CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 6409 // CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16* 6410 // CHECK2-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1 6411 // CHECK2-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8* 6412 // CHECK2-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2 6413 // CHECK2-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 6414 // CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 6415 // CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 6416 // CHECK2-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 6417 // CHECK2-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 6418 // CHECK2-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 6419 // CHECK2-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1 6420 // CHECK2-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 6421 // CHECK2-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 6422 // CHECK2-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 4 6423 // CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0 6424 // CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 6425 // CHECK2-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 6426 // CHECK2-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 6427 // CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2 6428 // CHECK2-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1 6429 // CHECK2-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0 6430 // CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]] 6431 // CHECK2-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0 6432 // CHECK2-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]] 6433 // CHECK2-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]] 6434 // CHECK2-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]] 6435 // CHECK2-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]] 6436 // CHECK2: then: 6437 // CHECK2-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 6438 // CHECK2-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 6439 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]] 6440 // CHECK2-NEXT: br label [[IFCONT:%.*]] 6441 // CHECK2: else: 6442 // CHECK2-NEXT: br label [[IFCONT]] 6443 // CHECK2: ifcont: 6444 // CHECK2-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1 6445 // CHECK2-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 6446 // CHECK2-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]] 6447 // CHECK2-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 6448 // CHECK2: then5: 6449 // CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 6450 // CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 6451 // CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 6452 // CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 6453 // CHECK2-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32* 6454 // CHECK2-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32* 6455 // CHECK2-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4 6456 // CHECK2-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4 6457 // CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 6458 // CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 6459 // CHECK2-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 6460 // CHECK2-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4 6461 // CHECK2-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16* 6462 // CHECK2-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16* 6463 // CHECK2-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2 6464 // CHECK2-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2 6465 // CHECK2-NEXT: br label [[IFCONT7:%.*]] 6466 // CHECK2: else6: 6467 // CHECK2-NEXT: br label [[IFCONT7]] 6468 // CHECK2: ifcont7: 6469 // CHECK2-NEXT: ret void 6470 // 6471 // 6472 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 6473 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 6474 // CHECK2-NEXT: entry: 6475 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6476 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6477 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 6478 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6479 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6480 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6481 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6482 // CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 6483 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6484 // CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 6485 // CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6486 // CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 6487 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) 6488 // CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 6489 // CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 6490 // CHECK2: then: 6491 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 6492 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 6493 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 6494 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 6495 // CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 6496 // CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 6497 // CHECK2-NEXT: br label [[IFCONT:%.*]] 6498 // CHECK2: else: 6499 // CHECK2-NEXT: br label [[IFCONT]] 6500 // CHECK2: ifcont: 6501 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 6502 // CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6503 // CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 6504 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 6505 // CHECK2: then2: 6506 // CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 6507 // CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 6508 // CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 6509 // CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 6510 // CHECK2-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 6511 // CHECK2-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4 6512 // CHECK2-NEXT: br label [[IFCONT4:%.*]] 6513 // CHECK2: else3: 6514 // CHECK2-NEXT: br label [[IFCONT4]] 6515 // CHECK2: ifcont4: 6516 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 6517 // CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 6518 // CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 6519 // CHECK2: then6: 6520 // CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 6521 // CHECK2-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4 6522 // CHECK2-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16* 6523 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 6524 // CHECK2-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)* 6525 // CHECK2-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2 6526 // CHECK2-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2 6527 // CHECK2-NEXT: br label [[IFCONT8:%.*]] 6528 // CHECK2: else7: 6529 // CHECK2-NEXT: br label [[IFCONT8]] 6530 // CHECK2: ifcont8: 6531 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 6532 // CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6533 // CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]] 6534 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 6535 // CHECK2: then10: 6536 // CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 6537 // CHECK2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)* 6538 // CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 6539 // CHECK2-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4 6540 // CHECK2-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16* 6541 // CHECK2-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2 6542 // CHECK2-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2 6543 // CHECK2-NEXT: br label [[IFCONT12:%.*]] 6544 // CHECK2: else11: 6545 // CHECK2-NEXT: br label [[IFCONT12]] 6546 // CHECK2: ifcont12: 6547 // CHECK2-NEXT: ret void 6548 // 6549 // 6550 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 6551 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 6552 // CHECK2-NEXT: entry: 6553 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6554 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 6555 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 6556 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 6557 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 6558 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 6559 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 6560 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6561 // CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 6562 // CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 6563 // CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 6564 // CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6565 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 6566 // CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 6567 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 6568 // CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 6569 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 6570 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 6571 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 6572 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 6573 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 6574 // CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 6575 // CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 6576 // CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size() 6577 // CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 6578 // CHECK2-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]]) 6579 // CHECK2-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 6580 // CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 6581 // CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 6582 // CHECK2-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 6583 // CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 4 6584 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 6585 // CHECK2-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4 6586 // CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 6587 // CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16* 6588 // CHECK2-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1 6589 // CHECK2-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8* 6590 // CHECK2-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2 6591 // CHECK2-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 6592 // CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 6593 // CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 6594 // CHECK2-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 6595 // CHECK2-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 6596 // CHECK2-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 6597 // CHECK2-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1 6598 // CHECK2-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 6599 // CHECK2-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 6600 // CHECK2-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 4 6601 // CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0 6602 // CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 6603 // CHECK2-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 6604 // CHECK2-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 6605 // CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2 6606 // CHECK2-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1 6607 // CHECK2-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0 6608 // CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]] 6609 // CHECK2-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0 6610 // CHECK2-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]] 6611 // CHECK2-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]] 6612 // CHECK2-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]] 6613 // CHECK2-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]] 6614 // CHECK2: then: 6615 // CHECK2-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 6616 // CHECK2-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 6617 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]] 6618 // CHECK2-NEXT: br label [[IFCONT:%.*]] 6619 // CHECK2: else: 6620 // CHECK2-NEXT: br label [[IFCONT]] 6621 // CHECK2: ifcont: 6622 // CHECK2-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1 6623 // CHECK2-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 6624 // CHECK2-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]] 6625 // CHECK2-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 6626 // CHECK2: then5: 6627 // CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 6628 // CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 6629 // CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 6630 // CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 6631 // CHECK2-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32* 6632 // CHECK2-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32* 6633 // CHECK2-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4 6634 // CHECK2-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4 6635 // CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 6636 // CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 6637 // CHECK2-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 6638 // CHECK2-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4 6639 // CHECK2-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16* 6640 // CHECK2-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16* 6641 // CHECK2-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2 6642 // CHECK2-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2 6643 // CHECK2-NEXT: br label [[IFCONT7:%.*]] 6644 // CHECK2: else6: 6645 // CHECK2-NEXT: br label [[IFCONT7]] 6646 // CHECK2: ifcont7: 6647 // CHECK2-NEXT: ret void 6648 // 6649 // 6650 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 6651 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 6652 // CHECK2-NEXT: entry: 6653 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6654 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6655 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 6656 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6657 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6658 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6659 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6660 // CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 6661 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6662 // CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 6663 // CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6664 // CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 6665 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 6666 // CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 6667 // CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 6668 // CHECK2: then: 6669 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 6670 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 6671 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 6672 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 6673 // CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 6674 // CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 6675 // CHECK2-NEXT: br label [[IFCONT:%.*]] 6676 // CHECK2: else: 6677 // CHECK2-NEXT: br label [[IFCONT]] 6678 // CHECK2: ifcont: 6679 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 6680 // CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6681 // CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 6682 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 6683 // CHECK2: then2: 6684 // CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 6685 // CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 6686 // CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 6687 // CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 6688 // CHECK2-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 6689 // CHECK2-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4 6690 // CHECK2-NEXT: br label [[IFCONT4:%.*]] 6691 // CHECK2: else3: 6692 // CHECK2-NEXT: br label [[IFCONT4]] 6693 // CHECK2: ifcont4: 6694 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 6695 // CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 6696 // CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 6697 // CHECK2: then6: 6698 // CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 6699 // CHECK2-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4 6700 // CHECK2-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16* 6701 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 6702 // CHECK2-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)* 6703 // CHECK2-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2 6704 // CHECK2-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2 6705 // CHECK2-NEXT: br label [[IFCONT8:%.*]] 6706 // CHECK2: else7: 6707 // CHECK2-NEXT: br label [[IFCONT8]] 6708 // CHECK2: ifcont8: 6709 // CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 6710 // CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6711 // CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]] 6712 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 6713 // CHECK2: then10: 6714 // CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 6715 // CHECK2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)* 6716 // CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 6717 // CHECK2-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4 6718 // CHECK2-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16* 6719 // CHECK2-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2 6720 // CHECK2-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2 6721 // CHECK2-NEXT: br label [[IFCONT12:%.*]] 6722 // CHECK2: else11: 6723 // CHECK2-NEXT: br label [[IFCONT12]] 6724 // CHECK2: ifcont12: 6725 // CHECK2-NEXT: ret void 6726 // 6727 // 6728 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 6729 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 6730 // CHECK2-NEXT: entry: 6731 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6732 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6733 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 6734 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6735 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6736 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 6737 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 6738 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 6739 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6740 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* 6741 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6742 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 6743 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 6744 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 6745 // CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 6746 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] 6747 // CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 6748 // CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 6749 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 6750 // CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 6751 // CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 6752 // CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 6753 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] 6754 // CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 6755 // CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 6756 // CHECK2-NEXT: ret void 6757 // 6758 // 6759 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 6760 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 6761 // CHECK2-NEXT: entry: 6762 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6763 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6764 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 6765 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 6766 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6767 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6768 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 6769 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6770 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* 6771 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6772 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 6773 // CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 6774 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] 6775 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 6776 // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 6777 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 6778 // CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 6779 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] 6780 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 6781 // CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 6782 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 6783 // CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 6784 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] 6785 // CHECK2-NEXT: ret void 6786 // 6787 // 6788 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 6789 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 6790 // CHECK2-NEXT: entry: 6791 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6792 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6793 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 6794 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6795 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6796 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 6797 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 6798 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 6799 // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6800 // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* 6801 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6802 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 6803 // CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 6804 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 6805 // CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 6806 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] 6807 // CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 6808 // CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 6809 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 6810 // CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 6811 // CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 6812 // CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 6813 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] 6814 // CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 6815 // CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 6816 // CHECK2-NEXT: ret void 6817 // 6818 // 6819 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 6820 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 6821 // CHECK2-NEXT: entry: 6822 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6823 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6824 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 6825 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 6826 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6827 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6828 // CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 6829 // CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6830 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* 6831 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 6832 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 6833 // CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 6834 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] 6835 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 6836 // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 6837 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 6838 // CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 6839 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] 6840 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 6841 // CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 6842 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 6843 // CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 6844 // CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] 6845 // CHECK2-NEXT: ret void 6846 // 6847 // 6848 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20 6849 // CHECK3-SAME: (double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] { 6850 // CHECK3-NEXT: entry: 6851 // CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 6852 // CHECK3-NEXT: [[E1:%.*]] = alloca double, align 8 6853 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 6854 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 6855 // CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 6856 // CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 6857 // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) 6858 // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 6859 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 6860 // CHECK3: user_code.entry: 6861 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 6862 // CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[TMP0]], align 8 6863 // CHECK3-NEXT: store double [[TMP3]], double* [[E1]], align 8 6864 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 6865 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 6866 // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR3:[0-9]+]] 6867 // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 6868 // CHECK3-NEXT: ret void 6869 // CHECK3: worker.exit: 6870 // CHECK3-NEXT: ret void 6871 // 6872 // 6873 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ 6874 // CHECK3-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] { 6875 // CHECK3-NEXT: entry: 6876 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 6877 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 6878 // CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 6879 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 6880 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 6881 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 6882 // CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 6883 // CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 6884 // CHECK3-NEXT: [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8) 6885 // CHECK3-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* 6886 // CHECK3-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 6887 // CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 6888 // CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 6889 // CHECK3-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 6890 // CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 6891 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 6892 // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 6893 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* 6894 // CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 6895 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 6896 // CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 6897 // CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 2048, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) 6898 // CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 6899 // CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 6900 // CHECK3: .omp.reduction.then: 6901 // CHECK3-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 6902 // CHECK3-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 6903 // CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] 6904 // CHECK3-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 6905 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) 6906 // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 6907 // CHECK3: .omp.reduction.done: 6908 // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i32 8) 6909 // CHECK3-NEXT: ret void 6910 // 6911 // 6912 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func 6913 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] { 6914 // CHECK3-NEXT: entry: 6915 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6916 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 6917 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 6918 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 6919 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 6920 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 6921 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6922 // CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 6923 // CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 6924 // CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 6925 // CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 6926 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* 6927 // CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 6928 // CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 6929 // CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 6930 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 6931 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 6932 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 6933 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* 6934 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 6935 // CHECK3-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* 6936 // CHECK3-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* 6937 // CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* 6938 // CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 6939 // CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size() 6940 // CHECK3-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 6941 // CHECK3-NEXT: [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]]) 6942 // CHECK3-NEXT: store i64 [[TMP20]], i64* [[TMP16]], align 8 6943 // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 6944 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 6945 // CHECK3-NEXT: [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 6946 // CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP11]], align 4 6947 // CHECK3-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0 6948 // CHECK3-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1 6949 // CHECK3-NEXT: [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 6950 // CHECK3-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]] 6951 // CHECK3-NEXT: [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2 6952 // CHECK3-NEXT: [[TMP29:%.*]] = and i16 [[TMP6]], 1 6953 // CHECK3-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0 6954 // CHECK3-NEXT: [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]] 6955 // CHECK3-NEXT: [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0 6956 // CHECK3-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]] 6957 // CHECK3-NEXT: [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]] 6958 // CHECK3-NEXT: [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]] 6959 // CHECK3-NEXT: br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]] 6960 // CHECK3: then: 6961 // CHECK3-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* 6962 // CHECK3-NEXT: [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 6963 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]] 6964 // CHECK3-NEXT: br label [[IFCONT:%.*]] 6965 // CHECK3: else: 6966 // CHECK3-NEXT: br label [[IFCONT]] 6967 // CHECK3: ifcont: 6968 // CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 6969 // CHECK3-NEXT: [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 6970 // CHECK3-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 6971 // CHECK3-NEXT: br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]] 6972 // CHECK3: then4: 6973 // CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 6974 // CHECK3-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 4 6975 // CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 6976 // CHECK3-NEXT: [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 4 6977 // CHECK3-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double* 6978 // CHECK3-NEXT: [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double* 6979 // CHECK3-NEXT: [[TMP47:%.*]] = load double, double* [[TMP45]], align 8 6980 // CHECK3-NEXT: store double [[TMP47]], double* [[TMP46]], align 8 6981 // CHECK3-NEXT: br label [[IFCONT6:%.*]] 6982 // CHECK3: else5: 6983 // CHECK3-NEXT: br label [[IFCONT6]] 6984 // CHECK3: ifcont6: 6985 // CHECK3-NEXT: ret void 6986 // 6987 // 6988 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func 6989 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 6990 // CHECK3-NEXT: entry: 6991 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 6992 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 6993 // CHECK3-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 6994 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 6995 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 6996 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 6997 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6998 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 6999 // CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 7000 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7001 // CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 7002 // CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7003 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]* 7004 // CHECK3-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 7005 // CHECK3-NEXT: br label [[PRECOND:%.*]] 7006 // CHECK3: precond: 7007 // CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 7008 // CHECK3-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2 7009 // CHECK3-NEXT: br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]] 7010 // CHECK3: body: 7011 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) 7012 // CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 7013 // CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 7014 // CHECK3: then: 7015 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0 7016 // CHECK3-NEXT: [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 4 7017 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32* 7018 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]] 7019 // CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 7020 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4 7021 // CHECK3-NEXT: store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4 7022 // CHECK3-NEXT: br label [[IFCONT:%.*]] 7023 // CHECK3: else: 7024 // CHECK3-NEXT: br label [[IFCONT]] 7025 // CHECK3: ifcont: 7026 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 7027 // CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7028 // CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]] 7029 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 7030 // CHECK3: then2: 7031 // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 7032 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0 7033 // CHECK3-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 4 7034 // CHECK3-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32* 7035 // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]] 7036 // CHECK3-NEXT: [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4 7037 // CHECK3-NEXT: store i32 [[TMP22]], i32* [[TMP21]], align 4 7038 // CHECK3-NEXT: br label [[IFCONT4:%.*]] 7039 // CHECK3: else3: 7040 // CHECK3-NEXT: br label [[IFCONT4]] 7041 // CHECK3: ifcont4: 7042 // CHECK3-NEXT: [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1 7043 // CHECK3-NEXT: store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4 7044 // CHECK3-NEXT: br label [[PRECOND]] 7045 // CHECK3: exit: 7046 // CHECK3-NEXT: ret void 7047 // 7048 // 7049 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func 7050 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 7051 // CHECK3-NEXT: entry: 7052 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7053 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7054 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 7055 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7056 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7057 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 7058 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 7059 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 7060 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7061 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 7062 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7063 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 7064 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 7065 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 7066 // CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 7067 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] 7068 // CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 7069 // CHECK3-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 7070 // CHECK3-NEXT: ret void 7071 // 7072 // 7073 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func 7074 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 7075 // CHECK3-NEXT: entry: 7076 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7077 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7078 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 7079 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 7080 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7081 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7082 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 7083 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7084 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* 7085 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7086 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 7087 // CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 7088 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] 7089 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 7090 // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 7091 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 7092 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 7093 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] 7094 // CHECK3-NEXT: ret void 7095 // 7096 // 7097 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func 7098 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 7099 // CHECK3-NEXT: entry: 7100 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7101 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7102 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 7103 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7104 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7105 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 7106 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 7107 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* 7108 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7109 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* 7110 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7111 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 7112 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 7113 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* 7114 // CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 7115 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] 7116 // CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 7117 // CHECK3-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 7118 // CHECK3-NEXT: ret void 7119 // 7120 // 7121 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func 7122 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 7123 // CHECK3-NEXT: entry: 7124 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7125 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7126 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 7127 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 7128 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7129 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7130 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 7131 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7132 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* 7133 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7134 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 7135 // CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 7136 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] 7137 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* 7138 // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 7139 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 7140 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 7141 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] 7142 // CHECK3-NEXT: ret void 7143 // 7144 // 7145 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26 7146 // CHECK3-SAME: (i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] { 7147 // CHECK3-NEXT: entry: 7148 // CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 7149 // CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 7150 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 7151 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 7152 // CHECK3-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 7153 // CHECK3-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 7154 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* 7155 // CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* 7156 // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) 7157 // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 7158 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 7159 // CHECK3: user_code.entry: 7160 // CHECK3-NEXT: [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1 7161 // CHECK3-NEXT: [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1) 7162 // CHECK3-NEXT: store i8 [[TMP1]], i8* [[C2]], align 1 7163 // CHECK3-NEXT: [[TMP2:%.*]] = load float, float* [[CONV1]], align 4 7164 // CHECK3-NEXT: [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4) 7165 // CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float* 7166 // CHECK3-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4 7167 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 7168 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 7169 // CHECK3-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4 7170 // CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]] 7171 // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i32 4) 7172 // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i32 1) 7173 // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) 7174 // CHECK3-NEXT: ret void 7175 // CHECK3: worker.exit: 7176 // CHECK3-NEXT: ret void 7177 // 7178 // 7179 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 7180 // CHECK3-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] { 7181 // CHECK3-NEXT: entry: 7182 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 7183 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 7184 // CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 7185 // CHECK3-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 7186 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 7187 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 7188 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 7189 // CHECK3-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 7190 // CHECK3-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 7191 // CHECK3-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 7192 // CHECK3-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 7193 // CHECK3-NEXT: [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1) 7194 // CHECK3-NEXT: [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4) 7195 // CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* 7196 // CHECK3-NEXT: store i8 0, i8* [[C1]], align 1 7197 // CHECK3-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 7198 // CHECK3-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 7199 // CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 7200 // CHECK3-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 7201 // CHECK3-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 7202 // CHECK3-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 7203 // CHECK3-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 7204 // CHECK3-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 7205 // CHECK3-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 7206 // CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 7207 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 7208 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 7209 // CHECK3-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 7210 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 7211 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* 7212 // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 7213 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 7214 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 7215 // CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 2048, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) 7216 // CHECK3-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 7217 // CHECK3-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 7218 // CHECK3: .omp.reduction.then: 7219 // CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 7220 // CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 7221 // CHECK3-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 7222 // CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 7223 // CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] 7224 // CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 7225 // CHECK3-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 7226 // CHECK3-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 7227 // CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 7228 // CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] 7229 // CHECK3-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 7230 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) 7231 // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 7232 // CHECK3: .omp.reduction.done: 7233 // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D2]], i32 4) 7234 // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C1]], i32 1) 7235 // CHECK3-NEXT: ret void 7236 // 7237 // 7238 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 7239 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 7240 // CHECK3-NEXT: entry: 7241 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7242 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 7243 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 7244 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 7245 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 7246 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 7247 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 7248 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7249 // CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 7250 // CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 7251 // CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 7252 // CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7253 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 7254 // CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 7255 // CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 7256 // CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 7257 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 7258 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 7259 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 7260 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 7261 // CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 7262 // CHECK3-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 7263 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size() 7264 // CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16 7265 // CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]]) 7266 // CHECK3-NEXT: [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8 7267 // CHECK3-NEXT: store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 7268 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 7269 // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 7270 // CHECK3-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 7271 // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 7272 // CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 7273 // CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 7274 // CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float* 7275 // CHECK3-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i32 1 7276 // CHECK3-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8* 7277 // CHECK3-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32* 7278 // CHECK3-NEXT: [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* 7279 // CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4 7280 // CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 7281 // CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 7282 // CHECK3-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 7283 // CHECK3-NEXT: store i32 [[TMP32]], i32* [[TMP28]], align 4 7284 // CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 7285 // CHECK3-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1 7286 // CHECK3-NEXT: [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 7287 // CHECK3-NEXT: store i8* [[TMP35]], i8** [[TMP23]], align 4 7288 // CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0 7289 // CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 7290 // CHECK3-NEXT: [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 7291 // CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] 7292 // CHECK3-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2 7293 // CHECK3-NEXT: [[TMP41:%.*]] = and i16 [[TMP6]], 1 7294 // CHECK3-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0 7295 // CHECK3-NEXT: [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]] 7296 // CHECK3-NEXT: [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0 7297 // CHECK3-NEXT: [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]] 7298 // CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]] 7299 // CHECK3-NEXT: [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]] 7300 // CHECK3-NEXT: br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]] 7301 // CHECK3: then: 7302 // CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 7303 // CHECK3-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 7304 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]] 7305 // CHECK3-NEXT: br label [[IFCONT:%.*]] 7306 // CHECK3: else: 7307 // CHECK3-NEXT: br label [[IFCONT]] 7308 // CHECK3: ifcont: 7309 // CHECK3-NEXT: [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1 7310 // CHECK3-NEXT: [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 7311 // CHECK3-NEXT: [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]] 7312 // CHECK3-NEXT: br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 7313 // CHECK3: then5: 7314 // CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 7315 // CHECK3-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 7316 // CHECK3-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 7317 // CHECK3-NEXT: [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 4 7318 // CHECK3-NEXT: [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1 7319 // CHECK3-NEXT: store i8 [[TMP57]], i8* [[TMP56]], align 1 7320 // CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 7321 // CHECK3-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 7322 // CHECK3-NEXT: [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 7323 // CHECK3-NEXT: [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 4 7324 // CHECK3-NEXT: [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float* 7325 // CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float* 7326 // CHECK3-NEXT: [[TMP64:%.*]] = load float, float* [[TMP62]], align 4 7327 // CHECK3-NEXT: store float [[TMP64]], float* [[TMP63]], align 4 7328 // CHECK3-NEXT: br label [[IFCONT7:%.*]] 7329 // CHECK3: else6: 7330 // CHECK3-NEXT: br label [[IFCONT7]] 7331 // CHECK3: ifcont7: 7332 // CHECK3-NEXT: ret void 7333 // 7334 // 7335 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 7336 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 7337 // CHECK3-NEXT: entry: 7338 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7339 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7340 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 7341 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7342 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7343 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7344 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7345 // CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 7346 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7347 // CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 7348 // CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7349 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 7350 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 7351 // CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 7352 // CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 7353 // CHECK3: then: 7354 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 7355 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 7356 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 7357 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)* 7358 // CHECK3-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1 7359 // CHECK3-NEXT: store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1 7360 // CHECK3-NEXT: br label [[IFCONT:%.*]] 7361 // CHECK3: else: 7362 // CHECK3-NEXT: br label [[IFCONT]] 7363 // CHECK3: ifcont: 7364 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 7365 // CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7366 // CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 7367 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 7368 // CHECK3: then2: 7369 // CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 7370 // CHECK3-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)* 7371 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 7372 // CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 7373 // CHECK3-NEXT: [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1 7374 // CHECK3-NEXT: store i8 [[TMP18]], i8* [[TMP17]], align 1 7375 // CHECK3-NEXT: br label [[IFCONT4:%.*]] 7376 // CHECK3: else3: 7377 // CHECK3-NEXT: br label [[IFCONT4]] 7378 // CHECK3: ifcont4: 7379 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 7380 // CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 7381 // CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 7382 // CHECK3: then6: 7383 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 7384 // CHECK3-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4 7385 // CHECK3-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32* 7386 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 7387 // CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4 7388 // CHECK3-NEXT: store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4 7389 // CHECK3-NEXT: br label [[IFCONT8:%.*]] 7390 // CHECK3: else7: 7391 // CHECK3-NEXT: br label [[IFCONT8]] 7392 // CHECK3: ifcont8: 7393 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) 7394 // CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7395 // CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]] 7396 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 7397 // CHECK3: then10: 7398 // CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 7399 // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 7400 // CHECK3-NEXT: [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 4 7401 // CHECK3-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32* 7402 // CHECK3-NEXT: [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4 7403 // CHECK3-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 4 7404 // CHECK3-NEXT: br label [[IFCONT12:%.*]] 7405 // CHECK3: else11: 7406 // CHECK3-NEXT: br label [[IFCONT12]] 7407 // CHECK3: ifcont12: 7408 // CHECK3-NEXT: ret void 7409 // 7410 // 7411 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 7412 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 7413 // CHECK3-NEXT: entry: 7414 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7415 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7416 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 7417 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7418 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7419 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 7420 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 7421 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 7422 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7423 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 7424 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7425 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 7426 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 7427 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 7428 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] 7429 // CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 7430 // CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 7431 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 7432 // CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 7433 // CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 7434 // CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 7435 // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] 7436 // CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 7437 // CHECK3-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 7438 // CHECK3-NEXT: ret void 7439 // 7440 // 7441 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 7442 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 7443 // CHECK3-NEXT: entry: 7444 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7445 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7446 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 7447 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 7448 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7449 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7450 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 7451 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7452 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 7453 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7454 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 7455 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 7456 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] 7457 // CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 7458 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 7459 // CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 7460 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] 7461 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 7462 // CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 7463 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 7464 // CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 7465 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] 7466 // CHECK3-NEXT: ret void 7467 // 7468 // 7469 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 7470 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 7471 // CHECK3-NEXT: entry: 7472 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7473 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7474 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 7475 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7476 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7477 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 7478 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 7479 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 7480 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7481 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* 7482 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7483 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 7484 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 7485 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 7486 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] 7487 // CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 7488 // CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 7489 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 7490 // CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 7491 // CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* 7492 // CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 7493 // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] 7494 // CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 7495 // CHECK3-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 7496 // CHECK3-NEXT: ret void 7497 // 7498 // 7499 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 7500 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 7501 // CHECK3-NEXT: entry: 7502 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7503 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7504 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 7505 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 7506 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7507 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7508 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 7509 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7510 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* 7511 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7512 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 7513 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 7514 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] 7515 // CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 7516 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 7517 // CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 7518 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] 7519 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* 7520 // CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 7521 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 7522 // CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 7523 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] 7524 // CHECK3-NEXT: ret void 7525 // 7526 // 7527 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 7528 // CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { 7529 // CHECK3-NEXT: entry: 7530 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 7531 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 7532 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 7533 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 7534 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 7535 // CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 7536 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* 7537 // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) 7538 // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 7539 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] 7540 // CHECK3: user_code.entry: 7541 // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) 7542 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 7543 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 7544 // CHECK3-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] 7545 // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) 7546 // CHECK3-NEXT: ret void 7547 // CHECK3: worker.exit: 7548 // CHECK3-NEXT: ret void 7549 // 7550 // 7551 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9 7552 // CHECK3-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { 7553 // CHECK3-NEXT: entry: 7554 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 7555 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 7556 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 7557 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 7558 // CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 7559 // CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 7560 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 7561 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 7562 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 7563 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 7564 // CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 7565 // CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 7566 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 7567 // CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 7568 // CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 7569 // CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 7570 // CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 7571 // CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* 7572 // CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 7573 // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 7574 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* 7575 // CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 7576 // CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 7577 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 7578 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 7579 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) 7580 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 7581 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* 7582 // CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 7583 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 7584 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* 7585 // CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 7586 // CHECK3-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 7587 // CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 7588 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) 7589 // CHECK3-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 7590 // CHECK3-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 7591 // CHECK3: .omp.reduction.then: 7592 // CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 7593 // CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 7594 // CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] 7595 // CHECK3-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 7596 // CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 7597 // CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 7598 // CHECK3-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 7599 // CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 7600 // CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] 7601 // CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 7602 // CHECK3: cond.true: 7603 // CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 7604 // CHECK3-NEXT: br label [[COND_END:%.*]] 7605 // CHECK3: cond.false: 7606 // CHECK3-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 7607 // CHECK3-NEXT: br label [[COND_END]] 7608 // CHECK3: cond.end: 7609 // CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] 7610 // CHECK3-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 7611 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) 7612 // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 7613 // CHECK3: .omp.reduction.done: 7614 // CHECK3-NEXT: ret void 7615 // 7616 // 7617 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10 7618 // CHECK3-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { 7619 // CHECK3-NEXT: entry: 7620 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 7621 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 7622 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 7623 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 7624 // CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 7625 // CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 7626 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 7627 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 7628 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 7629 // CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 7630 // CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 7631 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 7632 // CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 7633 // CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 7634 // CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 7635 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 7636 // CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 7637 // CHECK3-NEXT: store i32 [[OR]], i32* [[A1]], align 4 7638 // CHECK3-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 7639 // CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 7640 // CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] 7641 // CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] 7642 // CHECK3: cond.true: 7643 // CHECK3-NEXT: br label [[COND_END:%.*]] 7644 // CHECK3: cond.false: 7645 // CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 7646 // CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 7647 // CHECK3-NEXT: br label [[COND_END]] 7648 // CHECK3: cond.end: 7649 // CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] 7650 // CHECK3-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 7651 // CHECK3-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 7652 // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 7653 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 7654 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 7655 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* 7656 // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 7657 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 7658 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* 7659 // CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 7660 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 7661 // CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) 7662 // CHECK3-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 7663 // CHECK3-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] 7664 // CHECK3: .omp.reduction.then: 7665 // CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 7666 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 7667 // CHECK3-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] 7668 // CHECK3-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 7669 // CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 7670 // CHECK3-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 7671 // CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 7672 // CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 7673 // CHECK3-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] 7674 // CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] 7675 // CHECK3: cond.true9: 7676 // CHECK3-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 7677 // CHECK3-NEXT: br label [[COND_END11:%.*]] 7678 // CHECK3: cond.false10: 7679 // CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 7680 // CHECK3-NEXT: br label [[COND_END11]] 7681 // CHECK3: cond.end11: 7682 // CHECK3-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] 7683 // CHECK3-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 7684 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) 7685 // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] 7686 // CHECK3: .omp.reduction.done: 7687 // CHECK3-NEXT: ret void 7688 // 7689 // 7690 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 7691 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 7692 // CHECK3-NEXT: entry: 7693 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7694 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 7695 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 7696 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 7697 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 7698 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 7699 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 7700 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7701 // CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 7702 // CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 7703 // CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 7704 // CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7705 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 7706 // CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 7707 // CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 7708 // CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 7709 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 7710 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 7711 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 7712 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 7713 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 7714 // CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 7715 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 7716 // CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size() 7717 // CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 7718 // CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]]) 7719 // CHECK3-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 7720 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 7721 // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 7722 // CHECK3-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 7723 // CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 4 7724 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 7725 // CHECK3-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4 7726 // CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 7727 // CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16* 7728 // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1 7729 // CHECK3-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8* 7730 // CHECK3-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2 7731 // CHECK3-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 7732 // CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 7733 // CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 7734 // CHECK3-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 7735 // CHECK3-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 7736 // CHECK3-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 7737 // CHECK3-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1 7738 // CHECK3-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 7739 // CHECK3-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 7740 // CHECK3-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 4 7741 // CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0 7742 // CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 7743 // CHECK3-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 7744 // CHECK3-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 7745 // CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2 7746 // CHECK3-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1 7747 // CHECK3-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0 7748 // CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]] 7749 // CHECK3-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0 7750 // CHECK3-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]] 7751 // CHECK3-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]] 7752 // CHECK3-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]] 7753 // CHECK3-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]] 7754 // CHECK3: then: 7755 // CHECK3-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 7756 // CHECK3-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 7757 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]] 7758 // CHECK3-NEXT: br label [[IFCONT:%.*]] 7759 // CHECK3: else: 7760 // CHECK3-NEXT: br label [[IFCONT]] 7761 // CHECK3: ifcont: 7762 // CHECK3-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1 7763 // CHECK3-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 7764 // CHECK3-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]] 7765 // CHECK3-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 7766 // CHECK3: then5: 7767 // CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 7768 // CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 7769 // CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 7770 // CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 7771 // CHECK3-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32* 7772 // CHECK3-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32* 7773 // CHECK3-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4 7774 // CHECK3-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4 7775 // CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 7776 // CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 7777 // CHECK3-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 7778 // CHECK3-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4 7779 // CHECK3-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16* 7780 // CHECK3-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16* 7781 // CHECK3-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2 7782 // CHECK3-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2 7783 // CHECK3-NEXT: br label [[IFCONT7:%.*]] 7784 // CHECK3: else6: 7785 // CHECK3-NEXT: br label [[IFCONT7]] 7786 // CHECK3: ifcont7: 7787 // CHECK3-NEXT: ret void 7788 // 7789 // 7790 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 7791 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 7792 // CHECK3-NEXT: entry: 7793 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7794 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7795 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 7796 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7797 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7798 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7799 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7800 // CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 7801 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7802 // CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 7803 // CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7804 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 7805 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) 7806 // CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 7807 // CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 7808 // CHECK3: then: 7809 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 7810 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 7811 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 7812 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 7813 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 7814 // CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 7815 // CHECK3-NEXT: br label [[IFCONT:%.*]] 7816 // CHECK3: else: 7817 // CHECK3-NEXT: br label [[IFCONT]] 7818 // CHECK3: ifcont: 7819 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 7820 // CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7821 // CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 7822 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 7823 // CHECK3: then2: 7824 // CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 7825 // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 7826 // CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 7827 // CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 7828 // CHECK3-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 7829 // CHECK3-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4 7830 // CHECK3-NEXT: br label [[IFCONT4:%.*]] 7831 // CHECK3: else3: 7832 // CHECK3-NEXT: br label [[IFCONT4]] 7833 // CHECK3: ifcont4: 7834 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 7835 // CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 7836 // CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 7837 // CHECK3: then6: 7838 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 7839 // CHECK3-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4 7840 // CHECK3-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16* 7841 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 7842 // CHECK3-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)* 7843 // CHECK3-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2 7844 // CHECK3-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2 7845 // CHECK3-NEXT: br label [[IFCONT8:%.*]] 7846 // CHECK3: else7: 7847 // CHECK3-NEXT: br label [[IFCONT8]] 7848 // CHECK3: ifcont8: 7849 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 7850 // CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7851 // CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]] 7852 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 7853 // CHECK3: then10: 7854 // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 7855 // CHECK3-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)* 7856 // CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 7857 // CHECK3-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4 7858 // CHECK3-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16* 7859 // CHECK3-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2 7860 // CHECK3-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2 7861 // CHECK3-NEXT: br label [[IFCONT12:%.*]] 7862 // CHECK3: else11: 7863 // CHECK3-NEXT: br label [[IFCONT12]] 7864 // CHECK3: ifcont12: 7865 // CHECK3-NEXT: ret void 7866 // 7867 // 7868 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 7869 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] { 7870 // CHECK3-NEXT: entry: 7871 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7872 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 7873 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 7874 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 7875 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 7876 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 7877 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 7878 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7879 // CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 7880 // CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 7881 // CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 7882 // CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7883 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* 7884 // CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 7885 // CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 7886 // CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 7887 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 7888 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 7889 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 7890 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* 7891 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 7892 // CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* 7893 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 7894 // CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size() 7895 // CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 7896 // CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]]) 7897 // CHECK3-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 7898 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 7899 // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 7900 // CHECK3-NEXT: [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* 7901 // CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP11]], align 4 7902 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 7903 // CHECK3-NEXT: [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4 7904 // CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 7905 // CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16* 7906 // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1 7907 // CHECK3-NEXT: [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8* 7908 // CHECK3-NEXT: [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2 7909 // CHECK3-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 7910 // CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size() 7911 // CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 7912 // CHECK3-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]]) 7913 // CHECK3-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 7914 // CHECK3-NEXT: store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 7915 // CHECK3-NEXT: [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1 7916 // CHECK3-NEXT: [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 7917 // CHECK3-NEXT: [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* 7918 // CHECK3-NEXT: store i8* [[TMP36]], i8** [[TMP24]], align 4 7919 // CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0 7920 // CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1 7921 // CHECK3-NEXT: [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] 7922 // CHECK3-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 7923 // CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2 7924 // CHECK3-NEXT: [[TMP42:%.*]] = and i16 [[TMP6]], 1 7925 // CHECK3-NEXT: [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0 7926 // CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]] 7927 // CHECK3-NEXT: [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0 7928 // CHECK3-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]] 7929 // CHECK3-NEXT: [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]] 7930 // CHECK3-NEXT: [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]] 7931 // CHECK3-NEXT: br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]] 7932 // CHECK3: then: 7933 // CHECK3-NEXT: [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* 7934 // CHECK3-NEXT: [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* 7935 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]] 7936 // CHECK3-NEXT: br label [[IFCONT:%.*]] 7937 // CHECK3: else: 7938 // CHECK3-NEXT: br label [[IFCONT]] 7939 // CHECK3: ifcont: 7940 // CHECK3-NEXT: [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1 7941 // CHECK3-NEXT: [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] 7942 // CHECK3-NEXT: [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]] 7943 // CHECK3-NEXT: br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]] 7944 // CHECK3: then5: 7945 // CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 7946 // CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 7947 // CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 7948 // CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 7949 // CHECK3-NEXT: [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32* 7950 // CHECK3-NEXT: [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32* 7951 // CHECK3-NEXT: [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4 7952 // CHECK3-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4 7953 // CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 7954 // CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 7955 // CHECK3-NEXT: [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 7956 // CHECK3-NEXT: [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4 7957 // CHECK3-NEXT: [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16* 7958 // CHECK3-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16* 7959 // CHECK3-NEXT: [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2 7960 // CHECK3-NEXT: store i16 [[TMP67]], i16* [[TMP66]], align 2 7961 // CHECK3-NEXT: br label [[IFCONT7:%.*]] 7962 // CHECK3: else6: 7963 // CHECK3-NEXT: br label [[IFCONT7]] 7964 // CHECK3: ifcont7: 7965 // CHECK3-NEXT: ret void 7966 // 7967 // 7968 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 7969 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { 7970 // CHECK3-NEXT: entry: 7971 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 7972 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 7973 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) 7974 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 7975 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 7976 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7977 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7978 // CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31 7979 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() 7980 // CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5 7981 // CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4 7982 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]* 7983 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 7984 // CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 7985 // CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] 7986 // CHECK3: then: 7987 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 7988 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 7989 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 7990 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 7991 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 7992 // CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 7993 // CHECK3-NEXT: br label [[IFCONT:%.*]] 7994 // CHECK3: else: 7995 // CHECK3-NEXT: br label [[IFCONT]] 7996 // CHECK3: ifcont: 7997 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 7998 // CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 7999 // CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]] 8000 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] 8001 // CHECK3: then2: 8002 // CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 8003 // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0 8004 // CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 8005 // CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* 8006 // CHECK3-NEXT: [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 8007 // CHECK3-NEXT: store i32 [[TMP18]], i32* [[TMP17]], align 4 8008 // CHECK3-NEXT: br label [[IFCONT4:%.*]] 8009 // CHECK3: else3: 8010 // CHECK3-NEXT: br label [[IFCONT4]] 8011 // CHECK3: ifcont4: 8012 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 8013 // CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 8014 // CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]] 8015 // CHECK3: then6: 8016 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 8017 // CHECK3-NEXT: [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4 8018 // CHECK3-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16* 8019 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] 8020 // CHECK3-NEXT: [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)* 8021 // CHECK3-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2 8022 // CHECK3-NEXT: store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2 8023 // CHECK3-NEXT: br label [[IFCONT8:%.*]] 8024 // CHECK3: else7: 8025 // CHECK3-NEXT: br label [[IFCONT8]] 8026 // CHECK3: ifcont8: 8027 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) 8028 // CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4 8029 // CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]] 8030 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]] 8031 // CHECK3: then10: 8032 // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] 8033 // CHECK3-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)* 8034 // CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1 8035 // CHECK3-NEXT: [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4 8036 // CHECK3-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16* 8037 // CHECK3-NEXT: [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2 8038 // CHECK3-NEXT: store i16 [[TMP31]], i16* [[TMP30]], align 2 8039 // CHECK3-NEXT: br label [[IFCONT12:%.*]] 8040 // CHECK3: else11: 8041 // CHECK3-NEXT: br label [[IFCONT12]] 8042 // CHECK3: ifcont12: 8043 // CHECK3-NEXT: ret void 8044 // 8045 // 8046 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 8047 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 8048 // CHECK3-NEXT: entry: 8049 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 8050 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 8051 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 8052 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 8053 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 8054 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 8055 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 8056 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 8057 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 8058 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* 8059 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 8060 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 8061 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 8062 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 8063 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 8064 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] 8065 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 8066 // CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 8067 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 8068 // CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 8069 // CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 8070 // CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 8071 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] 8072 // CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 8073 // CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 8074 // CHECK3-NEXT: ret void 8075 // 8076 // 8077 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 8078 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 8079 // CHECK3-NEXT: entry: 8080 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 8081 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 8082 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 8083 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 8084 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 8085 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 8086 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 8087 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 8088 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* 8089 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 8090 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 8091 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 8092 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] 8093 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 8094 // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 8095 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 8096 // CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 8097 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] 8098 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 8099 // CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 8100 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 8101 // CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 8102 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] 8103 // CHECK3-NEXT: ret void 8104 // 8105 // 8106 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 8107 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 8108 // CHECK3-NEXT: entry: 8109 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 8110 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 8111 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 8112 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 8113 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 8114 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 8115 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 8116 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* 8117 // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 8118 // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* 8119 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 8120 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 8121 // CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 8122 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* 8123 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 8124 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] 8125 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 8126 // CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 8127 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 8128 // CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 8129 // CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* 8130 // CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 8131 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] 8132 // CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 8133 // CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 8134 // CHECK3-NEXT: ret void 8135 // 8136 // 8137 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 8138 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] { 8139 // CHECK3-NEXT: entry: 8140 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 8141 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 8142 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 8143 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 8144 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 8145 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 8146 // CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 8147 // CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 8148 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* 8149 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 8150 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 8151 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 8152 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] 8153 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* 8154 // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 8155 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 8156 // CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 8157 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] 8158 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* 8159 // CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 8160 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* 8161 // CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 8162 // CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] 8163 // CHECK3-NEXT: ret void 8164 // 8165