1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ 2 // Test target codegen - host bc file has to be created first. 3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 4 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1 5 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc 6 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2 7 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK3 8 9 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc 10 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK4 11 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc 12 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK5 13 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK6 14 15 // expected-no-diagnostics 16 #ifndef HEADER 17 #define HEADER 18 19 template<typename tx> 20 tx ftemplate(int n) { 21 tx a = 0; 22 short aa = 0; 23 tx b[10]; 24 25 #pragma omp target parallel if(target: 0) 26 { 27 a += 1; 28 } 29 30 #pragma omp target parallel map(tofrom: aa) 31 { 32 aa += 1; 33 } 34 35 #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) 36 { 37 a += 1; 38 aa += 1; 39 b[2] += 1; 40 } 41 42 return a; 43 } 44 45 int bar(int n){ 46 int a = 0; 47 48 a += ftemplate<int>(n); 49 50 return a; 51 } 52 53 #endif 54 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 55 // CHECK1-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { 56 // CHECK1-NEXT: entry: 57 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 58 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 59 // CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 60 // CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 61 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 62 // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 63 // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] 64 // CHECK1: .execute: 65 // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 66 // CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 67 // CHECK1-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* 68 // CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 69 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 70 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1) 71 // CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] 72 // CHECK1: .omp.deinit: 73 // CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 74 // CHECK1-NEXT: br label [[DOTEXIT:%.*]] 75 // CHECK1: .exit: 76 // CHECK1-NEXT: ret void 77 // 78 // 79 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ 80 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { 81 // CHECK1-NEXT: entry: 82 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 83 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 84 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 85 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 86 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 87 // CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 88 // CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 89 // CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 90 // CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 91 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 92 // CHECK1-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 93 // CHECK1-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 94 // CHECK1-NEXT: ret void 95 // 96 // 97 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 98 // CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 99 // CHECK1-NEXT: entry: 100 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 101 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 102 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 103 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 104 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 105 // CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 106 // CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 107 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 108 // CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 109 // CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 110 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 111 // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 112 // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] 113 // CHECK1: .execute: 114 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 115 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 116 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* 117 // CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 118 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 119 // CHECK1-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* 120 // CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 121 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 122 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* 123 // CHECK1-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 124 // CHECK1-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 125 // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i64 3) 126 // CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] 127 // CHECK1: .omp.deinit: 128 // CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 129 // CHECK1-NEXT: br label [[DOTEXIT:%.*]] 130 // CHECK1: .exit: 131 // CHECK1-NEXT: ret void 132 // 133 // 134 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 135 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 136 // CHECK1-NEXT: entry: 137 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 138 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 139 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 140 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 141 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 142 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 143 // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 144 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 145 // CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 146 // CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 147 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 148 // CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 149 // CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 150 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 151 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 152 // CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 153 // CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 154 // CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 155 // CHECK1-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 156 // CHECK1-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 157 // CHECK1-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 158 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i64 0, i64 2 159 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 160 // CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 161 // CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 162 // CHECK1-NEXT: ret void 163 // 164 // 165 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 166 // CHECK2-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { 167 // CHECK2-NEXT: entry: 168 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 169 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 170 // CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 171 // CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 172 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 173 // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 174 // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] 175 // CHECK2: .execute: 176 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 177 // CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 178 // CHECK2-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* 179 // CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 180 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 181 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) 182 // CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] 183 // CHECK2: .omp.deinit: 184 // CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 185 // CHECK2-NEXT: br label [[DOTEXIT:%.*]] 186 // CHECK2: .exit: 187 // CHECK2-NEXT: ret void 188 // 189 // 190 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ 191 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { 192 // CHECK2-NEXT: entry: 193 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 194 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 195 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 196 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 197 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 198 // CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 199 // CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 200 // CHECK2-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 201 // CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 202 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 203 // CHECK2-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 204 // CHECK2-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 205 // CHECK2-NEXT: ret void 206 // 207 // 208 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 209 // CHECK2-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 210 // CHECK2-NEXT: entry: 211 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 212 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 213 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 214 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 215 // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 216 // CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 217 // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 218 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 219 // CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 220 // CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 221 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 222 // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 223 // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] 224 // CHECK2: .execute: 225 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 226 // CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 227 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* 228 // CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 229 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 230 // CHECK2-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* 231 // CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 232 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 233 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* 234 // CHECK2-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 235 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 236 // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i32 3) 237 // CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] 238 // CHECK2: .omp.deinit: 239 // CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 240 // CHECK2-NEXT: br label [[DOTEXIT:%.*]] 241 // CHECK2: .exit: 242 // CHECK2-NEXT: ret void 243 // 244 // 245 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 246 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 247 // CHECK2-NEXT: entry: 248 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 249 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 250 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 251 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 252 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 253 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 254 // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 255 // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 256 // CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 257 // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 258 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 259 // CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 260 // CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 261 // CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 262 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 263 // CHECK2-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 264 // CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 265 // CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 266 // CHECK2-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 267 // CHECK2-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 268 // CHECK2-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 269 // CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 270 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 271 // CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 272 // CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 273 // CHECK2-NEXT: ret void 274 // 275 // 276 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 277 // CHECK3-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { 278 // CHECK3-NEXT: entry: 279 // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 280 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 281 // CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 282 // CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 283 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 284 // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 285 // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] 286 // CHECK3: .execute: 287 // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 288 // CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 289 // CHECK3-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* 290 // CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 291 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 292 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) 293 // CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] 294 // CHECK3: .omp.deinit: 295 // CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 296 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 297 // CHECK3: .exit: 298 // CHECK3-NEXT: ret void 299 // 300 // 301 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ 302 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { 303 // CHECK3-NEXT: entry: 304 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 305 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 306 // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 307 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 308 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 309 // CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 310 // CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 311 // CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 312 // CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 313 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 314 // CHECK3-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 315 // CHECK3-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 316 // CHECK3-NEXT: ret void 317 // 318 // 319 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 320 // CHECK3-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 321 // CHECK3-NEXT: entry: 322 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 323 // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 324 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 325 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 326 // CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 327 // CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 328 // CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 329 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 330 // CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 331 // CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 332 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 333 // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 334 // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] 335 // CHECK3: .execute: 336 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 337 // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 338 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* 339 // CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 340 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 341 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* 342 // CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 343 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 344 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* 345 // CHECK3-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 346 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 347 // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i32 3) 348 // CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] 349 // CHECK3: .omp.deinit: 350 // CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 351 // CHECK3-NEXT: br label [[DOTEXIT:%.*]] 352 // CHECK3: .exit: 353 // CHECK3-NEXT: ret void 354 // 355 // 356 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 357 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 358 // CHECK3-NEXT: entry: 359 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 360 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 361 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 362 // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 363 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 364 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 365 // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 366 // CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 367 // CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 368 // CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 369 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 370 // CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 371 // CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 372 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 373 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 374 // CHECK3-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 375 // CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 376 // CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 377 // CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 378 // CHECK3-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 379 // CHECK3-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 380 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 381 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 382 // CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 383 // CHECK3-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 384 // CHECK3-NEXT: ret void 385 // 386 // 387 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 388 // CHECK4-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { 389 // CHECK4-NEXT: entry: 390 // CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 391 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 392 // CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 393 // CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 394 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 395 // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 396 // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] 397 // CHECK4: .execute: 398 // CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 399 // CHECK4-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 400 // CHECK4-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* 401 // CHECK4-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 402 // CHECK4-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 403 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1) 404 // CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] 405 // CHECK4: .omp.deinit: 406 // CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 407 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 408 // CHECK4: .exit: 409 // CHECK4-NEXT: ret void 410 // 411 // 412 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ 413 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { 414 // CHECK4-NEXT: entry: 415 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 416 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 417 // CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 418 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 419 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 420 // CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 421 // CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 422 // CHECK4-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 423 // CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 424 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 425 // CHECK4-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 426 // CHECK4-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 427 // CHECK4-NEXT: ret void 428 // 429 // 430 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 431 // CHECK4-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 432 // CHECK4-NEXT: entry: 433 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 434 // CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 435 // CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 436 // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 437 // CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 438 // CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 439 // CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 440 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 441 // CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 442 // CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 443 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 444 // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 445 // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] 446 // CHECK4: .execute: 447 // CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 448 // CHECK4-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 449 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* 450 // CHECK4-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 451 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 452 // CHECK4-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* 453 // CHECK4-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 454 // CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 455 // CHECK4-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* 456 // CHECK4-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 457 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 458 // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i64 3) 459 // CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] 460 // CHECK4: .omp.deinit: 461 // CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 462 // CHECK4-NEXT: br label [[DOTEXIT:%.*]] 463 // CHECK4: .exit: 464 // CHECK4-NEXT: ret void 465 // 466 // 467 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 468 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 469 // CHECK4-NEXT: entry: 470 // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 471 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 472 // CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 473 // CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 8 474 // CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 475 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 476 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 477 // CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 478 // CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 479 // CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 480 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 481 // CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 482 // CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 483 // CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 484 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 485 // CHECK4-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 486 // CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 487 // CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 488 // CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 489 // CHECK4-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 490 // CHECK4-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 491 // CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i64 0, i64 2 492 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 493 // CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 494 // CHECK4-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 495 // CHECK4-NEXT: ret void 496 // 497 // 498 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 499 // CHECK5-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { 500 // CHECK5-NEXT: entry: 501 // CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 502 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 503 // CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 504 // CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 505 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 506 // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 507 // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] 508 // CHECK5: .execute: 509 // CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 510 // CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 511 // CHECK5-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* 512 // CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 513 // CHECK5-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 514 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) 515 // CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] 516 // CHECK5: .omp.deinit: 517 // CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 518 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 519 // CHECK5: .exit: 520 // CHECK5-NEXT: ret void 521 // 522 // 523 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ 524 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { 525 // CHECK5-NEXT: entry: 526 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 527 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 528 // CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 529 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 530 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 531 // CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 532 // CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 533 // CHECK5-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 534 // CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 535 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 536 // CHECK5-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 537 // CHECK5-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 538 // CHECK5-NEXT: ret void 539 // 540 // 541 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 542 // CHECK5-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 543 // CHECK5-NEXT: entry: 544 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 545 // CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 546 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 547 // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 548 // CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 549 // CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 550 // CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 551 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 552 // CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 553 // CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 554 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 555 // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 556 // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] 557 // CHECK5: .execute: 558 // CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 559 // CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 560 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* 561 // CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 562 // CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 563 // CHECK5-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* 564 // CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 565 // CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 566 // CHECK5-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* 567 // CHECK5-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 568 // CHECK5-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 569 // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i32 3) 570 // CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] 571 // CHECK5: .omp.deinit: 572 // CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 573 // CHECK5-NEXT: br label [[DOTEXIT:%.*]] 574 // CHECK5: .exit: 575 // CHECK5-NEXT: ret void 576 // 577 // 578 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 579 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 580 // CHECK5-NEXT: entry: 581 // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 582 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 583 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 584 // CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 585 // CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 586 // CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 587 // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 588 // CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 589 // CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 590 // CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 591 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 592 // CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 593 // CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 594 // CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 595 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 596 // CHECK5-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 597 // CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 598 // CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 599 // CHECK5-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 600 // CHECK5-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 601 // CHECK5-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 602 // CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 603 // CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 604 // CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 605 // CHECK5-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 606 // CHECK5-NEXT: ret void 607 // 608 // 609 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30 610 // CHECK6-SAME: (i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] { 611 // CHECK6-NEXT: entry: 612 // CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 613 // CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 614 // CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 615 // CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 616 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 617 // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 618 // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] 619 // CHECK6: .execute: 620 // CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) 621 // CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 622 // CHECK6-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP0]] to i8* 623 // CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 624 // CHECK6-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 625 // CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i32 1) 626 // CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] 627 // CHECK6: .omp.deinit: 628 // CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 629 // CHECK6-NEXT: br label [[DOTEXIT:%.*]] 630 // CHECK6: .exit: 631 // CHECK6-NEXT: ret void 632 // 633 // 634 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ 635 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] { 636 // CHECK6-NEXT: entry: 637 // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 638 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 639 // CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 640 // CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 641 // CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 642 // CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 643 // CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 644 // CHECK6-NEXT: [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2 645 // CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 646 // CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1 647 // CHECK6-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 648 // CHECK6-NEXT: store i16 [[CONV1]], i16* [[TMP0]], align 2 649 // CHECK6-NEXT: ret void 650 // 651 // 652 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35 653 // CHECK6-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 654 // CHECK6-NEXT: entry: 655 // CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 656 // CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 657 // CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 658 // CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 659 // CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 660 // CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 661 // CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 662 // CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 663 // CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 664 // CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 665 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 666 // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) 667 // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] 668 // CHECK6: .execute: 669 // CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) 670 // CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 671 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* 672 // CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 673 // CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 674 // CHECK6-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* 675 // CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 676 // CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 677 // CHECK6-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* 678 // CHECK6-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 679 // CHECK6-NEXT: [[TMP10:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** 680 // CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP10]], i32 3) 681 // CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] 682 // CHECK6: .omp.deinit: 683 // CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) 684 // CHECK6-NEXT: br label [[DOTEXIT:%.*]] 685 // CHECK6: .exit: 686 // CHECK6-NEXT: ret void 687 // 688 // 689 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 690 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { 691 // CHECK6-NEXT: entry: 692 // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 693 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 694 // CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 695 // CHECK6-NEXT: [[AA_ADDR:%.*]] = alloca i16*, align 4 696 // CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 697 // CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 698 // CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 699 // CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 700 // CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 701 // CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 702 // CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 703 // CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 704 // CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 705 // CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP0]], align 4 706 // CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1 707 // CHECK6-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 708 // CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP1]], align 2 709 // CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32 710 // CHECK6-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1 711 // CHECK6-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16 712 // CHECK6-NEXT: store i16 [[CONV2]], i16* [[TMP1]], align 2 713 // CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP2]], i32 0, i32 2 714 // CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 715 // CHECK6-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1 716 // CHECK6-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 717 // CHECK6-NEXT: ret void 718 // 719